Skip to content

Commit 7468938

Browse files
authored
feat: enhance API filetype detection (#445)
# Use the library for filetype detection The mimetype detection has always been very naive in the API - we rely on the file extension. If the user doesn't include a filename, we return an error that `Filetype None is not supported`. The library has a detect_filetype that actually inspects the file bytes, so let's reuse this. # Add a `content_type` param to override filetype detection Add an optional `content_type` param that allows the user to override the filetype detection. We'll use this value if it's set, or take the `file.content_type` which is based on the multipart `Content-Type` header. This provides an alternative when clients are unable to modify the header. # Testing The important thing is that `test_happy_path_all_types` passes in the docker smoke test - this contains all filetypes that we want the API to support. To test manually, you can try sending files to the server with and without the filename/content_type defined. Check out this branch and run `make run-web-app`. Example sending with no extension in filename. This correctly processes a pdf. ``` import requests filename = "sample-docs/layout-parser-paper-fast.pdf" url = "http://localhost:8000/general/v0/general" with open(filename, 'rb') as f: files = {'files': ("sample-doc", f)} response = requests.post(url, files=files) print(response.text) ``` For the new param, you can try modifying the content type for a text based file. Verify that you can change the `metadata.filetype` of the response using the new param: ``` curl --location 'http://localhost:8000/general/v0/general' \ --form 'files=@"sample-docs/family-day.eml"' \ --form 'content_type="text/plain"' [ { "type": "UncategorizedText", "element_id": "5cafe1ce2b0a96f8e3eba232e790db19", "text": "MIME-Version: 1.0 Date: Wed, 21 Dec 2022 10:28:53 -0600 Message-ID: <CAPgNNXQKR=o6AsOTr74VMrsDNhUJW0Keou9n3vLa2UO_Nv+tZw@mail.gmail.com> Subject: Family Day From: Mallori Harrell <[email protected]> To: Mallori Harrell <[email protected]> Content-Type: multipart/alternative; boundary=\"0000000000005c115405f0590ce4\"", "metadata": { "filename": "family-day.eml", "languages": [ "eng" ], "filetype": "text/plain" } }, ... ] ```
1 parent d5502d0 commit 7468938

File tree

9 files changed

+152
-302
lines changed

9 files changed

+152
-302
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.0.76
2+
* Use the library's `detect_filetype` in API to determine mimetype
3+
* Add content_type api parameter
4+
* Bump to `unstructured` 0.15.1
5+
16
## 0.0.75
27

38
* Remove constraint on `safetensors` that preventing us from bumping `transformers`.

prepline_general/api/app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
app = FastAPI(
1414
title="Unstructured Pipeline API",
1515
summary="Partition documents with the Unstructured library",
16-
version="0.0.75",
16+
version="0.0.76",
1717
docs_url="/general/docs",
1818
openapi_url="/general/openapi.json",
1919
servers=[

prepline_general/api/filetypes.py

+39-91
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,55 @@
1-
import mimetypes
21
import os
3-
from fastapi import UploadFile, HTTPException
42
from typing import Optional
3+
from io import BytesIO
54

6-
DEFAULT_MIMETYPES = (
7-
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
8-
"text/x-markdown,text/html,"
9-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
10-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
11-
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
12-
"presentationml.presentation,"
13-
"application/json,"
14-
"application/vnd.ms-powerpoint,"
15-
"text/html,message/rfc822,text/plain,image/png,"
16-
"application/epub,application/epub+zip,"
17-
"application/rtf,text/rtf,"
18-
"application/vnd.oasis.opendocument.text,"
19-
"text/csv,text/x-csv,application/csv,application/x-csv,"
20-
"text/comma-separated-values,text/x-comma-separated-values,"
21-
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
22-
"text/tsv,text/tab-separated-values,"
23-
"application/x-ole-storage,application/vnd.ms-outlook,"
24-
"application/yaml,"
25-
"application/x-yaml,"
26-
"text/x-yaml,"
27-
"text/yaml,"
28-
"image/bmp,"
29-
"image/heic,"
30-
"image/tiff,"
31-
"text/org,"
32-
)
5+
from fastapi import HTTPException, UploadFile
336

34-
if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
35-
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
7+
from unstructured.file_utils.filetype import detect_filetype
8+
from unstructured.file_utils.model import FileType
369

3710

38-
def _load_mimetypes() -> None:
39-
"""Call this on startup to ensure that all expected file extensions are present in the mimetypes
40-
lib"""
41-
expected_mimetypes = [
42-
(".bmp", "image/bmp"),
43-
(".csv", "application/csv"),
44-
(".doc", "application/msword"),
45-
(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
46-
(".eml", "message/rfc822"),
47-
(".epub", "application/epub"),
48-
(".gz", "application/gzip"),
49-
(".heic", "image/heic"),
50-
(".html", "text/html"),
51-
(".jpeg", "image/jpeg"),
52-
(".jpg", "image/jpeg"),
53-
(".json", "application/json"),
54-
(".md", "text/markdown"),
55-
(".msg", "application/x-ole-storage"),
56-
(".odt", "application/vnd.oasis.opendocument.text"),
57-
(".org", "text/org"),
58-
(".pdf", "application/pdf"),
59-
(".png", "image/png"),
60-
(".ppt", "application/vnd.ms-powerpoint"),
61-
(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
62-
(".rst", "text/prs.fallenstein.rst"),
63-
(".rtf", "application/rtf"),
64-
(".tiff", "image/tiff"),
65-
(".tsv", "text/tab-separated-values"),
66-
(".txt", "text/plain"),
67-
(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
68-
(".xml", "text/xml"),
69-
]
11+
def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None:
12+
"""removes charset information from mime types, e.g.,
13+
"application/json; charset=utf-8" -> "application/json"
14+
"""
15+
if not content_type:
16+
return content_type
17+
return content_type.split(";")[0]
18+
7019

71-
for extension, mimetype in expected_mimetypes:
72-
mimetypes.add_type(mimetype, extension)
20+
def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
21+
"""Given the incoming file, identify and return the correct mimetype.
7322
23+
Order of operations:
24+
- If user passed content_type as a form param, take it as truth.
25+
- Otherwise, use file.content_type (as set by the Content-Type header)
26+
- If no content_type was passed and the header wasn't useful, call the library's detect_filetype
7427
75-
_load_mimetypes()
28+
Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
29+
"""
30+
content_type: str | None = None
7631

32+
if content_type_hint is not None:
33+
content_type = content_type_hint
34+
else:
35+
content_type = _remove_optional_info_from_mime_type(file.content_type)
7736

78-
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
79-
"""The MIME-type of `file`.
37+
filetype = FileType.from_mime_type(content_type)
8038

81-
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
82-
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
83-
return HTTP 400 for an invalid type.
84-
"""
85-
content_type = file.content_type
86-
filename = str(file.filename) # -- "None" when file.filename is None --
87-
if not content_type or content_type == "application/octet-stream":
88-
content_type = mimetypes.guess_type(filename)[0]
39+
# If content_type was not specified, use the library to identify the file
40+
# We inspect the bytes to do this, so we need to buffer the file
41+
if not filetype or filetype == FileType.UNK:
42+
file_buffer = BytesIO(file.file.read())
43+
file.file.seek(0)
8944

90-
# Some filetypes missing for this library, just hardcode them for now
91-
if not content_type:
92-
if filename.endswith(".md"):
93-
content_type = "text/markdown"
94-
elif filename.endswith(".msg"):
95-
content_type = "message/rfc822"
45+
file_buffer.name = file.filename
9646

97-
allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
98-
if allowed_mimetypes_str is not None:
99-
allowed_mimetypes = allowed_mimetypes_str.split(",")
47+
filetype = detect_filetype(file=file_buffer)
10048

101-
if content_type not in allowed_mimetypes:
102-
raise HTTPException(
103-
status_code=400,
104-
detail=(f"File type {content_type} is not supported."),
105-
)
49+
if not filetype.is_partitionable:
50+
raise HTTPException(
51+
status_code=400,
52+
detail=(f"File type {filetype.mime_type} is not supported."),
53+
)
10654

107-
return content_type
55+
return filetype.mime_type

prepline_general/api/general.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,7 @@ def return_content_type(filename: str):
649649

650650

651651
@router.get("/general/v0/general", include_in_schema=False)
652-
@router.get("/general/v0.0.75/general", include_in_schema=False)
652+
@router.get("/general/v0.0.76/general", include_in_schema=False)
653653
async def handle_invalid_get_request():
654654
raise HTTPException(
655655
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -664,7 +664,7 @@ async def handle_invalid_get_request():
664664
description="Description",
665665
operation_id="partition_parameters",
666666
)
667-
@router.post("/general/v0.0.75/general", include_in_schema=False)
667+
@router.post("/general/v0.0.76/general", include_in_schema=False)
668668
def general_partition(
669669
request: Request,
670670
# cannot use annotated type here because of a bug described here:
@@ -683,13 +683,13 @@ def general_partition(
683683
detail=f"API key {api_key} is invalid", status_code=status.HTTP_401_UNAUTHORIZED
684684
)
685685

686-
content_type = request.headers.get("Accept")
686+
accept_type = request.headers.get("Accept")
687687

688688
# -- detect response content-type conflict when multiple files are uploaded --
689689
if (
690690
len(files) > 1
691-
and content_type
692-
and content_type
691+
and accept_type
692+
and accept_type
693693
not in [
694694
"*/*",
695695
"multipart/mixed",
@@ -698,7 +698,7 @@ def general_partition(
698698
]
699699
):
700700
raise HTTPException(
701-
detail=f"Conflict in media type {content_type} with response type 'multipart/mixed'.\n",
701+
detail=f"Conflict in media type {accept_type} with response type 'multipart/mixed'.\n",
702702
status_code=status.HTTP_406_NOT_ACCEPTABLE,
703703
)
704704

@@ -714,7 +714,9 @@ def general_partition(
714714

715715
def response_generator(is_multipart: bool):
716716
for file in files:
717-
file_content_type = get_validated_mimetype(file)
717+
file_content_type = get_validated_mimetype(
718+
file, content_type_hint=form_params.content_type
719+
)
718720

719721
_file = file.file
720722

@@ -781,7 +783,7 @@ def join_responses(
781783
MultipartMixedResponse(
782784
response_generator(is_multipart=True), content_type=form_params.output_format
783785
)
784-
if content_type == "multipart/mixed"
786+
if accept_type == "multipart/mixed"
785787
else (
786788
list(response_generator(is_multipart=False))[0]
787789
if len(files) == 1

prepline_general/api/models/form_params.py

+11
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class GeneralFormParams(BaseModel):
2020
output_format: str
2121
coordinates: bool
2222
encoding: str
23+
content_type: Optional[str]
2324
hi_res_model_name: Optional[str]
2425
include_page_breaks: bool
2526
pdf_infer_table_structure: bool
@@ -100,6 +101,15 @@ def as_form(
100101
),
101102
BeforeValidator(SmartValueParser[bool]().value_or_first_element),
102103
] = False,
104+
content_type: Annotated[
105+
Optional[str],
106+
Form(
107+
title="Content type",
108+
description="A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype.",
109+
example="text/markdown",
110+
),
111+
BeforeValidator(SmartValueParser[str]().value_or_first_element),
112+
] = None,
103113
encoding: Annotated[
104114
str,
105115
Form(
@@ -245,6 +255,7 @@ def as_form(
245255
gz_uncompressed_content_type=gz_uncompressed_content_type,
246256
output_format=output_format,
247257
coordinates=coordinates,
258+
content_type=content_type,
248259
encoding=encoding,
249260
hi_res_model_name=hi_res_model_name,
250261
include_page_breaks=include_page_breaks,

preprocessing-pipeline-family.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.75
2+
version: 0.0.76

0 commit comments

Comments
 (0)