|
1 |
| -import mimetypes |
2 | 1 | import os
|
3 |
| -from fastapi import UploadFile, HTTPException |
4 | 2 | from typing import Optional
|
| 3 | +from io import BytesIO |
5 | 4 |
|
6 |
| -DEFAULT_MIMETYPES = ( |
7 |
| - "application/pdf,application/msword,image/jpeg,image/png,text/markdown," |
8 |
| - "text/x-markdown,text/html," |
9 |
| - "application/vnd.openxmlformats-officedocument.wordprocessingml.document," |
10 |
| - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," |
11 |
| - "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument." |
12 |
| - "presentationml.presentation," |
13 |
| - "application/json," |
14 |
| - "application/vnd.ms-powerpoint," |
15 |
| - "text/html,message/rfc822,text/plain,image/png," |
16 |
| - "application/epub,application/epub+zip," |
17 |
| - "application/rtf,text/rtf," |
18 |
| - "application/vnd.oasis.opendocument.text," |
19 |
| - "text/csv,text/x-csv,application/csv,application/x-csv," |
20 |
| - "text/comma-separated-values,text/x-comma-separated-values," |
21 |
| - "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst," |
22 |
| - "text/tsv,text/tab-separated-values," |
23 |
| - "application/x-ole-storage,application/vnd.ms-outlook," |
24 |
| - "application/yaml," |
25 |
| - "application/x-yaml," |
26 |
| - "text/x-yaml," |
27 |
| - "text/yaml," |
28 |
| - "image/bmp," |
29 |
| - "image/heic," |
30 |
| - "image/tiff," |
31 |
| - "text/org," |
32 |
| -) |
| 5 | +from fastapi import HTTPException, UploadFile |
33 | 6 |
|
34 |
| -if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None): |
35 |
| - os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES |
| 7 | +from unstructured.file_utils.filetype import detect_filetype |
| 8 | +from unstructured.file_utils.model import FileType |
36 | 9 |
|
37 | 10 |
|
38 |
| -def _load_mimetypes() -> None: |
39 |
| - """Call this on startup to ensure that all expected file extensions are present in the mimetypes |
40 |
| - lib""" |
41 |
| - expected_mimetypes = [ |
42 |
| - (".bmp", "image/bmp"), |
43 |
| - (".csv", "application/csv"), |
44 |
| - (".doc", "application/msword"), |
45 |
| - (".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), |
46 |
| - (".eml", "message/rfc822"), |
47 |
| - (".epub", "application/epub"), |
48 |
| - (".gz", "application/gzip"), |
49 |
| - (".heic", "image/heic"), |
50 |
| - (".html", "text/html"), |
51 |
| - (".jpeg", "image/jpeg"), |
52 |
| - (".jpg", "image/jpeg"), |
53 |
| - (".json", "application/json"), |
54 |
| - (".md", "text/markdown"), |
55 |
| - (".msg", "application/x-ole-storage"), |
56 |
| - (".odt", "application/vnd.oasis.opendocument.text"), |
57 |
| - (".org", "text/org"), |
58 |
| - (".pdf", "application/pdf"), |
59 |
| - (".png", "image/png"), |
60 |
| - (".ppt", "application/vnd.ms-powerpoint"), |
61 |
| - (".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"), |
62 |
| - (".rst", "text/prs.fallenstein.rst"), |
63 |
| - (".rtf", "application/rtf"), |
64 |
| - (".tiff", "image/tiff"), |
65 |
| - (".tsv", "text/tab-separated-values"), |
66 |
| - (".txt", "text/plain"), |
67 |
| - (".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), |
68 |
| - (".xml", "text/xml"), |
69 |
| - ] |
| 11 | +def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None: |
| 12 | + """removes charset information from mime types, e.g., |
| 13 | + "application/json; charset=utf-8" -> "application/json" |
| 14 | + """ |
| 15 | + if not content_type: |
| 16 | + return content_type |
| 17 | + return content_type.split(";")[0] |
| 18 | + |
70 | 19 |
|
71 |
| - for extension, mimetype in expected_mimetypes: |
72 |
| - mimetypes.add_type(mimetype, extension) |
| 20 | +def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]: |
| 21 | + """Given the incoming file, identify and return the correct mimetype. |
73 | 22 |
|
| 23 | + Order of operations: |
| 24 | + - If user passed content_type as a form param, take it as truth. |
| 25 | + - Otherwise, use file.content_type (as set by the Content-Type header) |
| 26 | + - If no content_type was passed and the header wasn't useful, call the library's detect_filetype |
74 | 27 |
|
75 |
| -_load_mimetypes() |
| 28 | + Once we have a filteype, check is_partitionable and return 400 if we don't support this file. |
| 29 | + """ |
| 30 | + content_type: str | None = None |
76 | 31 |
|
| 32 | + if content_type_hint is not None: |
| 33 | + content_type = content_type_hint |
| 34 | + else: |
| 35 | + content_type = _remove_optional_info_from_mime_type(file.content_type) |
77 | 36 |
|
78 |
| -def get_validated_mimetype(file: UploadFile) -> Optional[str]: |
79 |
| - """The MIME-type of `file`. |
| 37 | + filetype = FileType.from_mime_type(content_type) |
80 | 38 |
|
81 |
| - The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too |
82 |
| - generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and |
83 |
| - return HTTP 400 for an invalid type. |
84 |
| - """ |
85 |
| - content_type = file.content_type |
86 |
| - filename = str(file.filename) # -- "None" when file.filename is None -- |
87 |
| - if not content_type or content_type == "application/octet-stream": |
88 |
| - content_type = mimetypes.guess_type(filename)[0] |
| 39 | + # If content_type was not specified, use the library to identify the file |
| 40 | + # We inspect the bytes to do this, so we need to buffer the file |
| 41 | + if not filetype or filetype == FileType.UNK: |
| 42 | + file_buffer = BytesIO(file.file.read()) |
| 43 | + file.file.seek(0) |
89 | 44 |
|
90 |
| - # Some filetypes missing for this library, just hardcode them for now |
91 |
| - if not content_type: |
92 |
| - if filename.endswith(".md"): |
93 |
| - content_type = "text/markdown" |
94 |
| - elif filename.endswith(".msg"): |
95 |
| - content_type = "message/rfc822" |
| 45 | + file_buffer.name = file.filename |
96 | 46 |
|
97 |
| - allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") |
98 |
| - if allowed_mimetypes_str is not None: |
99 |
| - allowed_mimetypes = allowed_mimetypes_str.split(",") |
| 47 | + filetype = detect_filetype(file=file_buffer) |
100 | 48 |
|
101 |
| - if content_type not in allowed_mimetypes: |
102 |
| - raise HTTPException( |
103 |
| - status_code=400, |
104 |
| - detail=(f"File type {content_type} is not supported."), |
105 |
| - ) |
| 49 | + if not filetype.is_partitionable: |
| 50 | + raise HTTPException( |
| 51 | + status_code=400, |
| 52 | + detail=(f"File type {filetype.mime_type} is not supported."), |
| 53 | + ) |
106 | 54 |
|
107 |
| - return content_type |
| 55 | + return filetype.mime_type |
0 commit comments