Skip to content

Commit 1425726

Browse files
committed
fix case where extension is not available
1 parent 21c47eb commit 1425726

File tree

3 files changed

+46
-6
lines changed

3 files changed

+46
-6
lines changed

test_unstructured/file_utils/test_filetype.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,10 @@ def test_it_identifies_NDJSON_for_file_like_object_with_no_name_but_NDJSON_conte
154154
assert detect_filetype(file=file, content_type=FileType.NDJSON.mime_type) == FileType.NDJSON
155155

156156

157-
# TODO: ideally this test should pass, currently fails
158-
# def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type():
159-
# file_path = example_doc_path("simple.ndjson")
160-
# assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON
157+
def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type():
158+
file_path = example_doc_path("simple.ndjson")
159+
assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON
160+
161161

162162
# ================================================================================================
163163
# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
@@ -396,6 +396,27 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi
396396
assert file_type is FileType.HTML
397397

398398

399+
@pytest.mark.parametrize(
400+
("expected_value", "file_name"),
401+
[(FileType.NDJSON, "simple.ndjson"), (FileType.JSON, "spring-weather.html.json")],
402+
)
403+
def test_it_detects_correct_json_type_without_extension(expected_value: FileType, file_name: str):
404+
with open(example_doc_path(file_name), "rb") as f:
405+
file = io.BytesIO(f.read())
406+
407+
filetype = detect_filetype(file=file)
408+
assert filetype == expected_value
409+
410+
411+
@pytest.mark.parametrize(
412+
("expected_value", "file_name"),
413+
[(FileType.NDJSON, "simple.ndjson"), (FileType.JSON, "spring-weather.html.json")],
414+
)
415+
def test_it_detects_correct_json_type_with_extension(expected_value: FileType, file_name: str):
416+
filetype = detect_filetype(file_path=example_doc_path(file_name))
417+
assert filetype == expected_value
418+
419+
399420
@pytest.mark.parametrize(
400421
("mime_type", "file_name"),
401422
[

unstructured/file_utils/filetype.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,11 @@ def _file_type(self) -> FileType:
179179
if file_type := self._file_type_from_file_extension:
180180
return file_type
181181

182-
# -- strategy 5: give up and report FileType.UNK --
182+
# -- strategy 5: edge case where JSON/NDJSON content without file extension --
183+
if file_type := self._disambiguate_json_file_type:
184+
return file_type
185+
186+
# -- strategy 6: give up and report FileType.UNK --
183187
return FileType.UNK
184188

185189
# == STRATEGIES ============================================================
@@ -210,6 +214,20 @@ def _file_type_from_content_type(self) -> FileType | None:
210214
# -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
211215
return FileType.from_mime_type(self._ctx.content_type)
212216

217+
@property
218+
def _disambiguate_json_file_type(self) -> FileType | None:
219+
"""Disambiguate JSON/NDJSON file-type based on file contents.
220+
221+
This method is used when the content-type is `application/json` and the file is not empty.
222+
"""
223+
if self._ctx.content_type != "application/json":
224+
return None
225+
if is_json_processable(file_text=self._ctx.text_head):
226+
return FileType.JSON
227+
if is_ndjson_processable(file_text=self._ctx.text_head):
228+
return FileType.NDJSON
229+
return None
230+
213231
@property
214232
def _file_type_from_guessed_mime_type(self) -> FileType | None:
215233
"""FileType based on auto-detection of MIME-type by libmagic.

unstructured/file_utils/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ def from_mime_type(cls, mime_type: str | None) -> FileType | None:
8282
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
8383
`FileType` member or one of its alias MIME-types.
8484
"""
85-
if mime_type is None:
85+
if mime_type is None or mime_type == "application/json":
86+
# application/json is ambiguous as it may point ot JSON and NDJSON file types
8687
return None
8788
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
8889
# -- limitations on defining a class variable on an Enum.

0 commit comments

Comments
 (0)