Skip to content

Commit 0159ec4

Browse files
committed
move check for ndjson into _file_type_from_guessed_mime_type
1 parent 20bc9cf commit 0159ec4

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

unstructured/file_utils/filetype.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,7 @@ def _file_type(self) -> FileType:
172172
return file_type
173173

174174
# -- strategy 3: guess MIME-type using libmagic and use that --
175-
if file_type := self._file_type_from_guessed_mime_type and self._ctx.extension != ".ndjson":
176-
# ndjson gets detected as json mime type, will be incorrectly classified as JSON
175+
if file_type := self._file_type_from_guessed_mime_type:
177176
return file_type
178177

179178
# -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
@@ -241,6 +240,9 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None:
241240
if mime_type.endswith("empty"):
242241
return FileType.EMPTY
243242

243+
if mime_type.endswith("json") and self._ctx.extension == ".ndjson":
244+
return FileType.NDJSON
245+
244246
# -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present --
245247
file_type = FileType.from_mime_type(mime_type)
246248
return file_type if file_type != FileType.UNK else None

0 commit comments

Comments
 (0)