diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c4896e5df..dafc14d342 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.16.22-dev0 + +### Fixes + +* **Handle filenames without extensions in file type detection** + ## 0.16.21 ### Enhancements diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8a58e89964..4d199d05e2 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -609,6 +609,8 @@ def and_it_derives_the_extension_from_metadata_file_path_when_file_object_has_no None, # -- case 2: file-like object has `.name` attribute but it's value is the empty string "", + # -- case 3: file-like object has name with no extension -- + "q3_invoices", ], ) def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_sources( @@ -621,6 +623,26 @@ def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name assert _FileTypeDetectionContext(file=file).extension == "" + @pytest.mark.parametrize( + "file_name", + [ + # -- case 1: file-like object has no `.name` attribute + None, + # -- case 2: file-like object has `.name` attribute but it's value is the empty string + "", + # -- case 3: file-like object has name with no extension -- + "q3_invoices", + ], + ) + def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_nor_metadata( + self, file_name: str | None + ): + with open(example_doc_path("ideas-page.html"), "rb") as f: + file = io.BytesIO(f.read()) + file.name = None + + assert _FileTypeDetectionContext(file=file, metadata_file_path=file_name).extension == "" + # -- .file_head --------------------------------------------- def it_grabs_the_first_8k_bytes_of_the_file_for_use_by_magic(self): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fb8bd1ff84..5614e0ae84 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.21" # pragma: no cover +__version__ = "0.16.22-dev0" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index c34883daf2..1ea2e37b96 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -346,11 +346,15 @@ def extension(self) -> str: # -- get from file_path, or file when it has a name (path) -- with self.open() as file: if hasattr(file, "name") and file.name: - return os.path.splitext(file.name)[1].lower() + splitext = os.path.splitext(file.name) + if len(splitext) > 1: + return splitext[1].lower() # -- otherwise use metadata file-path when provided -- if file_path := self._metadata_file_path: - return os.path.splitext(file_path)[1].lower() + splitext = os.path.splitext(file_path) + if len(splitext) > 1: + return splitext[1].lower() # -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" -- return ""