Skip to content

Commit 74ce2ae

Browse files
authored
fix: update detect_filetype to properly handle older office files (#161)
1 parent 08ccee0 commit 74ce2ae

File tree

4 files changed

+41
-2
lines changed

4 files changed

+41
-2
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.4.3-dev1
1+
## 0.4.3-dev2
22

33
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
44
* Fix bug in `_parse_received_data`.
5+
* Update `detect_filetype` to properly handle `.doc`, `.xls`, and `.ppt`.
56

67
## 0.4.2
78

Diff for: test_unstructured/file_utils/test_filetype.py

+18
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,24 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
8888
assert filetype == FileType.ZIP
8989

9090

91+
def test_detect_doc_file_from_mime_type(monkeypatch):
92+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
93+
filetype = detect_filetype(filename="fake.doc")
94+
assert filetype == FileType.DOC
95+
96+
97+
def test_detect_ppt_file_from_mime_type(monkeypatch):
98+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
99+
filetype = detect_filetype(filename="fake.ppt")
100+
assert filetype == FileType.PPT
101+
102+
103+
def test_detect_xls_file_from_mime_type(monkeypatch):
104+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
105+
filetype = detect_filetype(filename="fake.xls")
106+
assert filetype == FileType.XLS
107+
108+
91109
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
92110
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
93111
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.3-dev1" # pragma: no cover
1+
__version__ = "0.4.3-dev2" # pragma: no cover

Diff for: unstructured/file_utils/filetype.py

+20
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,25 @@
1111

1212
DOCX_MIME_TYPES = [
1313
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
14+
]
15+
16+
DOC_MIME_TYPES = [
1417
"application/msword",
1518
]
19+
1620
XLSX_MIME_TYPES = [
1721
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
22+
]
23+
24+
XLS_MIME_TYPES = [
1825
"application/vnd.ms-excel",
1926
]
27+
2028
PPTX_MIME_TYPES = [
2129
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
30+
]
31+
32+
PPT_MIME_TYPES = [
2233
"application/vnd.ms-powerpoint",
2334
]
2435

@@ -125,6 +136,9 @@ def detect_filetype(
125136
elif mime_type in DOCX_MIME_TYPES:
126137
return FileType.DOCX
127138

139+
elif mime_type in DOC_MIME_TYPES:
140+
return FileType.DOC
141+
128142
elif mime_type == "image/jpeg":
129143
return FileType.JPG
130144

@@ -157,9 +171,15 @@ def detect_filetype(
157171
elif mime_type in XLSX_MIME_TYPES:
158172
return FileType.XLSX
159173

174+
elif mime_type in XLS_MIME_TYPES:
175+
return FileType.XLS
176+
160177
elif mime_type in PPTX_MIME_TYPES:
161178
return FileType.PPTX
162179

180+
elif mime_type in PPT_MIME_TYPES:
181+
return FileType.PPT
182+
163183
elif mime_type == "application/octet-stream":
164184
if file and not extension:
165185
return _detect_filetype_from_octet_stream(file=file)

0 commit comments

Comments
 (0)