1
1
import os
2
2
import pathlib
3
3
import pytest
4
+ import zipfile
4
5
5
6
import magic
6
7
7
8
from unstructured .file_utils .filetype import (
8
9
detect_filetype ,
9
10
FileType ,
10
- DOCX_MIME_TYPE ,
11
- XLSX_MIME_TYPE ,
11
+ DOCX_MIME_TYPES ,
12
+ XLSX_MIME_TYPES ,
12
13
)
13
14
14
15
FILE_DIRECTORY = pathlib .Path (__file__ ).parent .resolve ()
27
28
("example-10k.html" , FileType .HTML ),
28
29
("fake-html.html" , FileType .HTML ),
29
30
("fake-excel.xlsx" , FileType .XLSX ),
31
+ ("fake-power-point.pptx" , FileType .PPTX ),
30
32
],
31
33
)
32
34
def test_detect_filetype_from_filename (file , expected ):
@@ -46,6 +48,7 @@ def test_detect_filetype_from_filename(file, expected):
46
48
("example-10k.html" , FileType .XML ),
47
49
("fake-html.html" , FileType .HTML ),
48
50
("fake-excel.xlsx" , FileType .XLSX ),
51
+ ("fake-power-point.pptx" , FileType .PPTX ),
49
52
],
50
53
)
51
54
def test_detect_filetype_from_file (file , expected ):
@@ -69,6 +72,22 @@ def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch
69
72
assert filetype == FileType .DOCX
70
73
71
74
75
+ def test_detect_docx_filetype_application_zip (monkeypatch ):
76
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : "application/zip" )
77
+ filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake.docx" )
78
+ filetype = detect_filetype (filename = filename )
79
+ assert filetype == FileType .DOCX
80
+
81
+
82
+ def test_detect_application_zip_files (monkeypatch , tmpdir ):
83
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : "application/zip" )
84
+ filename = os .path .join (tmpdir , "test.zip" )
85
+ zf = zipfile .ZipFile (filename , "w" )
86
+ zf .close ()
87
+ filetype = detect_filetype (filename = filename )
88
+ assert filetype == FileType .ZIP
89
+
90
+
72
91
def test_detect_xlsx_filetype_application_octet_stream (monkeypatch ):
73
92
monkeypatch .setattr (magic , "from_buffer" , lambda * args , ** kwargs : "application/octet-stream" )
74
93
filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-excel.xlsx" )
@@ -84,24 +103,47 @@ def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch
84
103
assert filetype == FileType .XLSX
85
104
86
105
106
+ def test_detect_pptx_filetype_application_octet_stream (monkeypatch ):
107
+ monkeypatch .setattr (magic , "from_buffer" , lambda * args , ** kwargs : "application/octet-stream" )
108
+ filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.pptx" )
109
+ with open (filename , "rb" ) as f :
110
+ filetype = detect_filetype (file = f )
111
+ assert filetype == FileType .PPTX
112
+
113
+
114
+ def test_detect_pptx_filetype_application_octet_stream_with_filename (monkeypatch ):
115
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : "application/octet-stream" )
116
+ filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.pptx" )
117
+ filetype = detect_filetype (filename = filename )
118
+ assert filetype == FileType .PPTX
119
+
120
+
87
121
def test_detect_application_octet_stream_returns_none_with_unknown (monkeypatch ):
88
122
monkeypatch .setattr (magic , "from_buffer" , lambda * args , ** kwargs : "application/octet-stream" )
89
123
filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-text.txt" )
90
124
with open (filename , "rb" ) as f :
91
125
filetype = detect_filetype (file = f )
92
- assert filetype is None
126
+ assert filetype == FileType .UNK
127
+
128
+
129
+ def test_detect_application_zip_returns_zip_with_unknown (monkeypatch ):
130
+ monkeypatch .setattr (magic , "from_buffer" , lambda * args , ** kwargs : "application/zip" )
131
+ filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-text.txt" )
132
+ with open (filename , "rb" ) as f :
133
+ filetype = detect_filetype (file = f )
134
+ assert filetype == FileType .ZIP
93
135
94
136
95
137
def test_detect_docx_filetype_word_mime_type (monkeypatch ):
96
- monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : DOCX_MIME_TYPE )
138
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : DOCX_MIME_TYPES [ 0 ] )
97
139
filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake.docx" )
98
140
with open (filename , "rb" ) as f :
99
141
filetype = detect_filetype (file = f )
100
142
assert filetype == FileType .DOCX
101
143
102
144
103
145
def test_detect_xlsx_filetype_word_mime_type (monkeypatch ):
104
- monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : XLSX_MIME_TYPE )
146
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : XLSX_MIME_TYPES [ 0 ] )
105
147
filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-excel.xlsx" )
106
148
with open (filename , "rb" ) as f :
107
149
filetype = detect_filetype (file = f )
@@ -110,7 +152,17 @@ def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
110
152
111
153
def test_detect_filetype_returns_none_with_unknown (monkeypatch ):
112
154
monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : "application/fake" )
113
- assert detect_filetype (filename = "made_up.fake" ) is None
155
+ assert detect_filetype (filename = "made_up.fake" ) == FileType .UNK
156
+
157
+
158
+ def test_detect_filetype_detects_png (monkeypatch ):
159
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : "image/png" )
160
+ assert detect_filetype (filename = "made_up.png" ) == FileType .PNG
161
+
162
+
163
+ def test_detect_filetype_detects_unknown_text_types_as_txt (monkeypatch ):
164
+ monkeypatch .setattr (magic , "from_file" , lambda * args , ** kwargs : "text/new-type" )
165
+ assert detect_filetype (filename = "made_up.png" ) == FileType .TXT
114
166
115
167
116
168
def test_detect_filetype_raises_with_both_specified ():
@@ -123,3 +175,7 @@ def test_detect_filetype_raises_with_both_specified():
123
175
def test_detect_filetype_raises_with_none_specified ():
124
176
with pytest .raises (ValueError ):
125
177
detect_filetype ()
178
+
179
+
180
+ def test_filetype_order ():
181
+ assert FileType .HTML < FileType .XML
0 commit comments