Skip to content

Commit eba4c80

Browse files
authored
feat: get_directory_file_info for exploring a directory of files (#142)
* added python-pptx to requirements * added filetype detection for powerpoint * add more filetypes to detect * more tests * added tests for filetype * reorder document types * tests for get_directory_file_info * added docs for get_directory_file_info * bump version * Word -> Office * added test for filetype * add group by filetype example
1 parent 7e3af6c commit eba4c80

File tree

11 files changed

+313
-29
lines changed

11 files changed

+313
-29
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.0-dev0
1+
## 0.4.0
22

33
* Added generic `partition` brick that detects the file type and routes a file to the appropriate
44
partitioning brick.
@@ -14,6 +14,7 @@
1414
* Added new function to parse plain text files `partition_text`
1515
* Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
1616
* Add new `Image` element and function to find embedded images `find_embedded_images`
17+
* Added `get_directory_file_info` for summarizing information about source documents
1718

1819
## 0.3.5
1920

docs/source/examples.rst

+64
Original file line numberDiff line numberDiff line change
@@ -270,3 +270,67 @@ You can also pass in a file-like object with:
270270
271271
To extract metadata from ``.docx`` or ``.xlsx``, use ``get_docx_metadata`` and
272272
``get_xlsx_metadata``. The interfaces are the same as ``get_jpg_metadata``.
273+
274+
275+
###########################
276+
Exploring Source Documents
277+
###########################
278+
279+
The ``unstructured`` library includes tools for helping you explore source documents.
280+
To get a summary of the size (in bytes) and type of documents in a directory, you can
281+
use the ``get_directory_file_info`` function, as show below. The function will
282+
recursively explore files in subdirectories.
283+
284+
.. code:: python
285+
286+
from unstructured.file_utils.exploration import get_directory_file_info
287+
288+
file_info = get_directory_file_info("example-docs")
289+
file_info.filetype.value_counts()
290+
291+
292+
The output (``file_info``) is a ``pandas`` ``DataFrame``.
293+
The result should look similar to:
294+
295+
.. code:: python
296+
297+
FileType.EML 4
298+
FileType.TXT 3
299+
FileType.HTML 2
300+
FileType.XML 2
301+
FileType.PDF 2
302+
FileType.DOCX 1
303+
FileType.PPTX 1
304+
FileType.XLSX 1
305+
FileType.JPG 1
306+
Name: filetype, dtype: int64
307+
308+
309+
You can also find the average file size by file type by using the following command
310+
311+
312+
.. code:: python
313+
314+
from unstructured.file_utils.exploration import get_directory_file_info
315+
316+
file_info = get_directory_file_info("example-docs")
317+
file_info.groupby("filetype").mean()
318+
319+
320+
The output should look similar to the following:
321+
322+
.. code:: python
323+
324+
325+
filesize
326+
filetype
327+
FileType.DOCX 3.660200e+04
328+
FileType.EML 1.490885e+05
329+
FileType.HTML 1.228404e+06
330+
FileType.JPG 3.276400e+04
331+
FileType.PDF 2.429245e+06
332+
FileType.PPTX 2.832900e+04
333+
FileType.TXT 6.113333e+02
334+
FileType.XLSX 4.765000e+03
335+
FileType.XML 7.135000e+02
336+

example-docs/fake-power-point.pptx

27.7 KB
Binary file not shown.

requirements/base.txt

+8-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ joblib==1.2.0
3131
lxml==4.9.2
3232
# via
3333
# python-docx
34+
# python-pptx
3435
# unstructured (setup.py)
3536
monotonic==1.6
3637
# via argilla
@@ -49,7 +50,9 @@ pandas==1.5.2
4950
# argilla
5051
# unstructured (setup.py)
5152
pillow==9.4.0
52-
# via unstructured (setup.py)
53+
# via
54+
# python-pptx
55+
# unstructured (setup.py)
5356
pydantic==1.10.4
5457
# via argilla
5558
python-dateutil==2.8.2
@@ -58,6 +61,8 @@ python-docx==0.8.11
5861
# via unstructured (setup.py)
5962
python-magic==0.4.27
6063
# via unstructured (setup.py)
64+
python-pptx==0.6.21
65+
# via unstructured (setup.py)
6166
pytz==2022.7
6267
# via pandas
6368
regex==2022.10.31
@@ -80,3 +85,5 @@ wrapt==1.13.3
8085
# via
8186
# argilla
8287
# deprecated
88+
xlsxwriter==3.0.6
89+
# via python-pptx

requirements/huggingface.txt

+8-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ langdetect==1.0.9
4848
lxml==4.9.2
4949
# via
5050
# python-docx
51+
# python-pptx
5152
# unstructured (setup.py)
5253
monotonic==1.6
5354
# via argilla
@@ -70,7 +71,9 @@ pandas==1.5.2
7071
# argilla
7172
# unstructured (setup.py)
7273
pillow==9.4.0
73-
# via unstructured (setup.py)
74+
# via
75+
# python-pptx
76+
# unstructured (setup.py)
7477
pydantic==1.10.4
7578
# via argilla
7679
python-dateutil==2.8.2
@@ -79,6 +82,8 @@ python-docx==0.8.11
7982
# via unstructured (setup.py)
8083
python-magic==0.4.27
8184
# via unstructured (setup.py)
85+
python-pptx==0.6.21
86+
# via unstructured (setup.py)
8287
pytz==2022.7
8388
# via pandas
8489
pyyaml==6.0
@@ -133,3 +138,5 @@ wrapt==1.13.3
133138
# via
134139
# argilla
135140
# deprecated
141+
xlsxwriter==3.0.6
142+
# via python-pptx

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"pandas",
5656
"pillow",
5757
"python-docx",
58+
"python-pptx",
5859
"python-magic",
5960
# NOTE(robinson) - The following dependencies are pinned
6061
# to address security scans
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import os
2+
3+
import pandas as pd
4+
5+
import unstructured.file_utils.exploration as exploration
6+
7+
8+
def test_get_directory_file_info(tmpdir):
9+
file_info_test = os.path.join(tmpdir, "file_info_test")
10+
if not os.path.exists(file_info_test):
11+
os.mkdir(file_info_test)
12+
13+
directory1 = os.path.join(file_info_test, "directory1")
14+
if not os.path.exists(directory1):
15+
os.mkdir(directory1)
16+
17+
filename1 = os.path.join(directory1, "filename1.txt")
18+
with open(filename1, "w") as f:
19+
f.write("hello there!")
20+
21+
directory2 = os.path.join(file_info_test, "directory2")
22+
if not os.path.exists(directory2):
23+
os.mkdir(directory2)
24+
25+
filename2 = os.path.join(directory2, "filename2.txt")
26+
with open(filename2, "w") as f:
27+
f.write("hello there!")
28+
29+
file_info = exploration.get_directory_file_info(file_info_test)
30+
assert isinstance(file_info, pd.DataFrame)
31+
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
32+
33+
means = file_info.groupby("filetype").mean()
34+
assert means.columns.to_list() == ["filesize"]

test_unstructured/file_utils/test_filetype.py

+62-6
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import os
22
import pathlib
33
import pytest
4+
import zipfile
45

56
import magic
67

78
from unstructured.file_utils.filetype import (
89
detect_filetype,
910
FileType,
10-
DOCX_MIME_TYPE,
11-
XLSX_MIME_TYPE,
11+
DOCX_MIME_TYPES,
12+
XLSX_MIME_TYPES,
1213
)
1314

1415
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -27,6 +28,7 @@
2728
("example-10k.html", FileType.HTML),
2829
("fake-html.html", FileType.HTML),
2930
("fake-excel.xlsx", FileType.XLSX),
31+
("fake-power-point.pptx", FileType.PPTX),
3032
],
3133
)
3234
def test_detect_filetype_from_filename(file, expected):
@@ -46,6 +48,7 @@ def test_detect_filetype_from_filename(file, expected):
4648
("example-10k.html", FileType.XML),
4749
("fake-html.html", FileType.HTML),
4850
("fake-excel.xlsx", FileType.XLSX),
51+
("fake-power-point.pptx", FileType.PPTX),
4952
],
5053
)
5154
def test_detect_filetype_from_file(file, expected):
@@ -69,6 +72,22 @@ def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch
6972
assert filetype == FileType.DOCX
7073

7174

75+
def test_detect_docx_filetype_application_zip(monkeypatch):
76+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip")
77+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
78+
filetype = detect_filetype(filename=filename)
79+
assert filetype == FileType.DOCX
80+
81+
82+
def test_detect_application_zip_files(monkeypatch, tmpdir):
83+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip")
84+
filename = os.path.join(tmpdir, "test.zip")
85+
zf = zipfile.ZipFile(filename, "w")
86+
zf.close()
87+
filetype = detect_filetype(filename=filename)
88+
assert filetype == FileType.ZIP
89+
90+
7291
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
7392
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
7493
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
@@ -84,24 +103,47 @@ def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch
84103
assert filetype == FileType.XLSX
85104

86105

106+
def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
107+
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
108+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
109+
with open(filename, "rb") as f:
110+
filetype = detect_filetype(file=f)
111+
assert filetype == FileType.PPTX
112+
113+
114+
def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
115+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
116+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
117+
filetype = detect_filetype(filename=filename)
118+
assert filetype == FileType.PPTX
119+
120+
87121
def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
88122
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
89123
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
90124
with open(filename, "rb") as f:
91125
filetype = detect_filetype(file=f)
92-
assert filetype is None
126+
assert filetype == FileType.UNK
127+
128+
129+
def test_detect_application_zip_returns_zip_with_unknown(monkeypatch):
130+
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/zip")
131+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
132+
with open(filename, "rb") as f:
133+
filetype = detect_filetype(file=f)
134+
assert filetype == FileType.ZIP
93135

94136

95137
def test_detect_docx_filetype_word_mime_type(monkeypatch):
96-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: DOCX_MIME_TYPE)
138+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: DOCX_MIME_TYPES[0])
97139
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
98140
with open(filename, "rb") as f:
99141
filetype = detect_filetype(file=f)
100142
assert filetype == FileType.DOCX
101143

102144

103145
def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
104-
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPE)
146+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
105147
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
106148
with open(filename, "rb") as f:
107149
filetype = detect_filetype(file=f)
@@ -110,7 +152,17 @@ def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
110152

111153
def test_detect_filetype_returns_none_with_unknown(monkeypatch):
112154
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/fake")
113-
assert detect_filetype(filename="made_up.fake") is None
155+
assert detect_filetype(filename="made_up.fake") == FileType.UNK
156+
157+
158+
def test_detect_filetype_detects_png(monkeypatch):
159+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "image/png")
160+
assert detect_filetype(filename="made_up.png") == FileType.PNG
161+
162+
163+
def test_detect_filetype_detects_unknown_text_types_as_txt(monkeypatch):
164+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/new-type")
165+
assert detect_filetype(filename="made_up.png") == FileType.TXT
114166

115167

116168
def test_detect_filetype_raises_with_both_specified():
@@ -123,3 +175,7 @@ def test_detect_filetype_raises_with_both_specified():
123175
def test_detect_filetype_raises_with_none_specified():
124176
with pytest.raises(ValueError):
125177
detect_filetype()
178+
179+
180+
def test_filetype_order():
181+
assert FileType.HTML < FileType.XML

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.0-dev0" # pragma: no cover
1+
__version__ = "0.4.0" # pragma: no cover
+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
from typing import Any, Dict, List
3+
4+
import pandas as pd
5+
6+
from unstructured.file_utils.filetype import detect_filetype
7+
8+
9+
def get_directory_file_info(directory: str) -> pd.DataFrame:
10+
"""Recursively walks a directory and extracts key file information to support initial
11+
exploration of text data sets. Returns a pandas DataFrame."""
12+
data: Dict[str, List[Any]] = {
13+
"filename": [],
14+
"path": [],
15+
"filesize": [],
16+
"extension": [],
17+
"filetype": [],
18+
}
19+
for path, _, files in os.walk(directory):
20+
for filename_no_path in files:
21+
filename = os.path.join(path, filename_no_path)
22+
_, extension = os.path.splitext(filename)
23+
filesize = os.path.getsize(filename)
24+
filetype = detect_filetype(filename)
25+
26+
data["filename"].append(filename_no_path)
27+
data["path"].append(path)
28+
data["extension"].append(extension)
29+
data["filesize"].append(filesize)
30+
data["filetype"].append(filetype)
31+
32+
return pd.DataFrame(data)

0 commit comments

Comments
 (0)