feat: get_directory_file_info for exploring a directory of files (#142)

MthwRobinson · web-flow · commit eba4c80b1eab · 2023-01-11T12:40:50.000-05:00
* added python-pptx to requirements

* added filetype detection for powerpoint

* add more filetypes to detect

* more tests

* added tests for filetype

* reorder document types

* tests for get_directory_file_info

* added docs for get_directory_file_info

* bump version

* Word -&gt; Office

* added test for filetype

* add group by filetype example
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.4.0-dev0
+## 0.4.0
 
 * Added generic `partition` brick that detects the file type and routes a file to the appropriate
   partitioning brick.
@@ -14,6 +14,7 @@
 * Added new function to parse plain text files `partition_text`
 * Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
 * Add new `Image` element and function to find embedded images `find_embedded_images`
+* Added `get_directory_file_info` for summarizing information about source documents
 
 ## 0.3.5
 
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -270,3 +270,67 @@ You can also pass in a file-like object with:
 
 To extract metadata from ``.docx`` or ``.xlsx``, use ``get_docx_metadata`` and
 ``get_xlsx_metadata``. The interfaces are the same as ``get_jpg_metadata``.
+
+
+###########################
+Exploring Source Documents
+###########################
+
+The ``unstructured`` library includes tools for helping you explore source documents.
+To get a summary of the size (in bytes) and type of documents in a directory, you can
+use the ``get_directory_file_info`` function, as show below. The function will
+recursively explore files in subdirectories.
+
+.. code:: python
+
+    from unstructured.file_utils.exploration import get_directory_file_info
+
+    file_info = get_directory_file_info("example-docs")
+    file_info.filetype.value_counts()
+
+
+The output (``file_info``) is a ``pandas`` ``DataFrame``.
+The result should look similar to:
+
+.. code:: python
+
+    FileType.EML     4
+    FileType.TXT     3
+    FileType.HTML    2
+    FileType.XML     2
+    FileType.PDF     2
+    FileType.DOCX    1
+    FileType.PPTX    1
+    FileType.XLSX    1
+    FileType.JPG     1
+    Name: filetype, dtype: int64
+
+
+You can also find the average file size by file type by using the following command
+
+
+.. code:: python
+
+    from unstructured.file_utils.exploration import get_directory_file_info
+
+    file_info = get_directory_file_info("example-docs")
+    file_info.groupby("filetype").mean()
+
+
+The output should look similar to the following:
+
+.. code:: python
+
+
+                       filesize
+    filetype
+    FileType.DOCX  3.660200e+04
+    FileType.EML   1.490885e+05
+    FileType.HTML  1.228404e+06
+    FileType.JPG   3.276400e+04
+    FileType.PDF   2.429245e+06
+    FileType.PPTX  2.832900e+04
+    FileType.TXT   6.113333e+02
+    FileType.XLSX  4.765000e+03
+    FileType.XML   7.135000e+02
+
diff --git a/example-docs/fake-power-point.pptx b/example-docs/fake-power-point.pptx
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -31,6 +31,7 @@ joblib==1.2.0
 lxml==4.9.2
     # via
     #   python-docx
+    #   python-pptx
     #   unstructured (setup.py)
 monotonic==1.6
     # via argilla
@@ -49,7 +50,9 @@ pandas==1.5.2
     #   argilla
     #   unstructured (setup.py)
 pillow==9.4.0
-    # via unstructured (setup.py)
+    # via
+    #   python-pptx
+    #   unstructured (setup.py)
 pydantic==1.10.4
     # via argilla
 python-dateutil==2.8.2
@@ -58,6 +61,8 @@ python-docx==0.8.11
     # via unstructured (setup.py)
 python-magic==0.4.27
     # via unstructured (setup.py)
+python-pptx==0.6.21
+    # via unstructured (setup.py)
 pytz==2022.7
     # via pandas
 regex==2022.10.31
@@ -80,3 +85,5 @@ wrapt==1.13.3
     # via
     #   argilla
     #   deprecated
+xlsxwriter==3.0.6
+    # via python-pptx
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -48,6 +48,7 @@ langdetect==1.0.9
 lxml==4.9.2
     # via
     #   python-docx
+    #   python-pptx
     #   unstructured (setup.py)
 monotonic==1.6
     # via argilla
@@ -70,7 +71,9 @@ pandas==1.5.2
     #   argilla
     #   unstructured (setup.py)
 pillow==9.4.0
-    # via unstructured (setup.py)
+    # via
+    #   python-pptx
+    #   unstructured (setup.py)
 pydantic==1.10.4
     # via argilla
 python-dateutil==2.8.2
@@ -79,6 +82,8 @@ python-docx==0.8.11
     # via unstructured (setup.py)
 python-magic==0.4.27
     # via unstructured (setup.py)
+python-pptx==0.6.21
+    # via unstructured (setup.py)
 pytz==2022.7
     # via pandas
 pyyaml==6.0
@@ -133,3 +138,5 @@ wrapt==1.13.3
     # via
     #   argilla
     #   deprecated
+xlsxwriter==3.0.6
+    # via python-pptx
diff --git a/setup.py b/setup.py
@@ -55,6 +55,7 @@
         "pandas",
         "pillow",
         "python-docx",
+        "python-pptx",
         "python-magic",
         # NOTE(robinson) - The following dependencies are pinned
         # to address security scans
diff --git a/test_unstructured/file_utils/test_exploration.py b/test_unstructured/file_utils/test_exploration.py
@@ -0,0 +1,34 @@
+import os
+
+import pandas as pd
+
+import unstructured.file_utils.exploration as exploration
+
+
+def test_get_directory_file_info(tmpdir):
+    file_info_test = os.path.join(tmpdir, "file_info_test")
+    if not os.path.exists(file_info_test):
+        os.mkdir(file_info_test)
+
+    directory1 = os.path.join(file_info_test, "directory1")
+    if not os.path.exists(directory1):
+        os.mkdir(directory1)
+
+    filename1 = os.path.join(directory1, "filename1.txt")
+    with open(filename1, "w") as f:
+        f.write("hello there!")
+
+    directory2 = os.path.join(file_info_test, "directory2")
+    if not os.path.exists(directory2):
+        os.mkdir(directory2)
+
+    filename2 = os.path.join(directory2, "filename2.txt")
+    with open(filename2, "w") as f:
+        f.write("hello there!")
+
+    file_info = exploration.get_directory_file_info(file_info_test)
+    assert isinstance(file_info, pd.DataFrame)
+    assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
+
+    means = file_info.groupby("filetype").mean()
+    assert means.columns.to_list() == ["filesize"]
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -1,14 +1,15 @@
 import os
 import pathlib
 import pytest
+import zipfile
 
 import magic
 
 from unstructured.file_utils.filetype import (
     detect_filetype,
     FileType,
-    DOCX_MIME_TYPE,
-    XLSX_MIME_TYPE,
+    DOCX_MIME_TYPES,
+    XLSX_MIME_TYPES,
 )
 
 FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -27,6 +28,7 @@
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
         ("fake-excel.xlsx", FileType.XLSX),
+        ("fake-power-point.pptx", FileType.PPTX),
     ],
 )
 def test_detect_filetype_from_filename(file, expected):
@@ -46,6 +48,7 @@ def test_detect_filetype_from_filename(file, expected):
         ("example-10k.html", FileType.XML),
         ("fake-html.html", FileType.HTML),
         ("fake-excel.xlsx", FileType.XLSX),
+        ("fake-power-point.pptx", FileType.PPTX),
     ],
 )
 def test_detect_filetype_from_file(file, expected):
@@ -69,6 +72,22 @@ def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch
     assert filetype == FileType.DOCX
 
 
+def test_detect_docx_filetype_application_zip(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.DOCX
+
+
+def test_detect_application_zip_files(monkeypatch, tmpdir):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip")
+    filename = os.path.join(tmpdir, "test.zip")
+    zf = zipfile.ZipFile(filename, "w")
+    zf.close()
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.ZIP
+
+
 def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
     monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
@@ -84,24 +103,47 @@ def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch
     assert filetype == FileType.XLSX
 
 
+def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
+    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
+    with open(filename, "rb") as f:
+        filetype = detect_filetype(file=f)
+    assert filetype == FileType.PPTX
+
+
+def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.PPTX
+
+
 def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
     monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
-    assert filetype is None
+    assert filetype == FileType.UNK
+
+
+def test_detect_application_zip_returns_zip_with_unknown(monkeypatch):
+    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/zip")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
+    with open(filename, "rb") as f:
+        filetype = detect_filetype(file=f)
+    assert filetype == FileType.ZIP
 
 
 def test_detect_docx_filetype_word_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: DOCX_MIME_TYPE)
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: DOCX_MIME_TYPES[0])
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
     assert filetype == FileType.DOCX
 
 
 def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPE)
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
     with open(filename, "rb") as f:
         filetype = detect_filetype(file=f)
@@ -110,7 +152,17 @@ def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
 
 def test_detect_filetype_returns_none_with_unknown(monkeypatch):
     monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/fake")
-    assert detect_filetype(filename="made_up.fake") is None
+    assert detect_filetype(filename="made_up.fake") == FileType.UNK
+
+
+def test_detect_filetype_detects_png(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "image/png")
+    assert detect_filetype(filename="made_up.png") == FileType.PNG
+
+
+def test_detect_filetype_detects_unknown_text_types_as_txt(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/new-type")
+    assert detect_filetype(filename="made_up.png") == FileType.TXT
 
 
 def test_detect_filetype_raises_with_both_specified():
@@ -123,3 +175,7 @@ def test_detect_filetype_raises_with_both_specified():
 def test_detect_filetype_raises_with_none_specified():
     with pytest.raises(ValueError):
         detect_filetype()
+
+
+def test_filetype_order():
+    assert FileType.HTML < FileType.XML
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.0-dev0"  # pragma: no cover
+__version__ = "0.4.0"  # pragma: no cover
diff --git a/unstructured/file_utils/exploration.py b/unstructured/file_utils/exploration.py
@@ -0,0 +1,32 @@
+import os
+from typing import Any, Dict, List
+
+import pandas as pd
+
+from unstructured.file_utils.filetype import detect_filetype
+
+
+def get_directory_file_info(directory: str) -> pd.DataFrame:
+    """Recursively walks a directory and extracts key file information to support initial
+    exploration of text data sets. Returns a pandas DataFrame."""
+    data: Dict[str, List[Any]] = {
+        "filename": [],
+        "path": [],
+        "filesize": [],
+        "extension": [],
+        "filetype": [],
+    }
+    for path, _, files in os.walk(directory):
+        for filename_no_path in files:
+            filename = os.path.join(path, filename_no_path)
+            _, extension = os.path.splitext(filename)
+            filesize = os.path.getsize(filename)
+            filetype = detect_filetype(filename)
+
+            data["filename"].append(filename_no_path)
+            data["path"].append(path)
+            data["extension"].append(extension)
+            data["filesize"].append(filesize)
+            data["filetype"].append(filetype)
+
+    return pd.DataFrame(data)
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.0-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.4.0" # pragma: no cover`