feat: add support for .txt files in partition (#150)

MthwRobinson · web-flow · commit f12240c5e7a6 · 2023-01-13T16:39:53.000-05:00
* added partition_text for auto

* rename partition_text tests

* bump version and update docs
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.1-dev0
+
+* Added support for text files in the `partition` function
+
 ## 0.4.0
 
 * Added generic `partition` brick that detects the file type and routes a file to the appropriate
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ To install the library, run `pip install unstructured`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
 
 The following examples show how to get started with the `unstructured` library.
-You can parse **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
+You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
 of the features in the library.
@@ -76,7 +76,7 @@ If you are using the `partition` brick, ensure you first install `libmagic` usin
 instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
 `partition` will always apply the default arguments. If you need
 advanced features, use a document-specific brick. The `partition` brick currently works for
-`.docx`, `eml`, `.html`, and `.pdf` documents.
+`.txt`, `.docx`, `eml`, `.html`, and `.pdf` documents.
 
 ```python
 from unstructured.partition.auto import partition
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -22,6 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the defualt kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
+``partition`` currently supports ``.docx``, ``.eml``, ``.html``, ``.pdf``, and ``.txt`` files.
 
 
 .. code:: python
@@ -104,7 +105,7 @@ Examples:
 ``partition_pdf``
 ---------------------
 
-The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. 
+The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
 The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
 if desired.
 
@@ -122,7 +123,7 @@ Examples:
 ---------------------
 
 The ``partition_email`` function partitions ``.eml`` documents and works with exports
-from email clients such as Microsoft Outlook and Gmail. The ``partition_email`` 
+from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
 takes a filename, file-like object, or raw text as input and produces a list of
 document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html``
 (default) or ``text/plain`` to process the html or plain text version of the email, respectively.
@@ -157,7 +158,7 @@ Examples:
 ``partition_text``
 ---------------------
 
-The ``partition_text`` function partitions text files. The ``partition_text`` 
+The ``partition_text`` function partitions text files. The ``partition_text``
 takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output.
 
 Examples:
@@ -629,7 +630,7 @@ addresses in the input string.
 
   from unstructured.cleaners.extract import extract_email_address
 
-  text = """Me me@email.com and You <You@email.com> 
+  text = """Me me@email.com and You <You@email.com>
       ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
 
   # Returns "['me@email.com', 'you@email.com']"
@@ -646,7 +647,7 @@ returns a list of all IP address in input string.
 
   from unstructured.cleaners.extract import extract_ip_address
 
-  text = """Me me@email.com and You <You@email.com> 
+  text = """Me me@email.com and You <You@email.com>
     ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
 
   # Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
@@ -656,7 +657,7 @@ returns a list of all IP address in input string.
 ``extract_ip_address_name``
 ----------------------------
 
-Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml`` 
+Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml``
 file. ``extract_ip_address_name`` takes in a string and returns a list of all
 IP addresses in the input string.
 
@@ -675,7 +676,7 @@ IP addresses in the input string.
 ``extract_mapi_id``
 ----------------------
 
-Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml`` 
+Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml``
 file. ``extract_mapi_id`` takes in a string and returns a list of a string
 containing the ``mapi id`` in the input string.
 
@@ -694,7 +695,7 @@ containing the ``mapi id`` in the input string.
 ``extract_datetimetz``
 ----------------------
 
-Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml`` 
+Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml``
 file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime
 object from the input string.
 
@@ -754,7 +755,7 @@ other languages.
 Parameters:
 
 * ``text``: the input string to translate.
-* ``source_lang``: the two letter language code for the source language of the text. 
+* ``source_lang``: the two letter language code for the source language of the text.
   If ``source_lang`` is not specified,
   the language will be detected using ``langdetect``.
 * ``target_lang``: the two letter language code for the target language for translation.
@@ -857,7 +858,7 @@ Examples:
 --------------------------
 
 Prepares ``Text`` elements for processing in ``transformers`` pipelines
-by splitting the elements into chunks that fit into the model's attention window. 
+by splitting the elements into chunks that fit into the model's attention window.
 
 Examples:
 
@@ -960,7 +961,7 @@ Examples:
       json.dump(label_studio_data, f, indent=4)
 
 
-You can also include pre-annotations and predictions as part of your LabelStudio upload. 
+You can also include pre-annotations and predictions as part of your LabelStudio upload.
 
 The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
 annotations for each element in the ``elements`` list. If an element does not have any annotations,
@@ -1009,7 +1010,7 @@ task in LabelStudio:
 
 Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
 the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
-predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list. 
+predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list.
 The following shows an example of how to upload predictions for the "Text Classification"
 task in LabelStudio:
 
@@ -1167,13 +1168,13 @@ Examples:
 ``stage_for_label_box``
 --------------------------
 
-Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data 
+Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data
 and does not support importing text directly. The ``stage_for_label_box`` does the following:
 
 * Stages the data files in the ``output_directory`` specified in function arguments to be uploaded to a cloud storage service.
 * Returns a config of type ``List[Dict[str, Any]]`` that can be written to a ``json`` file and imported into LabelBox.
 
-**Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3 
+**Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3
 using ``aws s3 sync ${output_directory} ${url_prefix}`` after running the ``stage_for_label_box`` staging brick.
 
 Examples:
@@ -1197,7 +1198,7 @@ files to an S3 bucket.
 
   # The URL prefix where the data files will be accessed.
   S3_URL_PREFIX = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{S3_BUCKET_KEY_PREFIX}"
-  
+
   # The local output directory where the data files will be staged for uploading to a Cloud Storage service.
   LOCAL_OUTPUT_DIRECTORY = "/tmp/labelbox-staging"
 
@@ -1232,7 +1233,7 @@ files to an S3 bucket.
 --------------------------
 Formats a list of ``Text`` elements as input to token based tasks in Datasaur.
 
-Example: 
+Example:
 
 .. code:: python
 
@@ -1243,7 +1244,7 @@ Example:
   datasaur_data = stage_for_datasaur(elements)
 
 The output is a list of dictionaries, each one with two keys:
-"text" with the content of the element and 
+"text" with the content of the element and
 "entities" with an empty list.
 
 You can also specify specify entities in the ``stage_for_datasaur`` brick. Entities
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -113,7 +113,28 @@ def test_auto_partition_html_from_file_rb():
     assert len(elements) > 0
 
 
-def test_auto_partition_pdf():
+EXPECTED_TEXT_OUTPUT = [
+    NarrativeText(text="This is a test document to use for unit tests."),
+    Title(text="Important points:"),
+    ListItem(text="Hamburgers are delicious"),
+    ListItem(text="Dogs are the best"),
+    ListItem(text="I love fuzzy blankets"),
+]
+
+
+def test_auto_partition_text_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    elements = partition(filename=filename)
+    assert len(elements) > 0
+    assert elements == EXPECTED_TEXT_OUTPUT
+
+
+def test_auto_partition_text_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    with open(filename, "r") as f:
+        elements = partition(file=f)
+    assert len(elements) > 0
+    assert elements == EXPECTED_TEXT_OUTPUT
     filename = os.path.join(
         EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
     )
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -16,22 +16,22 @@
 ]
 
 
-def test_partition_email_from_filename():
+def test_partition_text_from_filename():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
     elements = partition_text(filename=filename)
     assert len(elements) > 0
     assert elements == EXPECTED_OUTPUT
 
 
-def test_partition_email_from_file():
+def test_partition_text_from_file():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
     with open(filename, "r") as f:
         elements = partition_text(file=f)
     assert len(elements) > 0
     assert elements == EXPECTED_OUTPUT
 
 
-def test_partition_email_from_text():
+def test_partition_text_from_text():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
     with open(filename, "r") as f:
         text = f.read()
@@ -40,12 +40,12 @@ def test_partition_email_from_text():
     assert elements == EXPECTED_OUTPUT
 
 
-def test_partition_email_raises_with_none_specified():
+def test_partition_text_raises_with_none_specified():
     with pytest.raises(ValueError):
         partition_text()
 
 
-def test_partition_email_raises_with_too_many_specified():
+def test_partition_text_raises_with_too_many_specified():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
     with open(filename, "r") as f:
         text = f.read()
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.0"  # pragma: no cover
+__version__ = "0.4.1-dev0"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -5,6 +5,7 @@
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.text import partition_text
 
 
 def partition(filename: Optional[str] = None, file: Optional[IO] = None):
@@ -33,6 +34,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
         return partition_html(filename=filename, file=file)
     elif filetype == FileType.PDF:
         return partition_pdf(filename=filename, file=file, url=None)  # type: ignore
+    elif filetype == FileType.TXT:
+        return partition_text(filename=filename, file=file)
     else:
         msg = "Invalid file" if not filename else f"Invalid file {filename}"
         raise ValueError(f"{msg}. File type not support in partition.")

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.0" # pragma: no cover`
	`1`	`+__version__ = "0.4.1-dev0" # pragma: no cover`