feat: add partition_csv function (#619)

MthwRobinson · web-flow · commit 21c821d651a0 · 2023-05-19T15:57:42.000-04:00
* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.6.8
+
+### Enhancements
+
+### Features
+
+* Add `partition_csv` for CSV files.
+
+### Fixes
+
 ## 0.6.7
 
 ### Enhancements
diff --git a/README.md b/README.md
@@ -184,7 +184,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
 The following examples show how to get started with the `unstructured` library.
 
 You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
-**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
+**XLSX**, **CSV**, **ODT**, **PPT**, **PPTX**, **JPG**,
 and **PNG** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@@ -269,6 +269,23 @@ Examples:
   print(elements[0].metadata.text_as_html)
 
 
+``partition_csv``
+------------------
+
+The ``partition_csv`` function pre-processes CSV files. The output is a single
+``Table`` element. The ``text_as_html`` attribute in the element metadata will
+contain an HTML representation of the table.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.csv import partition_csv
+
+  elements = partition_csv(filename="example-docs/stanley-cups.csv")
+  print(elements[0].metadata.text_as_html)
+
+
 ``partition_odt``
 ------------------
 
diff --git a/example-docs/stanley-cups.csv b/example-docs/stanley-cups.csv
@@ -0,0 +1,5 @@
+Stanley Cups,,
+Team,Location,Stanley Cups
+Blues,STL,1
+Flyers,PHI,2
+Maple Leafs,TOR,13
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -36,6 +36,7 @@
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
         ("stanley-cups.xlsx", FileType.XLSX),
+        ("stanley-cups.csv", FileType.CSV),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
         ("spring-weather.html.json", FileType.JSON),
@@ -59,6 +60,7 @@ def test_detect_filetype_from_filename(file, expected):
         ("example-10k.html", FileType.HTML),
         ("fake-html.html", FileType.HTML),
         ("stanley-cups.xlsx", FileType.XLSX),
+        ("stanley-cups.csv", FileType.CSV),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
         ("fake-doc.rtf", FileType.RTF),
@@ -94,6 +96,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
         ("example-10k.html", [FileType.HTML, FileType.XML]),
         ("fake-html.html", FileType.HTML),
         ("stanley-cups.xlsx", FileType.XLSX),
+        ("stanley-cups.csv", FileType.CSV),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
     ],
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -693,3 +693,21 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
     assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
     assert elements[0].metadata.page_number == 1
     assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
+
+
+def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
+    elements = partition(filename=filename)
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.filetype == "text/csv"
+
+
+def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f)
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert isinstance(elements[0], Table)
+    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.filetype == "text/csv"
diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py
@@ -0,0 +1,60 @@
+from unstructured.cleaners.core import clean_extra_whitespace
+from unstructured.documents.elements import Table
+from unstructured.partition.csv import partition_csv
+
+EXPECTED_TABLE = """<table border="1" class="dataframe">
+  <tbody>
+    <tr>
+      <td>Team</td>
+      <td>Location</td>
+      <td>Stanley Cups</td>
+    </tr>
+    <tr>
+      <td>Blues</td>
+      <td>STL</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Flyers</td>
+      <td>PHI</td>
+      <td>2</td>
+    </tr>
+    <tr>
+      <td>Maple Leafs</td>
+      <td>TOR</td>
+      <td>13</td>
+    </tr>
+  </tbody>
+</table>"""
+
+
+EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_FILETYPE = "text/csv"
+
+
+def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
+    elements = partition_csv(filename=filename)
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
+    with open(filename, "rb") as f:
+        elements = partition_csv(file=f)
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert isinstance(elements[0], Table)
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
+    elements = partition_csv(filename=filename, include_metadata=False)
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert isinstance(elements[0], Table)
+    assert elements[0].metadata.text_as_html is None
+    assert elements[0].metadata.filetype is None
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.7"  # pragma: no cover
+__version__ = "0.6.8"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -67,6 +67,7 @@ class FileType(Enum):
     RTF = 41
     TXT = 42
     JSON = 43
+    CSV = 44
 
     # Markup Types
     HTML = 50
@@ -92,6 +93,7 @@ def __lt__(self, other):
     "image/jpeg": FileType.JPG,
     "image/png": FileType.PNG,
     "text/plain": FileType.TXT,
+    "text/csv": FileType.CSV,
     "text/markdown": FileType.MD,
     "text/x-markdown": FileType.MD,
     "application/epub": FileType.EPUB,
@@ -139,6 +141,7 @@ def __lt__(self, other):
     ".epub": FileType.EPUB,
     ".msg": FileType.MSG,
     ".odt": FileType.ODT,
+    ".csv": FileType.CSV,
     None: FileType.UNK,
 }
 
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -11,6 +11,7 @@
 )
 from unstructured.logger import logger
 from unstructured.partition.common import exactly_one
+from unstructured.partition.csv import partition_csv
 from unstructured.partition.doc import partition_doc
 from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
@@ -198,6 +199,8 @@ def partition(
         elements = partition_json(filename=filename, file=file)
     elif filetype == FileType.XLSX:
         elements = partition_xlsx(filename=filename, file=file)
+    elif filetype == FileType.CSV:
+        elements = partition_csv(filename=filename, file=file)
     else:
         msg = "Invalid file" if not filename else f"Invalid file {filename}"
         raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
@@ -0,0 +1,53 @@
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast
+
+import lxml.html
+import pandas as pd
+
+from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
+
+
+@add_metadata_with_filetype(FileType.CSV)
+def partition_csv(
+    filename: Optional[str] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
+    metadata_filename: Optional[str] = None,
+    include_metadata: bool = True,
+) -> List[Element]:
+    """Partitions Microsoft Excel Documents in .csv format into its document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    metadata_filename
+        The filename to use for the metadata.
+    include_metadata
+        Determines whether or not metadata is included in the output.
+    """
+    exactly_one(filename=filename, file=file)
+
+    if filename:
+        table = pd.read_csv(filename)
+    else:
+        f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
+        table = pd.read_csv(f)
+
+    metadata_filename = filename or metadata_filename
+
+    html_text = table.to_html(index=False, header=False, na_rep="")
+    text = lxml.html.document_fromstring(html_text).text_content()
+
+    if include_metadata:
+        metadata = ElementMetadata(
+            text_as_html=html_text,
+            filename=metadata_filename,
+        )
+    else:
+        metadata = ElementMetadata()
+
+    return [Table(text=text, metadata=metadata)]
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
@@ -25,9 +25,7 @@ def partition_xlsx(
     file
         A file-like object using "rb" mode --> open(filename, "rb").
     metadata_filename
-        The filename to use for the metadata. Relevant because partition_doc converts the
-        document to .xlsx before partition. We want the original source filename in the
-        metadata.
+        The filename to use for the metadata.
     include_metadata
         Determines whether or not metadata is included in the output.
     """

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.7" # pragma: no cover`
	`1`	`+__version__ = "0.6.8" # pragma: no cover`