Skip to content

Commit 21c821d

Browse files
authored
feat: add partition_csv function (#619)
* add csv into filetype detection * first pass on csv * add tests for csv * add csv to auto * version bump * update readme and docs * fix doc strings
1 parent 046af73 commit 21c821d

File tree

12 files changed

+176
-6
lines changed

12 files changed

+176
-6
lines changed

Diff for: CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.6.8
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
* Add `partition_csv` for CSV files.
8+
9+
### Fixes
10+
111
## 0.6.7
212

313
### Enhancements

Diff for: README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
184184
The following examples show how to get started with the `unstructured` library.
185185

186186
You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
187-
**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
187+
**XLSX**, **CSV**, **ODT**, **PPT**, **PPTX**, **JPG**,
188188
and **PNG** documents with one line of code!
189189
<br></br>
190190
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description

Diff for: docs/source/bricks.rst

+18-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
8383
file type and route it to the appropriate partitioning brick. All partitioning bricks
8484
called within ``partition`` are called using the default kwargs. Use the document-type
8585
specific bricks if you need to apply non-default settings.
86-
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
86+
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
8787
``.png``, ``.jpg``, and ``.txt`` files.
8888
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
8989
``.png``, and ``.jpg``.
@@ -269,6 +269,23 @@ Examples:
269269
print(elements[0].metadata.text_as_html)
270270
271271
272+
``partition_csv``
273+
------------------
274+
275+
The ``partition_csv`` function pre-processes CSV files. The output is a single
276+
``Table`` element. The ``text_as_html`` attribute in the element metadata will
277+
contain an HTML representation of the table.
278+
279+
Examples:
280+
281+
.. code:: python
282+
283+
from unstructured.partition.csv import partition_csv
284+
285+
elements = partition_csv(filename="example-docs/stanley-cups.csv")
286+
print(elements[0].metadata.text_as_html)
287+
288+
272289
``partition_odt``
273290
------------------
274291

Diff for: example-docs/stanley-cups.csv

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Stanley Cups,,
2+
Team,Location,Stanley Cups
3+
Blues,STL,1
4+
Flyers,PHI,2
5+
Maple Leafs,TOR,13

Diff for: test_unstructured/file_utils/test_filetype.py

+3
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
("example-10k.html", FileType.HTML),
3737
("fake-html.html", FileType.HTML),
3838
("stanley-cups.xlsx", FileType.XLSX),
39+
("stanley-cups.csv", FileType.CSV),
3940
("fake-power-point.pptx", FileType.PPTX),
4041
("winter-sports.epub", FileType.EPUB),
4142
("spring-weather.html.json", FileType.JSON),
@@ -59,6 +60,7 @@ def test_detect_filetype_from_filename(file, expected):
5960
("example-10k.html", FileType.HTML),
6061
("fake-html.html", FileType.HTML),
6162
("stanley-cups.xlsx", FileType.XLSX),
63+
("stanley-cups.csv", FileType.CSV),
6264
("fake-power-point.pptx", FileType.PPTX),
6365
("winter-sports.epub", FileType.EPUB),
6466
("fake-doc.rtf", FileType.RTF),
@@ -94,6 +96,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
9496
("example-10k.html", [FileType.HTML, FileType.XML]),
9597
("fake-html.html", FileType.HTML),
9698
("stanley-cups.xlsx", FileType.XLSX),
99+
("stanley-cups.csv", FileType.CSV),
97100
("fake-power-point.pptx", FileType.PPTX),
98101
("winter-sports.epub", FileType.EPUB),
99102
],

Diff for: test_unstructured/partition/test_auto.py

+18
Original file line numberDiff line numberDiff line change
@@ -693,3 +693,21 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
693693
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
694694
assert elements[0].metadata.page_number == 1
695695
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
696+
697+
698+
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
699+
elements = partition(filename=filename)
700+
701+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
702+
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
703+
assert elements[0].metadata.filetype == "text/csv"
704+
705+
706+
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
707+
with open(filename, "rb") as f:
708+
elements = partition(file=f)
709+
710+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
711+
assert isinstance(elements[0], Table)
712+
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
713+
assert elements[0].metadata.filetype == "text/csv"

Diff for: test_unstructured/partition/test_csv.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from unstructured.cleaners.core import clean_extra_whitespace
2+
from unstructured.documents.elements import Table
3+
from unstructured.partition.csv import partition_csv
4+
5+
EXPECTED_TABLE = """<table border="1" class="dataframe">
6+
<tbody>
7+
<tr>
8+
<td>Team</td>
9+
<td>Location</td>
10+
<td>Stanley Cups</td>
11+
</tr>
12+
<tr>
13+
<td>Blues</td>
14+
<td>STL</td>
15+
<td>1</td>
16+
</tr>
17+
<tr>
18+
<td>Flyers</td>
19+
<td>PHI</td>
20+
<td>2</td>
21+
</tr>
22+
<tr>
23+
<td>Maple Leafs</td>
24+
<td>TOR</td>
25+
<td>13</td>
26+
</tr>
27+
</tbody>
28+
</table>"""
29+
30+
31+
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
32+
33+
EXPECTED_FILETYPE = "text/csv"
34+
35+
36+
def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
37+
elements = partition_csv(filename=filename)
38+
39+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
40+
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
41+
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
42+
43+
44+
def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
45+
with open(filename, "rb") as f:
46+
elements = partition_csv(file=f)
47+
48+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
49+
assert isinstance(elements[0], Table)
50+
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
51+
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
52+
53+
54+
def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
55+
elements = partition_csv(filename=filename, include_metadata=False)
56+
57+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
58+
assert isinstance(elements[0], Table)
59+
assert elements[0].metadata.text_as_html is None
60+
assert elements[0].metadata.filetype is None

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.7" # pragma: no cover
1+
__version__ = "0.6.8" # pragma: no cover

Diff for: unstructured/file_utils/filetype.py

+3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class FileType(Enum):
6767
RTF = 41
6868
TXT = 42
6969
JSON = 43
70+
CSV = 44
7071

7172
# Markup Types
7273
HTML = 50
@@ -92,6 +93,7 @@ def __lt__(self, other):
9293
"image/jpeg": FileType.JPG,
9394
"image/png": FileType.PNG,
9495
"text/plain": FileType.TXT,
96+
"text/csv": FileType.CSV,
9597
"text/markdown": FileType.MD,
9698
"text/x-markdown": FileType.MD,
9799
"application/epub": FileType.EPUB,
@@ -139,6 +141,7 @@ def __lt__(self, other):
139141
".epub": FileType.EPUB,
140142
".msg": FileType.MSG,
141143
".odt": FileType.ODT,
144+
".csv": FileType.CSV,
142145
None: FileType.UNK,
143146
}
144147

Diff for: unstructured/partition/auto.py

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
)
1212
from unstructured.logger import logger
1313
from unstructured.partition.common import exactly_one
14+
from unstructured.partition.csv import partition_csv
1415
from unstructured.partition.doc import partition_doc
1516
from unstructured.partition.docx import partition_docx
1617
from unstructured.partition.email import partition_email
@@ -198,6 +199,8 @@ def partition(
198199
elements = partition_json(filename=filename, file=file)
199200
elif filetype == FileType.XLSX:
200201
elements = partition_xlsx(filename=filename, file=file)
202+
elif filetype == FileType.CSV:
203+
elements = partition_csv(filename=filename, file=file)
201204
else:
202205
msg = "Invalid file" if not filename else f"Invalid file {filename}"
203206
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")

Diff for: unstructured/partition/csv.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from tempfile import SpooledTemporaryFile
2+
from typing import IO, BinaryIO, List, Optional, Union, cast
3+
4+
import lxml.html
5+
import pandas as pd
6+
7+
from unstructured.documents.elements import Element, ElementMetadata, Table
8+
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
9+
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
10+
11+
12+
@add_metadata_with_filetype(FileType.CSV)
13+
def partition_csv(
14+
filename: Optional[str] = None,
15+
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
16+
metadata_filename: Optional[str] = None,
17+
include_metadata: bool = True,
18+
) -> List[Element]:
19+
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
20+
21+
Parameters
22+
----------
23+
filename
24+
A string defining the target filename path.
25+
file
26+
A file-like object using "rb" mode --> open(filename, "rb").
27+
metadata_filename
28+
The filename to use for the metadata.
29+
include_metadata
30+
Determines whether or not metadata is included in the output.
31+
"""
32+
exactly_one(filename=filename, file=file)
33+
34+
if filename:
35+
table = pd.read_csv(filename)
36+
else:
37+
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
38+
table = pd.read_csv(f)
39+
40+
metadata_filename = filename or metadata_filename
41+
42+
html_text = table.to_html(index=False, header=False, na_rep="")
43+
text = lxml.html.document_fromstring(html_text).text_content()
44+
45+
if include_metadata:
46+
metadata = ElementMetadata(
47+
text_as_html=html_text,
48+
filename=metadata_filename,
49+
)
50+
else:
51+
metadata = ElementMetadata()
52+
53+
return [Table(text=text, metadata=metadata)]

Diff for: unstructured/partition/xlsx.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@ def partition_xlsx(
2525
file
2626
A file-like object using "rb" mode --> open(filename, "rb").
2727
metadata_filename
28-
The filename to use for the metadata. Relevant because partition_doc converts the
29-
document to .xlsx before partition. We want the original source filename in the
30-
metadata.
28+
The filename to use for the metadata.
3129
include_metadata
3230
Determines whether or not metadata is included in the output.
3331
"""

0 commit comments

Comments
 (0)