Skip to content

Commit 9f7bd61

Browse files
authored
enhancement: Add include_header kwarg for xlsx, default True(#1125)
Closes Github issue #1121 Adds include_header kwarg to partition_xlsx and change default behavior to True.
1 parent 22c12ef commit 9f7bd61

File tree

8 files changed

+67
-39
lines changed

8 files changed

+67
-39
lines changed

Diff for: CHANGELOG.md

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
1-
## 0.10.1-dev3
1+
## 0.10.1
22

33
### Enhancements
44
* Bump unstructured-inference==0.5.12:
5-
- fix to avoid trace for certain PDF's
6-
* Bump unstructured-inference==0.5.11:
7-
- better defaults for DPI for hi_res and Chipper
8-
* Bump unstructured-inference==0.5.10:
9-
- implement full-page OCR
5+
- fix to avoid trace for certain PDF's (0.5.12)
6+
- better defaults for DPI for hi_res and Chipper (0.5.11)
7+
- implement full-page OCR (0.5.10)
108

119
### Features
1210

1311
### Fixes
12+
1413
* Fix dead links in repository README (Quick Start > Install for local development, and Learn more > Batch Processing)
1514
* Update document dependencies to include tesseract-lang for additional language support (required for tests to pass)
1615

1716
## 0.10.0
1817

1918
### Enhancements
2019

20+
* Add `include_header` kwarg to `partition_xlsx` and change default behavior to `True`
2121
* Update the `links` and `emphasized_texts` metadata fields
2222

2323
### Features
@@ -26,6 +26,7 @@
2626

2727
* fix pdf partition of list items being detected as titles in OCR only mode
2828

29+
2930
## 0.9.3
3031

3132
### Enhancements

Diff for: test_unstructured/partition/test_auto.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,7 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.
668668

669669

670670
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
671-
elements = partition(filename=filename)
671+
elements = partition(filename=filename, include_header=False)
672672

673673
assert all(isinstance(element, Table) for element in elements)
674674
assert len(elements) == 2
@@ -681,7 +681,7 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
681681

682682
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
683683
with open(filename, "rb") as f:
684-
elements = partition(file=f)
684+
elements = partition(file=f, include_header=False)
685685

686686
assert all(isinstance(element, Table) for element in elements)
687687
assert len(elements) == 2
@@ -774,7 +774,7 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
774774

775775
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
776776
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
777-
elements = partition(filename=filename)
777+
elements = partition(filename=filename, include_header=False)
778778

779779
assert all(isinstance(element, Table) for element in elements)
780780
assert len(elements) == 3

Diff for: test_unstructured/partition/test_xlsx.py

+31-7
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
12-
elements = partition_xlsx(filename=filename)
12+
elements = partition_xlsx(filename=filename, include_header=False)
1313

1414
assert all(isinstance(element, Table) for element in elements)
1515
assert len(elements) == 2
@@ -23,7 +23,7 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx")
2323

2424

2525
def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"):
26-
elements = partition_xlsx(filename=filename)
26+
elements = partition_xlsx(filename=filename, include_header=False)
2727
assert all(isinstance(element, Table) for element in elements)
2828
assert len(elements) == 1
2929
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
@@ -32,16 +32,27 @@ def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xl
3232
def test_partition_xlsx_from_filename_with_metadata_filename(
3333
filename="example-docs/stanley-cups.xlsx",
3434
):
35-
elements = partition_xlsx(filename=filename, metadata_filename="test")
35+
elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False)
3636

3737
assert all(isinstance(element, Table) for element in elements)
3838
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
3939
assert elements[0].metadata.filename == "test"
4040

4141

42+
def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
43+
elements = partition_xlsx(filename=filename, include_header=True)
44+
assert all(isinstance(element, Table) for element in elements)
45+
assert len(elements) == 2
46+
assert (
47+
clean_extra_whitespace(elements[0].text)
48+
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
49+
)
50+
assert "<thead>" in elements[0].metadata.text_as_html
51+
52+
4253
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
4354
with open(filename, "rb") as f:
44-
elements = partition_xlsx(file=f)
55+
elements = partition_xlsx(file=f, include_header=False)
4556

4657
assert all(isinstance(element, Table) for element in elements)
4758
assert len(elements) == 2
@@ -55,15 +66,28 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
5566

5667
def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"):
5768
with open(filename, "rb") as f:
58-
elements = partition_xlsx(file=f, metadata_filename="test")
69+
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)
5970

6071
assert all(isinstance(element, Table) for element in elements)
6172
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
6273
assert elements[0].metadata.filename == "test"
6374

6475

76+
def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"):
77+
with open(filename, "rb") as f:
78+
elements = partition_xlsx(file=f, include_header=True)
79+
80+
assert all(isinstance(element, Table) for element in elements)
81+
assert len(elements) == 2
82+
assert (
83+
clean_extra_whitespace(elements[0].text)
84+
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
85+
)
86+
assert "<thead>" in elements[0].metadata.text_as_html
87+
88+
6589
def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
66-
elements = partition_xlsx(filename=filename, include_metadata=False)
90+
elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False)
6791

6892
assert all(isinstance(element, Table) for element in elements)
6993
assert len(elements) == 2
@@ -78,7 +102,7 @@ def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley
78102

79103
def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
80104
with open(filename, "rb") as f:
81-
elements = partition_xlsx(file=f, include_metadata=False)
105+
elements = partition_xlsx(file=f, include_metadata=False, include_header=False)
82106

83107
assert all(isinstance(element, Table) for element in elements)
84108
assert len(elements) == 2

Diff for: test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[
22
{
33
"type": "Table",
4-
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
4+
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
55
"metadata": {
66
"data_source": {
77
"record_locator": {
@@ -16,13 +16,13 @@
1616
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1717
"page_number": 1,
1818
"page_name": "Stanley Cups",
19-
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
19+
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
2020
},
21-
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
21+
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
2222
},
2323
{
2424
"type": "Table",
25-
"element_id": "31421b5cd94fedb10dc82738503b4505",
25+
"element_id": "0699dddf33814117e04654068f5182f6",
2626
"metadata": {
2727
"data_source": {
2828
"record_locator": {
@@ -37,8 +37,8 @@
3737
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
3838
"page_number": 2,
3939
"page_name": "Stanley Cups Since 67",
40-
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
40+
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
4141
},
42-
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
42+
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
4343
}
4444
]
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,26 @@
11
[
22
{
33
"type": "Table",
4-
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
4+
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
55
"metadata": {
66
"data_source": {},
77
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
88
"page_number": 1,
99
"page_name": "Stanley Cups",
10-
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
10+
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
1111
},
12-
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
12+
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
1313
},
1414
{
1515
"type": "Table",
16-
"element_id": "31421b5cd94fedb10dc82738503b4505",
16+
"element_id": "0699dddf33814117e04654068f5182f6",
1717
"metadata": {
1818
"data_source": {},
1919
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
2020
"page_number": 2,
2121
"page_name": "Stanley Cups Since 67",
22-
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
22+
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
2323
},
24-
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
24+
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
2525
}
2626
]

0 commit comments

Comments
 (0)