Skip to content

Commit 9552fbb

Browse files
chore: bump unstructured-inference 0.7.35 (#3205)
### Summary - bump unstructured-inference to `0.7.35` which fixed syntax for generated HTML tables - update unit tests and ingest test fixtures to reflect changes in the generated HTML tables - cut a release for `0.14.6` --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
1 parent a6c09ec commit 9552fbb

File tree

18 files changed

+31
-23
lines changed

18 files changed

+31
-23
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.14.6-dev7
1+
## 0.14.6
22

33
### Enhancements
44

5+
* **Bump unstructured-inference==0.7.35** Fix syntax for generated HTML tables.
6+
57
### Features
68

79
* **tqdm ingest support** add optional flag to ingest flow to print out progress bar of each step in the process.

requirements/dev.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ executing==2.0.1
8282
# via stack-data
8383
fastjsonschema==2.19.1
8484
# via nbformat
85-
filelock==3.14.0
85+
filelock==3.15.1
8686
# via virtualenv
8787
fqdn==1.5.1
8888
# via jsonschema

requirements/extra-paddleocr.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ opencv-python==4.8.0.76
121121
# -c ././deps/constraints.txt
122122
# imgaug
123123
# unstructured-paddleocr
124-
openpyxl==3.1.3
124+
openpyxl==3.1.4
125125
# via unstructured-paddleocr
126126
packaging==23.2
127127
# via

requirements/extra-pdf-image.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ google-cloud-vision
1212
effdet
1313
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
1414
# when unstructured library is.
15-
unstructured-inference==0.7.33
15+
unstructured-inference==0.7.35
1616
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
1717
# from one tesseract call
1818
unstructured.pytesseract>=0.3.12

requirements/extra-pdf-image.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ deprecated==1.2.14
3232
# via pikepdf
3333
effdet==0.4.1
3434
# via -r ./extra-pdf-image.in
35-
filelock==3.14.0
35+
filelock==3.15.1
3636
# via
3737
# huggingface-hub
3838
# torch
@@ -287,7 +287,7 @@ typing-extensions==4.12.2
287287
# torch
288288
tzdata==2024.1
289289
# via pandas
290-
unstructured-inference==0.7.33
290+
unstructured-inference==0.7.35
291291
# via -r ./extra-pdf-image.in
292292
unstructured-pytesseract==0.3.12
293293
# via

requirements/extra-xlsx.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ numpy==1.26.4
1313
# -c ././deps/constraints.txt
1414
# -c ./base.txt
1515
# pandas
16-
openpyxl==3.1.3
16+
openpyxl==3.1.4
1717
# via -r ./extra-xlsx.in
1818
pandas==2.2.2
1919
# via -r ./extra-xlsx.in

requirements/huggingface.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ click==8.1.7
1717
# via
1818
# -c ./base.txt
1919
# sacremoses
20-
filelock==3.14.0
20+
filelock==3.15.1
2121
# via
2222
# huggingface-hub
2323
# torch

requirements/ingest/chroma.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ exceptiongroup==1.2.1
5252
# via anyio
5353
fastapi==0.110.3
5454
# via chromadb
55-
filelock==3.14.0
55+
filelock==3.15.1
5656
# via huggingface-hub
5757
flatbuffers==24.3.25
5858
# via onnxruntime

requirements/ingest/clarifai.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ charset-normalizer==3.3.2
1515
# requests
1616
clarifai==10.5.0
1717
# via -r ./ingest/clarifai.in
18-
clarifai-grpc==10.5.1
18+
clarifai-grpc==10.5.2
1919
# via clarifai
2020
contextlib2==21.6.0
2121
# via schema

requirements/ingest/embed-huggingface.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dataclasses-json==0.6.7
3131
# via
3232
# -c ./ingest/../base.txt
3333
# langchain-community
34-
filelock==3.14.0
34+
filelock==3.15.1
3535
# via
3636
# huggingface-hub
3737
# torch

requirements/ingest/embed-octoai.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ idna==3.7
3838
# anyio
3939
# httpx
4040
# requests
41-
openai==1.33.0
41+
openai==1.34.0
4242
# via -r ./ingest/embed-octoai.in
4343
pydantic==2.7.4
4444
# via openai

requirements/ingest/embed-openai.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ numpy==1.26.4
9898
# -c ./ingest/../deps/constraints.txt
9999
# langchain
100100
# langchain-community
101-
openai==1.33.0
101+
openai==1.34.0
102102
# via -r ./ingest/embed-openai.in
103103
orjson==3.10.4
104104
# via langsmith

test_unstructured/partition/pdf_image/test_image.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@ def test_partition_image_with_table_extraction(
155155
)
156156
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
157157
assert len(table) == 1
158-
assert "<table><thead><th>" in table[0]
158+
assert "<table><thead><tr>" in table[0]
159+
assert "</thead><tbody><tr>" in table[0]
159160

160161

161162
def test_partition_image_with_multipage_tiff(
@@ -180,7 +181,8 @@ def test_partition_image_with_bmp(
180181
)
181182
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
182183
assert len(table) == 1
183-
assert "<table><thead><th>" in table[0]
184+
assert "<table><thead><tr>" in table[0]
185+
assert "</thead><tbody><tr>" in table[0]
184186

185187

186188
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
@@ -657,7 +659,8 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
657659
)
658660
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
659661
assert len(table) == 1
660-
assert "<table><thead><th>" in table[0]
662+
assert "<table><thead><tr>" in table[0]
663+
assert "</thead><tbody><tr>" in table[0]
661664
assert "Layouts of history Japanese documents" in table[0]
662665
assert "Layouts of scanned modern magazines and scientific reports" in table[0]
663666

test_unstructured/partition/pdf_image/test_pdf.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,8 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
494494
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
495495
assert elements[0].metadata.languages == ["kor"]
496496
assert len(table) == 2
497-
assert "<table><thead><th>" in table[0]
497+
assert "<table><thead><tr>" in table[0]
498+
assert "</thead><tbody><tr>" in table[0]
498499
# FIXME(yuming): didn't test full sentence here since unit test and docker test have
499500
# some differences on spaces between characters
500501
assert "업" in table[0]
@@ -535,7 +536,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
535536
)
536537
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
537538
assert len(table) == 2
538-
assert "<table><thead><th>" in table[0]
539+
assert "<table><thead><tr>" in table[0]
540+
assert "</thead><tbody><tr>" in table[0]
539541
assert "Layouts of history Japanese documents" in table[0]
540542
assert "Layouts of scanned modern magazines and scientific report" in table[0]
541543
assert "Layouts of scanned US newspapers from the 20th century" in table[0]

test_unstructured/partition/test_auto.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1272,7 +1272,8 @@ def test_partition_image_with_bmp_with_auto(
12721272
)
12731273
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
12741274
assert len(table) == 1
1275-
assert "<table><thead><th>" in table[0]
1275+
assert "<table><thead><tr>" in table[0]
1276+
assert "</thead><tbody><tr>" in table[0]
12761277

12771278

12781279
def test_auto_partition_eml_add_signature_to_metadata():

test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
"element_id": "dddac446da6c93dc1449ecb5d997c423",
4949
"text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents",
5050
"metadata": {
51-
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></table>",
51+
"text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></tr></thead><tbody><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
5252
"filetype": "image/jpeg",
5353
"languages": [
5454
"eng"

test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,7 @@
840840
"element_id": "2a62c55be8401908c18140e858ec3345",
841841
"text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents",
842842
"metadata": {
843-
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>",
843+
"text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></tr></thead><tbody><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
844844
"filetype": "application/pdf",
845845
"languages": [
846846
"eng"
@@ -1391,7 +1391,7 @@
13911391
"element_id": "64bc79d1132a89c71837f420d6e4e2dc",
13921392
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region",
13931393
"metadata": {
1394-
"text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>",
1394+
"text_as_html": "<table><thead><tr><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></tr></thead><tbody><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></tbody></table>",
13951395
"filetype": "application/pdf",
13961396
"languages": [
13971397
"eng"

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.6-dev7" # pragma: no cover
1+
__version__ = "0.14.6" # pragma: no cover

0 commit comments

Comments
 (0)