Skip to content

Commit f491b85

Browse files
authored
build(release): bump unstructured (#183)
Related to downstream issue: #182 And upstream PR: Unstructured-IO/unstructured-inference#165 * remove test_parallel_mode_correct_result * dropped the file_directory field from elements metadata
1 parent c9b74d4 commit f491b85

File tree

6 files changed

+41
-89
lines changed

6 files changed

+41
-89
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.0.35-dev0
1+
## 0.0.35
22

3+
* Bump unstructured library to 0.9.2
34
* Fix a misleading error in make docker-test
45

56
## 0.0.34

Diff for: pipeline-notebooks/pipeline-general.ipynb

+3
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,9 @@
888888
" \n",
889889
" if element.metadata.last_modified:\n",
890890
" elements[i].metadata.last_modified = None\n",
891+
"\n",
892+
" if element.metadata.file_directory:\n",
893+
" elements[i].metadata.file_directory = None\n",
891894
" \n",
892895
" if response_type == \"text/csv\":\n",
893896
" df = convert_to_dataframe(elements)\n",

Diff for: prepline_general/api/general.py

+3
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,9 @@ def pipeline_api(
367367
if element.metadata.last_modified:
368368
elements[i].metadata.last_modified = None
369369

370+
if element.metadata.file_directory:
371+
elements[i].metadata.file_directory = None
372+
370373
if response_type == "text/csv":
371374
df = convert_to_dataframe(elements)
372375
return df.to_csv(index=False)

Diff for: requirements/base.txt

+18-16
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ attrs==23.1.0
1717
autoflake==2.2.0
1818
# via unstructured-api-tools
1919
beautifulsoup4==4.12.2
20-
# via nbconvert
20+
# via
21+
# nbconvert
22+
# unstructured
2123
bleach==6.0.0
2224
# via nbconvert
2325
certifi==2023.7.22
@@ -40,8 +42,10 @@ coloredlogs==15.0.1
4042
# via onnxruntime
4143
contourpy==1.1.0
4244
# via matplotlib
43-
cryptography==41.0.3
44-
# via pdfminer-six
45+
cryptography==41.0.2
46+
# via
47+
# pdfminer-six
48+
# unstructured
4549
cycler==0.11.0
4650
# via matplotlib
4751
defusedxml==0.7.1
@@ -101,7 +105,7 @@ jinja2==3.1.2
101105
# nbconvert
102106
# torch
103107
# unstructured-api-tools
104-
joblib==1.3.1
108+
joblib==1.3.2
105109
# via nltk
106110
jsonschema==4.19.0
107111
# via nbformat
@@ -140,7 +144,7 @@ mpmath==1.3.0
140144
# via sympy
141145
msg-parser==1.2.0
142146
# via unstructured
143-
mypy==1.4.1
147+
mypy==1.5.0
144148
# via unstructured-api-tools
145149
mypy-extensions==1.0.0
146150
# via mypy
@@ -174,7 +178,7 @@ omegaconf==2.3.0
174178
# via effdet
175179
onnxruntime==1.15.1
176180
# via unstructured-inference
177-
opencv-python==4.8.0.74
181+
opencv-python==4.8.0.76
178182
# via
179183
# layoutparser
180184
# unstructured-inference
@@ -221,7 +225,7 @@ platformdirs==3.10.0
221225
# via jupyter-core
222226
portalocker==2.7.0
223227
# via iopath
224-
protobuf==4.23.4
228+
protobuf==4.24.0
225229
# via onnxruntime
226230
pycocotools==2.0.6
227231
# via effdet
@@ -274,15 +278,15 @@ pyyaml==6.0.1
274278
# timm
275279
# transformers
276280
# uvicorn
277-
pyzmq==25.1.0
281+
pyzmq==25.1.1
278282
# via jupyter-client
279283
ratelimit==2.2.1
280284
# via -r requirements/base.in
281285
referencing==0.30.2
282286
# via
283287
# jsonschema
284288
# jsonschema-specifications
285-
regex==2023.6.3
289+
regex==2023.8.8
286290
# via
287291
# nltk
288292
# transformers
@@ -297,7 +301,7 @@ rpds-py==0.9.2
297301
# via
298302
# jsonschema
299303
# referencing
300-
safetensors==0.3.1
304+
safetensors==0.3.2
301305
# via
302306
# timm
303307
# transformers
@@ -340,9 +344,9 @@ torchvision==0.15.2
340344
# effdet
341345
# layoutparser
342346
# timm
343-
tornado==6.3.2
347+
tornado==6.3.3
344348
# via jupyter-client
345-
tqdm==4.65.0
349+
tqdm==4.66.1
346350
# via
347351
# huggingface-hub
348352
# iopath
@@ -365,24 +369,22 @@ types-urllib3==1.26.25.14
365369
# via types-requests
366370
typing-extensions==4.7.1
367371
# via
368-
# annotated-types
369372
# fastapi
370373
# huggingface-hub
371374
# iopath
372375
# mypy
373376
# pydantic
374-
# pydantic-core
375377
# pypdf
376378
# starlette
377379
# torch
378380
# uvicorn
379381
tzdata==2023.3
380382
# via pandas
381-
unstructured[local-inference]==0.9.0
383+
unstructured[local-inference]==0.9.2
382384
# via -r requirements/base.in
383385
unstructured-api-tools==0.10.10
384386
# via -r requirements/base.in
385-
unstructured-inference==0.5.7
387+
unstructured-inference==0.5.9
386388
# via unstructured
387389
urllib3==2.0.4
388390
# via requests

Diff for: requirements/test.txt

+15-14
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ beautifulsoup4==4.12.2
5050
# via
5151
# -r requirements/base.txt
5252
# nbconvert
53+
# unstructured
5354
black==23.7.0
5455
# via -r requirements/test.in
5556
bleach==6.0.0
@@ -98,15 +99,16 @@ contourpy==1.1.0
9899
# matplotlib
99100
coverage[toml]==7.2.7
100101
# via pytest-cov
101-
cryptography==41.0.3
102+
cryptography==41.0.2
102103
# via
103104
# -r requirements/base.txt
104105
# pdfminer-six
106+
# unstructured
105107
cycler==0.11.0
106108
# via
107109
# -r requirements/base.txt
108110
# matplotlib
109-
debugpy==1.6.7
111+
debugpy==1.6.7.post1
110112
# via ipykernel
111113
decorator==5.1.1
112114
# via ipython
@@ -254,7 +256,7 @@ jinja2==3.1.2
254256
# nbconvert
255257
# torch
256258
# unstructured-api-tools
257-
joblib==1.3.1
259+
joblib==1.3.2
258260
# via
259261
# -r requirements/base.txt
260262
# nltk
@@ -366,7 +368,7 @@ msg-parser==1.2.0
366368
# via
367369
# -r requirements/base.txt
368370
# unstructured
369-
mypy==1.4.1
371+
mypy==1.5.0
370372
# via
371373
# -r requirements/base.txt
372374
# -r requirements/test.in
@@ -435,7 +437,7 @@ onnxruntime==1.15.1
435437
# via
436438
# -r requirements/base.txt
437439
# unstructured-inference
438-
opencv-python==4.8.0.74
440+
opencv-python==4.8.0.76
439441
# via
440442
# -r requirements/base.txt
441443
# layoutparser
@@ -529,7 +531,7 @@ prompt-toolkit==3.0.39
529531
# via
530532
# ipython
531533
# jupyter-console
532-
protobuf==4.23.4
534+
protobuf==4.24.0
533535
# via
534536
# -r requirements/base.txt
535537
# onnxruntime
@@ -598,7 +600,6 @@ pytest-mock==3.11.1
598600
python-dateutil==2.8.2
599601
# via
600602
# -r requirements/base.txt
601-
# arrow
602603
# jupyter-client
603604
# matplotlib
604605
# pandas
@@ -641,7 +642,7 @@ pyyaml==6.0.1
641642
# timm
642643
# transformers
643644
# uvicorn
644-
pyzmq==25.1.0
645+
pyzmq==25.1.1
645646
# via
646647
# -r requirements/base.txt
647648
# ipykernel
@@ -661,7 +662,7 @@ referencing==0.30.2
661662
# jsonschema
662663
# jsonschema-specifications
663664
# jupyter-events
664-
regex==2023.6.3
665+
regex==2023.8.8
665666
# via
666667
# -r requirements/base.txt
667668
# nltk
@@ -687,7 +688,7 @@ rpds-py==0.9.2
687688
# -r requirements/base.txt
688689
# jsonschema
689690
# referencing
690-
safetensors==0.3.1
691+
safetensors==0.3.2
691692
# via
692693
# -r requirements/base.txt
693694
# timm
@@ -769,7 +770,7 @@ torchvision==0.15.2
769770
# effdet
770771
# layoutparser
771772
# timm
772-
tornado==6.3.2
773+
tornado==6.3.3
773774
# via
774775
# -r requirements/base.txt
775776
# ipykernel
@@ -778,7 +779,7 @@ tornado==6.3.2
778779
# jupyterlab
779780
# notebook
780781
# terminado
781-
tqdm==4.65.0
782+
tqdm==4.66.1
782783
# via
783784
# -r requirements/base.txt
784785
# huggingface-hub
@@ -838,11 +839,11 @@ tzdata==2023.3
838839
# via
839840
# -r requirements/base.txt
840841
# pandas
841-
unstructured[local-inference]==0.9.0
842+
unstructured[local-inference]==0.9.2
842843
# via -r requirements/base.txt
843844
unstructured-api-tools==0.10.10
844845
# via -r requirements/base.txt
845-
unstructured-inference==0.5.7
846+
unstructured-inference==0.5.9
846847
# via
847848
# -r requirements/base.txt
848849
# unstructured

Diff for: test_general/api/test_app.py

-58
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from pathlib import Path
22

3-
import json
43
import io
54
import pytest
65
import re
@@ -10,8 +9,6 @@
109
from unstructured_api_tools.pipelines.api_conventions import get_pipeline_path
1110

1211
from prepline_general.api.app import app
13-
from unstructured.partition.auto import partition
14-
from unstructured.staging.base import convert_to_isd
1512
import tempfile
1613

1714
MAIN_API_ROUTE = get_pipeline_path("general")
@@ -400,61 +397,6 @@ def json(self):
400397
return self.body
401398

402399

403-
def mock_partition_file_via_api(url, **kwargs):
404-
file = kwargs["files"]["files"][1]
405-
406-
partition_kwargs = kwargs["data"]
407-
408-
# Hack - the api takes `coordinates` but regular partition does not
409-
del partition_kwargs["coordinates"]
410-
411-
elements = partition(file=file, **partition_kwargs)
412-
413-
response = MockResponse(200)
414-
response.body = elements
415-
response.text = json.dumps(convert_to_isd(elements))
416-
return response
417-
418-
419-
def test_parallel_mode_correct_result(monkeypatch):
420-
"""
421-
Validate that parallel processing mode merges the results
422-
to look the same as normal mode. The api call is mocked to
423-
use local partition, so this is just testing the merge logic.
424-
"""
425-
client = TestClient(app)
426-
test_file = Path("sample-docs") / "layout-parser-paper.pdf"
427-
428-
response = client.post(
429-
MAIN_API_ROUTE,
430-
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
431-
)
432-
433-
assert response.status_code == 200
434-
result_serial = response.json()
435-
436-
monkeypatch.setenv("UNSTRUCTURED_PARALLEL_MODE_ENABLED", "true")
437-
monkeypatch.setenv("UNSTRUCTURED_PARALLEL_MODE_URL", "unused")
438-
# Replace our callout with regular old partition
439-
monkeypatch.setattr(
440-
requests,
441-
"post",
442-
lambda *args, **kwargs: mock_partition_file_via_api(*args, **kwargs),
443-
)
444-
445-
response = client.post(
446-
MAIN_API_ROUTE,
447-
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
448-
)
449-
450-
assert response.status_code == 200
451-
result_parallel = response.json()
452-
453-
for pair in zip(result_serial, result_parallel):
454-
print(json.dumps(pair, indent=2))
455-
assert pair[0] == pair[1]
456-
457-
458400
def test_parallel_mode_returns_errors(monkeypatch):
459401
"""
460402
If we get an error sending a page to the api, bubble it up

0 commit comments

Comments
 (0)