Skip to content

Commit cc90c26

Browse files
authored
build(deps): bump unstructured and unstructured-inference (#210)
1 parent e70d47f commit cc90c26

File tree

8 files changed

+61
-98
lines changed

8 files changed

+61
-98
lines changed

Diff for: .github/workflows/ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
branches: [ main ]
88

99
env:
10-
PYTHON_VERSION: "3.8"
10+
PYTHON_VERSION: "3.10"
1111
PIPELINE_FAMILY: "general"
1212

1313
jobs:

Diff for: .github/workflows/docker-publish.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ env:
1111
PACKAGE: "unstructured-api"
1212
PIPELINE_FAMILY: "general"
1313
PIP_VERSION: "22.2.1"
14-
PYTHON_VERSION: "3.8"
14+
PYTHON_VERSION: "3.10"
1515

1616
jobs:
1717
setup:

Diff for: CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.0.41
2+
3+
* Bump unstructured library to 0.10.8
4+
* Bump unstructured-inference to 0.5.17
5+
16
## 0.0.40
27

38
* Reject traffic when overloaded via `UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB`

Diff for: pipeline-notebooks/pipeline-general.ipynb

+6-6
Original file line numberDiff line numberDiff line change
@@ -680,15 +680,15 @@
680680
"\n",
681681
" return elements\n",
682682
"\n",
683-
"def partition_pdf_splits(request, pdf_pages, file, file_filename, content_type, coordinates, **partition_kwargs):\n",
683+
"def partition_pdf_splits(request, pdf_pages, file, metadata_filename, content_type, coordinates, **partition_kwargs):\n",
684684
" '''\n",
685685
" Split a pdf into chunks and process in parallel with more api calls, or partition\n",
686686
" locally if the chunk is small enough. As soon as any remote call fails, bubble up\n",
687687
" the error.\n",
688688
" \n",
689689
" Arguments:\n",
690690
" request is used to forward relevant headers to the api calls\n",
691-
" file, file_filename and content_type are passed on in the file argument to requests.post\n",
691+
" file, metadata_filename and content_type are passed on in the file argument to requests.post\n",
692692
" coordinates is passed on to the api calls, but cannot be used in the local partition case\n",
693693
" partition_kwargs holds any others parameters that will be forwarded, or passed to partition\n",
694694
" ''' \n",
@@ -698,7 +698,7 @@
698698
" if len(pdf_pages) <= pages_per_pdf:\n",
699699
" return partition(\n",
700700
" file=file,\n",
701-
" file_filename=file_filename,\n",
701+
" metadata_filename=metadata_filename,\n",
702702
" content_type=content_type,\n",
703703
" **partition_kwargs\n",
704704
" )\n",
@@ -709,7 +709,7 @@
709709
" partition_func = partial(\n",
710710
" partition_file_via_api,\n",
711711
" request=request,\n",
712-
" filename=file_filename,\n",
712+
" filename=metadata_filename,\n",
713713
" content_type=content_type,\n",
714714
" coordinates=coordinates,\n",
715715
" **partition_kwargs\n",
@@ -864,7 +864,7 @@
864864
" request,\n",
865865
" pdf_pages=pdf.pages,\n",
866866
" file=file,\n",
867-
" file_filename=filename,\n",
867+
" metadata_filename=filename,\n",
868868
" content_type=file_content_type,\n",
869869
" coordinates=show_coordinates,\n",
870870
" # partition_kwargs\n",
@@ -880,7 +880,7 @@
880880
" else:\n",
881881
" elements = partition(\n",
882882
" file=file,\n",
883-
" file_filename=filename,\n",
883+
" metadata_filename=filename,\n",
884884
" content_type=file_content_type,\n",
885885
" # partition_kwargs\n",
886886
" encoding=encoding,\n",

Diff for: prepline_general/api/general.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def partition_file_via_api(file_tuple, request, filename, content_type, **partit
153153

154154

155155
def partition_pdf_splits(
156-
request, pdf_pages, file, file_filename, content_type, coordinates, **partition_kwargs
156+
request, pdf_pages, file, metadata_filename, content_type, coordinates, **partition_kwargs
157157
):
158158
"""
159159
Split a pdf into chunks and process in parallel with more api calls, or partition
@@ -162,7 +162,7 @@ def partition_pdf_splits(
162162
163163
Arguments:
164164
request is used to forward relevant headers to the api calls
165-
file, file_filename and content_type are passed on in the file argument to requests.post
165+
file, metadata_filename and content_type are passed on in the file argument to requests.post
166166
coordinates is passed on to the api calls, but cannot be used in the local partition case
167167
partition_kwargs holds any others parameters that will be forwarded, or passed to partition
168168
"""
@@ -171,7 +171,10 @@ def partition_pdf_splits(
171171
# If it's small enough, just process locally
172172
if len(pdf_pages) <= pages_per_pdf:
173173
return partition(
174-
file=file, file_filename=file_filename, content_type=content_type, **partition_kwargs
174+
file=file,
175+
metadata_filename=metadata_filename,
176+
content_type=content_type,
177+
**partition_kwargs,
175178
)
176179

177180
results = []
@@ -180,7 +183,7 @@ def partition_pdf_splits(
180183
partition_func = partial(
181184
partition_file_via_api,
182185
request=request,
183-
filename=file_filename,
186+
filename=metadata_filename,
184187
content_type=content_type,
185188
coordinates=coordinates,
186189
**partition_kwargs,
@@ -340,7 +343,7 @@ def pipeline_api(
340343
request,
341344
pdf_pages=pdf.pages,
342345
file=file,
343-
file_filename=filename,
346+
metadata_filename=filename,
344347
content_type=file_content_type,
345348
coordinates=show_coordinates,
346349
# partition_kwargs
@@ -356,7 +359,7 @@ def pipeline_api(
356359
else:
357360
elements = partition(
358361
file=file,
359-
file_filename=filename,
362+
metadata_filename=filename,
360363
content_type=file_content_type,
361364
# partition_kwargs
362365
encoding=encoding,
@@ -502,7 +505,7 @@ def return_content_type(filename):
502505

503506

504507
@router.post("/general/v0/general")
505-
@router.post("/general/v0.0.40/general")
508+
@router.post("/general/v0.0.41/general")
506509
def pipeline_1(
507510
request: Request,
508511
gz_uncompressed_content_type: Optional[str] = Form(default=None),

Diff for: preprocessing-pipeline-family.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.40
2+
version: 0.0.41

Diff for: requirements/base.txt

+17-33
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# This file is autogenerated by pip-compile with Python 3.8
2+
# This file is autogenerated by pip-compile with Python 3.10
33
# by the following command:
44
#
55
# pip-compile requirements/base.in
@@ -58,11 +58,11 @@ et-xmlfile==1.1.0
5858
# via openpyxl
5959
exceptiongroup==1.1.3
6060
# via anyio
61-
fastapi==0.101.1
61+
fastapi==0.103.0
6262
# via unstructured-api-tools
6363
fastjsonschema==2.18.0
6464
# via nbformat
65-
filelock==3.12.2
65+
filelock==3.12.3
6666
# via
6767
# huggingface-hub
6868
# torch
@@ -90,16 +90,6 @@ idna==3.4
9090
# via
9191
# anyio
9292
# requests
93-
importlib-metadata==6.8.0
94-
# via
95-
# jupyter-client
96-
# markdown
97-
# nbconvert
98-
importlib-resources==6.0.1
99-
# via
100-
# jsonschema
101-
# jsonschema-specifications
102-
# matplotlib
10393
iopath==0.1.10
10494
# via layoutparser
10595
jinja2==3.1.2
@@ -113,7 +103,7 @@ jsonschema==4.19.0
113103
# via nbformat
114104
jsonschema-specifications==2023.7.1
115105
# via jsonschema
116-
jupyter-client==8.3.0
106+
jupyter-client==8.3.1
117107
# via nbclient
118108
jupyter-core==5.3.1
119109
# via
@@ -153,7 +143,7 @@ mypy-extensions==1.0.0
153143
# via mypy
154144
nbclient==0.8.0
155145
# via nbconvert
156-
nbconvert==7.7.4
146+
nbconvert==7.8.0
157147
# via unstructured-api-tools
158148
nbformat==5.9.2
159149
# via
@@ -163,7 +153,7 @@ networkx==3.1
163153
# via torch
164154
nltk==3.8.1
165155
# via unstructured
166-
numpy==1.24.4
156+
numpy==1.25.2
167157
# via
168158
# contourpy
169159
# layoutparser
@@ -222,13 +212,11 @@ pillow==9.5.0
222212
# python-pptx
223213
# torchvision
224214
# unstructured
225-
pkgutil-resolve-name==1.3.10
226-
# via jsonschema
227215
platformdirs==3.10.0
228216
# via jupyter-core
229217
portalocker==2.7.0
230218
# via iopath
231-
protobuf==4.24.1
219+
protobuf==4.24.2
232220
# via onnxruntime
233221
psutil==5.9.5
234222
# via -r requirements/base.in
@@ -250,9 +238,9 @@ pypandoc==1.11
250238
# via unstructured
251239
pyparsing==3.0.9
252240
# via matplotlib
253-
pypdf==3.15.2
241+
pypdf==3.15.4
254242
# via -r requirements/base.in
255-
pypdfium2==4.18.0
243+
pypdfium2==4.19.0
256244
# via pdfplumber
257245
pytesseract==0.3.10
258246
# via layoutparser
@@ -271,7 +259,7 @@ python-multipart==0.0.6
271259
# via
272260
# unstructured-api-tools
273261
# unstructured-inference
274-
python-pptx==0.6.21
262+
python-pptx==0.6.22
275263
# via unstructured
276264
pytz==2023.3
277265
# via pandas
@@ -302,7 +290,7 @@ requests==2.31.0
302290
# torchvision
303291
# transformers
304292
# unstructured
305-
rpds-py==0.9.2
293+
rpds-py==0.10.0
306294
# via
307295
# jsonschema
308296
# referencing
@@ -311,7 +299,7 @@ safetensors==0.3.2
311299
# -c requirements/constraints.in
312300
# timm
313301
# transformers
314-
scipy==1.10.1
302+
scipy==1.11.2
315303
# via layoutparser
316304
six==1.16.0
317305
# via
@@ -366,7 +354,7 @@ traitlets==5.9.0
366354
# nbclient
367355
# nbconvert
368356
# nbformat
369-
transformers==4.32.0
357+
transformers==4.32.1
370358
# via unstructured-inference
371359
types-requests==2.31.0.2
372360
# via unstructured-api-tools
@@ -377,21 +365,21 @@ types-urllib3==1.26.25.14
377365
typing-extensions==4.7.1
378366
# via
379367
# fastapi
368+
# filelock
380369
# huggingface-hub
381370
# iopath
382371
# mypy
383372
# pydantic
384-
# pypdf
385-
# starlette
373+
# pydantic-core
386374
# torch
387375
# uvicorn
388376
tzdata==2023.3
389377
# via pandas
390-
unstructured[local-inference]==0.10.5
378+
unstructured[local-inference]==0.10.8
391379
# via -r requirements/base.in
392380
unstructured-api-tools==0.10.11
393381
# via -r requirements/base.in
394-
unstructured-inference==0.5.16
382+
unstructured-inference==0.5.17
395383
# via unstructured
396384
urllib3==2.0.4
397385
# via requests
@@ -411,7 +399,3 @@ xlrd==2.0.1
411399
# via unstructured
412400
xlsxwriter==3.1.2
413401
# via python-pptx
414-
zipp==3.16.2
415-
# via
416-
# importlib-metadata
417-
# importlib-resources

0 commit comments

Comments
 (0)