Skip to content

Commit 7de630e

Browse files
authored
Feat/bump numpy to 2 (#3961)
This PR updates a few dependencies so that they are compatible with `numpy>=2`.
1 parent 4e424ef commit 7de630e

14 files changed

+146
-167
lines changed

CHANGELOG.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
## 0.17.1-dev1
1+
## 0.17.1
22

33
### Enhancements
44

55
- **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
6+
67
- **Use `lxml` instead of `bs4` to parse hOCR data.** `lxml` is much faster than `bs4` given the hOCR data format is regular (garanteed because it is programatically generated)
78

9+
- **bump `numpy` to `>2`**. And upgrade `paddlepaddle`, `unstructured-paddleocr`, `onnx` so they are compatible with `numpy>2`.
10+
811
### Features
912

1013
### Fixes

requirements/base.in

+1-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@ emoji
1010
dataclasses-json
1111
python-iso639
1212
langdetect
13-
# NOTE(robinson) - numpy pin is because ONNX model weights are only compatible
14-
# with numpy 1.x.x
15-
numpy<2
13+
numpy
1614
rapidfuzz
1715
backoff
1816
typing-extensions

requirements/base.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile ./base.in
66
#
7-
anyio==4.8.0
7+
anyio==4.9.0
88
# via httpx
99
backoff==2.2.1
1010
# via -r ./base.in
@@ -34,7 +34,7 @@ dataclasses-json==0.6.7
3434
# via
3535
# -r ./base.in
3636
# unstructured-client
37-
deepdiff==8.3.0
37+
deepdiff==8.4.2
3838
# via unstructured-client
3939
emoji==2.14.1
4040
# via -r ./base.in
@@ -76,7 +76,7 @@ nest-asyncio==1.6.0
7676
# via unstructured-client
7777
nltk==3.9.1
7878
# via -r ./base.in
79-
numpy==1.26.4
79+
numpy==2.0.2
8080
# via -r ./base.in
8181
olefile==0.47
8282
# via python-oxmsg
@@ -90,7 +90,7 @@ psutil==7.0.0
9090
# via -r ./base.in
9191
pycparser==2.22
9292
# via cffi
93-
pypdf==5.3.1
93+
pypdf==5.4.0
9494
# via unstructured-client
9595
python-dateutil==2.9.0.post0
9696
# via unstructured-client

requirements/dev.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ click==8.1.8
1515
# pip-tools
1616
distlib==0.3.9
1717
# via virtualenv
18-
filelock==3.17.0
18+
filelock==3.18.0
1919
# via virtualenv
20-
identify==2.6.8
20+
identify==2.6.9
2121
# via pre-commit
2222
importlib-metadata==8.6.1
2323
# via

requirements/extra-csv.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile ./extra-csv.in
66
#
7-
numpy==1.26.4
7+
numpy==2.0.2
88
# via
99
# -c ./base.txt
1010
# pandas

requirements/extra-paddleocr.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-c ./deps/constraints.txt
22
-c base.txt
33

4-
paddlepaddle==3.0.0b1
5-
unstructured.paddleocr==2.8.1.0
4+
paddlepaddle>=3.0.0b1
5+
unstructured.paddleocr==2.10.0

requirements/extra-paddleocr.txt

+62-53
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,24 @@
44
#
55
# pip-compile ./extra-paddleocr.in
66
#
7-
anyio==4.8.0
7+
albucore==0.0.23
8+
# via
9+
# albumentations
10+
# unstructured-paddleocr
11+
albumentations==2.0.5
12+
# via unstructured-paddleocr
13+
annotated-types==0.7.0
14+
# via pydantic
15+
anyio==4.9.0
816
# via
917
# -c ./base.txt
1018
# httpx
1119
astor==0.8.1
1220
# via paddlepaddle
21+
beautifulsoup4==4.13.3
22+
# via
23+
# -c ./base.txt
24+
# unstructured-paddleocr
1325
certifi==2025.1.31
1426
# via
1527
# -c ./base.txt
@@ -20,20 +32,20 @@ charset-normalizer==3.4.1
2032
# via
2133
# -c ./base.txt
2234
# requests
23-
contourpy==1.3.0
24-
# via matplotlib
25-
cycler==0.12.1
26-
# via matplotlib
2735
cython==3.0.12
2836
# via unstructured-paddleocr
2937
decorator==5.2.1
3038
# via paddlepaddle
39+
eval-type-backport==0.2.2
40+
# via albumentations
3141
exceptiongroup==1.2.2
3242
# via
3343
# -c ./base.txt
3444
# anyio
45+
fire==0.7.0
46+
# via unstructured-paddleocr
3547
fonttools==4.56.0
36-
# via matplotlib
48+
# via unstructured-paddleocr
3749
h11==0.14.0
3850
# via
3951
# -c ./base.txt
@@ -53,32 +65,26 @@ idna==3.10
5365
# httpx
5466
# requests
5567
imageio==2.37.0
56-
# via
57-
# imgaug
58-
# scikit-image
59-
imgaug==0.4.0
60-
# via unstructured-paddleocr
61-
importlib-resources==6.5.2
62-
# via matplotlib
63-
kiwisolver==1.4.7
64-
# via matplotlib
68+
# via scikit-image
6569
lazy-loader==0.4
6670
# via scikit-image
67-
matplotlib==3.9.4
68-
# via imgaug
71+
lxml==5.3.1
72+
# via
73+
# -c ./base.txt
74+
# python-docx
6975
networkx==3.2.1
7076
# via
7177
# paddlepaddle
7278
# scikit-image
73-
numpy==1.26.4
79+
numpy==2.0.2
7480
# via
7581
# -c ./base.txt
76-
# contourpy
82+
# albucore
83+
# albumentations
7784
# imageio
78-
# imgaug
79-
# matplotlib
8085
# opencv-contrib-python
8186
# opencv-python
87+
# opencv-python-headless
8288
# opt-einsum
8389
# paddlepaddle
8490
# scikit-image
@@ -89,44 +95,42 @@ numpy==1.26.4
8995
opencv-contrib-python==4.11.0.86
9096
# via unstructured-paddleocr
9197
opencv-python==4.11.0.86
98+
# via unstructured-paddleocr
99+
opencv-python-headless==4.11.0.86
92100
# via
93-
# imgaug
94-
# unstructured-paddleocr
101+
# albucore
102+
# albumentations
95103
opt-einsum==3.3.0
96104
# via paddlepaddle
97105
packaging==24.2
98106
# via
99107
# -c ./base.txt
100108
# lazy-loader
101-
# matplotlib
102109
# scikit-image
103-
paddlepaddle==3.0.0b1
110+
paddlepaddle==3.0.0rc1
104111
# via -r ./extra-paddleocr.in
105-
pdf2image==1.17.0
106-
# via unstructured-paddleocr
107112
pillow==11.1.0
108113
# via
109114
# imageio
110-
# imgaug
111-
# matplotlib
112115
# paddlepaddle
113-
# pdf2image
114116
# scikit-image
115117
# unstructured-paddleocr
116-
protobuf==6.30.0
118+
protobuf==6.30.1
117119
# via
118120
# -c ././deps/constraints.txt
119121
# paddlepaddle
120122
pyclipper==1.3.0.post6
121123
# via unstructured-paddleocr
122-
pyparsing==3.2.1
123-
# via matplotlib
124-
python-dateutil==2.9.0.post0
125-
# via
126-
# -c ./base.txt
127-
# matplotlib
128-
pyyaml==6.0.2
124+
pydantic==2.10.6
125+
# via albumentations
126+
pydantic-core==2.27.2
127+
# via pydantic
128+
python-docx==1.1.2
129129
# via unstructured-paddleocr
130+
pyyaml==6.0.2
131+
# via
132+
# albumentations
133+
# unstructured-paddleocr
130134
rapidfuzz==3.12.2
131135
# via
132136
# -c ./base.txt
@@ -136,26 +140,27 @@ requests==2.32.3
136140
# -c ./base.txt
137141
# unstructured-paddleocr
138142
scikit-image==0.24.0
139-
# via
140-
# imgaug
141-
# unstructured-paddleocr
143+
# via unstructured-paddleocr
142144
scipy==1.13.1
143145
# via
144-
# imgaug
146+
# albumentations
145147
# scikit-image
146148
shapely==2.0.7
147-
# via
148-
# imgaug
149-
# unstructured-paddleocr
150-
six==1.17.0
151-
# via
152-
# -c ./base.txt
153-
# imgaug
154-
# python-dateutil
149+
# via unstructured-paddleocr
150+
simsimd==6.2.1
151+
# via albucore
155152
sniffio==1.3.1
156153
# via
157154
# -c ./base.txt
158155
# anyio
156+
soupsieve==2.6
157+
# via
158+
# -c ./base.txt
159+
# beautifulsoup4
160+
stringzilla==3.12.3
161+
# via albucore
162+
termcolor==2.5.0
163+
# via fire
159164
tifffile==2024.8.30
160165
# via scikit-image
161166
tqdm==4.67.1
@@ -165,14 +170,18 @@ tqdm==4.67.1
165170
typing-extensions==4.12.2
166171
# via
167172
# -c ./base.txt
173+
# albucore
174+
# albumentations
168175
# anyio
176+
# beautifulsoup4
169177
# paddlepaddle
170-
unstructured-paddleocr==2.8.1.0
178+
# pydantic
179+
# pydantic-core
180+
# python-docx
181+
unstructured-paddleocr==2.10.0
171182
# via -r ./extra-paddleocr.in
172183
urllib3==1.26.20
173184
# via
174185
# -c ././deps/constraints.txt
175186
# -c ./base.txt
176187
# requests
177-
zipp==3.21.0
178-
# via importlib-resources

requirements/extra-pdf-image.in

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
-c ./deps/constraints.txt
22
-c base.txt
33

4-
onnx
4+
onnx>=1.17.0
5+
onnxruntime>=1.19.0
56
pdf2image
67
pdfminer.six
78
pikepdf
@@ -11,5 +12,5 @@ google-cloud-vision
1112
effdet
1213
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
1314
# when unstructured library is.
14-
unstructured-inference>=0.8.9
15+
unstructured-inference>=0.8.10
1516
unstructured.pytesseract>=0.3.12

0 commit comments

Comments
 (0)