Skip to content

Commit db173d0

Browse files
authored
fix: convert to pixels (#74)
Fixed failure to convert points to pixels when loading embedded elements in PDF. Alse added paddleocr dependency for x86_64 machines, and corrected an incompatibility between `LayoutElement` and its `unstructured` equivalent.
1 parent 43887e6 commit db173d0

File tree

8 files changed

+59
-27
lines changed

8 files changed

+59
-27
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.3.0
2+
3+
* Fix for text block detection
4+
* Add paddleocr dependency to setup for x86_64 machines
5+
16
## 0.2.14
27

38
* Suppressed processing progress bars

requirements/base.txt

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,22 @@ coloredlogs==15.0.1
2222
# via onnxruntime
2323
contourpy==1.0.7
2424
# via matplotlib
25-
cryptography==39.0.2
25+
cryptography==40.0.1
2626
# via pdfminer-six
2727
cycler==0.11.0
2828
# via matplotlib
2929
effdet==0.3.0
3030
# via layoutparser
3131
fastapi==0.95.0
3232
# via unstructured-inference (setup.py)
33-
filelock==3.10.0
33+
filelock==3.10.7
3434
# via
3535
# huggingface-hub
3636
# torch
3737
# transformers
3838
flatbuffers==23.3.3
3939
# via onnxruntime
40-
fonttools==4.39.2
40+
fonttools==4.39.3
4141
# via matplotlib
4242
h11==0.14.0
4343
# via uvicorn
@@ -86,7 +86,7 @@ omegaconf==2.3.0
8686
# via effdet
8787
onnxruntime==1.14.1
8888
# via unstructured-inference (setup.py)
89-
opencv-python==4.7.0.72
89+
opencv-python==4.6.0.66
9090
# via
9191
# layoutparser
9292
# unstructured-inference (setup.py)
@@ -121,7 +121,7 @@ pycocotools==2.0.6
121121
# via effdet
122122
pycparser==2.21
123123
# via cffi
124-
pydantic==1.10.6
124+
pydantic==1.10.7
125125
# via fastapi
126126
pyparsing==3.0.9
127127
# via matplotlib
@@ -133,7 +133,7 @@ python-dateutil==2.8.2
133133
# pandas
134134
python-multipart==0.0.6
135135
# via unstructured-inference (setup.py)
136-
pytz==2022.7.1
136+
pytz==2023.3
137137
# via pandas
138138
pyyaml==6.0
139139
# via
@@ -142,7 +142,7 @@ pyyaml==6.0
142142
# omegaconf
143143
# timm
144144
# transformers
145-
regex==2022.10.31
145+
regex==2023.3.23
146146
# via transformers
147147
requests==2.28.2
148148
# via
@@ -161,7 +161,7 @@ sympy==1.11.1
161161
# via
162162
# onnxruntime
163163
# torch
164-
timm==0.6.12
164+
timm==0.6.13
165165
# via effdet
166166
tokenizers==0.13.2
167167
# via transformers
@@ -181,7 +181,7 @@ tqdm==4.65.0
181181
# huggingface-hub
182182
# iopath
183183
# transformers
184-
transformers==4.27.2
184+
transformers==4.27.4
185185
# via unstructured-inference (setup.py)
186186
typing-extensions==4.5.0
187187
# via

requirements/dev.txt

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
#
77
anyio==3.6.2
88
# via jupyter-server
9+
appnope==0.1.3
10+
# via
11+
# ipykernel
12+
# ipython
913
argon2-cffi==21.3.0
1014
# via
1115
# jupyter-server
@@ -31,7 +35,7 @@ cffi==1.15.1
3135
# via argon2-cffi-bindings
3236
click==8.1.3
3337
# via pip-tools
34-
comm==0.1.2
38+
comm==0.1.3
3539
# via ipykernel
3640
debugpy==1.6.6
3741
# via ipykernel
@@ -57,14 +61,15 @@ importlib-resources==5.12.0
5761
# via jsonschema
5862
ipykernel==6.22.0
5963
# via
64+
# ipywidgets
6065
# jupyter
6166
# jupyter-console
6267
# nbclassic
6368
# notebook
6469
# qtconsole
6570
ipython==8.11.0
6671
# via
67-
# -r dev.in
72+
# -r requirements/dev.in
6873
# ipykernel
6974
# ipywidgets
7075
# jupyter-console
@@ -73,7 +78,7 @@ ipython-genutils==0.2.0
7378
# nbclassic
7479
# notebook
7580
# qtconsole
76-
ipywidgets==8.0.5
81+
ipywidgets==8.0.6
7782
# via jupyter
7883
isoduration==20.11.0
7984
# via jsonschema
@@ -126,7 +131,7 @@ jupyter-server-terminals==0.4.4
126131
# via jupyter-server
127132
jupyterlab-pygments==0.2.2
128133
# via nbconvert
129-
jupyterlab-widgets==3.0.6
134+
jupyterlab-widgets==3.0.7
130135
# via ipywidgets
131136
markupsafe==2.1.2
132137
# via
@@ -184,7 +189,7 @@ pip-tools==6.12.3
184189
# via -r requirements/dev.in
185190
pkgutil-resolve-name==1.3.10
186191
# via jsonschema
187-
platformdirs==3.1.1
192+
platformdirs==3.2.0
188193
# via jupyter-core
189194
prometheus-client==0.16.0
190195
# via
@@ -234,7 +239,7 @@ pyzmq==25.0.2
234239
# qtconsole
235240
qtconsole==5.4.1
236241
# via jupyter
237-
qtpy==2.3.0
242+
qtpy==2.3.1
238243
# via qtconsole
239244
rfc3339-validator==0.1.4
240245
# via
@@ -269,6 +274,10 @@ terminado==0.17.1
269274
# notebook
270275
tinycss2==1.2.1
271276
# via nbconvert
277+
tomli==2.0.1
278+
# via
279+
# build
280+
# pyproject-hooks
272281
tornado==6.2
273282
# via
274283
# ipykernel
@@ -299,7 +308,7 @@ uri-template==1.2.0
299308
# via jsonschema
300309
wcwidth==0.2.6
301310
# via prompt-toolkit
302-
webcolors==1.12
311+
webcolors==1.13
303312
# via jsonschema
304313
webencodings==0.5.1
305314
# via
@@ -309,7 +318,7 @@ websocket-client==1.5.1
309318
# via jupyter-server
310319
wheel==0.40.0
311320
# via pip-tools
312-
widgetsnbextension==4.0.6
321+
widgetsnbextension==4.0.7
313322
# via ipywidgets
314323
zipp==3.15.0
315324
# via

requirements/test.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ appdirs==1.4.4
1010
# via label-studio-tools
1111
attrs==22.2.0
1212
# via pytest
13-
black==23.1.0
13+
black==23.3.0
1414
# via -r requirements/test.in
1515
certifi==2022.12.7
1616
# via
@@ -29,7 +29,7 @@ coverage[toml]==7.2.2
2929
# pytest-cov
3030
exceptiongroup==1.1.1
3131
# via pytest
32-
filelock==3.10.0
32+
filelock==3.10.7
3333
# via huggingface-hub
3434
flake8==6.0.0
3535
# via
@@ -82,13 +82,13 @@ pdf2image==1.16.3
8282
# via -r requirements/test.in
8383
pillow==9.4.0
8484
# via pdf2image
85-
platformdirs==3.1.1
85+
platformdirs==3.2.0
8686
# via black
8787
pluggy==1.0.0
8888
# via pytest
8989
pycodestyle==2.10.0
9090
# via flake8
91-
pydantic==1.10.6
91+
pydantic==1.10.7
9292
# via label-studio-sdk
9393
pydocstyle==6.3.0
9494
# via flake8-docstrings

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
limitations under the License.
1919
"""
2020
from setuptools import setup, find_packages
21-
from platform import machine
2221

2322
from unstructured_inference.__version__ import __version__
2423

@@ -60,6 +59,6 @@
6059
"opencv-python==4.6.0.66",
6160
"onnxruntime",
6261
"transformers",
62+
'unstructured.PaddleOCR ; platform_machine=="x86_64"',
6363
],
64-
extras_require={"paddle-ocr": "unstructured.PaddleOCR"},
6564
)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.14" # pragma: no cover
1+
__version__ = "0.3.0" # pragma: no cover

unstructured_inference/inference/elements.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ def is_in(self, other: Rectangle, error_margin: Optional[int] = None):
5858
]
5959
)
6060

61+
@property
62+
def coordinates(self):
63+
"""Gets coordinates of the rectangle"""
64+
return ((self.x1, self.y1), (self.x1, self.y2), (self.x2, self.y2), (self.x2, self.y1))
65+
6166

6267
@dataclass
6368
class TextRegion(Rectangle):
@@ -77,7 +82,12 @@ class LayoutElement(TextRegion):
7782

7883
def to_dict(self) -> dict:
7984
"""Converts the class instance to dictionary form."""
80-
return self.__dict__
85+
out_dict = {
86+
"coordinates": self.coordinates,
87+
"text": self.text,
88+
"type": self.type,
89+
}
90+
return out_dict
8191

8292
@classmethod
8393
def from_region(cls, region: Rectangle):

unstructured_inference/inference/layout.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,12 +363,21 @@ def load_pdf(
363363
)
364364
word_objs = [
365365
TextRegion(
366-
x1=word["x0"], y1=word["top"], x2=word["x1"], y2=word["bottom"], text=word["text"]
366+
x1=word["x0"] * dpi / 72,
367+
y1=word["top"] * dpi / 72,
368+
x2=word["x1"] * dpi / 72,
369+
y2=word["bottom"] * dpi / 72,
370+
text=word["text"],
367371
)
368372
for word in plumber_words
369373
]
370374
image_objs = [
371-
ImageTextRegion(x1=image["x0"], y1=image["y0"], x2=image["x1"], y2=image["y1"])
375+
ImageTextRegion(
376+
x1=image["x0"] * dpi / 72,
377+
y1=image["y0"] * dpi / 72,
378+
x2=image["x1"] * dpi / 72,
379+
y2=image["y1"] * dpi / 72,
380+
)
372381
for image in page.images
373382
]
374383
layout = word_objs + image_objs

0 commit comments

Comments
 (0)