Skip to content

Commit 90875b1

Browse files
authored
test: Reduced time for testing
* test: yolox extensive tests skipped * fix: formatting code and version correction * fix: removed unused imports * perf: now OCR runs in paralell for YoloX model * test: improving coverage * fix: changed method of calling multiprocessing.Pool that conflicts with pytest * test: added loremipsum with multiple pages * test: added configuration for pytest * fix: sometimes empty patches are produced during OCR * fix: added except for empty patch during OCR * refactor: use only one ONNX sesion to run YoloX * style: add more specific exception on OCR * fix: version sync
1 parent af402ce commit 90875b1

File tree

10 files changed

+304
-159
lines changed

10 files changed

+304
-159
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
## 0.2.8-dev0
22

3+
* Improved testing time
4+
35
## 0.2.7
46

57
* Fixed duplicated load_pdf call

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ stop-app-local:
9595
## test: runs all unittests
9696
.PHONY: test
9797
test:
98+
PYTHONPATH=. pytest -m "not slow" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
99+
100+
.PHONY: test-slow
101+
test-slow:
98102
PYTHONPATH=. pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
99103

100104
## check: runs linters (includes tests)

pytest.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[pytest]
2+
markers =
3+
slow: marks tests as slow (deselect with '-m "not long"')
73.2 KB
Binary file not shown.
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
import os
2+
import shutil
3+
4+
import jsons
5+
import pytest
6+
from fastapi.testclient import TestClient
7+
8+
from unstructured_inference import api
9+
from unstructured_inference.inference.layout import DocumentLayout
10+
from unstructured_inference.models.yolox import yolox_local_inference, get_model_loading_info
11+
from unstructured_inference.models.base import UnknownModelException
12+
13+
14+
@pytest.mark.slow
15+
def test_layout_v02_api_parsing_image():
16+
filename = os.path.join("sample-docs", "test-image.jpg")
17+
18+
client = TestClient(api.app)
19+
response = client.post(
20+
"/layout/yolox/image",
21+
headers={"Accept": "multipart/mixed"},
22+
files=[("file", (filename, open(filename, "rb"), "image/png"))],
23+
data={"version": "yolox"},
24+
)
25+
doc_layout = jsons.load(response.json(), DocumentLayout)
26+
assert len(doc_layout.pages) == 1
27+
# NOTE(benjamin) The example sent to the test contains 13 detections
28+
assert len(doc_layout.pages[0]["layout"]) == 13
29+
assert response.status_code == 200
30+
31+
32+
@pytest.mark.slow
33+
def test_layout_v02_api_parsing_pdf():
34+
filename = os.path.join("sample-docs", "loremipsum.pdf")
35+
36+
client = TestClient(api.app)
37+
response = client.post(
38+
"/layout/yolox/pdf",
39+
files={"file": (filename, open(filename, "rb"))},
40+
data={"version": "yolox"},
41+
)
42+
doc_layout = jsons.load(response.json(), DocumentLayout)
43+
assert len(doc_layout.pages) == 1
44+
# NOTE(benjamin) The example sent to the test contains 5 detections
45+
assert len(doc_layout.pages[0]["layout"]) == 5
46+
assert response.status_code == 200
47+
48+
49+
@pytest.mark.slow
50+
def test_layout_v02_api_parsing_pdf_ocr():
51+
filename = os.path.join("sample-docs", "non-embedded.pdf")
52+
53+
client = TestClient(api.app)
54+
response = client.post(
55+
"/layout/yolox/pdf",
56+
files={"file": (filename, open(filename, "rb"))},
57+
data={"force_ocr": True, "version": "yolox"},
58+
)
59+
doc_layout = jsons.load(response.json(), DocumentLayout)
60+
assert len(doc_layout.pages) == 10
61+
assert len(doc_layout.pages[0]["layout"]) > 1
62+
assert response.status_code == 200
63+
64+
65+
@pytest.mark.slow
66+
def test_layout_v02_local_parsing_image():
67+
filename = os.path.join("sample-docs", "test-image.jpg")
68+
OUTPUT_DIR = "yolox_output"
69+
# NOTE(benjamin) keep_output = True create a file for each image in
70+
# localstorage for visualization of the result
71+
if os.path.exists(OUTPUT_DIR):
72+
# NOTE(benjamin): should delete the default output folder on test?
73+
shutil.rmtree(OUTPUT_DIR)
74+
document_layout_1 = yolox_local_inference(
75+
filename, type="image", output_directory=OUTPUT_DIR, version="yolox"
76+
)
77+
assert len(document_layout_1.pages) == 1
78+
document_layout_2 = yolox_local_inference(filename, type="image", version="yolox")
79+
# NOTE(benjamin) The example image should result in one page result
80+
assert len(document_layout_2.pages) == 1
81+
# NOTE(benjamin) The example sent to the test contains 13 detections
82+
assert len(document_layout_2.pages[0].layout) == 13
83+
84+
85+
@pytest.mark.slow
86+
def test_layout_v02_local_parsing_pdf():
87+
filename = os.path.join("sample-docs", "loremipsum.pdf")
88+
document_layout = yolox_local_inference(filename, type="pdf", version="yolox")
89+
content = document_layout.to_string()
90+
assert "Lorem ipsum" in content
91+
assert len(document_layout.pages) == 1
92+
# NOTE(benjamin) The example sent to the test contains 5 detections
93+
assert len(document_layout.pages[0].layout) == 5
94+
95+
96+
@pytest.mark.slow
97+
def test_layout_v02_local_parsing_empty_pdf():
98+
filename = os.path.join("sample-docs", "empty-document.pdf")
99+
document_layout = yolox_local_inference(filename, type="pdf", version="yolox")
100+
assert len(document_layout.pages) == 1
101+
# NOTE(benjamin) The example sent to the test contains 5 detections
102+
assert len(document_layout.pages[0].layout) == 0
103+
104+
105+
def test_invalid_model():
106+
with pytest.raises(UnknownModelException):
107+
get_model_loading_info("invalidmodel")
108+
109+
110+
########################
111+
# ONLY SHORT TESTS BELOW
112+
########################
113+
114+
115+
def test_layout_v02_api_parsing_image_soft():
116+
filename = os.path.join("sample-docs", "test-image.jpg")
117+
118+
client = TestClient(api.app)
119+
response = client.post(
120+
"/layout/yolox/image",
121+
headers={"Accept": "multipart/mixed"},
122+
files=[("file", (filename, open(filename, "rb"), "image/png"))],
123+
data={"version": "yolox_tiny"},
124+
)
125+
doc_layout = jsons.load(response.json(), DocumentLayout)
126+
assert len(doc_layout.pages) == 1
127+
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
128+
assert len(doc_layout.pages[0]["layout"]) > 0
129+
assert response.status_code == 200
130+
131+
132+
def test_layout_v02_api_parsing_pdf_soft():
133+
filename = os.path.join("sample-docs", "loremipsum.pdf")
134+
135+
client = TestClient(api.app)
136+
response = client.post(
137+
"/layout/yolox/pdf",
138+
files={"file": (filename, open(filename, "rb"))},
139+
data={"version": "yolox_tiny"},
140+
)
141+
doc_layout = jsons.load(response.json(), DocumentLayout)
142+
assert len(doc_layout.pages) == 1
143+
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
144+
assert len(doc_layout.pages[0]["layout"]) > 0
145+
assert response.status_code == 200
146+
147+
148+
def test_layout_v02_api_parsing_pdf_ocr_soft():
149+
filename = os.path.join("sample-docs", "non-embedded.pdf")
150+
151+
client = TestClient(api.app)
152+
response = client.post(
153+
"/layout/yolox/pdf",
154+
files={"file": (filename, open(filename, "rb"))},
155+
data={"force_ocr": True, "version": "yolox_tiny"},
156+
)
157+
doc_layout = jsons.load(response.json(), DocumentLayout)
158+
assert len(doc_layout.pages) == 10
159+
assert len(doc_layout.pages[0]["layout"]) > 1
160+
assert response.status_code == 200
161+
162+
163+
def test_layout_v02_local_parsing_image_soft():
164+
filename = os.path.join("sample-docs", "test-image.jpg")
165+
OUTPUT_DIR = "yolox_output"
166+
# NOTE(benjamin) keep_output = True create a file for each image in
167+
# localstorage for visualization of the result
168+
if os.path.exists(OUTPUT_DIR):
169+
# NOTE(benjamin): should delete the default output folder on test?
170+
shutil.rmtree(OUTPUT_DIR)
171+
document_layout_1 = yolox_local_inference(
172+
filename, type="image", output_directory=OUTPUT_DIR, version="yolox_tiny"
173+
)
174+
assert len(document_layout_1.pages) == 1
175+
document_layout_2 = yolox_local_inference(filename, type="image", version="yolox_tiny")
176+
# NOTE(benjamin) The example image should result in one page result
177+
assert len(document_layout_2.pages) == 1
178+
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
179+
assert len(document_layout_2.pages[0].layout) > 0
180+
181+
182+
def test_layout_v02_local_parsing_pdf_soft():
183+
filename = os.path.join("sample-docs", "loremipsum.pdf")
184+
document_layout = yolox_local_inference(filename, type="pdf", version="yolox_tiny")
185+
content = document_layout.to_string()
186+
assert "Lorem ipsum" in content
187+
assert len(document_layout.pages) == 1
188+
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
189+
assert len(document_layout.pages[0].layout) > 0
190+
191+
192+
def test_layout_v02_local_parsing_empty_pdf_soft():
193+
filename = os.path.join("sample-docs", "empty-document.pdf")
194+
document_layout = yolox_local_inference(filename, type="pdf", version="yolox_tiny")
195+
assert len(document_layout.pages) == 1
196+
# NOTE(benjamin) The example sent to the test contains 5 detections
197+
assert len(document_layout.pages[0].layout) == 0

test_unstructured_inference/test_api.py

Lines changed: 2 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
import os
2-
import shutil
32

4-
import jsons
53
import pytest
64
from fastapi.testclient import TestClient
75

86
from unstructured_inference import api
97
from unstructured_inference.models import base as models
108
from unstructured_inference.inference.layout import DocumentLayout
11-
from unstructured_inference.models.yolox import yolox_local_inference # DocumentLayout #maybe
129

1310

1411
class MockModel:
@@ -53,89 +50,10 @@ def test_layout_parsing_api(monkeypatch, filetype, ext, data, process_func, expe
5350
def test_bad_route_404():
5451
client = TestClient(api.app)
5552
filename = os.path.join("sample-docs", "loremipsum.pdf")
56-
response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))})
57-
assert response.status_code == 404
58-
59-
60-
def test_layout_v02_api_parsing_image():
61-
filename = os.path.join("sample-docs", "test-image.jpg")
62-
63-
client = TestClient(api.app)
64-
response = client.post(
65-
"/layout/yolox/image",
66-
headers={"Accept": "multipart/mixed"},
67-
files=[("file", (filename, open(filename, "rb"), "image/png"))],
68-
)
69-
doc_layout = jsons.load(response.json(), DocumentLayout)
70-
assert len(doc_layout.pages) == 1
71-
# NOTE(benjamin) The example sent to the test contains 13 detections
72-
assert len(doc_layout.pages[0]["layout"]) == 13
73-
assert response.status_code == 200
74-
75-
76-
def test_layout_v02_api_parsing_pdf():
77-
filename = os.path.join("sample-docs", "loremipsum.pdf")
78-
79-
client = TestClient(api.app)
8053
response = client.post(
81-
"/layout/yolox/pdf",
82-
files={"file": (filename, open(filename, "rb"))},
54+
"/layout/detectron/badroute", files={"file": (filename, open(filename, "rb"))}
8355
)
84-
doc_layout = jsons.load(response.json(), DocumentLayout)
85-
assert len(doc_layout.pages) == 1
86-
# NOTE(benjamin) The example sent to the test contains 5 detections
87-
assert len(doc_layout.pages[0]["layout"]) == 5
88-
assert response.status_code == 200
89-
90-
91-
def test_layout_v02_api_parsing_pdf_ocr():
92-
filename = os.path.join("sample-docs", "non-embedded.pdf")
93-
94-
client = TestClient(api.app)
95-
response = client.post(
96-
"/layout/yolox/pdf",
97-
files={"file": (filename, open(filename, "rb"))},
98-
data={"force_ocr": True},
99-
)
100-
doc_layout = jsons.load(response.json(), DocumentLayout)
101-
assert len(doc_layout.pages) == 10
102-
assert len(doc_layout.pages[0]["layout"]) > 1
103-
assert response.status_code == 200
104-
105-
106-
def test_layout_v02_local_parsing_image():
107-
filename = os.path.join("sample-docs", "test-image.jpg")
108-
OUTPUT_DIR = "yolox_output"
109-
# NOTE(benjamin) keep_output = True create a file for each image in
110-
# localstorage for visualization of the result
111-
if os.path.exists(OUTPUT_DIR):
112-
# NOTE(benjamin): should delete the default output folder on test?
113-
shutil.rmtree(OUTPUT_DIR)
114-
document_layout_1 = yolox_local_inference(filename, type="image", output_directory=OUTPUT_DIR)
115-
assert len(document_layout_1.pages) == 1
116-
document_layout_2 = yolox_local_inference(filename, type="image")
117-
# NOTE(benjamin) The example image should result in one page result
118-
assert len(document_layout_2.pages) == 1
119-
# NOTE(benjamin) The example sent to the test contains 13 detections
120-
assert len(document_layout_2.pages[0].layout) == 13
121-
122-
123-
def test_layout_v02_local_parsing_pdf():
124-
filename = os.path.join("sample-docs", "loremipsum.pdf")
125-
document_layout = yolox_local_inference(filename, type="pdf")
126-
content = document_layout.to_string()
127-
assert "Lorem ipsum" in content
128-
assert len(document_layout.pages) == 1
129-
# NOTE(benjamin) The example sent to the test contains 5 detections
130-
assert len(document_layout.pages[0].layout) == 5
131-
132-
133-
def test_layout_v02_local_parsing_empty_pdf():
134-
filename = os.path.join("sample-docs", "empty-document.pdf")
135-
document_layout = yolox_local_inference(filename, type="pdf")
136-
assert len(document_layout.pages) == 1
137-
# NOTE(benjamin) The example sent to the test contains 5 detections
138-
assert len(document_layout.pages[0].layout) == 0
56+
assert response.status_code == 404
13957

14058

14159
def test_healthcheck(monkeypatch):

unstructured_inference/api.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,13 @@ async def layout_parsing_yolox(
4747
request: Request,
4848
file: List[UploadFile] = File(default=None),
4949
force_ocr=Form(default=False),
50+
version=Form(default="yolox"),
5051
):
5152
with tempfile.NamedTemporaryFile() as tmp_file:
5253
tmp_file.write(file[0].file.read())
53-
detections = yolox_local_inference(tmp_file.name, type=filetype, use_ocr=force_ocr)
54+
detections = yolox_local_inference(
55+
tmp_file.name, type=filetype, use_ocr=force_ocr, version=version
56+
)
5457

5558
return detections
5659

0 commit comments

Comments
 (0)