Skip to content

Commit a21325f

Browse files
authored
chore: cut release for image processing (#13)
- Added is_image flag for process_data_with_model and process_file_with_model - Added support for the above in the API - Added testing for all of the above
1 parent b8238fe commit a21325f

File tree

5 files changed

+41
-25
lines changed

5 files changed

+41
-25
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.2.2-dev1
1+
## 0.2.2
22

33
* Add capability to process image files
44
* Add logic to use OCR when layout text is full of unknown characters

test_unstructured_inference/test_api.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,50 +9,52 @@
99
import unstructured_inference.models.detectron2 as detectron2
1010

1111

12-
@pytest.fixture
13-
def sample_pdf_content():
14-
return """
15-
this is the content of a sample pdf file.
16-
Title: ...
17-
Author: ...
18-
"""
19-
20-
2112
class MockModel:
2213
def __init__(self, *args, **kwargs):
2314
self.args = args
2415
self.kwargs = kwargs
2516

2617

27-
def test_layout_parsing_pdf_api(sample_pdf_content, tmpdir, monkeypatch):
18+
@pytest.mark.parametrize("filetype, ext", [("pdf", "pdf"), ("image", "png")])
19+
def test_layout_parsing_api(monkeypatch, filetype, ext):
2820
monkeypatch.setattr(models, "load_model", lambda *args, **kwargs: MockModel(*args, **kwargs))
2921
monkeypatch.setattr(models, "hf_hub_download", lambda *args, **kwargs: "fake-path")
3022
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
3123
monkeypatch.setattr(
3224
DocumentLayout, "from_file", lambda *args, **kwargs: DocumentLayout.from_pages([])
3325
)
26+
monkeypatch.setattr(
27+
DocumentLayout, "from_image_file", lambda *args, **kwargs: DocumentLayout.from_pages([])
28+
)
3429

35-
filename = os.path.join(tmpdir.dirname, "sample.pdf")
36-
with open(filename, "w") as f:
37-
f.write(sample_pdf_content)
30+
filename = os.path.join("sample-docs", f"loremipsum.{ext}")
3831

3932
client = TestClient(app)
40-
response = client.post("/layout/pdf", files={"file": (filename, open(filename, "rb"))})
33+
response = client.post(f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))})
4134
assert response.status_code == 200
4235

4336
response = client.post(
44-
"/layout/pdf", files={"file": (filename, open(filename, "rb"))}, data={"model": "checkbox"}
37+
f"/layout/{filetype}",
38+
files={"file": (filename, open(filename, "rb"))},
39+
data={"model": "checkbox"},
4540
)
4641
assert response.status_code == 200
4742

4843
response = client.post(
49-
"/layout/pdf",
44+
f"/layout/{filetype}",
5045
files={"file": (filename, open(filename, "rb"))},
5146
data={"model": "fake_model"},
5247
)
5348
assert response.status_code == 422
5449

5550

51+
def test_bad_route_404():
52+
client = TestClient(app)
53+
filename = os.path.join("sample-docs", "loremipsum.pdf")
54+
response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))})
55+
assert response.status_code == 404
56+
57+
5658
def test_healthcheck(monkeypatch):
5759
client = TestClient(app)
5860
response = client.get("/healthcheck")
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.2-dev1" # pragma: no cover
1+
__version__ = "0.2.2" # pragma: no cover

unstructured_inference/api.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,21 @@
66
app = FastAPI()
77

88
ALL_ELEMS = "_ALL"
9+
VALID_FILETYPES = ["pdf", "image"]
910

1011

11-
@app.post("/layout/pdf")
12-
async def layout_parsing_pdf(
12+
@app.post("/layout/{filetype:path}")
13+
async def layout_parsing(
14+
filetype: str,
1315
file: UploadFile = File(),
1416
include_elems: List[str] = Form(default=ALL_ELEMS),
1517
model: str = Form(default=None),
1618
):
19+
if filetype not in VALID_FILETYPES:
20+
raise HTTPException(status.HTTP_404_NOT_FOUND)
21+
is_image = filetype == "image"
1722
try:
18-
layout = process_data_with_model(file.file, model)
23+
layout = process_data_with_model(file.file, model, is_image)
1924
except UnknownModelException as e:
2025
raise HTTPException(status.HTTP_422_UNPROCESSABLE_ENTITY, str(e))
2126
pages_layout = [

unstructured_inference/inference/layout.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,21 +159,29 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
159159
return self.image_array
160160

161161

162-
def process_data_with_model(data: BinaryIO, model_name: str) -> DocumentLayout:
162+
def process_data_with_model(
163+
data: BinaryIO, model_name: Optional[str], is_image: bool = False
164+
) -> DocumentLayout:
163165
"""Processes pdf file in the form of a file handler (supporting a read method) into a
164166
DocumentLayout by using a model identified by model_name."""
165167
with tempfile.NamedTemporaryFile() as tmp_file:
166168
tmp_file.write(data.read())
167-
layout = process_file_with_model(tmp_file.name, model_name)
169+
layout = process_file_with_model(tmp_file.name, model_name, is_image=is_image)
168170

169171
return layout
170172

171173

172-
def process_file_with_model(filename: str, model_name: str) -> DocumentLayout:
174+
def process_file_with_model(
175+
filename: str, model_name: Optional[str], is_image: bool = False
176+
) -> DocumentLayout:
173177
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
174178
model_name."""
175179
model = None if model_name is None else get_model(model_name)
176-
layout = DocumentLayout.from_file(filename, model=model)
180+
layout = (
181+
DocumentLayout.from_image_file(filename, model=model)
182+
if is_image
183+
else DocumentLayout.from_file(filename, model=model)
184+
)
177185
return layout
178186

179187

@@ -188,6 +196,7 @@ def cid_ratio(text: str) -> float:
188196

189197

190198
def is_cid_present(text: str) -> bool:
199+
"""Checks if a cid code is present in a text selection."""
191200
if len(text) < len("(cid:x)"):
192201
return False
193202
return text.find("(cid:") != -1

0 commit comments

Comments
 (0)