chore: cut release for image processing (#13)

qued · web-flow · commit a21325f6894b · 2023-01-12T13:21:54.000-06:00
- Added is_image flag for process_data_with_model and process_file_with_model
- Added support for the above in the API
- Added testing for all of the above
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.2.2-dev1
+## 0.2.2
 
 * Add capability to process image files
 * Add logic to use OCR when layout text is full of unknown characters
diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py
@@ -9,50 +9,52 @@
 import unstructured_inference.models.detectron2 as detectron2
 
 
-@pytest.fixture
-def sample_pdf_content():
-    return """
-    this is the content of a sample pdf file.
-    Title: ...
-    Author: ...
-    """
-
-
 class MockModel:
     def __init__(self, *args, **kwargs):
         self.args = args
         self.kwargs = kwargs
 
 
-def test_layout_parsing_pdf_api(sample_pdf_content, tmpdir, monkeypatch):
+@pytest.mark.parametrize("filetype, ext", [("pdf", "pdf"), ("image", "png")])
+def test_layout_parsing_api(monkeypatch, filetype, ext):
     monkeypatch.setattr(models, "load_model", lambda *args, **kwargs: MockModel(*args, **kwargs))
     monkeypatch.setattr(models, "hf_hub_download", lambda *args, **kwargs: "fake-path")
     monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
     monkeypatch.setattr(
         DocumentLayout, "from_file", lambda *args, **kwargs: DocumentLayout.from_pages([])
     )
+    monkeypatch.setattr(
+        DocumentLayout, "from_image_file", lambda *args, **kwargs: DocumentLayout.from_pages([])
+    )
 
-    filename = os.path.join(tmpdir.dirname, "sample.pdf")
-    with open(filename, "w") as f:
-        f.write(sample_pdf_content)
+    filename = os.path.join("sample-docs", f"loremipsum.{ext}")
 
     client = TestClient(app)
-    response = client.post("/layout/pdf", files={"file": (filename, open(filename, "rb"))})
+    response = client.post(f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))})
     assert response.status_code == 200
 
     response = client.post(
-        "/layout/pdf", files={"file": (filename, open(filename, "rb"))}, data={"model": "checkbox"}
+        f"/layout/{filetype}",
+        files={"file": (filename, open(filename, "rb"))},
+        data={"model": "checkbox"},
     )
     assert response.status_code == 200
 
     response = client.post(
-        "/layout/pdf",
+        f"/layout/{filetype}",
         files={"file": (filename, open(filename, "rb"))},
         data={"model": "fake_model"},
     )
     assert response.status_code == 422
 
 
+def test_bad_route_404():
+    client = TestClient(app)
+    filename = os.path.join("sample-docs", "loremipsum.pdf")
+    response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))})
+    assert response.status_code == 404
+
+
 def test_healthcheck(monkeypatch):
     client = TestClient(app)
     response = client.get("/healthcheck")
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.2-dev1"  # pragma: no cover
+__version__ = "0.2.2"  # pragma: no cover
diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py
@@ -6,16 +6,21 @@
 app = FastAPI()
 
 ALL_ELEMS = "_ALL"
+VALID_FILETYPES = ["pdf", "image"]
 
 
-@app.post("/layout/pdf")
-async def layout_parsing_pdf(
+@app.post("/layout/{filetype:path}")
+async def layout_parsing(
+    filetype: str,
     file: UploadFile = File(),
     include_elems: List[str] = Form(default=ALL_ELEMS),
     model: str = Form(default=None),
 ):
+    if filetype not in VALID_FILETYPES:
+        raise HTTPException(status.HTTP_404_NOT_FOUND)
+    is_image = filetype == "image"
     try:
-        layout = process_data_with_model(file.file, model)
+        layout = process_data_with_model(file.file, model, is_image)
     except UnknownModelException as e:
         raise HTTPException(status.HTTP_422_UNPROCESSABLE_ENTITY, str(e))
     pages_layout = [
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -159,21 +159,29 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
         return self.image_array
 
 
-def process_data_with_model(data: BinaryIO, model_name: str) -> DocumentLayout:
+def process_data_with_model(
+    data: BinaryIO, model_name: Optional[str], is_image: bool = False
+) -> DocumentLayout:
     """Processes pdf file in the form of a file handler (supporting a read method) into a
     DocumentLayout by using a model identified by model_name."""
     with tempfile.NamedTemporaryFile() as tmp_file:
         tmp_file.write(data.read())
-        layout = process_file_with_model(tmp_file.name, model_name)
+        layout = process_file_with_model(tmp_file.name, model_name, is_image=is_image)
 
     return layout
 
 
-def process_file_with_model(filename: str, model_name: str) -> DocumentLayout:
+def process_file_with_model(
+    filename: str, model_name: Optional[str], is_image: bool = False
+) -> DocumentLayout:
     """Processes pdf file with name filename into a DocumentLayout by using a model identified by
     model_name."""
     model = None if model_name is None else get_model(model_name)
-    layout = DocumentLayout.from_file(filename, model=model)
+    layout = (
+        DocumentLayout.from_image_file(filename, model=model)
+        if is_image
+        else DocumentLayout.from_file(filename, model=model)
+    )
     return layout
 
 
@@ -188,6 +196,7 @@ def cid_ratio(text: str) -> float:
 
 
 def is_cid_present(text: str) -> bool:
+    """Checks if a cid code is present in a text selection."""
     if len(text) < len("(cid:x)"):
         return False
     return text.find("(cid:") != -1

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-## 0.2.2-dev1`
	`1`	`+## 0.2.2`
`2`	`2`
`3`	`3`	`* Add capability to process image files`
`4`	`4`	`* Add logic to use OCR when layout text is full of unknown characters`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.2.2-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.2.2" # pragma: no cover`