Unstructured-IO · jlcsilva · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/test_unstructured_inference/models/test_yolov8.py b/test_unstructured_inference/models/test_yolov8.py
@@ -0,0 +1,99 @@
+import os
+
+import pytest
+
+from unstructured_inference.inference.layout import process_file_with_model
+
+
+@pytest.mark.slow()
+def test_layout_yolov8_local_parsing_image():
+    filename = os.path.join("sample-docs", "test-image.jpg")
+    # NOTE(benjamin) keep_output = True create a file for each image in
+    # localstorage for visualization of the result
+    document_layout = process_file_with_model(filename, model_name="yolov8s", is_image=True)
+    # NOTE(benjamin) The example image should result in one page result
+    assert len(document_layout.pages) == 1
+    # NOTE(benjamin) The example sent to the test contains 13 detections
+    types_known = ["Text", "Section-header", "Page-header"]
+    known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
+    assert len(known_regions) == 13
+    assert hasattr(
+        document_layout.pages[0].elements[0],
+        "prob",
+    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
+    assert isinstance(
+        document_layout.pages[0].elements[0].prob,
+        float,
+    )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float
+
+
+@pytest.mark.slow()
+def test_layout_yolov8_local_parsing_pdf():
+    filename = os.path.join("sample-docs", "loremipsum.pdf")
+    document_layout = process_file_with_model(filename, model_name="yolov8s")
+    assert len(document_layout.pages) == 1
+    # NOTE(benjamin) The example sent to the test contains 5 text detections
+    text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
+    assert len(text_elements) == 5
+    assert hasattr(
+        document_layout.pages[0].elements[0],
+        "prob",
+    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
+    assert isinstance(
+        document_layout.pages[0].elements[0].prob,
+        float,
+    )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float
+
+
+@pytest.mark.slow()
+def test_layout_yolov8_local_parsing_empty_pdf():
+    filename = os.path.join("sample-docs", "empty-document.pdf")
+    document_layout = process_file_with_model(filename, model_name="yolov8s")
+    assert len(document_layout.pages) == 1
+    # NOTE(benjamin) The example sent to the test contains 0 detections
+    assert len(document_layout.pages[0].elements) == 0
+
+
+########################
+# ONLY SHORT TESTS BELOW
+########################
+
+
+def test_layout_yolov8_local_parsing_image_soft():
+    filename = os.path.join("sample-docs", "example_table.jpg")
+    # NOTE(benjamin) keep_output = True create a file for each image in
+    # localstorage for visualization of the result
+    document_layout = process_file_with_model(filename, model_name="yolov8s", is_image=True)
+    # NOTE(benjamin) The example image should result in one page result
+    assert len(document_layout.pages) == 1
+    # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
+    assert len(document_layout.pages[0].elements) > 0
+    assert hasattr(
+        document_layout.pages[0].elements[0],
+        "prob",
+    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
+    assert isinstance(
+        document_layout.pages[0].elements[0].prob,
+        float,
+    )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float
+
+
+def test_layout_yolov8_local_parsing_pdf_soft():
+    filename = os.path.join("sample-docs", "loremipsum.pdf")
+    document_layout = process_file_with_model(filename, model_name="yolov8s")
+    assert len(document_layout.pages) == 1
+    # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
+    assert len(document_layout.pages[0].elements) > 0
+    assert hasattr(
+        document_layout.pages[0].elements[0],
+        "prob",
+    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
+
+
+def test_layout_yolov8_local_parsing_empty_pdf_soft():
+    filename = os.path.join("sample-docs", "empty-document.pdf")
+    document_layout = process_file_with_model(filename, model_name="yolov8s")
+    assert len(document_layout.pages) == 1
+    # NOTE(benjamin) The example sent to the test contains 0 detections
+    text_elements_page_1 = [el for el in document_layout.pages[0].elements if el.type != "Image"]
+    assert len(text_elements_page_1) == 0
diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py
@@ -8,6 +8,7 @@ class AnnotationResult(Enum):
 
 class Source(Enum):
     YOLOX = "yolox"
+    YOLOv8 = "yolov8"
     DETECTRON2_ONNX = "detectron2_onnx"
     DETECTRON2_LP = "detectron2_lp"
     CHIPPER = "chipper"

diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py
@@ -26,6 +26,12 @@
 from unstructured_inference.models.yolox import (
     UnstructuredYoloXModel,
 )
+from unstructured_inference.models.yolov8 import (
+    MODEL_TYPES as YOLOV8_MODEL_TYPES,
+)
+from unstructured_inference.models.yolov8 import (
+    UnstructuredYolov8Model,
+)
 
 DEFAULT_MODEL = "yolox"
 
@@ -35,6 +41,7 @@
     **{name: UnstructuredDetectronModel for name in DETECTRON2_MODEL_TYPES},
     **{name: UnstructuredDetectronONNXModel for name in DETECTRON2_ONNX_MODEL_TYPES},
     **{name: UnstructuredYoloXModel for name in YOLOX_MODEL_TYPES},
+    **{name: UnstructuredYolov8Model for name in YOLOV8_MODEL_TYPES},
     **{name: UnstructuredChipperModel for name in CHIPPER_MODEL_TYPES},
     "super_gradients": UnstructuredSuperGradients,
 }
@@ -65,6 +72,8 @@ def get_model(model_name: Optional[str] = None) -> UnstructuredModel:
             initialize_params = DETECTRON2_ONNX_MODEL_TYPES[model_name]
         elif model_name in YOLOX_MODEL_TYPES:
             initialize_params = YOLOX_MODEL_TYPES[model_name]
+        elif model_name in YOLOV8_MODEL_TYPES:
+            initialize_params = YOLOV8_MODEL_TYPES[model_name]
         elif model_name in CHIPPER_MODEL_TYPES:
             initialize_params = CHIPPER_MODEL_TYPES[model_name]
         else:

diff --git a/unstructured_inference/models/yolov8.py b/unstructured_inference/models/yolov8.py
@@ -0,0 +1,95 @@
+from typing import List, cast
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision.ops import nms
+
+from unstructured_inference.constants import ElementType, Source
+from unstructured_inference.inference.layoutelement import LayoutElement
+from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
+from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
+from ultralytics import YOLO
+
+YOLOv8_LABEL_MAP = {
+    0: ElementType.CAPTION,
+    1: ElementType.FOOTNOTE,
+    2: ElementType.FORMULA,
+    3: ElementType.LIST_ITEM,
+    4: ElementType.PAGE_FOOTER,
+    5: ElementType.PAGE_HEADER,
+    6: ElementType.PICTURE,
+    7: ElementType.SECTION_HEADER,
+    8: ElementType.TABLE,
+    9: ElementType.TEXT,
+    10: ElementType.TITLE,
+}
+
+model = YOLO('/home/joao/yolov8n/weights/best.pt')
+MODEL_TYPES = {
+    "yolov8n": LazyDict(
+        model_path=LazyEvaluateInfo(
+            hf_hub_download,
+            "neuralshift/doc-layout-yolov8n",
+            "weights/best.pt",
+        ),
+        label_map=YOLOv8_LABEL_MAP,
+    ),
+    "yolov8s": LazyDict(
+        model_path=LazyEvaluateInfo(
+            hf_hub_download,
+            "neuralshift/doc-layout-yolov8s",
+            "weights/best.pt",
+        ),
+        label_map=YOLOv8_LABEL_MAP,
+    ),
+}
+
+
+class UnstructuredYolov8Model(UnstructuredObjectDetectionModel):
+    def predict(self, x: Image):
+        """Predict using Yolov8 model."""
+        super().predict(x)
+        return self.image_processing(x)
+
+    def initialize(self, model_path: str, label_map: dict):
+        """Start inference session for Yolov8 model."""
+        self.model = YOLO(model=model_path)
+        self.layout_classes = label_map
+
+    def image_processing(
+        self,
+        image: Image = None,
+    ) -> List[LayoutElement]:
+        """Method runing Yolov8 for layout detection, returns a list of 
+        LayoutElement
+        ----------
+        image
+            Image to process
+        """
+        input_shape = (640, 640)
+        processed_image = image.resize(input_shape, Image.BILINEAR)
+        ratio = np.array(input_shape) / np.array(image.size)
+
+        # NMS
+        boxes = self.model(processed_image, verbose=False)[0].boxes
+        valid_boxes = nms(boxes.xyxy, boxes.conf, 0.1)
+        boxes = boxes[valid_boxes]
+        boxes = boxes[boxes.conf > 0.3]
+
+        regions = sorted([
+            LayoutElement.from_coords(
+                box.xyxy[0][0].item() / ratio[0],
+                box.xyxy[0][1].item() / ratio[1],
+                box.xyxy[0][2].item() / ratio[0],
+                box.xyxy[0][3].item() / ratio[1],
+                text=None,
+                type=self.layout_classes[int(box.cls.item())],
+                prob=box.conf.item(),
+                source=Source.YOLOv8,
+            ) for box in boxes
+        ], key=lambda element: element.bbox.y1)
+
+        page_layout = cast(List[LayoutElement], regions)  # TODO(benjamin): encode image as base64?
+
+        return page_layout