Skip to content

Feature: yolov8 support #324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions test_unstructured_inference/models/test_yolov8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import os

import pytest

from unstructured_inference.inference.layout import process_file_with_model


@pytest.mark.slow()
def test_layout_yolov8_local_parsing_image():
filename = os.path.join("sample-docs", "test-image.jpg")
# NOTE(benjamin) keep_output = True create a file for each image in
# localstorage for visualization of the result
document_layout = process_file_with_model(filename, model_name="yolov8s", is_image=True)
# NOTE(benjamin) The example image should result in one page result
assert len(document_layout.pages) == 1
# NOTE(benjamin) The example sent to the test contains 13 detections
types_known = ["Text", "Section-header", "Page-header"]
known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
assert len(known_regions) == 13
assert hasattr(
document_layout.pages[0].elements[0],
"prob",
) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
assert isinstance(
document_layout.pages[0].elements[0].prob,
float,
) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float


@pytest.mark.slow()
def test_layout_yolov8_local_parsing_pdf():
filename = os.path.join("sample-docs", "loremipsum.pdf")
document_layout = process_file_with_model(filename, model_name="yolov8s")
assert len(document_layout.pages) == 1
# NOTE(benjamin) The example sent to the test contains 5 text detections
text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
assert len(text_elements) == 5
assert hasattr(
document_layout.pages[0].elements[0],
"prob",
) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
assert isinstance(
document_layout.pages[0].elements[0].prob,
float,
) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float


@pytest.mark.slow()
def test_layout_yolov8_local_parsing_empty_pdf():
filename = os.path.join("sample-docs", "empty-document.pdf")
document_layout = process_file_with_model(filename, model_name="yolov8s")
assert len(document_layout.pages) == 1
# NOTE(benjamin) The example sent to the test contains 0 detections
assert len(document_layout.pages[0].elements) == 0


########################
# ONLY SHORT TESTS BELOW
########################


def test_layout_yolov8_local_parsing_image_soft():
filename = os.path.join("sample-docs", "example_table.jpg")
# NOTE(benjamin) keep_output = True create a file for each image in
# localstorage for visualization of the result
document_layout = process_file_with_model(filename, model_name="yolov8s", is_image=True)
# NOTE(benjamin) The example image should result in one page result
assert len(document_layout.pages) == 1
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
assert len(document_layout.pages[0].elements) > 0
assert hasattr(
document_layout.pages[0].elements[0],
"prob",
) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
assert isinstance(
document_layout.pages[0].elements[0].prob,
float,
) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float


def test_layout_yolov8_local_parsing_pdf_soft():
filename = os.path.join("sample-docs", "loremipsum.pdf")
document_layout = process_file_with_model(filename, model_name="yolov8s")
assert len(document_layout.pages) == 1
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
assert len(document_layout.pages[0].elements) > 0
assert hasattr(
document_layout.pages[0].elements[0],
"prob",
) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities


def test_layout_yolov8_local_parsing_empty_pdf_soft():
filename = os.path.join("sample-docs", "empty-document.pdf")
document_layout = process_file_with_model(filename, model_name="yolov8s")
assert len(document_layout.pages) == 1
# NOTE(benjamin) The example sent to the test contains 0 detections
text_elements_page_1 = [el for el in document_layout.pages[0].elements if el.type != "Image"]
assert len(text_elements_page_1) == 0
1 change: 1 addition & 0 deletions unstructured_inference/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class AnnotationResult(Enum):

class Source(Enum):
YOLOX = "yolox"
YOLOv8 = "yolov8"
DETECTRON2_ONNX = "detectron2_onnx"
DETECTRON2_LP = "detectron2_lp"
CHIPPER = "chipper"
Expand Down
9 changes: 9 additions & 0 deletions unstructured_inference/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@
from unstructured_inference.models.yolox import (
UnstructuredYoloXModel,
)
from unstructured_inference.models.yolov8 import (
MODEL_TYPES as YOLOV8_MODEL_TYPES,
)
from unstructured_inference.models.yolov8 import (
UnstructuredYolov8Model,
)

DEFAULT_MODEL = "yolox"

Expand All @@ -35,6 +41,7 @@
**{name: UnstructuredDetectronModel for name in DETECTRON2_MODEL_TYPES},
**{name: UnstructuredDetectronONNXModel for name in DETECTRON2_ONNX_MODEL_TYPES},
**{name: UnstructuredYoloXModel for name in YOLOX_MODEL_TYPES},
**{name: UnstructuredYolov8Model for name in YOLOV8_MODEL_TYPES},
**{name: UnstructuredChipperModel for name in CHIPPER_MODEL_TYPES},
"super_gradients": UnstructuredSuperGradients,
}
Expand Down Expand Up @@ -65,6 +72,8 @@ def get_model(model_name: Optional[str] = None) -> UnstructuredModel:
initialize_params = DETECTRON2_ONNX_MODEL_TYPES[model_name]
elif model_name in YOLOX_MODEL_TYPES:
initialize_params = YOLOX_MODEL_TYPES[model_name]
elif model_name in YOLOV8_MODEL_TYPES:
initialize_params = YOLOV8_MODEL_TYPES[model_name]
elif model_name in CHIPPER_MODEL_TYPES:
initialize_params = CHIPPER_MODEL_TYPES[model_name]
else:
Expand Down
95 changes: 95 additions & 0 deletions unstructured_inference/models/yolov8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from typing import List, cast

import numpy as np
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision.ops import nms

from unstructured_inference.constants import ElementType, Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from ultralytics import YOLO

YOLOv8_LABEL_MAP = {
0: ElementType.CAPTION,
1: ElementType.FOOTNOTE,
2: ElementType.FORMULA,
3: ElementType.LIST_ITEM,
4: ElementType.PAGE_FOOTER,
5: ElementType.PAGE_HEADER,
6: ElementType.PICTURE,
7: ElementType.SECTION_HEADER,
8: ElementType.TABLE,
9: ElementType.TEXT,
10: ElementType.TITLE,
}

model = YOLO('/home/joao/yolov8n/weights/best.pt')
MODEL_TYPES = {
"yolov8n": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
"neuralshift/doc-layout-yolov8n",
"weights/best.pt",
),
label_map=YOLOv8_LABEL_MAP,
),
"yolov8s": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
"neuralshift/doc-layout-yolov8s",
"weights/best.pt",
),
label_map=YOLOv8_LABEL_MAP,
),
}


class UnstructuredYolov8Model(UnstructuredObjectDetectionModel):
def predict(self, x: Image):
"""Predict using Yolov8 model."""
super().predict(x)
return self.image_processing(x)

def initialize(self, model_path: str, label_map: dict):
"""Start inference session for Yolov8 model."""
self.model = YOLO(model=model_path)
self.layout_classes = label_map

def image_processing(
self,
image: Image = None,
) -> List[LayoutElement]:
"""Method runing Yolov8 for layout detection, returns a list of
LayoutElement
----------
image
Image to process
"""
input_shape = (640, 640)
processed_image = image.resize(input_shape, Image.BILINEAR)
ratio = np.array(input_shape) / np.array(image.size)

# NMS
boxes = self.model(processed_image, verbose=False)[0].boxes
valid_boxes = nms(boxes.xyxy, boxes.conf, 0.1)
boxes = boxes[valid_boxes]
boxes = boxes[boxes.conf > 0.3]

regions = sorted([
LayoutElement.from_coords(
box.xyxy[0][0].item() / ratio[0],
box.xyxy[0][1].item() / ratio[1],
box.xyxy[0][2].item() / ratio[0],
box.xyxy[0][3].item() / ratio[1],
text=None,
type=self.layout_classes[int(box.cls.item())],
prob=box.conf.item(),
source=Source.YOLOv8,
) for box in boxes
], key=lambda element: element.bbox.y1)

page_layout = cast(List[LayoutElement], regions) # TODO(benjamin): encode image as base64?

return page_layout