Skip to content

Commit 80167f5

Browse files
authored
Merge pull request #1636 from anuprulez/doclayout_yolo
Add inference engine for segmenting texts in documents using Yolo
2 parents 67e0e1d + 5979718 commit 80167f5

File tree

6 files changed

+225
-1
lines changed

6 files changed

+225
-1
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
name: doclayoutyolo
2+
owner: bgruening
3+
description: Tool for segmenting text in images.
4+
long_description: Tool for segmenting text in images using Yolo model pretrained on large number of documents.
5+
remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/image_processing/yolo-utils/doclayoutyolo
6+
homepage_url: https://github.com/bgruening/galaxytools/tree/master/tools/image_processing/yolo-utils/doclayoutyolo
7+
type:
8+
categories:
9+
- Machine Learning
10+
- Imaging
11+
maintainers:
12+
- anuprulez
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
<tool id="doclayoutyolo" name="DocLayout-YOLO" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2+
<description>Enhancing document layout analysis</description>
3+
<macros>
4+
<token name="@TOOL_VERSION@">0.0.4.1</token>
5+
<token name="@VERSION_SUFFIX@">0</token>
6+
<token name="@PROFILE@">24.2</token>
7+
</macros>
8+
<creator>
9+
<organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/>
10+
<person givenName="Anup" familyName="Kumar" email="kumara@informatik.uni-freiburg.de"/>
11+
</creator>
12+
<requirements>
13+
<container type="docker">quay.io/galaxy/doclayout-yolo:@TOOL_VERSION@</container>
14+
</requirements>
15+
<required_files>
16+
<include path="segment_text_yolo.py"/>
17+
</required_files>
18+
<command detect_errors="aggressive"><![CDATA[
19+
python '$__tool_directory__/segment_text_yolo.py'
20+
--yolo_model '$input_yolo_model'
21+
--input_image '$input_image'
22+
--input_image_ext '$input_image.ext'
23+
--input_confidence '$input_confidence'
24+
--input_image_size '$input_image_size'
25+
--output_image '$output_image'
26+
--output_geojson '$output_segmentation_coordinates'
27+
]]>
28+
</command>
29+
<inputs>
30+
<param name="input_yolo_model" type="data" format="zip" label="Yolo model" help="Please upload a Yolo model."/>
31+
<param name="input_image" type="data" format="tiff,jpg,png" label="Input image" help="Please provide an input image for the analysis."/>
32+
<param name="input_confidence" type="float" label="Confidence" value="0.5" min="0.0" max="1.0" help="Set confidence threshold between 0.0 and 1.0 for drawing bounding boxes. Higher values indicate higher probablity of segmentation."/>
33+
<param name="input_image_size" type="integer" label="Image size" value="1024" min="1" max="1500" help="Set input image size for image resize by Doclayout Yolo model. Larger values may provide better accuracy in segmentation but could be slower. Lower values might be faster with lower accuracy."/>
34+
</inputs>
35+
<outputs>
36+
<data format_source="input_image" name="output_image" label="Segmented image"></data>
37+
<data format="geojson" name="output_segmentation_coordinates" label="Segmented coordinates"></data>
38+
</outputs>
39+
<tests>
40+
<test>
41+
<param name="input_yolo_model" value="input_yolo_model.zip" location="https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/resolve/main/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.pt?download=true"/>
42+
<param name="input_image" value="input_image_png.png"/>
43+
<param name="input_confidence" value="0.5"/>
44+
<param name="input_image_size" value="1024"/>
45+
<output name="output_image" ftype="png">
46+
<assert_contents>
47+
<has_size size="920950" delta="100" />
48+
</assert_contents>
49+
</output>
50+
<output name="output_segmentation_coordinates" ftype="geojson">
51+
<assert_contents>
52+
<has_text text="Polygon" />
53+
<has_text text="Feature" />
54+
<has_text text="coordinates" />
55+
</assert_contents>
56+
</output>
57+
</test>
58+
<test>
59+
<param name="input_yolo_model" value="input_yolo_model.zip" location="https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/resolve/main/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.pt?download=true"/>
60+
<param name="input_image" value="input_image_jpg.jpg" location="https://zenodo.org/records/15649779/files/input_image_jpg.jpg?download=1"/>
61+
<param name="input_confidence" value="0.5"/>
62+
<param name="input_image_size" value="1024"/>
63+
<output name="output_image" ftype="jpg">
64+
<assert_contents>
65+
<has_size size="2753175" delta="100" />
66+
</assert_contents>
67+
</output>
68+
<output name="output_segmentation_coordinates" ftype="geojson">
69+
<assert_contents>
70+
<has_text text="Polygon" />
71+
<has_text text="Feature" />
72+
<has_text text="coordinates" />
73+
</assert_contents>
74+
</output>
75+
</test>
76+
<test>
77+
<param name="input_yolo_model" value="input_yolo_model.zip" location="https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/resolve/main/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.pt?download=true"/>
78+
<param name="input_image" value="input_image_tiff.tif"/>
79+
<param name="input_confidence" value="0.5"/>
80+
<param name="input_image_size" value="1024"/>
81+
<output name="output_image" ftype="tiff">
82+
<assert_contents>
83+
<has_size size="510756" delta="100" />
84+
</assert_contents>
85+
</output>
86+
<output name="output_segmentation_coordinates" ftype="geojson">
87+
<assert_contents>
88+
<has_text text="Polygon" />
89+
<has_text text="Feature" />
90+
<has_text text="coordinates" />
91+
</assert_contents>
92+
</output>
93+
</test>
94+
</tests>
95+
<help>
96+
<![CDATA[
97+
**What it does**
98+
99+
The tool takes a Yolo model trained for annotating bounding boxes around text. It takes a pretrained Yolo model and predicts bounding boxes in the input image where any text is found.
100+
It is based on document layout analysis: https://github.com/opendatalab/DocLayout-YOLO. The Yolo model can be downloaded from: https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/tree/main
101+
102+
103+
**Input files**
104+
- Yolo model (as `.pt` file)
105+
- Input image containing text
106+
- Confidence score to be used for drawing bounding boxes
107+
- Image size to be resized to by Yolo model
108+
109+
**Output files**
110+
- Segmented image
111+
- Coordinates of bounding boxes as Geojson file
112+
113+
]]>
114+
</help>
115+
<citations>
116+
<citation type="bibtex">
117+
@ARTICLE{zhao2024doclayoutyoloenhancingdocumentlayout,
118+
Author = {Zhao, Zhiyuan and et al.},
119+
title = {{DocLayout-YOLO: Enhancing Document Layout Analysis through Diverse Synthetic Data and Global-to-Local Adaptive Perception}},
120+
url = {https://github.com/opendatalab/DocLayout-YOLO}
121+
}
122+
</citation>
123+
</citations>
124+
</tool>
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
Segment text using DocLayout Yolo model
3+
"""
4+
5+
import argparse
6+
import json
7+
import os
8+
9+
import cv2
10+
from doclayout_yolo import YOLOv10
11+
from geojson import Feature, FeatureCollection
12+
from shapely.geometry import box, mapping
13+
14+
15+
def load_model_and_predict(
16+
model_path, input_image_path, input_confidence, image_size, output_image_path
17+
):
18+
19+
model = YOLOv10(model=model_path)
20+
21+
det_res = model.predict(
22+
input_image_path, imgsz=int(image_size), conf=float(input_confidence)
23+
)
24+
annotated_frame = det_res[0].plot(pil=True, line_width=5, font_size=20)
25+
cv2.imwrite(output_image_path, annotated_frame)
26+
return det_res[0]
27+
28+
29+
def extract_bb_crop(results, output_segmentation_coordiates):
30+
bounding_boxes = []
31+
features = []
32+
for bx in results.boxes.xyxy.cpu().numpy():
33+
x1, y1, x2, y2 = bx
34+
bounding_boxes.append((x1, y1, x2, y2))
35+
36+
for i, (x1, y1, x2, y2) in enumerate(bounding_boxes):
37+
poly = box(x1, y1, x2, y2)
38+
feature = Feature(geometry=mapping(poly), properties={"id": i})
39+
features.append(feature)
40+
41+
geojson_obj = FeatureCollection(features)
42+
43+
with open(output_segmentation_coordiates, "w") as f:
44+
json.dump(geojson_obj, f)
45+
46+
47+
if __name__ == "__main__":
48+
arg_parser = argparse.ArgumentParser()
49+
arg_parser.add_argument(
50+
"-im", "--yolo_model", required=True, help="Input Yolo model"
51+
)
52+
arg_parser.add_argument(
53+
"-ii", "--input_image", required=True, help="Input image file"
54+
)
55+
arg_parser.add_argument(
56+
"-ie", "--input_image_ext", required=True, help="Input image file extension"
57+
)
58+
arg_parser.add_argument(
59+
"-ic", "--input_confidence", required=True, help="Input confidence"
60+
)
61+
arg_parser.add_argument(
62+
"-is", "--input_image_size", required=True, help="Input image size"
63+
)
64+
arg_parser.add_argument("-oi", "--output_image", required=True, help="Output image")
65+
arg_parser.add_argument(
66+
"-ogj", "--output_geojson", required=True, help="Output segmented coordinates"
67+
)
68+
args = vars(arg_parser.parse_args())
69+
model_path = args["yolo_model"]
70+
input_image_path = args["input_image"]
71+
input_ext = args["input_image_ext"]
72+
confidence = args["input_confidence"]
73+
image_size = args["input_image_size"]
74+
output_image_path = args["output_image"]
75+
output_segmentation_coordiates = args["output_geojson"]
76+
77+
model_link = "yolo_model.pt"
78+
input_image = f"input_image.{input_ext}"
79+
output_image = f"output_image.{input_ext}"
80+
81+
os.symlink(model_path, model_link)
82+
os.symlink(input_image_path, input_image)
83+
os.symlink(output_image_path, output_image)
84+
85+
segmented_image = load_model_and_predict(
86+
model_link, input_image, confidence, image_size, output_image
87+
)
88+
extract_bb_crop(segmented_image, output_segmentation_coordiates)
802 KB
Loading
Binary file not shown.

tools/image_processing/yolo/.shed.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ suite:
1414
name: "suite_yolo"
1515
description: "A suite of tools that brings the yolo workflows into Galaxy."
1616
long_description: |
17-
collection of ready-to-use tools for training yolo models and making predictions.
17+
collection of ready-to-use tools for training yolo models and making predictions.

0 commit comments

Comments
 (0)