Merge pull request #1636 from anuprulez/doclayout_yolo

bgruening · web-flow · commit 80167f52fb9b · 2025-06-13T16:31:38.000+02:00
Add inference engine for segmenting texts in documents using Yolo
diff --git a/tools/image_processing/yolo-utils/doclayoutyolo/.shed.yml b/tools/image_processing/yolo-utils/doclayoutyolo/.shed.yml
@@ -0,0 +1,12 @@
+name: doclayoutyolo
+owner: bgruening
+description: Tool for segmenting text in images.
+long_description: Tool for segmenting text in images using Yolo model pretrained on large number of documents.
+remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/image_processing/yolo-utils/doclayoutyolo
+homepage_url: https://github.com/bgruening/galaxytools/tree/master/tools/image_processing/yolo-utils/doclayoutyolo
+type:
+categories:
+  - Machine Learning
+  - Imaging
+maintainers:
+  - anuprulez
diff --git a/tools/image_processing/yolo-utils/doclayoutyolo/doclayoutyolo.xml b/tools/image_processing/yolo-utils/doclayoutyolo/doclayoutyolo.xml
@@ -0,0 +1,124 @@
+<tool id="doclayoutyolo" name="DocLayout-YOLO" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Enhancing document layout analysis</description>
+    <macros>
+        <token name="@TOOL_VERSION@">0.0.4.1</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@PROFILE@">24.2</token>
+    </macros>
+    <creator>
+        <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/>
+        <person givenName="Anup" familyName="Kumar" email="kumara@informatik.uni-freiburg.de"/>
+    </creator>
+    <requirements>
+        <container type="docker">quay.io/galaxy/doclayout-yolo:@TOOL_VERSION@</container>
+    </requirements>
+    <required_files>
+        <include path="segment_text_yolo.py"/>
+    </required_files>
+    <command detect_errors="aggressive"><![CDATA[
+    python '$__tool_directory__/segment_text_yolo.py'
+            --yolo_model '$input_yolo_model'
+            --input_image '$input_image'
+            --input_image_ext '$input_image.ext'
+            --input_confidence '$input_confidence'
+            --input_image_size '$input_image_size'
+            --output_image '$output_image'
+            --output_geojson '$output_segmentation_coordinates'
+]]>
+    </command>
+    <inputs>
+        <param name="input_yolo_model" type="data" format="zip" label="Yolo model" help="Please upload a Yolo model."/>
+        <param name="input_image" type="data" format="tiff,jpg,png" label="Input image" help="Please provide an input image for the analysis."/>
+        <param name="input_confidence" type="float" label="Confidence" value="0.5" min="0.0" max="1.0" help="Set confidence threshold between 0.0 and 1.0 for drawing bounding boxes. Higher values indicate higher probablity of segmentation."/>
+        <param name="input_image_size" type="integer" label="Image size" value="1024" min="1" max="1500" help="Set input image size for image resize by Doclayout Yolo model. Larger values may provide better accuracy in segmentation but could be slower. Lower values might be faster with lower accuracy."/>
+    </inputs>
+    <outputs>
+        <data format_source="input_image" name="output_image" label="Segmented image"></data>
+        <data format="geojson" name="output_segmentation_coordinates" label="Segmented coordinates"></data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_yolo_model" value="input_yolo_model.zip" location="https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/resolve/main/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.pt?download=true"/>
+            <param name="input_image" value="input_image_png.png"/>
+            <param name="input_confidence" value="0.5"/>
+            <param name="input_image_size" value="1024"/>
+            <output name="output_image" ftype="png">
+                <assert_contents>
+                    <has_size size="920950" delta="100" />
+                </assert_contents>
+            </output>
+            <output name="output_segmentation_coordinates" ftype="geojson">
+                <assert_contents>
+                    <has_text text="Polygon" />
+                    <has_text text="Feature" />
+                    <has_text text="coordinates" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input_yolo_model" value="input_yolo_model.zip" location="https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/resolve/main/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.pt?download=true"/>
+            <param name="input_image" value="input_image_jpg.jpg" location="https://zenodo.org/records/15649779/files/input_image_jpg.jpg?download=1"/>
+            <param name="input_confidence" value="0.5"/>
+            <param name="input_image_size" value="1024"/>
+            <output name="output_image" ftype="jpg">
+                <assert_contents>
+                    <has_size size="2753175" delta="100" />
+                </assert_contents>
+            </output>
+            <output name="output_segmentation_coordinates" ftype="geojson">
+                <assert_contents>
+                    <has_text text="Polygon" />
+                    <has_text text="Feature" />
+                    <has_text text="coordinates" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input_yolo_model" value="input_yolo_model.zip" location="https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/resolve/main/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.pt?download=true"/>
+            <param name="input_image" value="input_image_tiff.tif"/>
+            <param name="input_confidence" value="0.5"/>
+            <param name="input_image_size" value="1024"/>
+            <output name="output_image" ftype="tiff">
+                <assert_contents>
+                    <has_size size="510756" delta="100" />
+                </assert_contents>
+            </output>
+            <output name="output_segmentation_coordinates" ftype="geojson">
+                <assert_contents>
+                    <has_text text="Polygon" />
+                    <has_text text="Feature" />
+                    <has_text text="coordinates" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+        <![CDATA[
+**What it does**
+
+The tool takes a Yolo model trained for annotating bounding boxes around text. It takes a pretrained Yolo model and predicts bounding boxes in the input image where any text is found.
+It is based on document layout analysis: https://github.com/opendatalab/DocLayout-YOLO. The Yolo model can be downloaded from: https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/tree/main
+ 
+
+**Input files**
+  - Yolo model (as `.pt` file)
+  - Input image containing text
+  - Confidence score to be used for drawing bounding boxes
+  - Image size to be resized to by Yolo model
+
+**Output files**
+  - Segmented image
+  - Coordinates of bounding boxes as Geojson file
+
+        ]]>
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @ARTICLE{zhao2024doclayoutyoloenhancingdocumentlayout,
+                Author = {Zhao, Zhiyuan and et al.},
+                title = {{DocLayout-YOLO: Enhancing Document Layout Analysis through Diverse Synthetic Data and Global-to-Local Adaptive Perception}},
+                url = {https://github.com/opendatalab/DocLayout-YOLO}
+            }
+        </citation>
+    </citations>
+</tool>
diff --git a/tools/image_processing/yolo-utils/doclayoutyolo/segment_text_yolo.py b/tools/image_processing/yolo-utils/doclayoutyolo/segment_text_yolo.py
@@ -0,0 +1,88 @@
+"""
+Segment text using DocLayout Yolo model
+"""
+
+import argparse
+import json
+import os
+
+import cv2
+from doclayout_yolo import YOLOv10
+from geojson import Feature, FeatureCollection
+from shapely.geometry import box, mapping
+
+
+def load_model_and_predict(
+    model_path, input_image_path, input_confidence, image_size, output_image_path
+):
+
+    model = YOLOv10(model=model_path)
+
+    det_res = model.predict(
+        input_image_path, imgsz=int(image_size), conf=float(input_confidence)
+    )
+    annotated_frame = det_res[0].plot(pil=True, line_width=5, font_size=20)
+    cv2.imwrite(output_image_path, annotated_frame)
+    return det_res[0]
+
+
+def extract_bb_crop(results, output_segmentation_coordiates):
+    bounding_boxes = []
+    features = []
+    for bx in results.boxes.xyxy.cpu().numpy():
+        x1, y1, x2, y2 = bx
+        bounding_boxes.append((x1, y1, x2, y2))
+
+    for i, (x1, y1, x2, y2) in enumerate(bounding_boxes):
+        poly = box(x1, y1, x2, y2)
+        feature = Feature(geometry=mapping(poly), properties={"id": i})
+        features.append(feature)
+
+    geojson_obj = FeatureCollection(features)
+
+    with open(output_segmentation_coordiates, "w") as f:
+        json.dump(geojson_obj, f)
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument(
+        "-im", "--yolo_model", required=True, help="Input Yolo model"
+    )
+    arg_parser.add_argument(
+        "-ii", "--input_image", required=True, help="Input image file"
+    )
+    arg_parser.add_argument(
+        "-ie", "--input_image_ext", required=True, help="Input image file extension"
+    )
+    arg_parser.add_argument(
+        "-ic", "--input_confidence", required=True, help="Input confidence"
+    )
+    arg_parser.add_argument(
+        "-is", "--input_image_size", required=True, help="Input image size"
+    )
+    arg_parser.add_argument("-oi", "--output_image", required=True, help="Output image")
+    arg_parser.add_argument(
+        "-ogj", "--output_geojson", required=True, help="Output segmented coordinates"
+    )
+    args = vars(arg_parser.parse_args())
+    model_path = args["yolo_model"]
+    input_image_path = args["input_image"]
+    input_ext = args["input_image_ext"]
+    confidence = args["input_confidence"]
+    image_size = args["input_image_size"]
+    output_image_path = args["output_image"]
+    output_segmentation_coordiates = args["output_geojson"]
+
+    model_link = "yolo_model.pt"
+    input_image = f"input_image.{input_ext}"
+    output_image = f"output_image.{input_ext}"
+
+    os.symlink(model_path, model_link)
+    os.symlink(input_image_path, input_image)
+    os.symlink(output_image_path, output_image)
+
+    segmented_image = load_model_and_predict(
+        model_link, input_image, confidence, image_size, output_image
+    )
+    extract_bb_crop(segmented_image, output_segmentation_coordiates)
diff --git a/tools/image_processing/yolo-utils/doclayoutyolo/test-data/input_image_png.png b/tools/image_processing/yolo-utils/doclayoutyolo/test-data/input_image_png.png
diff --git a/tools/image_processing/yolo-utils/doclayoutyolo/test-data/input_image_tiff.tif b/tools/image_processing/yolo-utils/doclayoutyolo/test-data/input_image_tiff.tif
diff --git a/tools/image_processing/yolo/.shed.yml b/tools/image_processing/yolo/.shed.yml
@@ -14,4 +14,4 @@ suite:
   name: "suite_yolo"
   description: "A suite of tools that brings the yolo workflows into Galaxy."
   long_description: |
-      collection of ready-to-use tools for training yolo models and making predictions.
+      collection of ready-to-use tools for training yolo models and making predictions.