Merge branch 'master' into master

GINAMO-EBVs · web-flow · commit 7e0032f20355 · 2026-04-13T09:43:29.000+02:00
diff --git a/tools/Sam3/sam3_semantic_segmentation.py b/tools/Sam3/sam3_semantic_segmentation.py
@@ -80,11 +80,22 @@ def parse_arguments() -> argparse.Namespace:
         default="copy",  # original quality by default
         help="Video bitrate: 'copy' (original), '2000k', '4000k', '8000k'",
     )
+    parser.add_argument(
+        "--coco_video_mode",
+        type=str,
+        default="no_coco",
+        choices=["video", "frames", "no_coco"],
+        help="For video input with COCO output: 'video' annotates the video"
+        "as a single source, 'frames' extracts each processed frame as an "
+        "individual image and annotates per frame, 'no_coco' disables "
+        "COCO output",
+    )
     return parser.parse_args()
 
 
 # -------- Functions --------
 
+
 def convert_avi_to_mp4(directory_path, quality):
     """
     Convert AVI file to MP4.
@@ -189,8 +200,15 @@ def create_coco_output(
 
         polygons = result.masks.xyn if is_normalized else result.masks.xy
         boxes = result.boxes.xyxyn if is_normalized else result.boxes.xyxy
+        track_ids = (
+            result.boxes.id.int().tolist()
+            if result.boxes.id is not None
+            else [None] * len(result.boxes.cls)
+        )
 
-        for polygon, bbox, class_id in zip(polygons, boxes, result.boxes.cls):
+        for polygon, bbox, class_id, track_id in zip(
+            polygons, boxes, result.boxes.cls, track_ids
+        ):
             # Flatten polygon coordinates
             polygon_flat = polygon.flatten().tolist()
 
@@ -206,6 +224,7 @@ def create_coco_output(
                     "id": annotation_id,
                     "image_id": image_id,
                     "category_id": int(class_id) + 1,
+                    "track_id": track_id,
                     "segmentation": [polygon_flat],
                     "area": area,
                     "bbox": [x1, y1, bbox_w, bbox_h],
@@ -293,6 +312,117 @@ def create_yolo_output(
     print(f"✓ Created {len(results)} images and labels in {output_dir}")
 
 
+def create_coco_video_frames_output(
+    results: List[Any],
+    text_prompts: List[str],
+    metadata: Dict[str, Any],
+    is_normalized: bool,
+    video_path: str,
+    stride: int,
+    outdir: Path,
+) -> Dict[str, Any]:
+    """Convert SAM3 video results to COCO format with one image entry
+    per extracted frame."""
+    frames_dir = outdir / "frames"
+    frames_dir.mkdir(parents=True, exist_ok=True)
+
+    coco_output = {
+        "info": metadata,
+        "images": [],
+        "annotations": [],
+        "categories": create_coco_categories(text_prompts),
+    }
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Failed to open video: {video_path}")
+
+    video_name = Path(video_path).stem
+    annotation_id = 1
+    frame_idx = 1 if stride > 1 else 0
+    saved_idx = 0
+
+    print(
+        f"Extracting frames and building per-frame COCO annotations "
+        f"(stride={stride})..."
+    )
+
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        if frame_idx % stride == 0:
+            if saved_idx >= len(results):
+                print(f"Warning: No result available for frame {frame_idx}")
+                break
+
+            frame_name = f"{video_name}_frame_{frame_idx:06d}.jpg"
+            frame_path = frames_dir / frame_name
+            cv2.imwrite(str(frame_path), frame)
+
+            result = results[saved_idx]
+            image_id = saved_idx + 1
+            height, width = result.orig_shape
+
+            coco_output["images"].append(
+                {
+                    "id": image_id,
+                    "file_name": frame_name,
+                    "width": width,
+                    "height": height,
+                    "frame_index": frame_idx,
+                }
+            )
+
+            if result.masks is not None:
+                polygons = (
+                    result.masks.xyn if is_normalized else result.masks.xy
+                )
+                boxes = (
+                    result.boxes.xyxyn if is_normalized else result.boxes.xyxy
+                )
+                track_ids = (
+                    result.boxes.id.int().tolist()
+                    if result.boxes.id is not None
+                    else [None] * len(result.boxes.cls)
+                )
+
+                for polygon, bbox, class_id, track_id in zip(
+                    polygons, boxes, result.boxes.cls, track_ids
+                ):
+                    polygon_flat = polygon.flatten().tolist()
+                    x1, y1, x2, y2 = bbox[:4].tolist()
+                    bbox_w = x2 - x1
+                    bbox_h = y2 - y1
+                    area = float(cv2.contourArea(polygon.astype(np.float32)))
+
+                    coco_output["annotations"].append(
+                        {
+                            "id": annotation_id,
+                            "image_id": image_id,
+                            "category_id": int(class_id) + 1,
+                            "track_id": track_id,
+                            "segmentation": [polygon_flat],
+                            "area": area,
+                            "bbox": [x1, y1, bbox_w, bbox_h],
+                            "iscrowd": 0,
+                        }
+                    )
+                    annotation_id += 1
+
+            saved_idx += 1
+
+            if saved_idx % 10 == 0:
+                print(f"  Extracted {saved_idx} frames...")
+
+        frame_idx += 1
+
+    cap.release()
+    print(f"✓ Extracted {saved_idx} frames to {frames_dir}")
+    return coco_output
+
+
 def create_yolo_video_output(
     annotation_type: str,
     results: List[Any],
@@ -475,7 +605,7 @@ def patched_postprocess(preds, img, orig_imgs):
     # print(f"\n Running prediction on {source_path}...")
     results = predictor(source=source_path, text=text_prompts, stream=False)
     if is_video(file_paths[0]):
-        convert_avi_to_mp4(outputs_annotated)
+        convert_avi_to_mp4(outputs_annotated, args.quality)
 
     if not results:
         raise RuntimeError("SAM3 returned no results")
@@ -492,9 +622,22 @@ def patched_postprocess(preds, img, orig_imgs):
 
     if "coco" in output_formats:
         print("\n→ Converting to COCO format...")
-        coco_output = create_coco_output(
-            results, text_prompts, metadata, is_normalized
-        )
+
+        if is_video(file_paths[0]) and args.coco_video_mode == "frames":
+            print("  Mode: per-frame (extracting individual frames)...")
+            coco_output = create_coco_video_frames_output(
+                results,
+                text_prompts,
+                metadata,
+                is_normalized,
+                file_paths[0],
+                args.vid_stride,
+                outdir,
+            )
+        else:
+            coco_output = create_coco_output(
+                results, text_prompts, metadata, is_normalized
+            )
 
         annotation_file = outdir / "annotations.json"
         with open(annotation_file, "w") as f:
@@ -510,7 +653,11 @@ def patched_postprocess(preds, img, orig_imgs):
 
         if is_video(file_paths[0]):
             create_yolo_video_output(
-                "bbox", results, yolo_bbox_dir, file_paths[0], args.vid_stride,
+                "bbox",
+                results,
+                yolo_bbox_dir,
+                file_paths[0],
+                args.vid_stride,
                 is_normalized,
             )
         else:
@@ -524,7 +671,11 @@ def patched_postprocess(preds, img, orig_imgs):
 
         if is_video(file_paths[0]):
             create_yolo_video_output(
-                "seg", results, yolo_seg_dir, file_paths[0], args.vid_stride,
+                "seg",
+                results,
+                yolo_seg_dir,
+                file_paths[0],
+                args.vid_stride,
                 is_normalized,
             )
         else:
diff --git a/tools/Sam3/sam3_semantic_segmentation.xml b/tools/Sam3/sam3_semantic_segmentation.xml
@@ -1,9 +1,9 @@
-<tool id="sam3_semantic_segmentation" name="SAM3 Semantic Segmentation" version="1.0.1+galaxy2" profile="25.1">
+<tool id="sam3_semantic_segmentation" name="SAM3 Semantic Segmentation" version="1.0.1+galaxy3" profile="25.1">
     <description>
         SAM3 performs text-prompted semantic segmentation on images or videos. 
     </description>
     <requirements>
-        <container type="docker">quay.io/arthur_barreau/sam3_tool:1.0.0</container>
+        <container type="docker">quay.io/arthur_barreau/sam3_tool:1.0.1</container>
     </requirements>
     <required_files>
         <include path="sam3_semantic_segmentation.py" />
@@ -26,9 +26,21 @@
         --conf '$conf' 
         --vid_stride '$vid_stride' 
         --outdir outputs 
-        --outputs $outputs_format
         --name_file '$name_file'
-        --quality '$input.quality'
+        #if $input.input_kind == "video"
+            --quality '$input.quality'
+        #end if
+        #if $input.input_kind == "image"
+            --outputs $input.outputs_format
+        #else
+            #if $input.coco_video_mode != "no_coco"
+                --outputs 'coco, $input.outputs_format'
+                --coco_video_mode '$input.coco_video_mode'
+            #else
+                --outputs $input.outputs_format
+            #end if
+        #end if
+        --do_normalization '$do_normalization' 
     ]]></command>
     <inputs>
         <param name="sam3_models" label="Model data" type="select" help="Contact the administrator of our Galaxy instance if you miss model data">
@@ -48,10 +60,17 @@
                 <param name="source" type="data" format="jpg,png,tiff" multiple="true" label="Input images">
                     <validator type="expression" message="TIFF images must contain exactly 3 channels (RGB).">value.ext not in ('tiff') or value.metadata.channels == 3</validator>
                 </param>
+                <param name="outputs_format" type="select" multiple="true" optional="true"
+                    label="Output formats"
+                    help="Select one or more annotation formats to generate.">
+                    <option value="coco">COCO</option>
+                    <option value="yolo_bbox">YOLO bounding boxes</option>
+                    <option value="yolo_seg">YOLO segmentation masks</option>
+                </param>
             </when>
             <when value="video">
                 <param name="source" type="data" format="mp4,avi,mov,gif"
-                    multiple="false" label="Input video"/>
+                    multiple="false" label="Input video file"/>
                 <param name="quality" type="select" label="Video quality"
                     help="Select output video bitrate,does not affect processing speed or annotations. 
                           Higher quality than the original is not useful and will only increase file size.">
@@ -61,6 +80,19 @@
                     <option value="4000k">4000k - Good (1080p)</option>
                     <option value="8000k">8000k - High quality (1080p)</option>
                 </param>
+                <param name="coco_video_mode" type="select"
+                    label="COCO output mode"
+                    help="Controls whether COCO annotations are generated, and how frames are referenced.">
+                    <option value="video" selected="true">Annotate the video — one COCO entry per frame, referencing the video file</option>
+                    <option value="frames">Annotate extracted frames — saves JPGs and one COCO entry per frame image</option>
+                    <option value="no_coco">No COCO output</option>
+                </param>
+                <param name="outputs_format" type="select" multiple="true" optional="true"
+                    label="Additional output formats"
+                    help="YOLO formats are optional. COCO output is controlled separately above.">
+                    <option value="yolo_bbox">YOLO bounding boxes</option>
+                    <option value="yolo_seg">YOLO segmentation masks</option>
+                </param>
             </when>
         </conditional>
         <param name="text_prompt" type="text" label="Text prompt" >
@@ -77,12 +109,6 @@
         <param name="vid_stride" type="integer" value="5" min="1" max="300"
             label="Video frame stride"
             help="For video input: process one frame every N frames."/>
-        <param name="outputs_format" type="select" multiple="true" optional="true"
-            label="Output formats">
-            <option value="coco">COCO</option>
-            <option value="yolo_bbox">YOLO bounding boxes</option>
-            <option value="yolo_seg">YOLO segmentation</option>
-        </param>
         <param name="do_normalization" type="boolean" checked="false" label="Normalize outputs?" >
             <help><![CDATA[
 This option will be applied to all selected formats above.<br/>
@@ -93,26 +119,30 @@ This option will be applied to all selected formats above.<br/>
         </param>
     </inputs>
     <outputs>
-        <data name="Annotations_coco" format="json" from_work_dir="./outputs/annotations.json" label="Annotation COCO" >
-            <filter>outputs_format and "coco" in outputs_format</filter>
+        <data name="Annotations_coco" format="json" from_work_dir="./outputs/annotations.json" label="Annotation COCO">
+            <filter>input['coco_video_mode'] != "no_coco"</filter>
         </data>
-        <collection name="Outputs_annotated" type="list">
+        <collection name="Coco_Frames" type="list" label="COCO Extracted Frames">
+            <filter>input['coco_video_mode'] == "frames"</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="outputs/frames"/>
+        </collection>
+        <collection name="Outputs_annotated" type="list" label="Annotated Outputs">
             <discover_datasets pattern="__name_and_ext__" directory="outputs/outputs_annotated"/>
         </collection>
         <collection name="Yolo_Bbox_Image" type="list" label="YOLO Bbox Images">
-            <filter>outputs_format and 'yolo_bbox' in outputs_format</filter>
+            <filter>input['outputs_format'] and 'yolo_bbox' in input['outputs_format']</filter>
             <discover_datasets pattern="__name_and_ext__" directory="outputs/yolo_bbox/images"/>
         </collection>
         <collection name="Yolo_Bbox_Label" type="list" label="YOLO Bbox Labels">
-            <filter>outputs_format and 'yolo_bbox' in outputs_format</filter>
+            <filter>input['outputs_format'] and 'yolo_bbox' in input['outputs_format']</filter>
             <discover_datasets pattern="__name_and_ext__" directory="outputs/yolo_bbox/labels"/>
         </collection>
         <collection name="Yolo_Seg_Image" type="list" label="YOLO Seg Images">
-            <filter>outputs_format and 'yolo_seg' in outputs_format</filter>
+            <filter>input['outputs_format'] and 'yolo_seg' in input['outputs_format']</filter>
             <discover_datasets pattern="__name_and_ext__" directory="outputs/yolo_seg/images"/>
         </collection>
         <collection name="Yolo_Seg_Label" type="list" label="YOLO Seg Labels">
-            <filter>outputs_format and 'yolo_seg' in outputs_format</filter>
+            <filter>input['outputs_format'] and 'yolo_seg' in input['outputs_format']</filter>
             <discover_datasets pattern="__name_and_ext__" directory="outputs/yolo_seg/labels"/>
         </collection>
     </outputs>
@@ -122,10 +152,10 @@ This option will be applied to all selected formats above.<br/>
             <conditional name="input">
                 <param name="input_kind" value="image" />
                 <param name="source" value="5827603936_3f1d5d715c_z.jpg,shrimp.png"/>
+                <param name="outputs_format" value="coco,yolo_bbox"/>
             </conditional>
             <param name="text_prompt" value="elephant"/>
             <param name="conf" value="0.25"/>
-            <param name="outputs_format" value="coco,yolo_bbox"/>
             <assert_stdout>
                 <has_text text="Invalid model!"/>
             </assert_stdout>