@@ -80,11 +80,22 @@ def parse_arguments() -> argparse.Namespace:
8080 default = "copy" , # original quality by default
8181 help = "Video bitrate: 'copy' (original), '2000k', '4000k', '8000k'" ,
8282 )
83+ parser .add_argument (
84+ "--coco_video_mode" ,
85+ type = str ,
86+ default = "no_coco" ,
87+ choices = ["video" , "frames" , "no_coco" ],
88+ help = "For video input with COCO output: 'video' annotates the video"
89+ "as a single source, 'frames' extracts each processed frame as an "
90+ "individual image and annotates per frame, 'no_coco' disables "
91+ "COCO output" ,
92+ )
8393 return parser .parse_args ()
8494
8595
8696# -------- Functions --------
8797
98+
8899def convert_avi_to_mp4 (directory_path , quality ):
89100 """
90101 Convert AVI file to MP4.
@@ -189,8 +200,15 @@ def create_coco_output(
189200
190201 polygons = result .masks .xyn if is_normalized else result .masks .xy
191202 boxes = result .boxes .xyxyn if is_normalized else result .boxes .xyxy
203+ track_ids = (
204+ result .boxes .id .int ().tolist ()
205+ if result .boxes .id is not None
206+ else [None ] * len (result .boxes .cls )
207+ )
192208
193- for polygon , bbox , class_id in zip (polygons , boxes , result .boxes .cls ):
209+ for polygon , bbox , class_id , track_id in zip (
210+ polygons , boxes , result .boxes .cls , track_ids
211+ ):
194212 # Flatten polygon coordinates
195213 polygon_flat = polygon .flatten ().tolist ()
196214
@@ -206,6 +224,7 @@ def create_coco_output(
206224 "id" : annotation_id ,
207225 "image_id" : image_id ,
208226 "category_id" : int (class_id ) + 1 ,
227+ "track_id" : track_id ,
209228 "segmentation" : [polygon_flat ],
210229 "area" : area ,
211230 "bbox" : [x1 , y1 , bbox_w , bbox_h ],
@@ -293,6 +312,117 @@ def create_yolo_output(
293312 print (f"✓ Created { len (results )} images and labels in { output_dir } " )
294313
295314
315+ def create_coco_video_frames_output (
316+ results : List [Any ],
317+ text_prompts : List [str ],
318+ metadata : Dict [str , Any ],
319+ is_normalized : bool ,
320+ video_path : str ,
321+ stride : int ,
322+ outdir : Path ,
323+ ) -> Dict [str , Any ]:
324+ """Convert SAM3 video results to COCO format with one image entry
325+ per extracted frame."""
326+ frames_dir = outdir / "frames"
327+ frames_dir .mkdir (parents = True , exist_ok = True )
328+
329+ coco_output = {
330+ "info" : metadata ,
331+ "images" : [],
332+ "annotations" : [],
333+ "categories" : create_coco_categories (text_prompts ),
334+ }
335+
336+ cap = cv2 .VideoCapture (video_path )
337+ if not cap .isOpened ():
338+ raise RuntimeError (f"Failed to open video: { video_path } " )
339+
340+ video_name = Path (video_path ).stem
341+ annotation_id = 1
342+ frame_idx = 1 if stride > 1 else 0
343+ saved_idx = 0
344+
345+ print (
346+ f"Extracting frames and building per-frame COCO annotations "
347+ f"(stride={ stride } )..."
348+ )
349+
350+ while cap .isOpened ():
351+ ret , frame = cap .read ()
352+ if not ret :
353+ break
354+
355+ if frame_idx % stride == 0 :
356+ if saved_idx >= len (results ):
357+ print (f"Warning: No result available for frame { frame_idx } " )
358+ break
359+
360+ frame_name = f"{ video_name } _frame_{ frame_idx :06d} .jpg"
361+ frame_path = frames_dir / frame_name
362+ cv2 .imwrite (str (frame_path ), frame )
363+
364+ result = results [saved_idx ]
365+ image_id = saved_idx + 1
366+ height , width = result .orig_shape
367+
368+ coco_output ["images" ].append (
369+ {
370+ "id" : image_id ,
371+ "file_name" : frame_name ,
372+ "width" : width ,
373+ "height" : height ,
374+ "frame_index" : frame_idx ,
375+ }
376+ )
377+
378+ if result .masks is not None :
379+ polygons = (
380+ result .masks .xyn if is_normalized else result .masks .xy
381+ )
382+ boxes = (
383+ result .boxes .xyxyn if is_normalized else result .boxes .xyxy
384+ )
385+ track_ids = (
386+ result .boxes .id .int ().tolist ()
387+ if result .boxes .id is not None
388+ else [None ] * len (result .boxes .cls )
389+ )
390+
391+ for polygon , bbox , class_id , track_id in zip (
392+ polygons , boxes , result .boxes .cls , track_ids
393+ ):
394+ polygon_flat = polygon .flatten ().tolist ()
395+ x1 , y1 , x2 , y2 = bbox [:4 ].tolist ()
396+ bbox_w = x2 - x1
397+ bbox_h = y2 - y1
398+ area = float (cv2 .contourArea (polygon .astype (np .float32 )))
399+
400+ coco_output ["annotations" ].append (
401+ {
402+ "id" : annotation_id ,
403+ "image_id" : image_id ,
404+ "category_id" : int (class_id ) + 1 ,
405+ "track_id" : track_id ,
406+ "segmentation" : [polygon_flat ],
407+ "area" : area ,
408+ "bbox" : [x1 , y1 , bbox_w , bbox_h ],
409+ "iscrowd" : 0 ,
410+ }
411+ )
412+ annotation_id += 1
413+
414+ saved_idx += 1
415+
416+ if saved_idx % 10 == 0 :
417+ print (f" Extracted { saved_idx } frames..." )
418+
419+ frame_idx += 1
420+
421+ cap .release ()
422+ print (f"✓ Extracted { saved_idx } frames to { frames_dir } " )
423+ return coco_output
424+
425+
296426def create_yolo_video_output (
297427 annotation_type : str ,
298428 results : List [Any ],
@@ -475,7 +605,7 @@ def patched_postprocess(preds, img, orig_imgs):
475605 # print(f"\n Running prediction on {source_path}...")
476606 results = predictor (source = source_path , text = text_prompts , stream = False )
477607 if is_video (file_paths [0 ]):
478- convert_avi_to_mp4 (outputs_annotated )
608+ convert_avi_to_mp4 (outputs_annotated , args . quality )
479609
480610 if not results :
481611 raise RuntimeError ("SAM3 returned no results" )
@@ -492,9 +622,22 @@ def patched_postprocess(preds, img, orig_imgs):
492622
493623 if "coco" in output_formats :
494624 print ("\n → Converting to COCO format..." )
495- coco_output = create_coco_output (
496- results , text_prompts , metadata , is_normalized
497- )
625+
626+ if is_video (file_paths [0 ]) and args .coco_video_mode == "frames" :
627+ print (" Mode: per-frame (extracting individual frames)..." )
628+ coco_output = create_coco_video_frames_output (
629+ results ,
630+ text_prompts ,
631+ metadata ,
632+ is_normalized ,
633+ file_paths [0 ],
634+ args .vid_stride ,
635+ outdir ,
636+ )
637+ else :
638+ coco_output = create_coco_output (
639+ results , text_prompts , metadata , is_normalized
640+ )
498641
499642 annotation_file = outdir / "annotations.json"
500643 with open (annotation_file , "w" ) as f :
@@ -510,7 +653,11 @@ def patched_postprocess(preds, img, orig_imgs):
510653
511654 if is_video (file_paths [0 ]):
512655 create_yolo_video_output (
513- "bbox" , results , yolo_bbox_dir , file_paths [0 ], args .vid_stride ,
656+ "bbox" ,
657+ results ,
658+ yolo_bbox_dir ,
659+ file_paths [0 ],
660+ args .vid_stride ,
514661 is_normalized ,
515662 )
516663 else :
@@ -524,7 +671,11 @@ def patched_postprocess(preds, img, orig_imgs):
524671
525672 if is_video (file_paths [0 ]):
526673 create_yolo_video_output (
527- "seg" , results , yolo_seg_dir , file_paths [0 ], args .vid_stride ,
674+ "seg" ,
675+ results ,
676+ yolo_seg_dir ,
677+ file_paths [0 ],
678+ args .vid_stride ,
528679 is_normalized ,
529680 )
530681 else :
0 commit comments