Add tiling mode in VideoSegmentationSam3Boxes

demoulinv · demoulinv · commit 273e03038181 · 2026-04-17T10:14:57.000+02:00
diff --git a/meshroom/imageSegmentation/VideoSegmentationSam3Boxes.py b/meshroom/imageSegmentation/VideoSegmentationSam3Boxes.py
@@ -1,4 +1,4 @@
-__version__ = "1.0"
+__version__ = "2.0"
 
 import os
 from pathlib import Path
@@ -16,8 +16,42 @@ class VideoSegmentationSam3Boxes(desc.Node):
 
     category = "Segmentation"
     documentation = """
-Based on the Segment Anything video predictor model 3, the node generates binary masks from a set of
-bounding boxes contained in a json file.
+## Video Segmentation with SAM3 Bounding Boxes
+
+This node generates binary segmentation masks for video sequences using the **Segment Anything Model 3 (SAM3)** video predictor.
+
+### Inputs
+Segmentation is driven by bounding boxes provided in a `bboxes.json` file, typically generated by the **VideoSegmentationSam3Text** node.
+
+### Multi-Resolution Support
+To improve segmentation quality on small objects, the node can combine source images at three resolutions:
+- **Native resolution** (required)
+- **Upscaled x2** (optional)
+- **Upscaled x4** (optional)
+
+When tiling is disabled, the resolution used for each bounding box is selected automatically based on its size:
+- Box smaller than **252×252** pixels → x4 image (if available)
+- Box smaller than **504×504** pixels → x2 image (if available)
+- Otherwise → native resolution image
+
+The `Round Crop Size` option (only available when tiling is disabled) snaps crop dimensions to **252, 504, or 1008** pixels, which can improve model accuracy for small bounding boxes.
+
+### Tiling Mode
+When **Enable Tiling** is active, large bounding boxes are subdivided into overlapping tiles before being passed to the model. This allows processing of high-resolution regions that would otherwise exceed the model's input capacity.  
+Key parameters:
+- **Target Tile Size**: Target size (in pixels) for each tile.
+- **Minimal Overlap**: Minimum pixel overlap between adjacent tiles to avoid boundary artifacts.
+
+> **Note:** Tiling and multi-resolution upscaling are mutually exclusive. When tiling is enabled, native resolution images are always used.
+
+### Computation Logic
+For each tracked object (identified by a text prompt and an object ID):
+1. The bounding boxes are extracted from `bboxes.json` and grouped into temporal chunks.
+2. Each chunk is optionally split into tiles.
+3. Cropped image sequences are fed to the SAM3 video predictor.
+4. The model propagates masks across all frames in the chunk.
+5. Predicted masks are resized and composited back into full-resolution mask images.
+6. Final masks are saved to disk, optionally inverted.
 """
 
     inputs = [
@@ -51,6 +85,33 @@ class VideoSegmentationSam3Boxes(desc.Node):
             description="Folder containing the bboxes.json file associated to the sfmData used as input.",
             value="",
         ),
+        desc.BoolParam(
+            name="enableTiling",
+            label="Enable Tiling",
+            description="Enable tiling in big boxes.",
+            value=True,
+        ),
+        desc.IntParam(
+            name="targetTileSize",
+            label="Target Tile Size",
+            description="Tile size.",
+            value=504,
+            enabled=lambda node: node.enableTiling.value,
+        ),
+        desc.IntParam(
+            name="minimalOverlap",
+            label="Minimal Overlap",
+            description="minimal tile overlap.",
+            value=16,
+            enabled=lambda node: node.enableTiling.value,
+        ),
+        desc.BoolParam(
+            name="roundCropSize",
+            label="Round Crop Size",
+            description="Round crop size to 252, 504 or 1008 for tube with smaller bounding boxes.",
+            value=True,
+            enabled=lambda node: not node.enableTiling.value,
+        ),
         desc.File(
             name="segmentationModelPath",
             label="Segmentation Model",
@@ -157,9 +218,11 @@ def processChunk(self, chunk):
             frame_w = chunk_image_paths[0][3]
             frame_h = chunk_image_paths[0][4]
             par = chunk_image_paths[0][5]
+            firstFrameId = chunk_image_paths[0][2]
             x2_ok = os.path.exists(chunk.node.inputx2.value)
             x4_ok = os.path.exists(chunk.node.inputx4.value)
-            bboxes = bboxUtils.extract_tracking(json_path, frame_w, frame_h, x2_ok, x4_ok, par)
+            roundCrop = chunk.node.roundCropSize.value
+            bboxes = bboxUtils.extract_tracking(json_path, frame_w, frame_h, x2_ok, x4_ok, roundCrop, par)
 
             logger.debug(f"bboxes.keys() = {bboxes.keys()}")
 
@@ -178,65 +241,85 @@ def processChunk(self, chunk):
                 logger.info(f"key = {key} ; text prompt = {textPrompt} ; obj_id = {obj_id}")
 
                 for frame_chunk in frame_chunks:
-                    logger.info(frame_chunk)
-                    pil_images = []
-                    firstFrameId = frame_chunk.start_frame
-                    for frame_idx, box in sorted(frame_chunk.boxes.items()):
-                        x1, y1, x2, y2 = bboxUtils.box_to_display(box, sourceInfo["PAR"])
-                        box_w = x2 - x1
-                        box_h = y2 - y1
-
-                        if box_w == 252 and box_h == 252:
-                            img, h_ori, w_ori, p_a_r, orientation = image.loadImage(str(chunk_image_paths[frame_idx - firstFrameId][7]), True)
-                            imgBuf = oiio.ImageBuf(img)
-                            imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(4*x1, 4*x2, 4*y1, 4*y2))
-                        elif box_w == 504 and box_h == 504:
-                            img, h_ori, w_ori, p_a_r, orientation = image.loadImage(str(chunk_image_paths[frame_idx - firstFrameId][6]), True)
-                            imgBuf = oiio.ImageBuf(img)
-                            imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(2*x1, 2*x2, 2*y1, 2*y2))
-                        else:
-                            img, h_ori, w_ori, p_a_r, orientation = image.loadImage(str(chunk_image_paths[frame_idx - firstFrameId][0]), True)
-                            imgBuf = oiio.ImageBuf(img)
-                            imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(x1, x2, y1, y2))
-
-                        img_crop = imgBuf.get_pixels(format=oiio.FLOAT)
-                        pil_images.append(Image.fromarray((255.0*img_crop).astype("uint8")))
-
-                    response = video_predictor.handle_request(
-                        request=dict(
-                            type="start_session",
-                            resource_path=pil_images,
+                    logger.info(f"frame_chunk:\{frame_chunk}")
+                    logger.debug(f"{frame_chunk.boxes}")
+
+                    chunk_tiles = [frame_chunk]
+                    if chunk.node.enableTiling.value:
+                        chunk_tiles = bboxUtils.tile_chunk(frame_chunk, chunk.node.targetTileSize.value,
+                                                           chunk.node.minimalOverlap.value, sourceInfo["PAR"], logger)
+                    # In tiling mode, avoid loading all frames for every new tiles
+                    full_pil_images = {}
+                    if chunk.node.enableTiling.value:
+                        for frameId, _ in chunk_tiles[0].boxes.items():
+                            img, h_ori, w_ori, PAR, orientation = image.loadImage(str(chunk_image_paths[frameId - firstFrameId][0]), True)
+                            full_pil_images[frameId] = img
+
+                    logger.info(f"chunk_tiles:\{chunk_tiles}")
+
+                    for chunk_tile in chunk_tiles:
+                        logger.debug(f"{chunk_tile.boxes}")
+
+                        pil_images = []
+                        for frame_idx, box in sorted(chunk_tile.boxes.items()):
+                            x1, y1, x2, y2 = bboxUtils.box_to_display(box, sourceInfo["PAR"])
+                            box_w = x2 - x1
+                            box_h = y2 - y1
+
+                            if box_w <= 252 and box_h <= 252 and x4_ok and not chunk.node.enableTiling.value:
+                                img, h_ori, w_ori, p_a_r, orientation = image.loadImage(str(chunk_image_paths[frame_idx - firstFrameId][7]), True)
+                                imgBuf = oiio.ImageBuf(img)
+                                imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(4*x1, 4*x2, 4*y1, 4*y2))
+                            elif box_w <= 504 and box_h <= 504 and x2_ok and not chunk.node.enableTiling.value:
+                                img, h_ori, w_ori, p_a_r, orientation = image.loadImage(str(chunk_image_paths[frame_idx - firstFrameId][6]), True)
+                                imgBuf = oiio.ImageBuf(img)
+                                imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(2*x1, 2*x2, 2*y1, 2*y2))
+                            elif not chunk.node.enableTiling.value:
+                                img, h_ori, w_ori, p_a_r, orientation = image.loadImage(str(chunk_image_paths[frame_idx - firstFrameId][0]), True)
+                                imgBuf = oiio.ImageBuf(img)
+                                imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(x1, x2, y1, y2))
+                            else:
+                                # use already loaded images
+                                imgBuf = oiio.ImageBuf(full_pil_images[frame_idx])
+                                imgBuf = oiio.ImageBufAlgo.crop(imgBuf, roi=oiio.ROI(x1, x2, y1, y2))
+
+                            img_crop = imgBuf.get_pixels(format=oiio.FLOAT)
+                            pil_images.append(Image.fromarray((255.0*img_crop).astype("uint8")))
+
+                        response = video_predictor.handle_request(
+                            request=dict(
+                                type="start_session",
+                                resource_path=pil_images,
+                                )
+                        )
+                        session_id = response["session_id"]
+
+                        video_predictor.handle_request(
+                            request=dict(
+                                type="add_prompt",
+                                session_id=session_id,
+                                frame_index=0,
+                                text=textPrompt,
                             )
-                    )
-                    session_id = response["session_id"]
-
-                    video_predictor.handle_request(
-                        request=dict(
-                            type="add_prompt",
-                            session_id=session_id,
-                            frame_index=0,
-                            text=textPrompt,
                         )
-                    )
-                    outputs_per_frame = sam3Utils.propagateInVideo(video_predictor, session_id) #, fIdx, max_frame_num_to_track, track_dir)
-                    outputs_per_frame_visu = sam3Utils.prepareMasksForVisualization(outputs_per_frame)
-
-                    for frame_idx, box in sorted(frame_chunk.boxes.items()):
-                        x1, y1, x2, y2 = box
-                        box_w = x2 - x1
-                        box_h = y2 - y1
-                        frameId = frame_idx - firstFrameId
-                        for key, maskBoxProb in outputs_per_frame_visu[frameId].items():
-                            mask = maskBoxProb["mask"]
-                            buf_in = oiio.ImageBuf(mask.astype('float32'))
-                            buf_out = oiio.ImageBufAlgo.resample(buf_in, roi=oiio.ROI(0, box_w, 0, box_h))
-                            mask = buf_out.get_pixels().reshape(box_h, box_w, 1)
-                            tgt = full_mask_images[frame_idx][y1:y2 ,x1:x2, :]
-                            bool_mask = mask.squeeze() > 0
-                            tgt[bool_mask] = [255, 255, 255]
-
-                    video_predictor.handle_request(request=dict(type="close_session", session_id=session_id))
-
+                        outputs_per_frame = sam3Utils.propagateInVideo(video_predictor, session_id)
+                        outputs_per_frame_visu = sam3Utils.prepareMasksForVisualization(outputs_per_frame)
+
+                        for frame_idx, box in sorted(chunk_tile.boxes.items()):
+                            x1, y1, x2, y2 = box
+                            box_w = x2 - x1
+                            box_h = y2 - y1
+                            frameId = frame_idx - chunk_tile.start_frame
+                            for key, maskBoxProb in outputs_per_frame_visu[frameId].items():
+                                mask = maskBoxProb["mask"]
+                                buf_in = oiio.ImageBuf(mask.astype('float32'))
+                                buf_out = oiio.ImageBufAlgo.resample(buf_in, roi=oiio.ROI(0, box_w, 0, box_h))
+                                mask = buf_out.get_pixels().reshape(box_h, box_w, 1)
+                                tgt = full_mask_images[frame_idx][y1:y2 ,x1:x2, :]
+                                bool_mask = mask.squeeze() > 0
+                                tgt[bool_mask] = [255, 255, 255]
+
+                        video_predictor.handle_request(request=dict(type="close_session", session_id=session_id))
 
             for frameId, image_path in enumerate(chunk_image_paths):
                 if chunk.node.maskInvert.value:
diff --git a/meshroom/rotoPersons.mg b/meshroom/rotoPersons.mg
@@ -0,0 +1,59 @@
+{
+    "header": {
+        "releaseVersion": "2026.1.0+develop",
+        "fileVersion": "2.0",
+        "nodesVersions": {
+            "CameraInit": "12.1",
+            "CopyFiles": "1.3",
+            "VideoSegmentationSam3Boxes": "2.0",
+            "VideoSegmentationSam3Text": "1.0"
+        },
+        "template": true
+    },
+    "graph": {
+        "CameraInit_1": {
+            "nodeType": "CameraInit",
+            "position": [
+                -452,
+                94
+            ],
+            "inputs": {}
+        },
+        "CopyFiles_1": {
+            "nodeType": "CopyFiles",
+            "position": [
+                229,
+                73
+            ],
+            "inputs": {
+                "output": "{VideoSegmentationSam3Boxes_1.output}"
+            }
+        },
+        "VideoSegmentationSam3Boxes_1": {
+            "nodeType": "VideoSegmentationSam3Boxes",
+            "position": [
+                9,
+                41
+            ],
+            "inputs": {
+                "input": "{VideoSegmentationSam3Text_1.input}",
+                "masksFolder": "{VideoSegmentationSam3Text_1.output}",
+                "bboxesFolder": "{VideoSegmentationSam3Text_1.output}",
+                "verboseLevel": "debug"
+            }
+        },
+        "VideoSegmentationSam3Text_1": {
+            "nodeType": "VideoSegmentationSam3Text",
+            "position": [
+                -221,
+                61
+            ],
+            "inputs": {
+                "input": "{CameraInit_1.output}",
+                "timeSlicing": true,
+                "sliceSize": 64,
+                "verboseLevel": "debug"
+            }
+        }
+    }
+}
diff --git a/segmentationRDS/bboxUtils.py b/segmentationRDS/bboxUtils.py