Merge pull request #40 from meshroomHub/bugfix/sam3VideoNodeSizeAndFrameId

demoulinv · web-flow · commit 935541035be0 · 2026-02-13T10:23:31.000+01:00
Bugfix SAM3D and SDMatte
diff --git a/meshroom/imageSegmentation/SDMatte.py b/meshroom/imageSegmentation/SDMatte.py
@@ -340,15 +340,18 @@ def build_SDMatte_model(self, modelFolder, checkpoint, device, promptType):
             conv_scale=3,
             num_inference_steps=1,
             aux_input=promptType,
+            aux_input_list=["point_mask", "bbox_mask", "mask", "trimap"],
+            attn_mask_aux_input=["point_mask", "bbox_mask", "mask", "trimap"],
             add_noise=False,
             use_dis_loss=True,
             use_aux_input=True,
             use_coor_input=True,
             use_attention_mask=True,
+            use_encoder_attention_mask=True,
             residual_connection=False,
             use_encoder_hidden_states=True,
             use_attention_mask_list=[True, True, True],
-            use_encoder_hidden_states_list=[False, True, False],
+            use_encoder_hidden_states_list=[True, True, True],
         )
         model.to(device)
         DetectionCheckpointer(model).load(checkpoint)
@@ -421,6 +424,7 @@ def processChunk(self, chunk):
             if promptType == "":
                 raise ValueError("Some images have no valid prompt to drive the matting process !!!")
             else:
+                logger.info(f"prompt type: {promptType}")
 
                 if not os.path.exists(chunk.node.output.value):
                     os.mkdir(chunk.node.output.value)
@@ -465,9 +469,14 @@ def processChunk(self, chunk):
                             mask = maskRGB[:,:,0]
                             mask_sized = cv2.resize(mask, inference_size, interpolation=cv2.INTER_NEAREST)
                             mask_scaled = mask_sized.copy() * 2 - 1
-                            sample["mask"] = F.to_tensor(mask_scaled).float().unsqueeze(0)
-                            sample["mask_coords"] = np.array([0, 0, 1, 1])
-                            sample["mask_coords"] = torch.from_numpy(sample["mask_coords"]).float().unsqueeze(0)
+                            if promptType == "mask":
+                                sample["mask"] = F.to_tensor(mask_scaled).float().unsqueeze(0)
+                                sample["mask_coords"] = np.array([0, 0, 1, 1])
+                                sample["mask_coords"] = torch.from_numpy(sample["mask_coords"]).float().unsqueeze(0)
+                            else:
+                                sample["trimap"] = F.to_tensor(mask_scaled).float().unsqueeze(0)
+                                sample["trimap_coords"] = np.array([0, 0, 1, 1])
+                                sample["trimap_coords"] = torch.from_numpy(sample["trimap_coords"]).float().unsqueeze(0)
                         elif promptType == "auto_mask":
                             mask = np.ones_like(img)[:,:,0]
                             mask_sized = cv2.resize(mask, inference_size, interpolation=cv2.INTER_NEAREST)
diff --git a/meshroom/imageSegmentation/VideoSegmentationSam3.py b/meshroom/imageSegmentation/VideoSegmentationSam3.py
@@ -12,12 +12,28 @@
 
 class Sam3VideoNodeSize(desc.MultiDynamicNodeSize):
     def computeSize(self, node):
+        if node.attribute(self._params[0]).isLink:
+            return node.attribute(self._params[0]).inputLink.node.size
+
+        from pathlib import Path
+
+        input_path_param = node.attribute(self._params[0])
+        extension_param = node.attribute(self._params[1])
+        input_path = input_path_param.value
+        extension = extension_param.value
+        include_suffixes = [extension.lower(), extension.upper()]
+
         size = 1
+        if Path(input_path).is_dir():
+            import itertools
+            image_paths = list(itertools.chain(*(Path(input_path).glob(f'*.{suffix}') for suffix in include_suffixes)))
+            size = len(image_paths)
+        
         return size
         
 class VideoSegmentationSam3(desc.Node):
     size = Sam3VideoNodeSize(['input', 'extensionIn'])
-    gpu = desc.Level.INTENSIVE
+    gpu = desc.Level.EXTREME
 
     category = "Utils"
     documentation = """
@@ -322,14 +338,18 @@ def processChunk(self, chunk):
             bboxes = {}
 
             colorPalette = image.paletteGenerator()
+            firstFrameId = chunk_image_paths[0][2]
             
             for idx, path in enumerate(chunk_image_paths):
                 img, h_ori, w_ori, PAR, orientation = image.loadImage(str(chunk_image_paths[idx][0]), True)
                 pil_images.append(Image.fromarray((255.0*img).astype("uint8")))
                 sourceInfo = {"h_ori": h_ori, "w_ori": w_ori, "PAR": PAR, "orientation": orientation}
 
                 viewId = chunk_image_paths[idx][1]
-                frameId = chunk_image_paths[idx][2]
+                if firstFrameId is None or chunk_image_paths[idx][2] is None:
+                    frameId = idx
+                else:
+                    frameId = chunk_image_paths[idx][2] - firstFrameId
 
                 objects = {}
                 if viewId is not None and viewId in posClickDictFromShape: