Used Vocal Fold extents for mesh extent computation

Henningson · Henningson · commit 5295d63d9f8c · 2026-03-21T15:04:07.000+01:00
diff --git a/source/NeuralSegmentation.py b/source/NeuralSegmentation.py
@@ -254,6 +254,8 @@ def __init__(self, images, path="assets/model.pth.tar"):
 
         self.generateSegmentationData()
 
+        self.vocalfold_extents = None
+
 
     def class_to_color(self, prediction, class_colors):
         prediction = np.expand_dims(prediction, 1)
@@ -267,6 +269,7 @@ def class_to_color(self, prediction, class_colors):
 
         return output
 
+
     def segmentImage(self, frame):
         segmentation = self.model(torch.from_numpy(frame).unsqueeze(0).unsqueeze(0).to(DEVICE).float()).argmax(dim=1).detach().cpu().numpy().squeeze().astype(np.uint8)
 
@@ -281,6 +284,21 @@ def segmentImage(self, frame):
 
         glottal_roi = np.zeros(segmentation.shape, np.uint8)
         x, y, w, h = sorted_stats[-2][1]
+        
+        if self.vocalfold_extents is None:
+            self.vocalfold_extents = [x, y, w, h]
+        else:
+            px, py, ph, pw = self.vocalfold_extents
+
+            # Convert to corner coordinates
+            x1 = min(px, x)
+            y1 = min(py, y)
+            x2 = max(px + pw, x + w)
+            y2 = max(py + ph, y + h)
+
+            # Store union box
+            self.vocalfold_extents = [x1, y1, x2 - x1, y2 - y1]
+
         glottal_roi[y:y+h, x:x+w] = 1
         filtered_glottis = ((segmentation == 2) * 255 * glottal_roi).astype(np.uint8)
 
@@ -301,22 +319,10 @@ def computeLocalMaxima(self, index, kernelsize=7):
         
         return maxima
 
-    def generateROI(self):
-        minX = 0
-        maxX = 0
-        minY = 0
-        maxY = 0
-
-        for laserdotSegmentation in self.laserdotSegmentations:
-            ys, xs = np.nonzero(laserdotSegmentation)
 
-            maxY = np.max(ys)
-            minY = np.min(ys)
-            maxX = np.max(xs)
-            minX = np.min(xs)
 
-
-        return [minX, maxX-minX, minY, maxY-minY]
+    def generateROI(self):
+        return self.vocalfold_extents
 
 
     def estimateClosedGlottis(self):
diff --git a/source/Segmentator.py b/source/Segmentator.py
@@ -16,6 +16,8 @@ def __init__(self, images):
         # List of 2x2 points
         self.glottalMidlines = list()
 
+        self.vocalfoldOutlines = list()
+
         # List of extracted local Maxima
         self.localMaxima = list()
 
@@ -43,6 +45,24 @@ def segmentImageIndex(self, index):
     def getSegmentation(self, index):
         return self.segmentations[index]
 
+    def computeVocalfoldOutline(self, index):
+        segmentation = self.segmentations[index]
+        contours, hierarchy = cv2.findContours(segmentation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+        i = 0
+        contour_points = list()
+        while (i != -1):
+            contour_points.append(contours[hierarchy[0][i][0]][:, 0, :])
+            i = hierarchy[0][i][0]
+
+        contourArray = None
+        if len(contour_points) > 1:
+            contourArray = np.concatenate(contour_points, axis=0)
+        else:
+            contourArray = contour_points[0]
+        return contourArray - np.ones(contourArray.shape)
+
+
     def computeGlottalOutline(self, index):
         segmentation = self.segmentations[index]
         contours, hierarchy = cv2.findContours(segmentation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
diff --git a/source/SiliconeSurfaceReconstruction.py b/source/SiliconeSurfaceReconstruction.py
@@ -208,9 +208,16 @@ def controlPointBasedARAP(triangulatedPoints, camera, segmentator, zSubdivisions
         t = helper.rayPlaneIntersectionMat(centroid, np.expand_dims(planeNormal, 0), np.zeros(glottalCameraRays.shape), glottalCameraRays) 
         glottalOutlinePoints = t * glottalCameraRays
 
+        xy, wh = segmentator.vocalfoldBoundingBox()
+        mesh_extents = torch.stack([xy, xy+wh]).detach().cpu().numpy()
+        meshCameraRays = camera.getRayMat(mesh_extents)
+        t = helper.rayPlaneIntersectionMat(centroid, np.expand_dims(planeNormal, 0), np.zeros(meshCameraRays.shape), meshCameraRays) 
+        meshExtentPoints = t * meshCameraRays
 
-        # Project Glottal Midline Extrema into Pointcloud
-        
+
+
+
+        # Project Glottal Midline Extrema into Pointcloud        
         upperMidLine, lowerMidLine = segmentator.glottalMidlines()[i]
 
         # Search for the next best midline if the computation didnt work.
@@ -235,6 +242,7 @@ def controlPointBasedARAP(triangulatedPoints, camera, segmentator, zSubdivisions
         glottalOutlinePoints = glottalOutlinePoints - centroid
         gml_point1 = np.expand_dims(gml_point1, 0) - centroid
         gml_point2 = np.expand_dims(gml_point2, 0) - centroid
+        meshExtentPoints = meshExtentPoints - centroid
 
         # Compute rotation matrix, aligning the plane normal to the +Y Axis
         rotPlane = helper.rotateAlign(planeNormal/np.linalg.norm(planeNormal), np.array([0.0, 1.0, 0.0]))
@@ -244,6 +252,7 @@ def controlPointBasedARAP(triangulatedPoints, camera, segmentator, zSubdivisions
         glottalOutlinePoints = np.matmul(rotPlane, glottalOutlinePoints.T).T
         gml_point1 = np.matmul(rotPlane, gml_point1.T).T
         gml_point2 = np.matmul(rotPlane, gml_point2.T).T
+        meshExtentPoints = np.matmul(rotPlane, meshExtentPoints.T).T
 
 
 
@@ -258,6 +267,7 @@ def controlPointBasedARAP(triangulatedPoints, camera, segmentator, zSubdivisions
         gml_point1 = rotateX(gml_point1, -gmplAngle, deg=False)
         gml_point2 = rotateX(gml_point2, -gmplAngle, deg=False)
         glottalOutlinePoints = rotateX(glottalOutlinePoints, -gmplAngle, deg=False)
+        meshExtentPoints = rotateX(meshExtentPoints, -gmplAngle, deg=False)
 
 
         # Move everything, such that the glottal midlie lies directly ontop the Z Axus
@@ -266,16 +276,19 @@ def controlPointBasedARAP(triangulatedPoints, camera, segmentator, zSubdivisions
         gml_point1 -= np.array([[0.0, 0.0, zOffset]])
         gml_point2 -= np.array([[0.0, 0.0, zOffset]])
         glottalOutlinePoints -= np.array([0.0, 0.0, zOffset])
+        meshExtentPoints -= np.array([0.0, 0.0, zOffset])
 
         # Rotate everything around by 90 degrees again
         alignedPoints = rotateX(alignedPoints, -90).astype(np.float)
         gml_point1 = rotateX(gml_point1, -90).astype(np.float)
         gml_point2 = rotateX(gml_point2, -90).astype(np.float)
         glottalOutlinePoints = rotateX(glottalOutlinePoints, -90).astype(np.float)
+        meshExtentPoints = rotateX(meshExtentPoints, -90).astype(np.float)
 
 
         # Set Y Values to zero of the glottal outline points
         glottalOutlinePoints[:, 1] = 0.0
+        meshExtentPoints[:, 1] = 0.0
 
         if flip_y:
             alignedPoints[:, 1] = -alignedPoints[:, 1]
@@ -294,7 +307,7 @@ def controlPointBasedARAP(triangulatedPoints, camera, segmentator, zSubdivisions
 
         # Find X-Y-Z Extent of Vocalfolds to generate fitting M5 Model
         if first:
-            minX, maxX, minY, maxY, minZ, maxZ = findXYZExtent(aligned)
+            minX, maxX, minY, maxY, minZ, maxZ = findXYZExtent(meshExtentPoints)
             first = False
 
             # Generate M5 Model for left and right vocalfold
diff --git a/source/feature_estimation.py b/source/feature_estimation.py
@@ -310,6 +310,8 @@ def compute_features(self, video: torch.tensor) -> None:
 
         self._glottal_midlines = []
 
+        self._vocalfold_bounding_box = None
+
         num_frames = video.shape[0]
         # video_clone = (video.clone().unsqueeze(1).float() / 255).repeat(1, 3, 1, 1)
         batch_size = 8
@@ -367,12 +369,16 @@ def compute_features(self, video: torch.tensor) -> None:
 
                     glottal_roi = torch.zeros(label.shape, device=labels.device)
                     x, y, w, h = sorted_stats[-2][1]
+
+                    if self._vocalfold_bounding_box is None:
+                        self._vocalfold_bounding_box = [torch.tensor([x, y]), torch.tensor([w, h])]
+
+
                     glottal_roi[y:y+h, x:x+w] = 1
                     wat.append(glottal_roi)
                 wat = torch.stack(wat)
                 labels = labels * wat
 
-
                 end_event_nn.record()
                 self._laserpoint_segmentations[i:i+batch_size] = (labels == 3) * 1