Merge branch 'fix/GH-679-svd2' into 'main'

shi-eric · shi-eric · commit 93e3dd142eb3 · 2025-05-06T12:07:18.000-07:00
Fix GH-679: improve svd2 robustness and accuracy See merge request omniverse/warp!1285
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -57,6 +57,7 @@
 - Fix 2D tile load when source array and tile have incompatible strides
   ([GH-688](https://github.com/NVIDIA/warp/issues/688)).
 - Fixed inconsistency in orientation of 2D geometry side normals ([GH-629](https://github.com/NVIDIA/warp/issues/629)).
+- Fixed `wp.svd2()` with duplicate singular values and improved accuracy ([GH-679](https://github.com/NVIDIA/warp/issues/679)).
 
 ## [1.7.1] - 2025-04-30
 
diff --git a/warp/native/svd.h b/warp/native/svd.h
@@ -60,17 +60,17 @@ struct _svd_config<double> {
     static constexpr int JACOBI_ITERATIONS = 8;
 };
 
-
-
-// TODO: replace sqrt with rsqrt
-
-template<typename Type>
-inline CUDA_CALLABLE
-Type accurateSqrt(Type x)
+template <typename Type> inline CUDA_CALLABLE Type recipSqrt(Type x)
 {
-  return x / sqrt(x);
+#if defined(__CUDA_ARCH__)
+    return ::rsqrt(x);
+#else
+    return Type(1) / sqrt(x);
+#endif
 }
 
+template <> inline CUDA_CALLABLE wp::half recipSqrt(wp::half x) { return wp::half(1) / sqrt(x); }
+
 template<typename Type>
 inline CUDA_CALLABLE
 void condSwap(bool c, Type &X, Type &Y)
@@ -175,7 +175,7 @@ void approximateGivensQuaternion(Type a11, Type a12, Type a22, Type &ch, Type &s
     ch = Type(2)*(a11-a22);
     sh = a12;
     bool b = Type(_gamma)*sh*sh < ch*ch;
-    Type w = Type(1) / sqrt(ch*ch+sh*sh);
+    Type w = recipSqrt(ch*ch+sh*sh);
     ch=b?w*ch:Type(_cstar);
     sh=b?w*sh:Type(_sstar);
 }
@@ -304,13 +304,13 @@ void QRGivensQuaternion(Type a1, Type a2, Type &ch, Type &sh)
     // a1 = pivot point on diagonal
     // a2 = lower triangular entry we want to annihilate
     const Type epsilon = _svd_config<Type>::QR_GIVENS_EPSILON;
-    Type rho = accurateSqrt(a1*a1 + a2*a2);
+    Type rho = sqrt(a1*a1 + a2*a2);
 
     sh = rho > epsilon ? a2 : Type(0);
     ch = abs(a1) + max(rho,epsilon);
     bool b = a1 < Type(0);
     condSwap(b,sh,ch);
-    Type w = Type(1) / sqrt(ch*ch+sh*sh);
+    Type w = recipSqrt(ch*ch+sh*sh);
     ch *= w;
     sh *= w;
 }
@@ -432,21 +432,15 @@ void _svd(// input A
     );
 }
 
-
-template<typename Type>
-inline CUDA_CALLABLE
-void _svd_2(// input A
-        Type a11, Type a12,
-        Type a21, Type a22,
-        // output U
-        Type &u11, Type &u12,
-        Type &u21, Type &u22,
-        // output S
-        Type &s11, Type &s12,
-        Type &s21, Type &s22,
-        // output V
-        Type &v11, Type &v12,
-        Type &v21, Type &v22)
+template <typename Type>
+inline CUDA_CALLABLE void _svd_2( // input A
+    Type a11, Type a12, Type a21, Type a22,
+    // output U
+    Type& u11, Type& u12, Type& u21, Type& u22,
+    // output S
+    Type& s1, Type& s2,
+    // output V
+    Type& v11, Type& v12, Type& v21, Type& v22)
 {
     // Step 1: Compute ATA
     Type ATA11 = a11 * a11 + a21 * a21;
@@ -455,39 +449,56 @@ void _svd_2(// input A
 
     // Step 2: Eigenanalysis
     Type trace = ATA11 + ATA22;
-    Type det = ATA11 * ATA22 - ATA12 * ATA12;
-    Type sqrt_term = sqrt(trace * trace - Type(4.0) * det);
-    Type lambda1 = (trace + sqrt_term) * Type(0.5);
-    Type lambda2 = (trace - sqrt_term) * Type(0.5);
+    Type diff = ATA11 - ATA22;
+    Type discriminant = diff * diff + Type(4) * ATA12 * ATA12;
 
     // Step 3: Singular values
-    Type sigma1 = sqrt(lambda1);
+    if (discriminant == Type(0))
+    {
+        // Duplicate eigenvalue, A ~ s Id
+        s1 = s2 = sqrt(Type(0.5) * trace);
+        u11 = v11 = Type(1);
+        u12 = v12 = Type(0);
+        u21 = v21 = Type(0);
+        u22 = v22 = Type(1);
+        return;
+    }
+
+    // General case
+    Type sqrt_term = sqrt(discriminant);
+    Type lambda1 = (trace + sqrt_term) * Type(0.5);
+    Type lambda2 = (trace - sqrt_term) * Type(0.5);
+    Type inv_sigma1 = recipSqrt(lambda1);
+    Type sigma1 = Type(1) / inv_sigma1;
     Type sigma2 = sqrt(lambda2);
 
     // Step 4: Eigenvectors (find V)
-    Type v1x = ATA12, v1y = lambda1 - ATA11; // For first eigenvector
-    Type v2x = ATA12, v2y = lambda2 - ATA11; // For second eigenvector
-    Type norm1 = sqrt(v1x * v1x + v1y * v1y);
-    Type norm2 = sqrt(v2x * v2x + v2y * v2y);
-
-    v11 = v1x / norm1; v12 = v2x / norm2;
-    v21 = v1y / norm1; v22 = v2y / norm2;
+    Type v1y = diff - sqrt_term + Type(2) * ATA12, v1x = diff + sqrt_term - Type(2) * ATA12;
+    Type len1_sq = v1x * v1x + v1y * v1y;
+    if (len1_sq == Type(0)) {
+        v11 = Type(0.707106781186547524401); // M_SQRT1_2
+        v21 = v11;
+    } else {
+        Type inv_len1 = recipSqrt(len1_sq);
+        v11 = v1x * inv_len1;
+        v21 = v1y * inv_len1;
+    }
+    v12 = -v21;
+    v22 = v11;
 
     // Step 5: Compute U
-    Type inv_sigma1 = (sigma1 > Type(1e-6)) ? Type(1.0) / sigma1 : Type(0.0);
-    Type inv_sigma2 = (sigma2 > Type(1e-6)) ? Type(1.0) / sigma2 : Type(0.0);
-
     u11 = (a11 * v11 + a12 * v21) * inv_sigma1;
-    u12 = (a11 * v12 + a12 * v22) * inv_sigma2;
     u21 = (a21 * v11 + a22 * v21) * inv_sigma1;
-    u22 = (a21 * v12 + a22 * v22) * inv_sigma2;
+    // sigma2 may be zero, but we can complete U orthogonally up to determinant's sign
+    Type det_sign = wp::sign(a11 * a22 - a12 * a21);
+    u12 = -u21 * det_sign;
+    u22 = u11 * det_sign;
 
     // Step 6: Set S
-    s11 = sigma1; s12 = Type(0.0);
-    s21 = Type(0.0); s22 = sigma2;
+    s1 = sigma1;
+    s2 = sigma2;
 }
 
-
 template<typename Type>
 inline CUDA_CALLABLE void svd3(const mat_t<3,3,Type>& A, mat_t<3,3,Type>& U, vec_t<3,Type>& sigma, mat_t<3,3,Type>& V) {
   Type s12, s13, s21, s23, s31, s32;
@@ -550,15 +561,14 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
 
 template<typename Type>
 inline CUDA_CALLABLE void svd2(const mat_t<2,2,Type>& A, mat_t<2,2,Type>& U, vec_t<2,Type>& sigma, mat_t<2,2,Type>& V) {
-  Type s12, s21;
   _svd_2(A.data[0][0], A.data[0][1],
        A.data[1][0], A.data[1][1],
 
        U.data[0][0], U.data[0][1],
        U.data[1][0], U.data[1][1],
 
-       sigma[0], s12,
-       s21, sigma[1],
+       sigma[0],
+       sigma[1],
 
        V.data[0][0], V.data[0][1],
        V.data[1][0], V.data[1][1]);
diff --git a/warp/tests/test_mat.py b/warp/tests/test_mat.py
@@ -1061,15 +1061,21 @@ def check_mat_svd2(
         Vout: wp.array(dtype=mat22),
         outcomponents: wp.array(dtype=wptype),
     ):
+        tid = wp.tid()
+
         U = mat22()
         sigma = vec2()
         V = mat22()
 
-        wp.svd2(m2[0], U, sigma, V)  # Assuming there's a 2D SVD kernel
+        wp.svd2(m2[tid], U, sigma, V)  # Assuming there's a 2D SVD kernel
 
-        Uout[0] = U
-        sigmaout[0] = sigma
-        Vout[0] = V
+        Uout[tid] = U
+        sigmaout[tid] = sigma
+        Vout[tid] = V
+
+        # backprop test only for first input
+        if tid > 0:
+            return
 
         # multiply outputs by 2 so we've got something to backpropagate:
         idx = 0
@@ -1094,22 +1100,46 @@ def check_mat_svd2(
     if register_kernels:
         return
 
-    m2 = wp.array(randvals(rng, [1, 2, 2], dtype) + np.eye(2), dtype=mat22, requires_grad=True, device=device)
+    mats = np.concatenate(
+        (
+            randvals(rng, [24, 2, 2], dtype) + np.eye(2),
+            # rng unlikely to hit edge cases, build them manually
+            [
+                np.zeros((2, 2)),
+                np.eye(2),
+                5.0 * np.eye(2),
+                np.array([[1.0, 0.0], [0.0, 0.0]]),
+                np.array([[0.0, 0.0], [0.0, 2.0]]),
+                np.array([[1.0, 1.0], [-1.0, -1.0]]),
+                np.array([[3.0, 0.0], [4.0, 5.0]]),
+                np.eye(2) + tol * np.array([[1.0, 1.0], [-1.0, -1.0]]),
+            ],
+        ),
+        axis=0,
+    )
+    M = len(mats)
+    m2 = wp.array(mats, dtype=mat22, requires_grad=True, device=device)
 
     outcomponents = wp.zeros(2 * 2 * 2 + 2, dtype=wptype, requires_grad=True, device=device)
-    Uout = wp.zeros(1, dtype=mat22, requires_grad=True, device=device)
-    sigmaout = wp.zeros(1, dtype=vec2, requires_grad=True, device=device)
-    Vout = wp.zeros(1, dtype=mat22, requires_grad=True, device=device)
+    Uout = wp.zeros(M, dtype=mat22, requires_grad=True, device=device)
+    sigmaout = wp.zeros(M, dtype=vec2, requires_grad=True, device=device)
+    Vout = wp.zeros(M, dtype=mat22, requires_grad=True, device=device)
 
-    wp.launch(kernel, dim=1, inputs=[m2], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)
+    wp.launch(kernel, dim=M, inputs=[m2], outputs=[Uout, sigmaout, Vout, outcomponents], device=device)
 
-    Uout_np = Uout.numpy()[0].astype(np.float64)
-    sigmaout_np = np.diag(sigmaout.numpy()[0].astype(np.float64))
-    Vout_np = Vout.numpy()[0].astype(np.float64)
+    Uout_np = Uout.numpy().astype(np.float64)
+    sigmaout_np = sigmaout.numpy().astype(np.float64)
+    Vout_np = Vout.numpy().astype(np.float64)
+
+    USVt_np = Uout_np @ (sigmaout_np[..., None] * np.transpose(Vout_np, axes=(0, 2, 1)))
 
     assert_np_equal(
-        np.matmul(Uout_np, np.matmul(sigmaout_np, Vout_np.T)), m2.numpy()[0].astype(np.float64), tol=30 * tol
+        Uout_np @ np.transpose(Uout_np, axes=(0, 2, 1)), np.broadcast_to(np.eye(2), shape=(M, 2, 2)), tol=30 * tol
     )
+    assert_np_equal(
+        Vout_np @ np.transpose(Vout_np, axes=(0, 2, 1)), np.broadcast_to(np.eye(2), shape=(M, 2, 2)), tol=30 * tol
+    )
+    assert_np_equal(USVt_np, m2.numpy().astype(np.float64), tol=30 * tol)
 
     if dtype == np.float16:
         # Skip gradient check for float16 due to rounding errors
@@ -1128,7 +1158,7 @@ def check_mat_svd2(
 
         tape.zero()
 
-        dx = 0.0001
+        dx = 0.001
         fdtol = 5.0e-4 if dtype == np.float64 else 2.0e-2
         for ii in range(2):
             for jj in range(2):
@@ -1163,9 +1193,9 @@ def test_qr(test, device, dtype, register_kernels=False):
     rng = np.random.default_rng(123)
 
     tol = {
-        np.float16: 2.0e-3,
+        np.float16: 2.5e-3,
         np.float32: 1.0e-6,
-        np.float64: 1.0e-6,
+        np.float64: 1.0e-12,
     }.get(dtype, 0)
 
     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]