falcon: generalize the SIMD around pubkey parsing

Rexicon226 · Rexicon226 · commit 9844b27fa666 · 2026-02-17T00:34:05.000-08:00
diff --git a/src/signatures/falcon.zig b/src/signatures/falcon.zig
@@ -27,7 +27,9 @@ fn Falcon(Hash: type, N: u32) type {
         pub const PublicKey = struct {
             h: Polynomial(N, Fq),
 
-            const V = @Vector(4, i16);
+            const length = 16;
+            const stride = (length / 4) * 7;
+            const V = @Vector(length, i16);
             const QV: V = @splat(Q);
 
             const BITS_PER_VALUE = 14;
@@ -45,32 +47,36 @@ fn Falcon(Hash: type, N: u32) type {
                 // values are compressed into a bit sequence of 14 * N bits, or 14N/8 bytes.
                 const h = bytes[1..];
                 var coeff: [N]Fq = undefined;
-                inline for (0..N / 4) |i| {
+                inline for (0..N / length) |i| {
                     // Given that each element is 14 bits, 7 bytes hold 4 elements (56 / 14 = 4).
                     // We represent the elements as u32 for efficient arithmetics.
-                    const in = h[i * 7 ..][0..7];
-                    const out: *[4]u32 = @ptrCast(coeff[i * 4 ..][0..4]);
-                    const mask: @Vector(4, u32) = @splat((1 << 14) - 1);
+                    const in = h[i * stride ..][0..stride];
+                    const out: *[length]u32 = @ptrCast(coeff[i * length ..][0..length]);
+                    const mask: @Vector(length, u32) = @splat((1 << 14) - 1);
 
                     // We perform 2 movs to load words at `in` and `in + 3`.
                     // The vector now contains 4 compressed elements (end-exclusive ranges):
                     // 1. 00..14 (bytes 0, 1)
                     // 2. 14..28 (bytes 1, 2, 3)
                     // 3. 28..42 (bytes 3, 4, 5)
                     // 4. 42..56 (bytes 5, 6)
-                    const compressed: @Vector(4, u32) = .{
-                        @bitCast(in[0..4].*),
-                        @bitCast(in[0..4].*),
-                        @bitCast(in[3..7].*),
-                        @bitCast(in[3..7].*),
-                    };
-                    const shifted = @byteSwap(compressed) >> .{ 18, 4, 14, 0 };
+                    var compressed: @Vector(length, u32) = undefined;
+                    inline for (0..length / 4) |j| {
+                        @setEvalBranchQuota(length * 1_000);
+                        compressed[(j * 4) + 0] = @bitCast(in[j * 7 ..][0..4].*);
+                        compressed[(j * 4) + 1] = @bitCast(in[j * 7 ..][0..4].*);
+                        compressed[(j * 4) + 2] = @bitCast(in[j * 7 ..][3..7].*);
+                        compressed[(j * 4) + 3] = @bitCast(in[j * 7 ..][3..7].*);
+                    }
+                    const shifted = @byteSwap(compressed) >> std.simd.repeat(length, [_]u5{ 18, 4, 14, 0 });
+                    }
+                    const shifted = @byteSwap(compressed) >> std.simd.repeat(length, [_]u5{ 18, 4, 14, 0 });
                     // After the mask, each element fits into 14-bits, so it'll always fit into signed 16 bits.
                     const masked: V = @intCast(shifted & mask);
                     // We perform the modulus check in parallel, checking each element and returning
                     // an error if any of the elements are greater than greater than or equal to the modulus.
                     if (@reduce(.Or, masked >= QV)) return error.InvalidCoeff;
-                    out.* = Fq.Vector(4).init(masked);
+                    out.* = Fq.Vector(length).init(masked);
                 }
 
                 return .{ .h = .{ .coeff = coeff } };
@@ -696,9 +702,7 @@ fn Falcon(Hash: type, N: u32) type {
 
                     // a[j] = a[j] * n^-1 mod q
                     const ninv = T.precompute.ninv;
-                    for (&a) |*aj| {
-                        aj.* = aj.mul(ninv);
-                    }
+                    for (&a) |*aj| aj.* = aj.mul(ninv);
 
                     return .{ .coeff = a };
                 }