Skip to content

Commit 5574995

Browse files
authored
polyval: refactor FieldElement::karatsuba_mul (#295)
Implements Karatsuba as a private method on `FieldElement` which returns a 256-bit `Product` type, which can either be `BitXor`'d together, or reduced back into a 128-bit `FieldElement`. This should make it easier for the software backend to implement Karatsuba + powers-of-H in the event we can't use the intrinsics-based R/F algorithm.
1 parent a327d42 commit 5574995

File tree

3 files changed

+191
-165
lines changed

3 files changed

+191
-165
lines changed

polyval/src/field_element.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,7 @@ impl Mul for FieldElement {
149149
/// Perform carryless multiplication within POLYVAL's field modulo its polynomial.
150150
#[inline]
151151
fn mul(self, rhs: Self) -> Self {
152-
let v = mul::karatsuba(self, rhs);
153-
mul::mont_reduce(v)
152+
self.karatsuba_mul(rhs).mont_reduce()
154153
}
155154
}
156155

@@ -168,7 +167,7 @@ impl Zeroize for FieldElement {
168167
}
169168
}
170169

171-
/// Multiplication in GF(2)[X], implemented generically and wrapped as `bmul32` and `bmul64`.
170+
/// Multiplication in GF(2)[X], implemented generically for use with `u32` and `u64`.
172171
///
173172
/// Uses "holes" (sequences of zeroes) to avoid carry spilling, as specified in the mask operand
174173
/// `m0` which should have a full-width value with the following bit pattern:

polyval/src/field_element/mul32.rs

Lines changed: 115 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,98 @@
2525
//! In other words, if we bit-reverse (over 32-bits) the operands then we bit-reverse (over 64-bits)
2626
//! the result.
2727
28+
use super::bmul;
2829
use crate::field_element::FieldElement;
30+
use core::{array, ops::BitXor};
2931

3032
impl FieldElement {
33+
/// Compute the unreduced 256-bit carryless product of two 128-bit field elements using 32-bit
34+
/// limbs.
35+
///
36+
/// Uses a Karatsuba decomposition in which the 128x128 multiplication is reduced to three 64x64
37+
/// multiplications, hence nine 32x32 multiplications. With the bit-reversal trick, we have to
38+
/// perform 18 32x32 multiplications.
39+
#[inline]
40+
pub(crate) fn karatsuba_mul(self, rhs: FieldElement) -> Product {
41+
let hw = self.to_u32x4();
42+
let yw = rhs.to_u32x4();
43+
let hwr = [
44+
hw[0].reverse_bits(),
45+
hw[1].reverse_bits(),
46+
hw[2].reverse_bits(),
47+
hw[3].reverse_bits(),
48+
];
49+
50+
// Karatsuba input decomposition for H
51+
let mut a = [0u32; 18];
52+
a[0] = yw[0];
53+
a[1] = yw[1];
54+
a[2] = yw[2];
55+
a[3] = yw[3];
56+
a[4] = a[0] ^ a[1];
57+
a[5] = a[2] ^ a[3];
58+
a[6] = a[0] ^ a[2];
59+
a[7] = a[1] ^ a[3];
60+
a[8] = a[6] ^ a[7];
61+
a[9] = yw[0].reverse_bits();
62+
a[10] = yw[1].reverse_bits();
63+
a[11] = yw[2].reverse_bits();
64+
a[12] = yw[3].reverse_bits();
65+
a[13] = a[9] ^ a[10];
66+
a[14] = a[11] ^ a[12];
67+
a[15] = a[9] ^ a[11];
68+
a[16] = a[10] ^ a[12];
69+
a[17] = a[15] ^ a[16];
70+
71+
// Karatsuba input decomposition for Y
72+
let mut b = [0u32; 18];
73+
b[0] = hw[0];
74+
b[1] = hw[1];
75+
b[2] = hw[2];
76+
b[3] = hw[3];
77+
b[4] = b[0] ^ b[1];
78+
b[5] = b[2] ^ b[3];
79+
b[6] = b[0] ^ b[2];
80+
b[7] = b[1] ^ b[3];
81+
b[8] = b[6] ^ b[7];
82+
b[9] = hwr[0];
83+
b[10] = hwr[1];
84+
b[11] = hwr[2];
85+
b[12] = hwr[3];
86+
b[13] = b[9] ^ b[10];
87+
b[14] = b[11] ^ b[12];
88+
b[15] = b[9] ^ b[11];
89+
b[16] = b[10] ^ b[12];
90+
b[17] = b[15] ^ b[16];
91+
92+
// 18 carryless 32x32 multiplications
93+
let mut c = [0u32; 18];
94+
for i in 0..18 {
95+
c[i] = bmul(a[i], b[i], 0x1111_1111);
96+
}
97+
98+
// Karatsuba recombination (normal)
99+
c[4] ^= c[0] ^ c[1];
100+
c[5] ^= c[2] ^ c[3];
101+
c[8] ^= c[6] ^ c[7];
102+
103+
// Karatsuba recombination (bit-reversed)
104+
c[13] ^= c[9] ^ c[10];
105+
c[14] ^= c[11] ^ c[12];
106+
c[17] ^= c[15] ^ c[16];
107+
108+
// Assemble the final 256-bit product as `u32` x 8.
109+
let zw0 = c[0];
110+
let zw1 = c[4] ^ c[9].reverse_bits() >> 1;
111+
let zw2 = c[1] ^ c[0] ^ c[2] ^ c[6] ^ c[13].reverse_bits() >> 1;
112+
let zw3 = c[4] ^ c[5] ^ c[8] ^ (c[10] ^ c[9] ^ c[11] ^ c[15]).reverse_bits() >> 1;
113+
let zw4 = c[2] ^ c[1] ^ c[3] ^ c[7] ^ (c[13] ^ c[14] ^ c[17]).reverse_bits() >> 1;
114+
let zw5 = c[5] ^ (c[11] ^ c[10] ^ c[12] ^ c[16]).reverse_bits() >> 1;
115+
let zw6 = c[3] ^ c[14].reverse_bits() >> 1;
116+
let zw7 = c[12].reverse_bits() >> 1;
117+
Product([zw0, zw1, zw2, zw3, zw4, zw5, zw6, zw7])
118+
}
119+
31120
#[inline]
32121
fn from_u32x4(v: [u32; 4]) -> FieldElement {
33122
let mut bytes = [0u8; 16];
@@ -49,112 +138,35 @@ impl FieldElement {
49138
}
50139
}
51140

52-
/// Compute the unreduced 256-bit carryless product of two 128-bit field elements using 32-bit
53-
/// limbs.
54-
///
55-
/// Uses a Karatsuba decomposition in which the 128x128 multiplication is reduced to three 64x64
56-
/// multiplications, hence nine 32x32 multiplications. With the bit-reversal trick, we have to
57-
/// perform 18 32x32 multiplications.
58-
#[inline]
59-
pub(super) fn karatsuba(h: FieldElement, y: FieldElement) -> [u32; 8] {
60-
let hw = h.to_u32x4();
61-
let yw = y.to_u32x4();
62-
let hwr = [
63-
hw[0].reverse_bits(),
64-
hw[1].reverse_bits(),
65-
hw[2].reverse_bits(),
66-
hw[3].reverse_bits(),
67-
];
68-
69-
// Karatsuba input decomposition for H
70-
let mut a = [0u32; 18];
71-
a[0] = yw[0];
72-
a[1] = yw[1];
73-
a[2] = yw[2];
74-
a[3] = yw[3];
75-
a[4] = a[0] ^ a[1];
76-
a[5] = a[2] ^ a[3];
77-
a[6] = a[0] ^ a[2];
78-
a[7] = a[1] ^ a[3];
79-
a[8] = a[6] ^ a[7];
80-
a[9] = yw[0].reverse_bits();
81-
a[10] = yw[1].reverse_bits();
82-
a[11] = yw[2].reverse_bits();
83-
a[12] = yw[3].reverse_bits();
84-
a[13] = a[9] ^ a[10];
85-
a[14] = a[11] ^ a[12];
86-
a[15] = a[9] ^ a[11];
87-
a[16] = a[10] ^ a[12];
88-
a[17] = a[15] ^ a[16];
89-
90-
// Karatsuba input decomposition for Y
91-
let mut b = [0u32; 18];
92-
b[0] = hw[0];
93-
b[1] = hw[1];
94-
b[2] = hw[2];
95-
b[3] = hw[3];
96-
b[4] = b[0] ^ b[1];
97-
b[5] = b[2] ^ b[3];
98-
b[6] = b[0] ^ b[2];
99-
b[7] = b[1] ^ b[3];
100-
b[8] = b[6] ^ b[7];
101-
b[9] = hwr[0];
102-
b[10] = hwr[1];
103-
b[11] = hwr[2];
104-
b[12] = hwr[3];
105-
b[13] = b[9] ^ b[10];
106-
b[14] = b[11] ^ b[12];
107-
b[15] = b[9] ^ b[11];
108-
b[16] = b[10] ^ b[12];
109-
b[17] = b[15] ^ b[16];
110-
111-
// 18 carryless 32x32 multiplications
112-
let mut c = [0u32; 18];
113-
for i in 0..18 {
114-
c[i] = bmul32(a[i], b[i]);
115-
}
141+
/// Unreduced 256-bit carryless product.
142+
pub(crate) struct Product([u32; 8]);
116143

117-
// Karatsuba recombination (normal)
118-
c[4] ^= c[0] ^ c[1];
119-
c[5] ^= c[2] ^ c[3];
120-
c[8] ^= c[6] ^ c[7];
121-
122-
// Karatsuba recombination (bit-reversed)
123-
c[13] ^= c[9] ^ c[10];
124-
c[14] ^= c[11] ^ c[12];
125-
c[17] ^= c[15] ^ c[16];
126-
127-
// Assemble the final 256-bit product as `u32` x 8.
128-
let zw0 = c[0];
129-
let zw1 = c[4] ^ c[9].reverse_bits() >> 1;
130-
let zw2 = c[1] ^ c[0] ^ c[2] ^ c[6] ^ c[13].reverse_bits() >> 1;
131-
let zw3 = c[4] ^ c[5] ^ c[8] ^ (c[10] ^ c[9] ^ c[11] ^ c[15]).reverse_bits() >> 1;
132-
let zw4 = c[2] ^ c[1] ^ c[3] ^ c[7] ^ (c[13] ^ c[14] ^ c[17]).reverse_bits() >> 1;
133-
let zw5 = c[5] ^ (c[11] ^ c[10] ^ c[12] ^ c[16]).reverse_bits() >> 1;
134-
let zw6 = c[3] ^ c[14].reverse_bits() >> 1;
135-
let zw7 = c[12].reverse_bits() >> 1;
136-
[zw0, zw1, zw2, zw3, zw4, zw5, zw6, zw7]
137-
}
144+
impl Product {
145+
/// Reduce the 256-bit carryless product of Karatsuba modulo the POLYVAL polynomial.
146+
///
147+
/// This performs constant-time folding using shifts and XORs corresponding to the irreducible
148+
/// polynomial `x^128 + x^127 + x^126 + x^121 + 1`.
149+
///
150+
/// This is closely related to GHASH reduction but the bit order is reversed in POLYVAL.
151+
#[inline]
152+
pub(crate) fn mont_reduce(self) -> FieldElement {
153+
let mut zw = self.0;
138154

139-
/// Carryless multiplication in GF(2)[X], truncated to the low 32-bits.
140-
#[inline]
141-
fn bmul32(x: u32, y: u32) -> u32 {
142-
super::bmul(x, y, 0x1111_1111)
143-
}
155+
for i in 0..4 {
156+
let lw = zw[i];
157+
zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7);
158+
zw[i + 3] ^= (lw << 31) ^ (lw << 30) ^ (lw << 25);
159+
}
144160

145-
/// Reduce the 256-bit carryless product of Karatsuba modulo the POLYVAL polynomial.
146-
///
147-
/// This performs constant-time folding using shifts and XORs corresponding to the irreducible
148-
/// polynomial `x^128 + x^127 + x^126 + x^121 + 1`.
149-
///
150-
/// This is closely related to GHASH reduction but the bit order is reversed in POLYVAL.
151-
#[inline]
152-
pub(super) fn mont_reduce(mut zw: [u32; 8]) -> FieldElement {
153-
for i in 0..4 {
154-
let lw = zw[i];
155-
zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7);
156-
zw[i + 3] ^= (lw << 31) ^ (lw << 30) ^ (lw << 25);
161+
FieldElement::from_u32x4([zw[4], zw[5], zw[6], zw[7]])
157162
}
163+
}
164+
165+
impl BitXor for Product {
166+
type Output = Self;
158167

159-
FieldElement::from_u32x4([zw[4], zw[5], zw[6], zw[7]])
168+
#[inline]
169+
fn bitxor(self, rhs: Self) -> Self::Output {
170+
Self(array::from_fn(|n| self.0[n] ^ rhs.0[n]))
171+
}
160172
}

0 commit comments

Comments
 (0)