|
| 1 | +//! AVX2 + PCLMULQDQ optimized POLYVAL implementation using R/F Algorithm |
| 2 | +//! Adapted from the implementation in the Apache 2.0+MIT-licensed HPCrypt library |
| 3 | +//! Copyright (c) 2024 HPCrypt Contributors |
| 4 | +//! |
| 5 | +//! Uses the R/F algorithm from "Efficient GHASH Implementation Using CLMUL": |
| 6 | +//! - 4 CLMULs per block for multiplication (R and F terms) |
| 7 | +//! - 1 CLMUL for reduction (Lemma 3) |
| 8 | +//! - 4-block aggregated processing with single reduction |
| 9 | +//! |
| 10 | +//! Key equations: |
| 11 | +//! - D = swap(H) ⊕ (H0 × P1) |
| 12 | +//! - R = M0×D1 ⊕ M1×H1 |
| 13 | +//! - F = M0×D0 ⊕ M1×H0 |
| 14 | +//! - Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0) |
| 15 | +//! |
| 16 | +//! Performance: ~1.7x faster than Karatsuba with scalar reduction |
| 17 | +//! |
| 18 | +//! POLYVAL operates in GF(2^128) with polynomial x^128 + x^127 + x^126 + x^121 + 1 |
| 19 | +//! Unlike GHASH, POLYVAL uses little-endian byte ordering (no byte swap needed). |
| 20 | +//! |
| 21 | +//! <https://eprint.iacr.org/2025/2171.pdf> |
| 22 | +
|
| 23 | +#![allow(unsafe_op_in_unsafe_fn)] |
| 24 | + |
| 25 | +#[cfg(target_arch = "x86")] |
| 26 | +use core::arch::x86::*; |
| 27 | +#[cfg(target_arch = "x86_64")] |
| 28 | +use core::arch::x86_64::*; |
| 29 | + |
| 30 | +use crate::ParBlocks; |
| 31 | + |
| 32 | +/// P1 polynomial: x^63 + x^62 + x^57 = 0xC200000000000000 |
| 33 | +const P1: u64 = 0xC200000000000000; |
| 34 | + |
| 35 | +cpufeatures::new!(clmul, "pclmulqdq"); |
| 36 | +pub(crate) use clmul::InitToken; |
| 37 | + |
| 38 | +/// POLYVAL state using AVX2 + PCLMULQDQ with R/F algorithm |
| 39 | +#[derive(Clone, Copy)] |
| 40 | +pub(crate) struct State { |
| 41 | + key: ExpandedKey, |
| 42 | + /// Current accumulator |
| 43 | + acc: __m128i, |
| 44 | +} |
| 45 | + |
| 46 | +impl State { |
| 47 | + /// Create a new POLYVAL instance |
| 48 | + /// |
| 49 | + /// # Safety |
| 50 | + /// Requires AVX2 and PCLMULQDQ support |
| 51 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 52 | + pub(crate) unsafe fn new(h: &[u8; 16]) -> Self { |
| 53 | + Self { |
| 54 | + key: ExpandedKey::new(h), |
| 55 | + acc: _mm_setzero_si128(), |
| 56 | + } |
| 57 | + } |
| 58 | + |
| 59 | + /// Update with a single block (5 CLMULs) |
| 60 | + /// |
| 61 | + /// # Safety |
| 62 | + /// Requires AVX2 and PCLMULQDQ support |
| 63 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 64 | + #[inline] |
| 65 | + pub(crate) unsafe fn update_block(&mut self, block: &[u8; 16]) { |
| 66 | + // Load directly (POLYVAL uses little-endian, no byte swap) |
| 67 | + let data = _mm_loadu_si128(block.as_ptr() as *const __m128i); |
| 68 | + |
| 69 | + // XOR with accumulator |
| 70 | + self.acc = _mm_xor_si128(self.acc, data); |
| 71 | + |
| 72 | + // Multiply by H using R/F algorithm |
| 73 | + self.acc = gf128_mul_rf(self.acc, self.key.h1, self.key.d1); |
| 74 | + } |
| 75 | + |
| 76 | + /// Process 4 blocks with R/F algorithm and aggregated reduction |
| 77 | + /// |
| 78 | + /// Uses 16 CLMULs for multiplication (4 per block) + 1 CLMUL for reduction = 17 CLMULs total |
| 79 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 80 | + #[inline] |
| 81 | + pub(crate) unsafe fn proc_par_blocks(&mut self, par_blocks: &ParBlocks) { |
| 82 | + // Load all 4 blocks (no byte swap for POLYVAL) |
| 83 | + let m0 = _mm_loadu_si128(par_blocks[0].as_ptr() as *const __m128i); |
| 84 | + let m1 = _mm_loadu_si128(par_blocks[1].as_ptr() as *const __m128i); |
| 85 | + let m2 = _mm_loadu_si128(par_blocks[2].as_ptr() as *const __m128i); |
| 86 | + let m3 = _mm_loadu_si128(par_blocks[3].as_ptr() as *const __m128i); |
| 87 | + |
| 88 | + // XOR first block with accumulator |
| 89 | + let y0 = _mm_xor_si128(self.acc, m0); |
| 90 | + |
| 91 | + // R/F multiply all 4 blocks (16 CLMULs) |
| 92 | + let (r0, f0) = rf_mul_unreduced(y0, self.key.h4, self.key.d4); |
| 93 | + let (r1, f1) = rf_mul_unreduced(m1, self.key.h3, self.key.d3); |
| 94 | + let (r2, f2) = rf_mul_unreduced(m2, self.key.h2, self.key.d2); |
| 95 | + let (r3, f3) = rf_mul_unreduced(m3, self.key.h1, self.key.d1); |
| 96 | + |
| 97 | + // Aggregate R and F values |
| 98 | + let r = _mm_xor_si128(_mm_xor_si128(r0, r1), _mm_xor_si128(r2, r3)); |
| 99 | + let f = _mm_xor_si128(_mm_xor_si128(f0, f1), _mm_xor_si128(f2, f3)); |
| 100 | + |
| 101 | + // Single reduction (1 CLMUL) |
| 102 | + self.acc = reduce_rf(r, f); |
| 103 | + } |
| 104 | + |
| 105 | + /// Finalize and return the POLYVAL tag |
| 106 | + /// |
| 107 | + /// # Safety |
| 108 | + /// Requires AVX2 and PCLMULQDQ support |
| 109 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 110 | + pub(crate) unsafe fn finalize(self) -> [u8; 16] { |
| 111 | + // Output directly (POLYVAL uses little-endian, no byte swap) |
| 112 | + let mut output = [0u8; 16]; |
| 113 | + _mm_storeu_si128(output.as_mut_ptr() as *mut __m128i, self.acc); |
| 114 | + output |
| 115 | + } |
| 116 | + |
| 117 | + /// Reset for reuse with the same key |
| 118 | + /// |
| 119 | + /// # Safety |
| 120 | + /// Requires AVX2 and PCLMULQDQ support |
| 121 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 122 | + pub(crate) unsafe fn reset(&mut self) { |
| 123 | + self.acc = _mm_setzero_si128(); |
| 124 | + } |
| 125 | + |
| 126 | + /// Zeroize the internal state. |
| 127 | + #[cfg(feature = "zeroize")] |
| 128 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 129 | + pub(crate) unsafe fn zeroize(&mut self) { |
| 130 | + // TODO(tarcieri): zeroize |
| 131 | + } |
| 132 | +} |
| 133 | + |
| 134 | +/// Precomputed key material for POLYVAL using R/F algorithm |
| 135 | +/// |
| 136 | +/// Stores H and D values for each power, where D = swap(H) ⊕ (H0 × P1) |
| 137 | +#[derive(Clone, Copy)] |
| 138 | +pub(crate) struct ExpandedKey { |
| 139 | + /// H^1 packed as [h1_hi : h1_lo] |
| 140 | + h1: __m128i, |
| 141 | + /// D^1 = computed from H^1 |
| 142 | + d1: __m128i, |
| 143 | + /// H^2 |
| 144 | + h2: __m128i, |
| 145 | + /// D^2 |
| 146 | + d2: __m128i, |
| 147 | + /// H^3 |
| 148 | + h3: __m128i, |
| 149 | + /// D^3 |
| 150 | + d3: __m128i, |
| 151 | + /// H^4 |
| 152 | + h4: __m128i, |
| 153 | + /// D^4 |
| 154 | + d4: __m128i, |
| 155 | +} |
| 156 | + |
| 157 | +impl ExpandedKey { |
| 158 | + /// Create a new POLYVAL key with R/F algorithm |
| 159 | + /// |
| 160 | + /// # Safety |
| 161 | + /// Requires AVX2 and PCLMULQDQ support |
| 162 | + #[target_feature(enable = "avx2", enable = "pclmulqdq")] |
| 163 | + pub(crate) unsafe fn new(h: &[u8; 16]) -> Self { |
| 164 | + // Load H directly (POLYVAL uses little-endian, no byte swap needed) |
| 165 | + let h1 = _mm_loadu_si128(h.as_ptr() as *const __m128i); |
| 166 | + let d1 = compute_d(h1); |
| 167 | + |
| 168 | + // Compute powers using R/F multiplication (same as GHASH) |
| 169 | + let h2 = gf128_mul_rf(h1, h1, d1); |
| 170 | + let d2 = compute_d(h2); |
| 171 | + |
| 172 | + let h3 = gf128_mul_rf(h2, h1, d1); |
| 173 | + let d3 = compute_d(h3); |
| 174 | + |
| 175 | + let h4 = gf128_mul_rf(h2, h2, d2); |
| 176 | + let d4 = compute_d(h4); |
| 177 | + |
| 178 | + Self { |
| 179 | + h1, |
| 180 | + d1, |
| 181 | + h2, |
| 182 | + d2, |
| 183 | + h3, |
| 184 | + d3, |
| 185 | + h4, |
| 186 | + d4, |
| 187 | + } |
| 188 | + } |
| 189 | +} |
| 190 | + |
| 191 | +/// Compute D from H using the R/F algorithm |
| 192 | +/// |
| 193 | +/// D = swap(H) ⊕ (H0 × P1) |
| 194 | +#[target_feature(enable = "pclmulqdq")] |
| 195 | +#[inline] |
| 196 | +unsafe fn compute_d(h: __m128i) -> __m128i { |
| 197 | + let p = _mm_set_epi64x(P1 as i64, 0); |
| 198 | + |
| 199 | + // Swap halves: [H1 : H0] -> [H0 : H1] |
| 200 | + let h_swap = _mm_shuffle_epi32(h, 0x4e); |
| 201 | + |
| 202 | + // T = H0 × P1 |
| 203 | + let t = _mm_clmulepi64_si128(h, p, 0x10); |
| 204 | + |
| 205 | + // D = swap(H) ⊕ T |
| 206 | + _mm_xor_si128(h_swap, t) |
| 207 | +} |
| 208 | + |
| 209 | +/// R/F multiplication using 4 CLMULs per block |
| 210 | +/// |
| 211 | +/// Given M = [M1 : M0] and precomputed H = [H1 : H0], D = [D1 : D0]: |
| 212 | +/// - R = M0×D1 ⊕ M1×H1 (2 CLMULs) |
| 213 | +/// - F = M0×D0 ⊕ M1×H0 (2 CLMULs) |
| 214 | +/// |
| 215 | +/// Returns (R, F) for later reduction |
| 216 | +#[target_feature(enable = "pclmulqdq")] |
| 217 | +#[inline] |
| 218 | +unsafe fn rf_mul_unreduced(m: __m128i, h: __m128i, d: __m128i) -> (__m128i, __m128i) { |
| 219 | + // R = M0×D1 ⊕ M1×H1 |
| 220 | + let r0 = _mm_clmulepi64_si128(m, d, 0x10); // M0 × D1 |
| 221 | + let r1 = _mm_clmulepi64_si128(m, h, 0x11); // M1 × H1 |
| 222 | + let r = _mm_xor_si128(r0, r1); |
| 223 | + |
| 224 | + // F = M0×D0 ⊕ M1×H0 |
| 225 | + let f0 = _mm_clmulepi64_si128(m, d, 0x00); // M0 × D0 |
| 226 | + let f1 = _mm_clmulepi64_si128(m, h, 0x01); // M1 × H0 |
| 227 | + let f = _mm_xor_si128(f0, f1); |
| 228 | + |
| 229 | + (r, f) |
| 230 | +} |
| 231 | + |
| 232 | +/// Reduction using Lemma 3: Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0) |
| 233 | +/// |
| 234 | +/// Uses 1 CLMUL for reduction |
| 235 | +#[target_feature(enable = "pclmulqdq")] |
| 236 | +#[inline] |
| 237 | +unsafe fn reduce_rf(r: __m128i, f: __m128i) -> __m128i { |
| 238 | + let p1 = _mm_set_epi64x(0, P1 as i64); |
| 239 | + |
| 240 | + // F1 in low position |
| 241 | + let f1 = _mm_srli_si128(f, 8); |
| 242 | + |
| 243 | + // x^64×F0 (shift F0 to high position) |
| 244 | + let f0_shifted = _mm_slli_si128(f, 8); |
| 245 | + |
| 246 | + // P1×F0 |
| 247 | + let p1_f0 = _mm_clmulepi64_si128(f, p1, 0x00); |
| 248 | + |
| 249 | + // Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0) |
| 250 | + let result = _mm_xor_si128(r, f1); |
| 251 | + let result = _mm_xor_si128(result, f0_shifted); |
| 252 | + _mm_xor_si128(result, p1_f0) |
| 253 | +} |
| 254 | + |
| 255 | +/// Complete R/F multiplication with reduction (5 CLMULs total) |
| 256 | +#[target_feature(enable = "pclmulqdq")] |
| 257 | +#[inline] |
| 258 | +unsafe fn gf128_mul_rf(m: __m128i, h: __m128i, d: __m128i) -> __m128i { |
| 259 | + let (r, f) = rf_mul_unreduced(m, h, d); |
| 260 | + reduce_rf(r, f) |
| 261 | +} |
0 commit comments