Skip to content

Commit 1e1f2e5

Browse files
committed
[WIP] polyval: implement R/F algorithm
1 parent fed0f15 commit 1e1f2e5

File tree

16 files changed

+849
-784
lines changed

16 files changed

+849
-784
lines changed

polyval/src/backend.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
mod soft;
2+
3+
cpubits::cfg_if! {
4+
if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
5+
// PMULL/NEON backend for aarch64
6+
mod autodetect;
7+
mod neon;
8+
pub(crate) use autodetect::State;
9+
pub(crate) use neon::InitToken;
10+
} else if #[cfg(all(
11+
any(target_arch = "x86_64", target_arch = "x86"),
12+
not(polyval_backend = "soft")
13+
))] {
14+
// CLMUL/AVX2 backend for x86/x86-64
15+
mod autodetect;
16+
mod avx2;
17+
pub(crate) use autodetect::State;
18+
pub(crate) use avx2::InitToken;
19+
} else {
20+
// "soft" pure Rust portable fallback implementation for other targets
21+
pub(crate) use soft::{State, InitToken};
22+
}
23+
}

polyval/src/backend/autodetect.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
//! Support for CPU feature autodetection with a portable pure Rust fallback.
2+
3+
use super::{InitToken, soft};
4+
use crate::{Block, FieldElement, ParBlocks, Tag};
5+
6+
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
7+
use super::avx2 as intrinsics;
8+
#[cfg(target_arch = "aarch64")]
9+
use super::neon as intrinsics;
10+
11+
pub(crate) union State {
12+
intrinsics: intrinsics::State,
13+
soft: soft::State,
14+
}
15+
16+
impl State {
17+
pub(crate) fn new(h: FieldElement, has_intrinsics: InitToken) -> Self {
18+
if has_intrinsics.get() {
19+
Self {
20+
intrinsics: unsafe { intrinsics::State::new(&h.into()) },
21+
}
22+
} else {
23+
Self {
24+
soft: soft::State::new(h, soft::InitToken::init()),
25+
}
26+
}
27+
}
28+
29+
pub(crate) fn proc_block(&mut self, block: &Block, has_intrinsics: InitToken) {
30+
if has_intrinsics.get() {
31+
unsafe { self.intrinsics.update_block(&block.0) }
32+
} else {
33+
unsafe { self.soft.proc_block(block, soft::InitToken::init()) }
34+
}
35+
}
36+
37+
pub(crate) fn proc_par_blocks(&mut self, par_blocks: &ParBlocks, has_intrinsics: InitToken) {
38+
if has_intrinsics.get() {
39+
unsafe { self.intrinsics.proc_par_blocks(par_blocks) }
40+
} else {
41+
unsafe {
42+
self.soft
43+
.proc_par_blocks(par_blocks, soft::InitToken::init())
44+
}
45+
}
46+
}
47+
48+
pub(crate) fn finalize(&self, has_intrinsics: InitToken) -> Tag {
49+
if has_intrinsics.get() {
50+
unsafe { self.intrinsics.finalize().into() }
51+
} else {
52+
unsafe { self.soft.finalize(soft::InitToken::init()) }
53+
}
54+
}
55+
56+
pub(crate) fn reset(&mut self, has_intrinsics: InitToken) {
57+
if has_intrinsics.get() {
58+
unsafe { self.intrinsics.reset() }
59+
} else {
60+
unsafe { self.soft.reset(soft::InitToken::init()) }
61+
}
62+
}
63+
64+
pub(crate) fn clone_with_intrinsics(&self, has_intrinsics: InitToken) -> Self {
65+
if has_intrinsics.get() {
66+
Self {
67+
intrinsics: unsafe { self.intrinsics.clone() },
68+
}
69+
} else {
70+
Self {
71+
soft: unsafe { self.soft.clone() },
72+
}
73+
}
74+
}
75+
76+
#[cfg(feature = "zeroize")]
77+
pub(crate) fn zeroize(&mut self, has_intrinsics: InitToken) {
78+
if has_intrinsics.get() {
79+
unsafe {
80+
self.intrinsics.zeroize();
81+
}
82+
} else {
83+
unsafe {
84+
self.soft.zeroize(soft::InitToken::init());
85+
}
86+
}
87+
}
88+
}

polyval/src/backend/avx2.rs

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
//! AVX2 + PCLMULQDQ optimized POLYVAL implementation using R/F Algorithm
2+
//! Adapted from the implementation in the Apache 2.0+MIT-licensed HPCrypt library
3+
//! Copyright (c) 2024 HPCrypt Contributors
4+
//!
5+
//! Uses the R/F algorithm from "Efficient GHASH Implementation Using CLMUL":
6+
//! - 4 CLMULs per block for multiplication (R and F terms)
7+
//! - 1 CLMUL for reduction (Lemma 3)
8+
//! - 4-block aggregated processing with single reduction
9+
//!
10+
//! Key equations:
11+
//! - D = swap(H) ⊕ (H0 × P1)
12+
//! - R = M0×D1 ⊕ M1×H1
13+
//! - F = M0×D0 ⊕ M1×H0
14+
//! - Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
15+
//!
16+
//! Performance: ~1.7x faster than Karatsuba with scalar reduction
17+
//!
18+
//! POLYVAL operates in GF(2^128) with polynomial x^128 + x^127 + x^126 + x^121 + 1
19+
//! Unlike GHASH, POLYVAL uses little-endian byte ordering (no byte swap needed).
20+
//!
21+
//! <https://eprint.iacr.org/2025/2171.pdf>
22+
23+
#![allow(unsafe_op_in_unsafe_fn)]
24+
25+
#[cfg(target_arch = "x86")]
26+
use core::arch::x86::*;
27+
#[cfg(target_arch = "x86_64")]
28+
use core::arch::x86_64::*;
29+
30+
use crate::ParBlocks;
31+
32+
/// P1 polynomial: x^63 + x^62 + x^57 = 0xC200000000000000
33+
const P1: u64 = 0xC200000000000000;
34+
35+
cpufeatures::new!(clmul, "pclmulqdq");
36+
pub(crate) use clmul::InitToken;
37+
38+
/// POLYVAL state using AVX2 + PCLMULQDQ with R/F algorithm
39+
#[derive(Clone, Copy)]
40+
pub(crate) struct State {
41+
key: ExpandedKey,
42+
/// Current accumulator
43+
acc: __m128i,
44+
}
45+
46+
impl State {
47+
/// Create a new POLYVAL instance
48+
///
49+
/// # Safety
50+
/// Requires AVX2 and PCLMULQDQ support
51+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
52+
pub(crate) unsafe fn new(h: &[u8; 16]) -> Self {
53+
Self {
54+
key: ExpandedKey::new(h),
55+
acc: _mm_setzero_si128(),
56+
}
57+
}
58+
59+
/// Update with a single block (5 CLMULs)
60+
///
61+
/// # Safety
62+
/// Requires AVX2 and PCLMULQDQ support
63+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
64+
#[inline]
65+
pub(crate) unsafe fn update_block(&mut self, block: &[u8; 16]) {
66+
// Load directly (POLYVAL uses little-endian, no byte swap)
67+
let data = _mm_loadu_si128(block.as_ptr() as *const __m128i);
68+
69+
// XOR with accumulator
70+
self.acc = _mm_xor_si128(self.acc, data);
71+
72+
// Multiply by H using R/F algorithm
73+
self.acc = gf128_mul_rf(self.acc, self.key.h1, self.key.d1);
74+
}
75+
76+
/// Process 4 blocks with R/F algorithm and aggregated reduction
77+
///
78+
/// Uses 16 CLMULs for multiplication (4 per block) + 1 CLMUL for reduction = 17 CLMULs total
79+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
80+
#[inline]
81+
pub(crate) unsafe fn proc_par_blocks(&mut self, par_blocks: &ParBlocks) {
82+
// Load all 4 blocks (no byte swap for POLYVAL)
83+
let m0 = _mm_loadu_si128(par_blocks[0].as_ptr() as *const __m128i);
84+
let m1 = _mm_loadu_si128(par_blocks[1].as_ptr() as *const __m128i);
85+
let m2 = _mm_loadu_si128(par_blocks[2].as_ptr() as *const __m128i);
86+
let m3 = _mm_loadu_si128(par_blocks[3].as_ptr() as *const __m128i);
87+
88+
// XOR first block with accumulator
89+
let y0 = _mm_xor_si128(self.acc, m0);
90+
91+
// R/F multiply all 4 blocks (16 CLMULs)
92+
let (r0, f0) = rf_mul_unreduced(y0, self.key.h4, self.key.d4);
93+
let (r1, f1) = rf_mul_unreduced(m1, self.key.h3, self.key.d3);
94+
let (r2, f2) = rf_mul_unreduced(m2, self.key.h2, self.key.d2);
95+
let (r3, f3) = rf_mul_unreduced(m3, self.key.h1, self.key.d1);
96+
97+
// Aggregate R and F values
98+
let r = _mm_xor_si128(_mm_xor_si128(r0, r1), _mm_xor_si128(r2, r3));
99+
let f = _mm_xor_si128(_mm_xor_si128(f0, f1), _mm_xor_si128(f2, f3));
100+
101+
// Single reduction (1 CLMUL)
102+
self.acc = reduce_rf(r, f);
103+
}
104+
105+
/// Finalize and return the POLYVAL tag
106+
///
107+
/// # Safety
108+
/// Requires AVX2 and PCLMULQDQ support
109+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
110+
pub(crate) unsafe fn finalize(self) -> [u8; 16] {
111+
// Output directly (POLYVAL uses little-endian, no byte swap)
112+
let mut output = [0u8; 16];
113+
_mm_storeu_si128(output.as_mut_ptr() as *mut __m128i, self.acc);
114+
output
115+
}
116+
117+
/// Reset for reuse with the same key
118+
///
119+
/// # Safety
120+
/// Requires AVX2 and PCLMULQDQ support
121+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
122+
pub(crate) unsafe fn reset(&mut self) {
123+
self.acc = _mm_setzero_si128();
124+
}
125+
126+
/// Zeroize the internal state.
127+
#[cfg(feature = "zeroize")]
128+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
129+
pub(crate) unsafe fn zeroize(&mut self) {
130+
// TODO(tarcieri): zeroize
131+
}
132+
}
133+
134+
/// Precomputed key material for POLYVAL using R/F algorithm
135+
///
136+
/// Stores H and D values for each power, where D = swap(H) ⊕ (H0 × P1)
137+
#[derive(Clone, Copy)]
138+
pub(crate) struct ExpandedKey {
139+
/// H^1 packed as [h1_hi : h1_lo]
140+
h1: __m128i,
141+
/// D^1 = computed from H^1
142+
d1: __m128i,
143+
/// H^2
144+
h2: __m128i,
145+
/// D^2
146+
d2: __m128i,
147+
/// H^3
148+
h3: __m128i,
149+
/// D^3
150+
d3: __m128i,
151+
/// H^4
152+
h4: __m128i,
153+
/// D^4
154+
d4: __m128i,
155+
}
156+
157+
impl ExpandedKey {
158+
/// Create a new POLYVAL key with R/F algorithm
159+
///
160+
/// # Safety
161+
/// Requires AVX2 and PCLMULQDQ support
162+
#[target_feature(enable = "avx2", enable = "pclmulqdq")]
163+
pub(crate) unsafe fn new(h: &[u8; 16]) -> Self {
164+
// Load H directly (POLYVAL uses little-endian, no byte swap needed)
165+
let h1 = _mm_loadu_si128(h.as_ptr() as *const __m128i);
166+
let d1 = compute_d(h1);
167+
168+
// Compute powers using R/F multiplication (same as GHASH)
169+
let h2 = gf128_mul_rf(h1, h1, d1);
170+
let d2 = compute_d(h2);
171+
172+
let h3 = gf128_mul_rf(h2, h1, d1);
173+
let d3 = compute_d(h3);
174+
175+
let h4 = gf128_mul_rf(h2, h2, d2);
176+
let d4 = compute_d(h4);
177+
178+
Self {
179+
h1,
180+
d1,
181+
h2,
182+
d2,
183+
h3,
184+
d3,
185+
h4,
186+
d4,
187+
}
188+
}
189+
}
190+
191+
/// Compute D from H using the R/F algorithm
192+
///
193+
/// D = swap(H) ⊕ (H0 × P1)
194+
#[target_feature(enable = "pclmulqdq")]
195+
#[inline]
196+
unsafe fn compute_d(h: __m128i) -> __m128i {
197+
let p = _mm_set_epi64x(P1 as i64, 0);
198+
199+
// Swap halves: [H1 : H0] -> [H0 : H1]
200+
let h_swap = _mm_shuffle_epi32(h, 0x4e);
201+
202+
// T = H0 × P1
203+
let t = _mm_clmulepi64_si128(h, p, 0x10);
204+
205+
// D = swap(H) ⊕ T
206+
_mm_xor_si128(h_swap, t)
207+
}
208+
209+
/// R/F multiplication using 4 CLMULs per block
210+
///
211+
/// Given M = [M1 : M0] and precomputed H = [H1 : H0], D = [D1 : D0]:
212+
/// - R = M0×D1 ⊕ M1×H1 (2 CLMULs)
213+
/// - F = M0×D0 ⊕ M1×H0 (2 CLMULs)
214+
///
215+
/// Returns (R, F) for later reduction
216+
#[target_feature(enable = "pclmulqdq")]
217+
#[inline]
218+
unsafe fn rf_mul_unreduced(m: __m128i, h: __m128i, d: __m128i) -> (__m128i, __m128i) {
219+
// R = M0×D1 ⊕ M1×H1
220+
let r0 = _mm_clmulepi64_si128(m, d, 0x10); // M0 × D1
221+
let r1 = _mm_clmulepi64_si128(m, h, 0x11); // M1 × H1
222+
let r = _mm_xor_si128(r0, r1);
223+
224+
// F = M0×D0 ⊕ M1×H0
225+
let f0 = _mm_clmulepi64_si128(m, d, 0x00); // M0 × D0
226+
let f1 = _mm_clmulepi64_si128(m, h, 0x01); // M1 × H0
227+
let f = _mm_xor_si128(f0, f1);
228+
229+
(r, f)
230+
}
231+
232+
/// Reduction using Lemma 3: Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
233+
///
234+
/// Uses 1 CLMUL for reduction
235+
#[target_feature(enable = "pclmulqdq")]
236+
#[inline]
237+
unsafe fn reduce_rf(r: __m128i, f: __m128i) -> __m128i {
238+
let p1 = _mm_set_epi64x(0, P1 as i64);
239+
240+
// F1 in low position
241+
let f1 = _mm_srli_si128(f, 8);
242+
243+
// x^64×F0 (shift F0 to high position)
244+
let f0_shifted = _mm_slli_si128(f, 8);
245+
246+
// P1×F0
247+
let p1_f0 = _mm_clmulepi64_si128(f, p1, 0x00);
248+
249+
// Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
250+
let result = _mm_xor_si128(r, f1);
251+
let result = _mm_xor_si128(result, f0_shifted);
252+
_mm_xor_si128(result, p1_f0)
253+
}
254+
255+
/// Complete R/F multiplication with reduction (5 CLMULs total)
256+
#[target_feature(enable = "pclmulqdq")]
257+
#[inline]
258+
unsafe fn gf128_mul_rf(m: __m128i, h: __m128i, d: __m128i) -> __m128i {
259+
let (r, f) = rf_mul_unreduced(m, h, d);
260+
reduce_rf(r, f)
261+
}

0 commit comments

Comments
 (0)