Skip to content

Commit 030ef72

Browse files
committed
polyval: consolidate single PolyvalGeneric struct
Consolidates the many (nested, via unions!) repeated `Polyval` structs found throughout the crate for each backend into a single `Polyval(Generic)` struct which imlements the `universal-hash` interface in a single place. `FieldElement` now defers decoding into SIMD registers until we're ready to actually perform a field operation, so we can use that as a common type for all field backends (and already implemented this in previous commits). Backends are now just free functions that operate on `FieldElement`, making them dramatically simpler.
1 parent 611b691 commit 030ef72

File tree

7 files changed

+398
-539
lines changed

7 files changed

+398
-539
lines changed

polyval/src/field_element.rs

Lines changed: 57 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,15 @@
33
mod soft;
44

55
use crate::{BLOCK_SIZE, Block};
6-
use core::fmt;
7-
use core::fmt::Debug;
8-
use core::ops::{Add, Mul, MulAssign};
6+
use core::{
7+
fmt::{self, Debug},
8+
ops::{Add, Mul, MulAssign},
9+
};
910
use cpubits::cfg_if;
1011

1112
#[cfg(feature = "zeroize")]
1213
use zeroize::Zeroize;
1314

14-
cfg_if! {
15-
if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
16-
mod autodetect;
17-
mod armv8;
18-
pub use autodetect::Polyval as PolyvalGeneric;
19-
} else if #[cfg(all(
20-
any(target_arch = "x86_64", target_arch = "x86"),
21-
not(polyval_backend = "soft")
22-
))] {
23-
mod autodetect;
24-
mod x86;
25-
pub use autodetect::Polyval as PolyvalGeneric;
26-
} else {
27-
pub use soft::Polyval as PolyvalGeneric;
28-
}
29-
}
30-
3115
/// An element in POLYVAL's field.
3216
///
3317
/// This type represents an element of the binary field GF(2^128) modulo the irreducible polynomial
@@ -48,11 +32,62 @@ cfg_if! {
4832
#[repr(C, align(16))] // Make ABI and alignment compatible with SIMD registers
4933
pub(crate) struct FieldElement([u8; BLOCK_SIZE]);
5034

35+
cfg_if! {
36+
if #[cfg(all(target_arch = "aarch64", not(polyval_backend = "soft")))] {
37+
// aarch64
38+
mod autodetect;
39+
mod armv8;
40+
pub(crate) use autodetect::{InitToken, detect_intrinsics};
41+
} else if #[cfg(all(
42+
any(target_arch = "x86_64", target_arch = "x86"),
43+
not(polyval_backend = "soft")
44+
))] {
45+
// x86/x86-64
46+
mod autodetect;
47+
mod x86;
48+
pub(crate) use autodetect::{InitToken, detect_intrinsics};
49+
} else {
50+
// Pure Rust fallback implementation for other targets
51+
use universal_hash::array::{Array, ArraySize};
52+
53+
pub(crate) type InitToken = ();
54+
pub(crate) fn detect_intrinsics() -> (InitToken, bool) {
55+
((), false)
56+
}
57+
58+
impl FieldElement {
59+
/// Default degree of parallelism, i.e. how many powers of `H` to compute.
60+
pub const DEFAULT_PARALLELISM: usize = 8;
61+
62+
/// Process an individual block.
63+
pub(crate) fn proc_block(
64+
h: FieldElement,
65+
y: FieldElement,
66+
x: &Block,
67+
_has_intrinsics: InitToken
68+
) -> FieldElement {
69+
soft::proc_block(h, y, x)
70+
}
71+
72+
/// Process multiple blocks in parallel.
73+
// TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only
74+
pub(crate) fn proc_par_blocks<const N: usize, U: ArraySize>(
75+
powers_of_h: &[FieldElement; N],
76+
y: FieldElement,
77+
blocks: &Array<Block, U>,
78+
_has_intrinsics: InitToken
79+
) -> FieldElement {
80+
soft::proc_par_blocks(powers_of_h, y, blocks)
81+
}
82+
}
83+
}
84+
}
85+
5186
impl FieldElement {
5287
/// Compute the first N powers of h, in reverse order.
5388
#[inline]
5489
#[allow(dead_code)] // We may not use this in some configurations
55-
fn powers_of_h<const N: usize>(self) -> [Self; N] {
90+
pub(crate) fn powers_of_h<const N: usize>(self) -> [Self; N] {
5691
// TODO: improve pipelining by using more square operations?
5792
let mut pow = [Self::default(); N];
5893
let mut prev = self;
@@ -144,8 +179,7 @@ impl Mul for FieldElement {
144179

145180
#[inline]
146181
fn mul(self, rhs: Self) -> Self {
147-
let v = soft::karatsuba(self, rhs);
148-
soft::mont_reduce(v)
182+
soft::polymul(self, rhs)
149183
}
150184
}
151185

polyval/src/field_element/armv8.rs

Lines changed: 71 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -13,154 +13,96 @@
1313
//! For more information about PMULL, see:
1414
//! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
1515
//! - <https://eprint.iacr.org/2015/688.pdf>
16+
1617
#![allow(unsafe_op_in_unsafe_fn)]
1718

1819
use super::FieldElement;
19-
use crate::{Block, Key, Tag};
20+
use crate::Block;
2021
use core::{arch::aarch64::*, mem};
21-
use universal_hash::{
22-
KeyInit, ParBlocks, Reset, UhfBackend,
23-
array::ArraySize,
24-
common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
25-
consts::U16,
26-
typenum::{Const, ToUInt, U},
27-
};
22+
use universal_hash::array::{Array, ArraySize};
2823

29-
/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
30-
/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
31-
/// product into the lower half during modular reduction.
32-
const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
24+
/// 128-bit SIMD register type.
25+
pub(super) type Simd128 = uint8x16_t;
3326

34-
/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
35-
///
36-
/// Parameterized on a constant that determines how many
37-
/// blocks to process at once: higher numbers use more memory,
38-
/// and require more time to re-key, but process data significantly
39-
/// faster.
27+
/// Perform carryless multiplication of `y` by `h` and return the result.
4028
///
41-
/// (This constant is not used when acceleration is not enabled.)
42-
#[derive(Clone)]
43-
pub struct Polyval<const N: usize = 8> {
44-
/// Powers of H in descending order.
45-
///
46-
/// (H^N, H^(N-1)...H)
47-
h: [FieldElement; N],
48-
y: FieldElement,
49-
}
50-
51-
impl<const N: usize> KeySizeUser for Polyval<N> {
52-
type KeySize = U16;
53-
}
54-
55-
impl<const N: usize> Polyval<N> {
56-
/// Initialize POLYVAL with the given `H` field element and initial block
57-
pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
58-
Self {
59-
h: FieldElement::from(h).powers_of_h(),
60-
y: init_block.into(),
61-
}
62-
}
63-
}
64-
65-
impl<const N: usize> KeyInit for Polyval<N> {
66-
/// Initialize POLYVAL with the given `H` field element
67-
fn new(h: &Key) -> Self {
68-
Self::new_with_init_block(h, 0)
69-
}
70-
}
71-
72-
impl<const N: usize> BlockSizeUser for Polyval<N> {
73-
type BlockSize = U16;
29+
/// # Safety
30+
/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
31+
/// instructions.
32+
// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
33+
#[inline]
34+
#[target_feature(enable = "aes,neon")]
35+
pub(super) unsafe fn polymul(y: Simd128, h: Simd128) -> Simd128 {
36+
let (h, m, l) = karatsuba1(h, y);
37+
let (h, l) = karatsuba2(h, m, l);
38+
mont_reduce(h, l)
7439
}
7540

76-
impl<const N: usize> ParBlocksSizeUser for Polyval<N>
77-
where
78-
U<N>: ArraySize,
79-
Const<N>: ToUInt,
80-
{
81-
type ParBlocksSize = U<N>;
41+
/// Process an individual block.
42+
///
43+
/// # Safety
44+
/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
45+
/// instructions.
46+
#[inline]
47+
#[target_feature(enable = "aes,neon")]
48+
pub(super) unsafe fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement {
49+
let y = veorq_u8(y.into(), vld1q_u8(x.as_ptr()));
50+
polymul(y, h.into()).into()
8251
}
8352

84-
impl<const N: usize> UhfBackend for Polyval<N>
85-
where
86-
U<N>: ArraySize,
87-
Const<N>: ToUInt,
88-
{
89-
fn proc_par_blocks(&mut self, blocks: &ParBlocks<Self>) {
90-
unsafe {
91-
let mut h = vdupq_n_u8(0);
92-
let mut m = vdupq_n_u8(0);
93-
let mut l = vdupq_n_u8(0);
94-
95-
// Note: Manually unrolling this loop did not help in benchmarks.
96-
for i in (0..N).rev() {
97-
let mut x = vld1q_u8(blocks[i].as_ptr());
98-
if i == 0 {
99-
x = veorq_u8(x, self.y.into());
100-
}
101-
let y = self.h[i];
102-
let (hh, mm, ll) = karatsuba1(x, y.into());
103-
h = veorq_u8(h, hh);
104-
m = veorq_u8(m, mm);
105-
l = veorq_u8(l, ll);
53+
/// Process multiple blocks in parallel.
54+
///
55+
/// # Safety
56+
/// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
57+
/// instructions.
58+
#[target_feature(enable = "aes,neon")]
59+
pub(super) unsafe fn proc_par_blocks<const N: usize, U: ArraySize>(
60+
powers_of_h: &[FieldElement; N],
61+
y: FieldElement,
62+
blocks: &Array<Block, U>,
63+
) -> FieldElement {
64+
unsafe {
65+
let mut h = vdupq_n_u8(0);
66+
let mut m = vdupq_n_u8(0);
67+
let mut l = vdupq_n_u8(0);
68+
69+
// Note: Manually unrolling this loop did not help in benchmarks.
70+
for i in (0..N).rev() {
71+
let mut x = vld1q_u8(blocks[i].as_ptr());
72+
if i == 0 {
73+
x = veorq_u8(x, y.into());
10674
}
107-
108-
let (h, l) = karatsuba2(h, m, l);
109-
self.y = mont_reduce(h, l).into();
75+
let (hh, mm, ll) = karatsuba1(x, powers_of_h[i].into());
76+
h = veorq_u8(h, hh);
77+
m = veorq_u8(m, mm);
78+
l = veorq_u8(l, ll);
11079
}
111-
}
112-
113-
fn proc_block(&mut self, x: &Block) {
114-
unsafe {
115-
let y = veorq_u8(self.y.into(), vld1q_u8(x.as_ptr()));
116-
self.y = polymul(y, self.h[N - 1].into()).into();
117-
}
118-
}
119-
}
12080

121-
impl<const N: usize> Reset for Polyval<N> {
122-
fn reset(&mut self) {
123-
self.y = FieldElement::default();
81+
let (h, l) = karatsuba2(h, m, l);
82+
mont_reduce(h, l).into()
12483
}
12584
}
12685

127-
impl<const N: usize> Polyval<N> {
128-
/// Get POLYVAL output.
129-
pub(crate) fn finalize(self) -> Tag {
130-
self.y.into()
131-
}
132-
}
133-
134-
impl From<FieldElement> for uint8x16_t {
86+
impl From<FieldElement> for Simd128 {
13587
#[inline]
136-
fn from(fe: FieldElement) -> uint8x16_t {
88+
fn from(fe: FieldElement) -> Simd128 {
13789
unsafe { vld1q_u8(fe.0.as_ptr()) }
13890
}
13991
}
14092

141-
impl From<uint8x16_t> for FieldElement {
93+
impl From<Simd128> for FieldElement {
14294
#[inline]
143-
fn from(fe: uint8x16_t) -> FieldElement {
95+
fn from(fe: Simd128) -> FieldElement {
14496
let mut ret = FieldElement::default();
14597
unsafe { vst1q_u8(ret.0.as_mut_ptr(), fe) }
14698
ret
14799
}
148100
}
149101

150-
/// Multiply "y" by "h" and return the result.
151-
// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
152-
#[inline]
153-
#[target_feature(enable = "neon")]
154-
unsafe fn polymul(y: uint8x16_t, h: uint8x16_t) -> uint8x16_t {
155-
let (h, m, l) = karatsuba1(h, y);
156-
let (h, l) = karatsuba2(h, m, l);
157-
mont_reduce(h, l)
158-
}
159-
160102
/// Karatsuba decomposition for `x*y`.
161103
#[inline]
162-
#[target_feature(enable = "neon")]
163-
unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, uint8x16_t) {
104+
#[target_feature(enable = "aes,neon")]
105+
unsafe fn karatsuba1(x: Simd128, y: Simd128) -> (Simd128, Simd128, Simd128) {
164106
// First Karatsuba step: decompose x and y.
165107
//
166108
// (x1*y0 + x0*y1) = (x1+x0) * (y1+y0) + (x1*y1) + (x0*y0)
@@ -179,7 +121,7 @@ unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, u
179121
/// Karatsuba combine.
180122
#[inline]
181123
#[target_feature(enable = "neon")]
182-
unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t, uint8x16_t) {
124+
unsafe fn karatsuba2(h: Simd128, m: Simd128, l: Simd128) -> (Simd128, Simd128) {
183125
// Second Karatsuba step: combine into a 2n-bit product.
184126
//
185127
// m0 ^= l0 ^ h0 // = m0^(l0^h0)
@@ -218,9 +160,14 @@ unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t
218160
(x23, x01)
219161
}
220162

163+
/// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
164+
/// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
165+
/// product into the lower half during modular reduction.
166+
const POLY: u128 = (1 << 127) | (1 << 126) | (1 << 121) | (1 << 63) | (1 << 62) | (1 << 57);
167+
221168
#[inline]
222-
#[target_feature(enable = "neon")]
223-
unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
169+
#[target_feature(enable = "aes,neon")]
170+
unsafe fn mont_reduce(x23: Simd128, x01: Simd128) -> Simd128 {
224171
// Perform the Montgomery reduction over the 256-bit X.
225172
// [A1:A0] = X0 • poly
226173
// [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
@@ -236,8 +183,8 @@ unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
236183

237184
/// Multiplies the low bits in `a` and `b`.
238185
#[inline]
239-
#[target_feature(enable = "neon")]
240-
unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
186+
#[target_feature(enable = "aes,neon")]
187+
unsafe fn pmull(a: Simd128, b: Simd128) -> Simd128 {
241188
mem::transmute(vmull_p64(
242189
vgetq_lane_u64(vreinterpretq_u64_u8(a), 0),
243190
vgetq_lane_u64(vreinterpretq_u64_u8(b), 0),
@@ -246,19 +193,10 @@ unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
246193

247194
/// Multiplies the high bits in `a` and `b`.
248195
#[inline]
249-
#[target_feature(enable = "neon")]
250-
unsafe fn pmull2(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
196+
#[target_feature(enable = "aes,neon")]
197+
unsafe fn pmull2(a: Simd128, b: Simd128) -> Simd128 {
251198
mem::transmute(vmull_p64(
252199
vgetq_lane_u64(vreinterpretq_u64_u8(a), 1),
253200
vgetq_lane_u64(vreinterpretq_u64_u8(b), 1),
254201
))
255202
}
256-
// TODO(tarcieri): zeroize support
257-
// #[cfg(feature = "zeroize")]
258-
// impl Drop for Polyval<N> {
259-
// fn drop(&mut self) {
260-
// use zeroize::Zeroize;
261-
// self.h.zeroize();
262-
// self.y.zeroize();
263-
// }
264-
// }

0 commit comments

Comments
 (0)