1313//! For more information about PMULL, see:
1414//! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
1515//! - <https://eprint.iacr.org/2015/688.pdf>
16+
1617#![ allow( unsafe_op_in_unsafe_fn) ]
1718
1819use super :: FieldElement ;
19- use crate :: { Block , Key , Tag } ;
20+ use crate :: Block ;
2021use core:: { arch:: aarch64:: * , mem} ;
21- use universal_hash:: {
22- KeyInit , ParBlocks , Reset , UhfBackend ,
23- array:: ArraySize ,
24- common:: { BlockSizeUser , KeySizeUser , ParBlocksSizeUser } ,
25- consts:: U16 ,
26- typenum:: { Const , ToUInt , U } ,
27- } ;
22+ use universal_hash:: array:: { Array , ArraySize } ;
2823
29- /// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
30- /// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
31- /// product into the lower half during modular reduction.
32- const POLY : u128 = ( 1 << 127 ) | ( 1 << 126 ) | ( 1 << 121 ) | ( 1 << 63 ) | ( 1 << 62 ) | ( 1 << 57 ) ;
24+ /// 128-bit SIMD register type.
25+ pub ( super ) type Simd128 = uint8x16_t ;
3326
34- /// **POLYVAL**: GHASH-like universal hash over GF(2^128).
35- ///
36- /// Parameterized on a constant that determines how many
37- /// blocks to process at once: higher numbers use more memory,
38- /// and require more time to re-key, but process data significantly
39- /// faster.
27+ /// Perform carryless multiplication of `y` by `h` and return the result.
4028///
41- /// (This constant is not used when acceleration is not enabled.)
42- #[ derive( Clone ) ]
43- pub struct Polyval < const N : usize = 8 > {
44- /// Powers of H in descending order.
45- ///
46- /// (H^N, H^(N-1)...H)
47- h : [ FieldElement ; N ] ,
48- y : FieldElement ,
49- }
50-
51- impl < const N : usize > KeySizeUser for Polyval < N > {
52- type KeySize = U16 ;
53- }
54-
55- impl < const N : usize > Polyval < N > {
56- /// Initialize POLYVAL with the given `H` field element and initial block
57- pub fn new_with_init_block ( h : & Key , init_block : u128 ) -> Self {
58- Self {
59- h : FieldElement :: from ( h) . powers_of_h ( ) ,
60- y : init_block. into ( ) ,
61- }
62- }
63- }
64-
65- impl < const N : usize > KeyInit for Polyval < N > {
66- /// Initialize POLYVAL with the given `H` field element
67- fn new ( h : & Key ) -> Self {
68- Self :: new_with_init_block ( h, 0 )
69- }
70- }
71-
72- impl < const N : usize > BlockSizeUser for Polyval < N > {
73- type BlockSize = U16 ;
29+ /// # Safety
30+ /// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
31+ /// instructions.
32+ // TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
33+ #[ inline]
34+ #[ target_feature( enable = "aes,neon" ) ]
35+ pub ( super ) unsafe fn polymul ( y : Simd128 , h : Simd128 ) -> Simd128 {
36+ let ( h, m, l) = karatsuba1 ( h, y) ;
37+ let ( h, l) = karatsuba2 ( h, m, l) ;
38+ mont_reduce ( h, l)
7439}
7540
76- impl < const N : usize > ParBlocksSizeUser for Polyval < N >
77- where
78- U < N > : ArraySize ,
79- Const < N > : ToUInt ,
80- {
81- type ParBlocksSize = U < N > ;
41+ /// Process an individual block.
42+ ///
43+ /// # Safety
44+ /// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
45+ /// instructions.
46+ #[ inline]
47+ #[ target_feature( enable = "aes,neon" ) ]
48+ pub ( super ) unsafe fn proc_block ( h : FieldElement , y : FieldElement , x : & Block ) -> FieldElement {
49+ let y = veorq_u8 ( y. into ( ) , vld1q_u8 ( x. as_ptr ( ) ) ) ;
50+ polymul ( y, h. into ( ) ) . into ( )
8251}
8352
84- impl < const N : usize > UhfBackend for Polyval < N >
85- where
86- U < N > : ArraySize ,
87- Const < N > : ToUInt ,
88- {
89- fn proc_par_blocks ( & mut self , blocks : & ParBlocks < Self > ) {
90- unsafe {
91- let mut h = vdupq_n_u8 ( 0 ) ;
92- let mut m = vdupq_n_u8 ( 0 ) ;
93- let mut l = vdupq_n_u8 ( 0 ) ;
94-
95- // Note: Manually unrolling this loop did not help in benchmarks.
96- for i in ( 0 ..N ) . rev ( ) {
97- let mut x = vld1q_u8 ( blocks[ i] . as_ptr ( ) ) ;
98- if i == 0 {
99- x = veorq_u8 ( x, self . y . into ( ) ) ;
100- }
101- let y = self . h [ i] ;
102- let ( hh, mm, ll) = karatsuba1 ( x, y. into ( ) ) ;
103- h = veorq_u8 ( h, hh) ;
104- m = veorq_u8 ( m, mm) ;
105- l = veorq_u8 ( l, ll) ;
53+ /// Process multiple blocks in parallel.
54+ ///
55+ /// # Safety
56+ /// It is the caller's responsibility to ensure the host CPU is capable of PMULL and NEON
57+ /// instructions.
58+ #[ target_feature( enable = "aes,neon" ) ]
59+ pub ( super ) unsafe fn proc_par_blocks < const N : usize , U : ArraySize > (
60+ powers_of_h : & [ FieldElement ; N ] ,
61+ y : FieldElement ,
62+ blocks : & Array < Block , U > ,
63+ ) -> FieldElement {
64+ unsafe {
65+ let mut h = vdupq_n_u8 ( 0 ) ;
66+ let mut m = vdupq_n_u8 ( 0 ) ;
67+ let mut l = vdupq_n_u8 ( 0 ) ;
68+
69+ // Note: Manually unrolling this loop did not help in benchmarks.
70+ for i in ( 0 ..N ) . rev ( ) {
71+ let mut x = vld1q_u8 ( blocks[ i] . as_ptr ( ) ) ;
72+ if i == 0 {
73+ x = veorq_u8 ( x, y. into ( ) ) ;
10674 }
107-
108- let ( h, l) = karatsuba2 ( h, m, l) ;
109- self . y = mont_reduce ( h, l) . into ( ) ;
75+ let ( hh, mm, ll) = karatsuba1 ( x, powers_of_h[ i] . into ( ) ) ;
76+ h = veorq_u8 ( h, hh) ;
77+ m = veorq_u8 ( m, mm) ;
78+ l = veorq_u8 ( l, ll) ;
11079 }
111- }
112-
113- fn proc_block ( & mut self , x : & Block ) {
114- unsafe {
115- let y = veorq_u8 ( self . y . into ( ) , vld1q_u8 ( x. as_ptr ( ) ) ) ;
116- self . y = polymul ( y, self . h [ N - 1 ] . into ( ) ) . into ( ) ;
117- }
118- }
119- }
12080
121- impl < const N : usize > Reset for Polyval < N > {
122- fn reset ( & mut self ) {
123- self . y = FieldElement :: default ( ) ;
81+ let ( h, l) = karatsuba2 ( h, m, l) ;
82+ mont_reduce ( h, l) . into ( )
12483 }
12584}
12685
127- impl < const N : usize > Polyval < N > {
128- /// Get POLYVAL output.
129- pub ( crate ) fn finalize ( self ) -> Tag {
130- self . y . into ( )
131- }
132- }
133-
134- impl From < FieldElement > for uint8x16_t {
86+ impl From < FieldElement > for Simd128 {
13587 #[ inline]
136- fn from ( fe : FieldElement ) -> uint8x16_t {
88+ fn from ( fe : FieldElement ) -> Simd128 {
13789 unsafe { vld1q_u8 ( fe. 0 . as_ptr ( ) ) }
13890 }
13991}
14092
141- impl From < uint8x16_t > for FieldElement {
93+ impl From < Simd128 > for FieldElement {
14294 #[ inline]
143- fn from ( fe : uint8x16_t ) -> FieldElement {
95+ fn from ( fe : Simd128 ) -> FieldElement {
14496 let mut ret = FieldElement :: default ( ) ;
14597 unsafe { vst1q_u8 ( ret. 0 . as_mut_ptr ( ) , fe) }
14698 ret
14799 }
148100}
149101
150- /// Multiply "y" by "h" and return the result.
151- // TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
152- #[ inline]
153- #[ target_feature( enable = "neon" ) ]
154- unsafe fn polymul ( y : uint8x16_t , h : uint8x16_t ) -> uint8x16_t {
155- let ( h, m, l) = karatsuba1 ( h, y) ;
156- let ( h, l) = karatsuba2 ( h, m, l) ;
157- mont_reduce ( h, l)
158- }
159-
160102/// Karatsuba decomposition for `x*y`.
161103#[ inline]
162- #[ target_feature( enable = "neon" ) ]
163- unsafe fn karatsuba1 ( x : uint8x16_t , y : uint8x16_t ) -> ( uint8x16_t , uint8x16_t , uint8x16_t ) {
104+ #[ target_feature( enable = "aes, neon" ) ]
105+ unsafe fn karatsuba1 ( x : Simd128 , y : Simd128 ) -> ( Simd128 , Simd128 , Simd128 ) {
164106 // First Karatsuba step: decompose x and y.
165107 //
166108 // (x1*y0 + x0*y1) = (x1+x0) * (y1+y0) + (x1*y1) + (x0*y0)
@@ -179,7 +121,7 @@ unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, u
179121/// Karatsuba combine.
180122#[ inline]
181123#[ target_feature( enable = "neon" ) ]
182- unsafe fn karatsuba2 ( h : uint8x16_t , m : uint8x16_t , l : uint8x16_t ) -> ( uint8x16_t , uint8x16_t ) {
124+ unsafe fn karatsuba2 ( h : Simd128 , m : Simd128 , l : Simd128 ) -> ( Simd128 , Simd128 ) {
183125 // Second Karatsuba step: combine into a 2n-bit product.
184126 //
185127 // m0 ^= l0 ^ h0 // = m0^(l0^h0)
@@ -218,9 +160,14 @@ unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t
218160 ( x23, x01)
219161}
220162
163+ /// POLYVAL reduction polynomial (`x^128 + x^127 + x^126 + x^121 + 1`) encoded in little-endian
164+ /// GF(2)[x] form with reflected reduction terms arising from folding the upper 128-bits of the
165+ /// product into the lower half during modular reduction.
166+ const POLY : u128 = ( 1 << 127 ) | ( 1 << 126 ) | ( 1 << 121 ) | ( 1 << 63 ) | ( 1 << 62 ) | ( 1 << 57 ) ;
167+
221168#[ inline]
222- #[ target_feature( enable = "neon" ) ]
223- unsafe fn mont_reduce ( x23 : uint8x16_t , x01 : uint8x16_t ) -> uint8x16_t {
169+ #[ target_feature( enable = "aes, neon" ) ]
170+ unsafe fn mont_reduce ( x23 : Simd128 , x01 : Simd128 ) -> Simd128 {
224171 // Perform the Montgomery reduction over the 256-bit X.
225172 // [A1:A0] = X0 • poly
226173 // [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
@@ -236,8 +183,8 @@ unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
236183
237184/// Multiplies the low bits in `a` and `b`.
238185#[ inline]
239- #[ target_feature( enable = "neon" ) ]
240- unsafe fn pmull ( a : uint8x16_t , b : uint8x16_t ) -> uint8x16_t {
186+ #[ target_feature( enable = "aes, neon" ) ]
187+ unsafe fn pmull ( a : Simd128 , b : Simd128 ) -> Simd128 {
241188 mem:: transmute ( vmull_p64 (
242189 vgetq_lane_u64 ( vreinterpretq_u64_u8 ( a) , 0 ) ,
243190 vgetq_lane_u64 ( vreinterpretq_u64_u8 ( b) , 0 ) ,
@@ -246,19 +193,10 @@ unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
246193
247194/// Multiplies the high bits in `a` and `b`.
248195#[ inline]
249- #[ target_feature( enable = "neon" ) ]
250- unsafe fn pmull2 ( a : uint8x16_t , b : uint8x16_t ) -> uint8x16_t {
196+ #[ target_feature( enable = "aes, neon" ) ]
197+ unsafe fn pmull2 ( a : Simd128 , b : Simd128 ) -> Simd128 {
251198 mem:: transmute ( vmull_p64 (
252199 vgetq_lane_u64 ( vreinterpretq_u64_u8 ( a) , 1 ) ,
253200 vgetq_lane_u64 ( vreinterpretq_u64_u8 ( b) , 1 ) ,
254201 ) )
255202}
256- // TODO(tarcieri): zeroize support
257- // #[cfg(feature = "zeroize")]
258- // impl Drop for Polyval<N> {
259- // fn drop(&mut self) {
260- // use zeroize::Zeroize;
261- // self.h.zeroize();
262- // self.y.zeroize();
263- // }
264- // }
0 commit comments