-
Notifications
You must be signed in to change notification settings - Fork 573
[FEAT] implement three scalar multiplication #858
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
zz-sol
wants to merge
6
commits into
dalek-cryptography:main
Choose a base branch
from
zz-sol:zz/three_scalar_mul
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+1,119
−5
Open
Changes from 1 commit
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
04f0913
add constants for B*2^128 pre-computated table
invalid-email-address a0a6b03
impl vartime triple base mul algorithm
invalid-email-address 8da112b
add benchmark and tests
invalid-email-address 3681989
add proptest
invalid-email-address f40a373
fix CI
zz-sol 1b8a8f0
remove ethnum
zz-sol File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
145 changes: 145 additions & 0 deletions
145
curve25519-dalek/src/backend/serial/scalar_mul/vartime_triple_base.rs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| // -*- mode: rust; -*- | ||
| // | ||
| // This file is part of curve25519-dalek. | ||
| // Copyright (c) 2016-2021 isis lovecruft | ||
| // Copyright (c) 2016-2019 Henry de Valence | ||
| // See LICENSE for licensing information. | ||
|
|
||
| #![allow(non_snake_case)] | ||
|
|
||
| use core::cmp::Ordering; | ||
|
|
||
| use crate::backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint}; | ||
| use crate::constants; | ||
| use crate::edwards::EdwardsPoint; | ||
| use crate::scalar::{HEEA_MAX_INDEX, Scalar}; | ||
| use crate::traits::Identity; | ||
| use crate::window::NafLookupTable5; | ||
|
|
||
| /// Compute \\(a_1 A_1 + a_2 A_2 + b B\\) in variable time, where \\(B\\) is the Ed25519 basepoint. | ||
| /// | ||
| /// This function is optimized for the case where \\(a_1\\) and \\(a_2\\) are known to be less than | ||
| /// \\(2^{128}\\), while \\(b\\) is a full 256-bit scalar. | ||
| /// | ||
| /// # Optimization Strategy | ||
| /// | ||
| /// The function decomposes the 256-bit scalar \\(b\\) as \\(b = b_{lo} + b_{hi} \cdot 2^{128}\\), | ||
| /// where both \\(b_{lo}\\) and \\(b_{hi}\\) are 128-bit values. This allows computing: | ||
| /// | ||
| /// \\[ | ||
| /// a_1 A_1 + a_2 A_2 + b_{lo} B + b_{hi} B' | ||
| /// \\] | ||
| /// | ||
| /// where \\(B' = B \cdot 2^{128}\\) is a precomputed constant. Now all four scalars | ||
| /// (\\(a_1, a_2, b_{lo}, b_{hi}\\)) are 128 bits, and two of the bases (\\(B\\) and \\(B'\\)) | ||
| /// use precomputed tables. | ||
| /// | ||
| /// # Implementation | ||
| /// | ||
| /// - For \\(A_1\\) and \\(A_2\\): NAF with window width 5 (8 precomputed points each) | ||
| /// - For \\(B\\): NAF with window width 8 when precomputed tables available (64 points) | ||
| /// - For \\(B'\\): NAF with window width 5 (could be optimized with precomputed table) | ||
| /// | ||
| /// The algorithm shares doublings across all four scalar multiplications, processing | ||
| /// only 128 bits instead of 256, providing approximately 2x speedup over the naive approach. | ||
| pub fn mul_128_128_256( | ||
| a1: &Scalar, | ||
| A1: &EdwardsPoint, | ||
| a2: &Scalar, | ||
| A2: &EdwardsPoint, | ||
| b: &Scalar, | ||
| ) -> EdwardsPoint { | ||
| // assert that a1 and a2 are less than 2^128 | ||
| debug_assert!(a1.as_bytes()[16..32].iter().all(|&b| b == 0)); | ||
| debug_assert!(a2.as_bytes()[16..32].iter().all(|&b| b == 0)); | ||
|
|
||
| // Decompose b into b_lo (lower 128 bits) and b_hi (upper 128 bits) | ||
| // b = b_lo + b_hi * 2^128 | ||
| let b_bytes = b.as_bytes(); | ||
|
|
||
| let mut b_lo_bytes = [0u8; 32]; | ||
| let mut b_hi_bytes = [0u8; 32]; | ||
|
|
||
| // Copy lower 16 bytes to b_lo, upper 16 bytes to b_hi | ||
| b_lo_bytes[..16].copy_from_slice(&b_bytes[..16]); | ||
| b_hi_bytes[..16].copy_from_slice(&b_bytes[16..]); | ||
|
|
||
| let b_lo = Scalar::from_canonical_bytes(b_lo_bytes).unwrap(); | ||
| let b_hi = Scalar::from_canonical_bytes(b_hi_bytes).unwrap(); | ||
|
|
||
| // Compute NAF representations (all scalars are now ~128 bits) | ||
| let a1_naf = a1.non_adjacent_form(5); | ||
| let a2_naf = a2.non_adjacent_form(5); | ||
| let b_lo_naf = b_lo.non_adjacent_form(5); | ||
| let b_hi_naf = b_hi.non_adjacent_form(5); | ||
|
|
||
| // Find starting index - check all NAFs up to bit 127 | ||
| // (with potential carry to bit 128 or 129) | ||
| let mut i = HEEA_MAX_INDEX; | ||
| for j in (0..HEEA_MAX_INDEX).rev() { | ||
| i = j; | ||
| if a1_naf[i] != 0 || a2_naf[i] != 0 || b_lo_naf[i] != 0 || b_hi_naf[i] != 0 { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| // Create lookup tables | ||
| let table_A1 = NafLookupTable5::<ProjectiveNielsPoint>::from(A1); | ||
| let table_A2 = NafLookupTable5::<ProjectiveNielsPoint>::from(A2); | ||
|
|
||
| #[cfg(feature = "precomputed-tables")] | ||
| let table_B = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT; | ||
| #[cfg(not(feature = "precomputed-tables"))] | ||
| let table_B = | ||
| &NafLookupTable5::<ProjectiveNielsPoint>::from(&constants::ED25519_BASEPOINT_POINT); | ||
|
|
||
| // B' = B * 2^128 (precomputed constant point) | ||
| #[cfg(feature = "precomputed-tables")] | ||
| let table_B_128 = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT_128; | ||
| #[cfg(not(feature = "precomputed-tables"))] | ||
| let table_B_128 = | ||
| &NafLookupTable5::<ProjectiveNielsPoint>::from(&constants::ED25519_BASEPOINT_128_POINT); | ||
|
|
||
| let mut r = ProjectivePoint::identity(); | ||
|
|
||
| loop { | ||
| let mut t = r.double(); | ||
|
|
||
| // Add contributions from a1*A1 | ||
| match a1_naf[i].cmp(&0) { | ||
| Ordering::Greater => t = &t.as_extended() + &table_A1.select(a1_naf[i] as usize), | ||
| Ordering::Less => t = &t.as_extended() - &table_A1.select(-a1_naf[i] as usize), | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| // Add contributions from a2*A2 | ||
| match a2_naf[i].cmp(&0) { | ||
| Ordering::Greater => t = &t.as_extended() + &table_A2.select(a2_naf[i] as usize), | ||
| Ordering::Less => t = &t.as_extended() - &table_A2.select(-a2_naf[i] as usize), | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| // Add contributions from b_lo*B | ||
| match b_lo_naf[i].cmp(&0) { | ||
| Ordering::Greater => t = &t.as_extended() + &table_B.select(b_lo_naf[i] as usize), | ||
| Ordering::Less => t = &t.as_extended() - &table_B.select(-b_lo_naf[i] as usize), | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| // Add contributions from b_hi*B' where B' = B * 2^128 | ||
| match b_hi_naf[i].cmp(&0) { | ||
| Ordering::Greater => t = &t.as_extended() + &table_B_128.select(b_hi_naf[i] as usize), | ||
| Ordering::Less => t = &t.as_extended() - &table_B_128.select(-b_hi_naf[i] as usize), | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| r = t.as_projective(); | ||
|
|
||
| if i == 0 { | ||
| break; | ||
| } | ||
| i -= 1; | ||
| } | ||
|
|
||
| r.as_extended() | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
185 changes: 185 additions & 0 deletions
185
curve25519-dalek/src/backend/vector/scalar_mul/vartime_triple_base.rs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,185 @@ | ||
| // -*- mode: rust; -*- | ||
| // | ||
| // This file is part of curve25519-dalek. | ||
| // Copyright (c) 2016-2021 isis lovecruft | ||
| // Copyright (c) 2016-2019 Henry de Valence | ||
| // See LICENSE for licensing information. | ||
|
|
||
| #![allow(non_snake_case)] | ||
|
|
||
| #[curve25519_dalek_derive::unsafe_target_feature_specialize( | ||
| "avx2", | ||
| conditional("avx512ifma,avx512vl", nightly) | ||
| )] | ||
| pub mod spec { | ||
|
|
||
| use core::cmp::Ordering; | ||
|
|
||
| #[for_target_feature("avx2")] | ||
| use crate::backend::vector::avx2::{CachedPoint, ExtendedPoint}; | ||
|
|
||
| #[for_target_feature("avx512ifma")] | ||
| use crate::backend::vector::ifma::{CachedPoint, ExtendedPoint}; | ||
|
|
||
| #[cfg(feature = "precomputed-tables")] | ||
| #[for_target_feature("avx2")] | ||
| use crate::backend::vector::avx2::constants::BASEPOINT_ODD_LOOKUP_TABLE; | ||
|
|
||
| #[cfg(feature = "precomputed-tables")] | ||
| #[for_target_feature("avx512ifma")] | ||
| use crate::backend::vector::ifma::constants::BASEPOINT_ODD_LOOKUP_TABLE; | ||
|
|
||
| use crate::constants; | ||
| use crate::edwards::EdwardsPoint; | ||
| use crate::scalar::HEEA_MAX_INDEX; | ||
| use crate::scalar::Scalar; | ||
| use crate::traits::Identity; | ||
| use crate::window::NafLookupTable5; | ||
|
|
||
| /// Compute \\(a_1 A_1 + a_2 A_2 + b B\\) in variable time, where \\(B\\) is the Ed25519 basepoint. | ||
| /// | ||
| /// This function is optimized for the case where \\(a_1\\) and \\(a_2\\) are known to be less than | ||
| /// \\(2^{128}\\), while \\(b\\) is a full 256-bit scalar. | ||
| /// | ||
| /// # Optimization Strategy | ||
| /// | ||
| /// The function decomposes the 256-bit scalar \\(b\\) as \\(b = b_{lo} + b_{hi} \cdot 2^{128}\\), | ||
| /// where both \\(b_{lo}\\) and \\(b_{hi}\\) are 128-bit values. This allows computing: | ||
| /// | ||
| /// \\[ | ||
| /// a_1 A_1 + a_2 A_2 + b_{lo} B + b_{hi} B' | ||
| /// \\] | ||
| /// | ||
| /// where \\(B' = B \cdot 2^{128}\\) is a precomputed constant. Now all four scalars | ||
| /// (\\(a_1, a_2, b_{lo}, b_{hi}\\)) are 128 bits, and two of the bases (\\(B\\) and \\(B'\\)) | ||
| /// use precomputed tables. | ||
| /// | ||
| /// # Implementation | ||
| /// | ||
| /// - For \\(A_1\\) and \\(A_2\\): NAF with window width 5 (8 precomputed points each) | ||
| /// - For \\(B\\): NAF with window width 8 when precomputed tables available (64 points), otherwise width 5 | ||
| /// - For \\(B'\\): NAF with window width 5 | ||
| /// | ||
| /// The algorithm shares doublings across all four scalar multiplications, processing | ||
| /// only 128 bits instead of 256, providing approximately 2x speedup over the naive approach. | ||
| /// | ||
| /// This SIMD implementation uses vectorized point operations (AVX2 or AVX512-IFMA) for | ||
| /// improved performance over the serial backend. | ||
| pub fn mul_128_128_256( | ||
| a1: &Scalar, | ||
| A1: &EdwardsPoint, | ||
| a2: &Scalar, | ||
| A2: &EdwardsPoint, | ||
| b: &Scalar, | ||
| ) -> EdwardsPoint { | ||
| // assert that a1 and a2 are less than 2^128 | ||
| debug_assert!(a1.as_bytes()[16..32].iter().all(|&b| b == 0)); | ||
| debug_assert!(a2.as_bytes()[16..32].iter().all(|&b| b == 0)); | ||
|
|
||
| // Decompose b into b_lo (lower 128 bits) and b_hi (upper 128 bits) | ||
| // b = b_lo + b_hi * 2^128 | ||
| let b_bytes = b.as_bytes(); | ||
|
|
||
| let mut b_lo_bytes = [0u8; 32]; | ||
| let mut b_hi_bytes = [0u8; 32]; | ||
|
|
||
| // Copy lower 16 bytes to b_lo, upper 16 bytes to b_hi | ||
| b_lo_bytes[..16].copy_from_slice(&b_bytes[..16]); | ||
| b_hi_bytes[..16].copy_from_slice(&b_bytes[16..]); | ||
|
|
||
| let b_lo = Scalar::from_canonical_bytes(b_lo_bytes).unwrap(); | ||
| let b_hi = Scalar::from_canonical_bytes(b_hi_bytes).unwrap(); | ||
|
|
||
| // Compute NAF representations (all scalars are now ~128 bits) | ||
| let a1_naf = a1.non_adjacent_form(5); | ||
| let a2_naf = a2.non_adjacent_form(5); | ||
|
|
||
| #[cfg(feature = "precomputed-tables")] | ||
| let b_lo_naf = b_lo.non_adjacent_form(8); | ||
| #[cfg(not(feature = "precomputed-tables"))] | ||
| let b_lo_naf = b_lo.non_adjacent_form(5); | ||
|
|
||
| let b_hi_naf = b_hi.non_adjacent_form(5); | ||
|
|
||
| // Find starting index - check all NAFs up to bit 127 | ||
| // (with potential carry to bit 128 or 129) | ||
| let mut i: usize = HEEA_MAX_INDEX; | ||
| for j in (0..=HEEA_MAX_INDEX).rev() { | ||
| i = j; | ||
| if a1_naf[i] != 0 || a2_naf[i] != 0 || b_lo_naf[i] != 0 || b_hi_naf[i] != 0 { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| // Create lookup tables using SIMD-optimized CachedPoint | ||
| let table_A1 = NafLookupTable5::<CachedPoint>::from(A1); | ||
| let table_A2 = NafLookupTable5::<CachedPoint>::from(A2); | ||
|
|
||
| #[cfg(feature = "precomputed-tables")] | ||
| let table_B = &BASEPOINT_ODD_LOOKUP_TABLE; | ||
| #[cfg(not(feature = "precomputed-tables"))] | ||
| let table_B = &NafLookupTable5::<CachedPoint>::from(&constants::ED25519_BASEPOINT_POINT); | ||
|
|
||
| // B' = B * 2^128 (precomputed constant point) | ||
| // TODO: For optimal performance, this should also use the wider lookup table when precomputed-tables is enabled | ||
| let table_B_128 = | ||
| &NafLookupTable5::<CachedPoint>::from(&constants::ED25519_BASEPOINT_128_POINT); | ||
|
|
||
| let mut Q = ExtendedPoint::identity(); | ||
|
|
||
| loop { | ||
| Q = Q.double(); | ||
|
|
||
| // Add contributions from a1*A1 | ||
| match a1_naf[i].cmp(&0) { | ||
| Ordering::Greater => { | ||
| Q = &Q + &table_A1.select(a1_naf[i] as usize); | ||
| } | ||
| Ordering::Less => { | ||
| Q = &Q - &table_A1.select(-a1_naf[i] as usize); | ||
| } | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| // Add contributions from a2*A2 | ||
| match a2_naf[i].cmp(&0) { | ||
| Ordering::Greater => { | ||
| Q = &Q + &table_A2.select(a2_naf[i] as usize); | ||
| } | ||
| Ordering::Less => { | ||
| Q = &Q - &table_A2.select(-a2_naf[i] as usize); | ||
| } | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| // Add contributions from b_lo*B | ||
| match b_lo_naf[i].cmp(&0) { | ||
| Ordering::Greater => { | ||
| Q = &Q + &table_B.select(b_lo_naf[i] as usize); | ||
| } | ||
| Ordering::Less => { | ||
| Q = &Q - &table_B.select(-b_lo_naf[i] as usize); | ||
| } | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| // Add contributions from b_hi*B' where B' = B * 2^128 | ||
| match b_hi_naf[i].cmp(&0) { | ||
| Ordering::Greater => { | ||
| Q = &Q + &table_B_128.select(b_hi_naf[i] as usize); | ||
| } | ||
| Ordering::Less => { | ||
| Q = &Q - &table_B_128.select(-b_hi_naf[i] as usize); | ||
| } | ||
| Ordering::Equal => {} | ||
| } | ||
|
|
||
| if i == 0 { | ||
| break; | ||
| } | ||
| i -= 1; | ||
| } | ||
|
|
||
| Q.into() | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.