Skip to content

Commit 09f94ff

Browse files
authored
Add null hasher for Pubkey (anza-xyz#96)
* null hasher for Pubkey that allows 4x faster hashmaps * truncate the counter in new_unique to 4 bytes and fill the rest with pseudorandom bytes to improve statistical properties of the resulting pubkeys --------- Co-authored-by: Alex Pyattaev <[email protected]>
1 parent 4db5fba commit 09f94ff

File tree

1 file changed

+183
-7
lines changed

1 file changed

+183
-7
lines changed

pubkey/src/lib.rs

Lines changed: 183 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ use {
2323
core::{
2424
array,
2525
convert::{Infallible, TryFrom},
26-
fmt, mem,
26+
fmt,
27+
hash::{Hash, Hasher},
28+
mem,
2729
str::{from_utf8, FromStr},
2830
},
2931
num_traits::{FromPrimitive, ToPrimitive},
@@ -158,10 +160,163 @@ impl From<u64> for PubkeyError {
158160
#[cfg_attr(all(feature = "borsh", feature = "std"), derive(BorshSchema))]
159161
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
160162
#[cfg_attr(feature = "bytemuck", derive(Pod, Zeroable))]
161-
#[derive(Clone, Copy, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
163+
#[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd)]
162164
#[cfg_attr(feature = "dev-context-only-utils", derive(Arbitrary))]
163165
pub struct Pubkey(pub(crate) [u8; 32]);
164166

167+
/// Custom impl of Hash for Pubkey
168+
/// allows us to skip hashing the length of the pubkey
169+
/// which is always the same anyway
170+
impl Hash for Pubkey {
171+
fn hash<H: Hasher>(&self, state: &mut H) {
172+
state.write(self.as_array());
173+
}
174+
}
175+
176+
#[cfg(all(feature = "rand", not(target_os = "solana")))]
177+
mod hasher {
178+
use {
179+
crate::PUBKEY_BYTES,
180+
core::{
181+
cell::Cell,
182+
hash::{BuildHasher, Hasher},
183+
},
184+
rand::{thread_rng, Rng},
185+
};
186+
187+
/// A faster, but less collision resistant hasher for pubkeys.
188+
///
189+
/// Specialized hasher that uses a random 8 bytes subslice of the
190+
/// pubkey as the hash value. Should not be used when collisions
191+
/// might be used to mount DOS attacks.
192+
///
193+
/// Using this results in about 4x faster lookups in a typical hashmap.
194+
#[derive(Default)]
195+
pub struct PubkeyHasher {
196+
offset: usize,
197+
state: u64,
198+
}
199+
200+
impl Hasher for PubkeyHasher {
201+
#[inline]
202+
fn finish(&self) -> u64 {
203+
self.state
204+
}
205+
#[inline]
206+
fn write(&mut self, bytes: &[u8]) {
207+
debug_assert_eq!(
208+
bytes.len(),
209+
PUBKEY_BYTES,
210+
"This hasher is intended to be used with pubkeys and nothing else"
211+
);
212+
// This slice/unwrap can never panic since offset is < PUBKEY_BYTES - size_of::<u64>()
213+
let chunk: &[u8; size_of::<u64>()] = bytes[self.offset..self.offset + size_of::<u64>()]
214+
.try_into()
215+
.unwrap();
216+
self.state = u64::from_ne_bytes(*chunk);
217+
}
218+
}
219+
220+
/// A builder for faster, but less collision resistant hasher for pubkeys.
221+
///
222+
/// Initializes `PubkeyHasher` instances that use an 8-byte
223+
/// slice of the pubkey as the hash value. Should not be used when
224+
/// collisions might be used to mount DOS attacks.
225+
///
226+
/// Using this results in about 4x faster lookups in a typical hashmap.
227+
#[derive(Clone)]
228+
pub struct PubkeyHasherBuilder {
229+
offset: usize,
230+
}
231+
232+
impl Default for PubkeyHasherBuilder {
233+
/// Default construct the PubkeyHasherBuilder.
234+
///
235+
/// The position of the slice is determined initially
236+
/// through random draw and then by incrementing a thread-local
237+
/// This way each hashmap can be expected to use a slightly different
238+
/// slice. This is essentially the same mechanism as what is used by
239+
/// `RandomState`
240+
fn default() -> Self {
241+
std::thread_local!(static OFFSET: Cell<usize> = {
242+
let mut rng = thread_rng();
243+
Cell::new(rng.gen_range(0..PUBKEY_BYTES - size_of::<u64>()))
244+
});
245+
246+
let offset = OFFSET.with(|offset| {
247+
let mut next_offset = offset.get() + 1;
248+
if next_offset > PUBKEY_BYTES - size_of::<u64>() {
249+
next_offset = 0;
250+
}
251+
offset.set(next_offset);
252+
next_offset
253+
});
254+
PubkeyHasherBuilder { offset }
255+
}
256+
}
257+
258+
impl BuildHasher for PubkeyHasherBuilder {
259+
type Hasher = PubkeyHasher;
260+
#[inline]
261+
fn build_hasher(&self) -> Self::Hasher {
262+
PubkeyHasher {
263+
offset: self.offset,
264+
state: 0,
265+
}
266+
}
267+
}
268+
269+
#[cfg(test)]
270+
mod tests {
271+
use {
272+
super::PubkeyHasherBuilder,
273+
crate::Pubkey,
274+
core::hash::{BuildHasher, Hasher},
275+
};
276+
#[test]
277+
fn test_pubkey_hasher_builder() {
278+
let key = Pubkey::new_unique();
279+
let builder = PubkeyHasherBuilder::default();
280+
let mut hasher1 = builder.build_hasher();
281+
let mut hasher2 = builder.build_hasher();
282+
hasher1.write(key.as_array());
283+
hasher2.write(key.as_array());
284+
assert_eq!(
285+
hasher1.finish(),
286+
hasher2.finish(),
287+
"Hashers made with same builder should be identical"
288+
);
289+
// Make sure that when we make new builders we get different slices
290+
// chosen for hashing
291+
let builder2 = PubkeyHasherBuilder::default();
292+
for _ in 0..64 {
293+
let mut hasher3 = builder2.build_hasher();
294+
hasher3.write(key.as_array());
295+
std::dbg!(hasher1.finish());
296+
std::dbg!(hasher3.finish());
297+
if hasher1.finish() != hasher3.finish() {
298+
return;
299+
}
300+
}
301+
panic!("Hashers built with different builder should be different due to random offset");
302+
}
303+
304+
#[test]
305+
fn test_pubkey_hasher() {
306+
let key1 = Pubkey::new_unique();
307+
let key2 = Pubkey::new_unique();
308+
let builder = PubkeyHasherBuilder::default();
309+
let mut hasher1 = builder.build_hasher();
310+
let mut hasher2 = builder.build_hasher();
311+
hasher1.write(key1.as_array());
312+
hasher2.write(key2.as_array());
313+
assert_ne!(hasher1.finish(), hasher2.finish());
314+
}
315+
}
316+
}
317+
#[cfg(all(feature = "rand", not(target_os = "solana")))]
318+
pub use hasher::{PubkeyHasher, PubkeyHasherBuilder};
319+
165320
impl solana_sanitize::Sanitize for Pubkey {}
166321

167322
// Use strum when testing to ensure our FromPrimitive
@@ -322,12 +477,33 @@ impl Pubkey {
322477
pub fn new_unique() -> Self {
323478
use solana_atomic_u64::AtomicU64;
324479
static I: AtomicU64 = AtomicU64::new(1);
325-
326-
let mut b = [0u8; 32];
327-
let i = I.fetch_add(1);
480+
type T = u32;
481+
const COUNTER_BYTES: usize = size_of::<T>();
482+
let mut b = [0u8; PUBKEY_BYTES];
483+
let mut i = I.fetch_add(1) as T;
328484
// use big endian representation to ensure that recent unique pubkeys
329-
// are always greater than less recent unique pubkeys
330-
b[0..8].copy_from_slice(&i.to_be_bytes());
485+
// are always greater than less recent unique pubkeys.
486+
b[0..COUNTER_BYTES].copy_from_slice(&i.to_be_bytes());
487+
// fill the rest of the pubkey with pseudorandom numbers to make
488+
// data statistically similar to real pubkeys.
489+
#[cfg(any(feature = "std", target_arch = "wasm32"))]
490+
{
491+
extern crate std;
492+
let mut hash = std::hash::DefaultHasher::new();
493+
for slice in b[COUNTER_BYTES..].chunks_mut(COUNTER_BYTES) {
494+
hash.write_u32(i);
495+
i += 1;
496+
slice.copy_from_slice(&hash.finish().to_ne_bytes()[0..COUNTER_BYTES]);
497+
}
498+
}
499+
// if std is not available, just replicate last byte of the counter.
500+
// this is not as good as a proper hash, but at least it is uniform
501+
#[cfg(not(any(feature = "std", target_arch = "wasm32")))]
502+
{
503+
for b in b[COUNTER_BYTES..].iter_mut() {
504+
*b = (i & 0xFF) as u8;
505+
}
506+
}
331507
Self::from(b)
332508
}
333509

0 commit comments

Comments
 (0)