Skip to content

Commit 076242a

Browse files
committed
store function pointer for the correct compare256 in a static
when e.g. the avx2 target feature is not enabled at compile time, but the feature is available at runtime, this approach reduces branching. We still dispatch statically if the target feature is already enabled at compile time
1 parent 1c5ad01 commit 076242a

File tree

1 file changed

+56
-12
lines changed

1 file changed

+56
-12
lines changed

Diff for: zlib-rs/src/deflate/compare256.rs

+56-12
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,74 @@
11
#[cfg(test)]
22
const MAX_COMPARE_SIZE: usize = 256;
33

4+
#[inline(always)]
45
pub fn compare256_slice(src0: &[u8], src1: &[u8]) -> usize {
56
let src0 = first_chunk::<_, 256>(src0).unwrap();
67
let src1 = first_chunk::<_, 256>(src1).unwrap();
78

89
compare256(src0, src1)
910
}
1011

12+
/// Call the most optimal compare256
13+
///
14+
/// We attempt to call a specific version if its target feature is enabled at compile time
15+
/// (e.g. via `-Ctarget-cpu`). If the desired target feature is not found, we defer to
16+
/// [`compare256_via_function_pointer`].
17+
#[inline(always)]
1118
fn compare256(src0: &[u8; 256], src1: &[u8; 256]) -> usize {
12-
#[cfg(target_arch = "x86_64")]
13-
if crate::cpu_features::is_enabled_avx2() {
14-
return unsafe { avx2::compare256(src0, src1) };
15-
}
19+
#[cfg(target_feature = "avx2")]
20+
return avx2::compare256(src0, src1);
1621

17-
#[cfg(target_arch = "aarch64")]
18-
if crate::cpu_features::is_enabled_neon() {
19-
return unsafe { neon::compare256(src0, src1) };
20-
}
22+
#[cfg(target_feature = "neon")]
23+
return neon::compare256(src0, src1);
2124

22-
#[cfg(target_arch = "wasm32")]
23-
if crate::cpu_features::is_enabled_simd128() {
24-
return wasm32::compare256(src0, src1);
25+
#[cfg(target_feature = "simd128")]
26+
return wasm32::compare256(src0, src1);
27+
28+
#[allow(unreachable_code)]
29+
compare256_via_function_pointer(src0, src1)
30+
}
31+
32+
/// Choose the most optimal implementation at runtime
33+
///
34+
/// We store the function pointer to the most optimal implementation in an AtomicPtr; every call
35+
/// loads this function pointer and then calls it.
36+
///
37+
/// The value is initially set to `initializer`, which on the first call will determine what the
38+
/// most efficient implementation is, and overwrite the value in the atomic, so that on subsequent
39+
/// calls the best implementation is called immediately.
40+
#[inline(always)]
41+
fn compare256_via_function_pointer(src0: &[u8; 256], src1: &[u8; 256]) -> usize {
42+
use core::sync::atomic::{AtomicPtr, Ordering};
43+
44+
type F = unsafe fn(&[u8; 256], &[u8; 256]) -> usize;
45+
46+
static PTR: AtomicPtr<()> = AtomicPtr::new(initializer as *mut ());
47+
48+
fn initializer(src0: &[u8; 256], src1: &[u8; 256]) -> usize {
49+
let ptr = match () {
50+
#[cfg(target_arch = "x86_64")]
51+
_ if crate::cpu_features::is_enabled_avx2() => avx2::compare256 as F,
52+
#[cfg(target_arch = "aarch64")]
53+
_ if crate::cpu_features::is_enabled_neon() => neon::compare256 as F,
54+
#[cfg(target_arch = "wasm32")]
55+
_ if crate::cpu_features::is_enabled_simd128() => wasm32::compare256 as F,
56+
_ => rust::compare256 as F,
57+
};
58+
59+
PTR.store(ptr as *mut (), Ordering::Relaxed);
60+
61+
// Safety: we've validated the target feature requirements
62+
unsafe { ptr(src0, src1) }
2563
}
2664

27-
rust::compare256(src0, src1)
65+
let ptr = PTR.load(Ordering::Relaxed);
66+
67+
// Safety: we trust this function pointer (PTR is local to the function)
68+
let dynamic_compare256 = unsafe { core::mem::transmute::<*mut (), F>(ptr) };
69+
70+
// Safety: we've validated the target feature requirements
71+
unsafe { dynamic_compare256(src0, src1) }
2872
}
2973

3074
pub fn compare256_rle_slice(byte: u8, src: &[u8]) -> usize {

0 commit comments

Comments
 (0)