|
1 | 1 | #[cfg(test)]
|
2 | 2 | const MAX_COMPARE_SIZE: usize = 256;
|
3 | 3 |
|
| 4 | +#[inline(always)] |
4 | 5 | pub fn compare256_slice(src0: &[u8], src1: &[u8]) -> usize {
|
5 | 6 | let src0 = first_chunk::<_, 256>(src0).unwrap();
|
6 | 7 | let src1 = first_chunk::<_, 256>(src1).unwrap();
|
7 | 8 |
|
8 | 9 | compare256(src0, src1)
|
9 | 10 | }
|
10 | 11 |
|
| 12 | +/// Call the most optimal compare256 |
| 13 | +/// |
| 14 | +/// We attempt to call a specific version if its target feature is enabled at compile time |
| 15 | +/// (e.g. via `-Ctarget-cpu`). If the desired target feature is not found, we defer to |
| 16 | +/// [`compare256_via_function_pointer`]. |
| 17 | +#[inline(always)] |
11 | 18 | fn compare256(src0: &[u8; 256], src1: &[u8; 256]) -> usize {
|
12 |
| - #[cfg(target_arch = "x86_64")] |
13 |
| - if crate::cpu_features::is_enabled_avx2() { |
14 |
| - return unsafe { avx2::compare256(src0, src1) }; |
15 |
| - } |
| 19 | + #[cfg(target_feature = "avx2")] |
| 20 | + return avx2::compare256(src0, src1); |
16 | 21 |
|
17 |
| - #[cfg(target_arch = "aarch64")] |
18 |
| - if crate::cpu_features::is_enabled_neon() { |
19 |
| - return unsafe { neon::compare256(src0, src1) }; |
20 |
| - } |
| 22 | + #[cfg(target_feature = "neon")] |
| 23 | + return neon::compare256(src0, src1); |
21 | 24 |
|
22 |
| - #[cfg(target_arch = "wasm32")] |
23 |
| - if crate::cpu_features::is_enabled_simd128() { |
24 |
| - return wasm32::compare256(src0, src1); |
| 25 | + #[cfg(target_feature = "simd128")] |
| 26 | + return wasm32::compare256(src0, src1); |
| 27 | + |
| 28 | + #[allow(unreachable_code)] |
| 29 | + compare256_via_function_pointer(src0, src1) |
| 30 | +} |
| 31 | + |
| 32 | +/// Choose the most optimal implementation at runtime |
| 33 | +/// |
| 34 | +/// We store the function pointer to the most optimal implementation in an AtomicPtr; every call |
| 35 | +/// loads this function pointer and then calls it. |
| 36 | +/// |
| 37 | +/// The value is initially set to `initializer`, which on the first call will determine what the |
| 38 | +/// most efficient implementation is, and overwrite the value in the atomic, so that on subsequent |
| 39 | +/// calls the best implementation is called immediately. |
| 40 | +#[inline(always)] |
| 41 | +fn compare256_via_function_pointer(src0: &[u8; 256], src1: &[u8; 256]) -> usize { |
| 42 | + use core::sync::atomic::{AtomicPtr, Ordering}; |
| 43 | + |
| 44 | + type F = unsafe fn(&[u8; 256], &[u8; 256]) -> usize; |
| 45 | + |
| 46 | + static PTR: AtomicPtr<()> = AtomicPtr::new(initializer as *mut ()); |
| 47 | + |
| 48 | + fn initializer(src0: &[u8; 256], src1: &[u8; 256]) -> usize { |
| 49 | + let ptr = match () { |
| 50 | + #[cfg(target_arch = "x86_64")] |
| 51 | + _ if crate::cpu_features::is_enabled_avx2() => avx2::compare256 as F, |
| 52 | + #[cfg(target_arch = "aarch64")] |
| 53 | + _ if crate::cpu_features::is_enabled_neon() => neon::compare256 as F, |
| 54 | + #[cfg(target_arch = "wasm32")] |
| 55 | + _ if crate::cpu_features::is_enabled_simd128() => wasm32::compare256 as F, |
| 56 | + _ => rust::compare256 as F, |
| 57 | + }; |
| 58 | + |
| 59 | + PTR.store(ptr as *mut (), Ordering::Relaxed); |
| 60 | + |
| 61 | + // Safety: we've validated the target feature requirements |
| 62 | + unsafe { ptr(src0, src1) } |
25 | 63 | }
|
26 | 64 |
|
27 |
| - rust::compare256(src0, src1) |
| 65 | + let ptr = PTR.load(Ordering::Relaxed); |
| 66 | + |
| 67 | + // Safety: we trust this function pointer (PTR is local to the function) |
| 68 | + let dynamic_compare256 = unsafe { core::mem::transmute::<*mut (), F>(ptr) }; |
| 69 | + |
| 70 | + // Safety: we've validated the target feature requirements |
| 71 | + unsafe { dynamic_compare256(src0, src1) } |
28 | 72 | }
|
29 | 73 |
|
30 | 74 | pub fn compare256_rle_slice(byte: u8, src: &[u8]) -> usize {
|
|
0 commit comments