Skip to content

Commit ec38437

Browse files
authored
cmov: slice cleanups (#1379)
- Simplify and clean up macros - Get rid of 32-bit impls until we can benchmark them and show they actually improve performance (less complexity for now) - Get rid of rustdoc on `Cmov`/`CmovEq` impl blocks: it varies by platform (so docs.rs will show how it works on `x86_64`) and really it's just describing implementation details better suited for a non-doc comment - Extract `cast_slice` and `cast_slice_mut` for slice conversions
1 parent 7c9ca7b commit ec38437

File tree

2 files changed

+107
-175
lines changed

2 files changed

+107
-175
lines changed

cmov/src/array.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
55
use crate::{Cmov, CmovEq, Condition};
66

7-
/// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
8-
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
7+
// Optimized implementation for arrays which coalesces them into word-sized chunks first,
8+
// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
99
impl<T, const N: usize> Cmov for [T; N]
1010
where
1111
[T]: Cmov,
@@ -16,8 +16,8 @@ where
1616
}
1717
}
1818

19-
/// Optimized implementation for byte arrays which coalesces them into word-sized chunks first,
20-
/// then performs [`CmovEq`] at the word-level to cut down on the total number of instructions.
19+
// Optimized implementation for arrays which coalesces them into word-sized chunks first,
20+
// then performs [`CmovEq`] at the word-level to cut down on the total number of instructions.
2121
impl<T, const N: usize> CmovEq for [T; N]
2222
where
2323
[T]: CmovEq,

cmov/src/slice.rs

Lines changed: 103 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,8 @@ macro_rules! assert_lengths_eq {
2525
};
2626
}
2727

28-
/// Optimized implementation for byte slices which coalesces them into word-sized chunks first,
29-
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
30-
///
31-
/// # Panics
32-
/// - if slices have unequal lengths
28+
// Optimized implementation for byte slices which coalesces them into word-sized chunks first,
29+
// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
3330
impl Cmov for [u8] {
3431
#[inline]
3532
#[track_caller]
@@ -50,11 +47,8 @@ impl Cmov for [u8] {
5047
}
5148
}
5249

53-
/// Optimized implementation for slices of `u16` which coalesces them into word-sized chunks first,
54-
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
55-
///
56-
/// # Panics
57-
/// - if slices have unequal lengths
50+
// Optimized implementation for slices of `u16` which coalesces them into word-sized chunks first,
51+
// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
5852
#[cfg(not(target_pointer_width = "64"))]
5953
#[cfg_attr(docsrs, doc(cfg(true)))]
6054
impl Cmov for [u16] {
@@ -78,11 +72,8 @@ impl Cmov for [u16] {
7872
}
7973
}
8074

81-
/// Optimized implementation for slices of `u16` which coalesces them into word-sized chunks first,
82-
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
83-
///
84-
/// # Panics
85-
/// - if slices have unequal lengths
75+
// Optimized implementation for slices of `u16` which coalesces them into word-sized chunks first,
76+
// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
8677
#[cfg(target_pointer_width = "64")]
8778
#[cfg_attr(docsrs, doc(cfg(true)))]
8879
impl Cmov for [u16] {
@@ -116,90 +107,36 @@ impl Cmov for [u16] {
116107
}
117108
}
118109

119-
/// Implementation for slices of `u32` on 32-bit platforms, where we can just loop.
120-
///
121-
/// # Panics
122-
/// - if slices have unequal lengths
123-
#[cfg(not(target_pointer_width = "64"))]
124-
#[cfg_attr(docsrs, doc(cfg(true)))]
125-
impl Cmov for [u32] {
126-
#[inline]
127-
#[track_caller]
128-
fn cmovnz(&mut self, value: &Self, condition: Condition) {
129-
assert_lengths_eq!(self.len(), value.len());
130-
131-
for (a, b) in self.iter_mut().zip(value.iter()) {
132-
a.cmovnz(b, condition);
133-
}
134-
}
135-
}
136-
137-
/// Optimized implementation for slices of `u32` which coalesces them into word-sized chunks first,
138-
/// then performs [`Cmov`] at the word-level to cut down on the total number of instructions.
139-
///
140-
/// # Panics
141-
/// - if slices have unequal lengths
142-
#[cfg(target_pointer_width = "64")]
143-
#[cfg_attr(docsrs, doc(cfg(true)))]
144-
impl Cmov for [u32] {
145-
#[inline]
146-
#[track_caller]
147-
fn cmovnz(&mut self, value: &Self, condition: Condition) {
148-
assert_lengths_eq!(self.len(), value.len());
149-
150-
let (dst_chunks, dst_remainder) = slice_as_chunks_mut::<u32, 2>(self);
151-
let (src_chunks, src_remainder) = slice_as_chunks::<u32, 2>(value);
152-
153-
for (dst_chunk, src_chunk) in dst_chunks.iter_mut().zip(src_chunks.iter()) {
154-
let mut a = Word::from(dst_chunk[0]) | (Word::from(dst_chunk[1]) << 32);
155-
let b = Word::from(src_chunk[0]) | (Word::from(src_chunk[1]) << 32);
156-
a.cmovnz(&b, condition);
157-
dst_chunk[0] = (a & 0xFFFF_FFFF) as u32;
158-
dst_chunk[1] = (a >> 32) as u32;
159-
}
160-
161-
cmovnz_remainder(dst_remainder, src_remainder, condition);
162-
}
163-
}
164-
165110
/// Implement [`Cmov`] using a simple loop.
166111
macro_rules! impl_cmov_with_loop {
167-
($int:ty, $doc:expr) => {
168-
#[doc = $doc]
169-
#[doc = "# Panics"]
170-
#[doc = "- if slices have unequal lengths"]
171-
impl Cmov for [$int] {
172-
#[inline]
173-
#[track_caller]
174-
fn cmovnz(&mut self, value: &Self, condition: Condition) {
175-
assert_lengths_eq!(self.len(), value.len());
176-
for (a, b) in self.iter_mut().zip(value.iter()) {
177-
a.cmovnz(b, condition);
112+
( $($int:ty),+ ) => {
113+
$(
114+
impl Cmov for [$int] {
115+
#[inline]
116+
#[track_caller]
117+
fn cmovnz(&mut self, value: &Self, condition: Condition) {
118+
assert_lengths_eq!(self.len(), value.len());
119+
for (a, b) in self.iter_mut().zip(value.iter()) {
120+
a.cmovnz(b, condition);
121+
}
178122
}
179123
}
180-
}
124+
)+
181125
};
182126
}
183127

184-
impl_cmov_with_loop!(
185-
u64,
186-
"Implementation for `u64` slices where we can just loop."
187-
);
188-
impl_cmov_with_loop!(
189-
u128,
190-
"Implementation for `u128` slices where we can just loop."
191-
);
128+
impl_cmov_with_loop!(u32, u64, u128);
192129

193130
macro_rules! assert_size_and_alignment_eq {
194-
($signed:ty, $unsigned:ty) => {
131+
($int:ty, $uint:ty) => {
195132
const {
196133
assert!(
197-
size_of::<$signed>() == size_of::<$unsigned>(),
134+
size_of::<$int>() == size_of::<$uint>(),
198135
"integers are of unequal size"
199136
);
200137

201138
assert!(
202-
align_of::<$signed>() == align_of::<$unsigned>(),
139+
align_of::<$int>() == align_of::<$uint>(),
203140
"integers have unequal alignment"
204141
);
205142
}
@@ -208,34 +145,20 @@ macro_rules! assert_size_and_alignment_eq {
208145

209146
/// Implement [`Cmov`] for a signed type by invoking the corresponding unsigned impl.
210147
macro_rules! impl_cmov_for_signed_with_unsigned {
211-
($signed:ty, $unsigned:ty) => {
212-
impl_cmov_for_signed_with_unsigned!(
213-
$signed,
214-
$unsigned,
215-
"Delegating implementation of `Cmov` for signed type which delegates to unsigned."
216-
);
217-
};
218-
($signed:ty, $unsigned:ty, $doc:expr) => {
219-
#[doc = $doc]
220-
#[doc = "# Panics"]
221-
#[doc = "- if slices have unequal lengths"]
222-
impl Cmov for [$signed] {
148+
($int:ty, $uint:ty) => {
149+
impl Cmov for [$int] {
223150
#[inline]
224151
#[track_caller]
152+
#[allow(unsafe_code)]
225153
fn cmovnz(&mut self, value: &Self, condition: Condition) {
226-
assert_size_and_alignment_eq!($signed, $unsigned);
154+
assert_size_and_alignment_eq!($int, $uint);
227155

228156
// SAFETY:
229157
// - Slices being constructed are of same-sized integers as asserted above.
230158
// - We source the slice length directly from the other valid slice.
231-
#[allow(unsafe_code)]
232-
let (self_unsigned, value_unsigned) = unsafe {
233-
(
234-
slice::from_raw_parts_mut(self.as_mut_ptr() as *mut $unsigned, self.len()),
235-
slice::from_raw_parts(value.as_ptr() as *const $unsigned, value.len()),
236-
)
237-
};
238159

160+
let self_unsigned = unsafe { cast_slice_mut::<$int, $uint>(self) };
161+
let value_unsigned = unsafe { cast_slice::<$int, $uint>(value) };
239162
self_unsigned.cmovnz(value_unsigned, condition);
240163
}
241164
}
@@ -244,58 +167,43 @@ macro_rules! impl_cmov_for_signed_with_unsigned {
244167

245168
/// Implement [`CmovEq`] for a signed type by invoking the corresponding unsigned impl.
246169
macro_rules! impl_cmoveq_for_signed_with_unsigned {
247-
($signed:ty, $unsigned:ty) => {
248-
impl_cmoveq_for_signed_with_unsigned!(
249-
$signed,
250-
$unsigned,
251-
"Delegating implementation of `CmovEq` for signed type which delegates to unsigned."
252-
);
253-
};
254-
($signed:ty, $unsigned:ty, $doc:expr) => {
255-
#[doc = $doc]
256-
#[doc = "# Panics"]
257-
#[doc = "- if slices have unequal lengths"]
258-
impl CmovEq for [$signed] {
170+
($int:ty, $uint:ty) => {
171+
impl CmovEq for [$int] {
259172
#[inline]
173+
#[allow(unsafe_code)]
260174
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
261-
assert_size_and_alignment_eq!($signed, $unsigned);
262-
263175
// SAFETY:
264176
// - Slices being constructed are of same-sized integers as asserted above.
265177
// - We source the slice length directly from the other valid slice.
266-
#[allow(unsafe_code)]
267-
let (self_unsigned, rhs_unsigned) = unsafe {
268-
(
269-
slice::from_raw_parts(self.as_ptr() as *const $unsigned, self.len()),
270-
slice::from_raw_parts(rhs.as_ptr() as *const $unsigned, rhs.len()),
271-
)
272-
};
273-
178+
let self_unsigned = unsafe { cast_slice::<$int, $uint>(self) };
179+
let rhs_unsigned = unsafe { cast_slice::<$int, $uint>(rhs) };
274180
self_unsigned.cmovne(rhs_unsigned, input, output);
275181
}
276182
}
277183
};
278184
}
279185

280186
/// Implement [`Cmov`] and [`CmovEq`] for the given signed/unsigned type pair.
187+
// TODO(tarcieri): use `cast_unsigned`/`cast_signed` to get rid of the `=> u*`
281188
macro_rules! impl_cmov_traits_for_signed_with_unsigned {
282-
($signed:ty, $unsigned:ty) => {
283-
impl_cmov_for_signed_with_unsigned!($signed, $unsigned);
284-
impl_cmoveq_for_signed_with_unsigned!($signed, $unsigned);
189+
( $($int:ty => $uint:ty),+ ) => {
190+
$(
191+
impl_cmov_for_signed_with_unsigned!($int, $uint);
192+
impl_cmoveq_for_signed_with_unsigned!($int, $uint);
193+
)+
285194
};
286195
}
287196

288-
impl_cmov_traits_for_signed_with_unsigned!(i8, u8);
289-
impl_cmov_traits_for_signed_with_unsigned!(i16, u16);
290-
impl_cmov_traits_for_signed_with_unsigned!(i32, u32);
291-
impl_cmov_traits_for_signed_with_unsigned!(i64, u64);
292-
impl_cmov_traits_for_signed_with_unsigned!(i128, u128);
197+
impl_cmov_traits_for_signed_with_unsigned!(
198+
i8 => u8,
199+
i16 => u16,
200+
i32 => u32,
201+
i64 => u64,
202+
i128 => u128
203+
);
293204

294-
/// Optimized implementation for byte slices which coalesces them into word-sized chunks first,
295-
/// then performs [`CmovEq`] at the word-level to cut down on the total number of instructions.
296-
///
297-
/// This is only constant-time for equal-length slices, and will short-circuit and set `output`
298-
/// in the event the slices are of unequal length.
205+
// Optimized implementation for byte slices which coalesces them into word-sized chunks first,
206+
// then performs [`CmovEq`] at the word-level to cut down on the total number of instructions.
299207
impl CmovEq for [u8] {
300208
#[inline]
301209
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
@@ -321,43 +229,67 @@ impl CmovEq for [u8] {
321229

322230
/// Implement [`CmovEq`] using a simple loop.
323231
macro_rules! impl_cmoveq_with_loop {
324-
($int:ty, $doc:expr) => {
325-
#[doc = $doc]
326-
impl CmovEq for [$int] {
327-
#[inline]
328-
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
329-
// Short-circuit the comparison if the slices are of different lengths, and set the output
330-
// condition to the input condition.
331-
if self.len() != rhs.len() {
332-
*output = input;
333-
return;
334-
}
335-
336-
for (a, b) in self.iter().zip(rhs.iter()) {
337-
a.cmovne(b, input, output);
232+
( $($int:ty),+ ) => {
233+
$(
234+
impl CmovEq for [$int] {
235+
#[inline]
236+
fn cmovne(&self, rhs: &Self, input: Condition, output: &mut Condition) {
237+
// Short-circuit the comparison if the slices are of different lengths, and set the output
238+
// condition to the input condition.
239+
if self.len() != rhs.len() {
240+
*output = input;
241+
return;
242+
}
243+
244+
for (a, b) in self.iter().zip(rhs.iter()) {
245+
a.cmovne(b, input, output);
246+
}
338247
}
339248
}
340-
}
249+
)+
341250
};
342251
}
343252

344-
// TODO(tarcieri): optimized coalescing implementations of `CmovEq` for `u16` and `u32`
345-
impl_cmoveq_with_loop!(
346-
u16,
347-
"Implementation for `u16` slices where we can just loop."
348-
);
349-
impl_cmoveq_with_loop!(
350-
u32,
351-
"Implementation for `u32` slices where we can just loop."
352-
);
353-
impl_cmoveq_with_loop!(
354-
u64,
355-
"Implementation for `u64` slices where we can just loop."
356-
);
357-
impl_cmoveq_with_loop!(
358-
u128,
359-
"Implementation for `u128` slices where we can just loop."
360-
);
253+
// TODO(tarcieri): investigate word-coalescing impls
254+
impl_cmoveq_with_loop!(u16, u32, u64, u128);
255+
256+
/// Performs an unsafe pointer cast from one slice type to the other.
257+
///
258+
/// # Panics
259+
/// - If `T` and `U` differ in size
260+
/// - If `T` and `U` differ in alignment
261+
#[allow(unsafe_code)]
262+
unsafe fn cast_slice<T, U>(slice: &[T]) -> &[U] {
263+
const {
264+
assert!(size_of::<T>() == size_of::<U>(), "T/U size differs");
265+
assert!(align_of::<T>() == align_of::<U>(), "T/U alignment differs");
266+
}
267+
268+
// SAFETY:
269+
// - Slices being constructed are of same-sized/aligned types as asserted above.
270+
// - We source the slice length directly from the other valid slice.
271+
// - It's up to the caller to ensure the pointer cast itself is valid.
272+
unsafe { slice::from_raw_parts(slice.as_ptr() as *const U, slice.len()) }
273+
}
274+
275+
/// Performs an unsafe pointer cast from one mutable slice type to the other.
276+
///
277+
/// # Panics
278+
/// - If `T` and `U` differ in size
279+
/// - If `T` and `U` differ in alignment
280+
#[allow(unsafe_code)]
281+
unsafe fn cast_slice_mut<T, U>(slice: &mut [T]) -> &mut [U] {
282+
const {
283+
assert!(size_of::<T>() == size_of::<U>(), "T/U size differs");
284+
assert!(align_of::<T>() == align_of::<U>(), "T/U alignment differs");
285+
}
286+
287+
// SAFETY:
288+
// - Slices being constructed are of same-sized/aligned types as asserted above.
289+
// - We source the slice length directly from the other valid slice.
290+
// - It's up to the caller to ensure the pointer cast itself is valid.
291+
unsafe { slice::from_raw_parts_mut(slice.as_mut_ptr() as *mut U, slice.len()) }
292+
}
361293

362294
/// Compare the two remainder slices by loading a `Word` then performing `cmovne`.
363295
#[inline]

0 commit comments

Comments
 (0)