Skip to content

Commit f791aa5

Browse files
committed
Auto merge of #136594 - pascaldekloe:fmt-int128, r=<try>
Faster fmt::Display of 128-bit integers, without unsafe pointer In followup of #135265, hereby the 128-bit part. * Batches per 16 instead of 19 digits * Buffer access as array insteaf of unsafe pointer * Added test coverage for i128 and u128 r? tgross35 ChrisDenton
2 parents c6a9554 + 0483c93 commit f791aa5

File tree

2 files changed

+248
-188
lines changed

2 files changed

+248
-188
lines changed

library/core/src/fmt/num.rs

Lines changed: 127 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ macro_rules! impl_Display {
279279
// Format per two digits from the lookup table.
280280
if remain > 9 {
281281
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
282-
// and the while condition ensures at least 2 more decimals.
282+
// and the if condition ensures at least 2 more decimals.
283283
unsafe { core::hint::assert_unchecked(offset >= 2) }
284284
// SAFETY: The offset counts down from its initial buf.len()
285285
// without underflow due to the previous precondition.
@@ -565,93 +565,6 @@ mod imp {
565565
}
566566
impl_Exp!(i128, u128 as u128 via to_u128 named exp_u128);
567567

568-
/// Helper function for writing a u64 into `buf` going from last to first, with `curr`.
569-
fn parse_u64_into<const N: usize>(mut n: u64, buf: &mut [MaybeUninit<u8>; N], curr: &mut usize) {
570-
let buf_ptr = MaybeUninit::slice_as_mut_ptr(buf);
571-
let lut_ptr = DEC_DIGITS_LUT.as_ptr();
572-
assert!(*curr > 19);
573-
574-
// SAFETY:
575-
// Writes at most 19 characters into the buffer. Guaranteed that any ptr into LUT is at most
576-
// 198, so will never OOB. There is a check above that there are at least 19 characters
577-
// remaining.
578-
unsafe {
579-
if n >= 1e16 as u64 {
580-
let to_parse = n % 1e16 as u64;
581-
n /= 1e16 as u64;
582-
583-
// Some of these are nops but it looks more elegant this way.
584-
let d1 = ((to_parse / 1e14 as u64) % 100) << 1;
585-
let d2 = ((to_parse / 1e12 as u64) % 100) << 1;
586-
let d3 = ((to_parse / 1e10 as u64) % 100) << 1;
587-
let d4 = ((to_parse / 1e8 as u64) % 100) << 1;
588-
let d5 = ((to_parse / 1e6 as u64) % 100) << 1;
589-
let d6 = ((to_parse / 1e4 as u64) % 100) << 1;
590-
let d7 = ((to_parse / 1e2 as u64) % 100) << 1;
591-
let d8 = ((to_parse / 1e0 as u64) % 100) << 1;
592-
593-
*curr -= 16;
594-
595-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr + 0), 2);
596-
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(*curr + 2), 2);
597-
ptr::copy_nonoverlapping(lut_ptr.add(d3 as usize), buf_ptr.add(*curr + 4), 2);
598-
ptr::copy_nonoverlapping(lut_ptr.add(d4 as usize), buf_ptr.add(*curr + 6), 2);
599-
ptr::copy_nonoverlapping(lut_ptr.add(d5 as usize), buf_ptr.add(*curr + 8), 2);
600-
ptr::copy_nonoverlapping(lut_ptr.add(d6 as usize), buf_ptr.add(*curr + 10), 2);
601-
ptr::copy_nonoverlapping(lut_ptr.add(d7 as usize), buf_ptr.add(*curr + 12), 2);
602-
ptr::copy_nonoverlapping(lut_ptr.add(d8 as usize), buf_ptr.add(*curr + 14), 2);
603-
}
604-
if n >= 1e8 as u64 {
605-
let to_parse = n % 1e8 as u64;
606-
n /= 1e8 as u64;
607-
608-
// Some of these are nops but it looks more elegant this way.
609-
let d1 = ((to_parse / 1e6 as u64) % 100) << 1;
610-
let d2 = ((to_parse / 1e4 as u64) % 100) << 1;
611-
let d3 = ((to_parse / 1e2 as u64) % 100) << 1;
612-
let d4 = ((to_parse / 1e0 as u64) % 100) << 1;
613-
*curr -= 8;
614-
615-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr + 0), 2);
616-
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(*curr + 2), 2);
617-
ptr::copy_nonoverlapping(lut_ptr.add(d3 as usize), buf_ptr.add(*curr + 4), 2);
618-
ptr::copy_nonoverlapping(lut_ptr.add(d4 as usize), buf_ptr.add(*curr + 6), 2);
619-
}
620-
// `n` < 1e8 < (1 << 32)
621-
let mut n = n as u32;
622-
if n >= 1e4 as u32 {
623-
let to_parse = n % 1e4 as u32;
624-
n /= 1e4 as u32;
625-
626-
let d1 = (to_parse / 100) << 1;
627-
let d2 = (to_parse % 100) << 1;
628-
*curr -= 4;
629-
630-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr + 0), 2);
631-
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(*curr + 2), 2);
632-
}
633-
634-
// `n` < 1e4 < (1 << 16)
635-
let mut n = n as u16;
636-
if n >= 100 {
637-
let d1 = (n % 100) << 1;
638-
n /= 100;
639-
*curr -= 2;
640-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr), 2);
641-
}
642-
643-
// decode last 1 or 2 chars
644-
if n < 10 {
645-
*curr -= 1;
646-
*buf_ptr.add(*curr) = (n as u8) + b'0';
647-
} else {
648-
let d1 = n << 1;
649-
*curr -= 2;
650-
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(*curr), 2);
651-
}
652-
}
653-
}
654-
655568
#[stable(feature = "rust1", since = "1.0.0")]
656569
impl fmt::Display for u128 {
657570
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -662,90 +575,152 @@ impl fmt::Display for u128 {
662575
#[stable(feature = "rust1", since = "1.0.0")]
663576
impl fmt::Display for i128 {
664577
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
665-
let is_nonnegative = *self >= 0;
666-
let n = if is_nonnegative {
667-
self.to_u128()
668-
} else {
669-
// convert the negative num to positive by summing 1 to its 2s complement
670-
(!self.to_u128()).wrapping_add(1)
671-
};
672-
fmt_u128(n, is_nonnegative, f)
578+
fmt_u128(self.unsigned_abs(), *self >= 0, f)
673579
}
674580
}
675581

676-
/// Specialized optimization for u128. Instead of taking two items at a time, it splits
677-
/// into at most 2 u64s, and then chunks by 10e16, 10e8, 10e4, 10e2, and then 10e1.
678-
/// It also has to handle 1 last item, as 10^40 > 2^128 > 10^39, whereas
679-
/// 10^20 > 2^64 > 10^19.
582+
/// Format optimized for u128. Computation of 128 bits is limited by proccessing
583+
/// in batches of 16 decimals at a time.
680584
fn fmt_u128(n: u128, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
585+
// Optimize common-case zero, which would also need special treatment due to
586+
// its "leading" zero.
587+
if n == 0 {
588+
return f.pad_integral(true, "", "0");
589+
}
590+
591+
// U128::MAX has 39 significant-decimals.
681592
const MAX_DEC_N: usize = u128::MAX.ilog(10) as usize + 1;
593+
// Buffer decimals with right alignment.
682594
let mut buf = [MaybeUninit::<u8>::uninit(); MAX_DEC_N];
683-
let mut curr = buf.len();
684-
685-
let (n, rem) = udiv_1e19(n);
686-
parse_u64_into(rem, &mut buf, &mut curr);
687-
688-
if n != 0 {
689-
// 0 pad up to point
690-
let target = buf.len() - 19;
691-
// SAFETY: Guaranteed that we wrote at most 19 bytes, and there must be space
692-
// remaining since it has length 39
693-
unsafe {
694-
ptr::write_bytes(
695-
MaybeUninit::slice_as_mut_ptr(&mut buf).add(target),
696-
b'0',
697-
curr - target,
698-
);
699-
}
700-
curr = target;
701-
702-
let (n, rem) = udiv_1e19(n);
703-
parse_u64_into(rem, &mut buf, &mut curr);
704-
// Should this following branch be annotated with unlikely?
705-
if n != 0 {
706-
let target = buf.len() - 38;
707-
// The raw `buf_ptr` pointer is only valid until `buf` is used the next time,
708-
// buf `buf` is not used in this scope so we are good.
709-
let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf);
710-
// SAFETY: At this point we wrote at most 38 bytes, pad up to that point,
711-
// There can only be at most 1 digit remaining.
712-
unsafe {
713-
ptr::write_bytes(buf_ptr.add(target), b'0', curr - target);
714-
curr = target - 1;
715-
*buf_ptr.add(curr) = (n as u8) + b'0';
716-
}
595+
596+
// Take the 16 least-significant decimals.
597+
let (quot_1e16, mod_1e16) = div_rem_1e16(n);
598+
let (mut remain, mut offset) = if quot_1e16 == 0 {
599+
(mod_1e16, MAX_DEC_N)
600+
} else {
601+
// Write digits at buf[23..39].
602+
enc_16lsd::<{ MAX_DEC_N - 16 }>(&mut buf, mod_1e16);
603+
604+
// Take another 16 decimals.
605+
let (quot2, mod2) = div_rem_1e16(quot_1e16);
606+
if quot2 == 0 {
607+
(mod2, MAX_DEC_N - 16)
608+
} else {
609+
// Write digits at buf[7..23].
610+
enc_16lsd::<{ MAX_DEC_N - 32 }>(&mut buf, mod2);
611+
// Quot2 has at most 7 decimals remaining after two 1e16 divisions.
612+
(quot2 as u64, MAX_DEC_N - 32)
717613
}
614+
};
615+
616+
// Format per four digits from the lookup table.
617+
while remain > 999 {
618+
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
619+
// and the while condition ensures at least 4 more decimals.
620+
unsafe { core::hint::assert_unchecked(offset >= 4) }
621+
// SAFETY: The offset counts down from its initial buf.len()
622+
// without underflow due to the previous precondition.
623+
unsafe { core::hint::assert_unchecked(offset <= buf.len()) }
624+
offset -= 4;
625+
626+
// pull two pairs
627+
let quad = remain % 1_00_00;
628+
remain /= 1_00_00;
629+
let pair1 = (quad / 100) as usize;
630+
let pair2 = (quad % 100) as usize;
631+
buf[offset + 0].write(DEC_DIGITS_LUT[pair1 * 2 + 0]);
632+
buf[offset + 1].write(DEC_DIGITS_LUT[pair1 * 2 + 1]);
633+
buf[offset + 2].write(DEC_DIGITS_LUT[pair2 * 2 + 0]);
634+
buf[offset + 3].write(DEC_DIGITS_LUT[pair2 * 2 + 1]);
635+
}
636+
637+
// Format per two digits from the lookup table.
638+
if remain > 9 {
639+
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
640+
// and the if condition ensures at least 2 more decimals.
641+
unsafe { core::hint::assert_unchecked(offset >= 2) }
642+
// SAFETY: The offset counts down from its initial buf.len()
643+
// without underflow due to the previous precondition.
644+
unsafe { core::hint::assert_unchecked(offset <= buf.len()) }
645+
offset -= 2;
646+
647+
let pair = (remain % 100) as usize;
648+
remain /= 100;
649+
buf[offset + 0].write(DEC_DIGITS_LUT[pair * 2 + 0]);
650+
buf[offset + 1].write(DEC_DIGITS_LUT[pair * 2 + 1]);
651+
}
652+
653+
// Format the last remaining digit, if any.
654+
if remain != 0 {
655+
// SAFETY: All of the decimals fit in buf due to MAX_DEC_N
656+
// and the if condition ensures (at least) 1 more decimals.
657+
unsafe { core::hint::assert_unchecked(offset >= 1) }
658+
// SAFETY: The offset counts down from its initial buf.len()
659+
// without underflow due to the previous precondition.
660+
unsafe { core::hint::assert_unchecked(offset <= buf.len()) }
661+
offset -= 1;
662+
663+
// Either the compiler sees that remain < 10, or it prevents
664+
// a boundary check up next.
665+
let last = (remain & 15) as usize;
666+
buf[offset].write(DEC_DIGITS_LUT[last * 2 + 1]);
667+
// not used: remain = 0;
718668
}
719669

720-
// SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid
721-
// UTF-8 since `DEC_DIGITS_LUT` is
722-
let buf_slice = unsafe {
670+
// SAFETY: All buf content since offset is set.
671+
let written = unsafe { buf.get_unchecked(offset..) };
672+
// SAFETY: Writes use ASCII from the lookup table exclusively.
673+
let as_str = unsafe {
723674
str::from_utf8_unchecked(slice::from_raw_parts(
724-
MaybeUninit::slice_as_mut_ptr(&mut buf).add(curr),
725-
buf.len() - curr,
675+
MaybeUninit::slice_as_ptr(written),
676+
written.len(),
726677
))
727678
};
728-
f.pad_integral(is_nonnegative, "", buf_slice)
679+
f.pad_integral(is_nonnegative, "", as_str)
729680
}
730681

731-
/// Partition of `n` into n > 1e19 and rem <= 1e19
682+
/// Encodes the 16 least-significant decimals of n into `buf[OFFSET .. OFFSET +
683+
/// 16 ]`.
684+
fn enc_16lsd<const OFFSET: usize>(buf: &mut [MaybeUninit<u8>; 39], n: u64) {
685+
// Consume the least-significant decimals from a working copy.
686+
let mut remain = n;
687+
688+
// Format per four digits from the lookup table.
689+
for quad_index in (0..4).rev() {
690+
// pull two pairs
691+
let quad = remain % 1_00_00;
692+
remain /= 1_00_00;
693+
let pair1 = (quad / 100) as usize;
694+
let pair2 = (quad % 100) as usize;
695+
buf[quad_index * 4 + OFFSET + 0].write(DEC_DIGITS_LUT[pair1 * 2 + 0]);
696+
buf[quad_index * 4 + OFFSET + 1].write(DEC_DIGITS_LUT[pair1 * 2 + 1]);
697+
buf[quad_index * 4 + OFFSET + 2].write(DEC_DIGITS_LUT[pair2 * 2 + 0]);
698+
buf[quad_index * 4 + OFFSET + 3].write(DEC_DIGITS_LUT[pair2 * 2 + 1]);
699+
}
700+
}
701+
702+
/// Euclidean division plus remainder with constant 1E16 basically consumes 16
703+
/// decimals from n.
732704
///
733-
/// Integer division algorithm is based on the following paper:
705+
/// The integer division algorithm is based on the following paper:
734706
///
735707
/// T. Granlund and P. Montgomery, “Division by Invariant Integers Using Multiplication”
736708
/// in Proc. of the SIGPLAN94 Conference on Programming Language Design and
737709
/// Implementation, 1994, pp. 61–72
738710
///
739-
fn udiv_1e19(n: u128) -> (u128, u64) {
740-
const DIV: u64 = 1e19 as u64;
741-
const FACTOR: u128 = 156927543384667019095894735580191660403;
711+
#[inline]
712+
fn div_rem_1e16(n: u128) -> (u128, u64) {
713+
const D: u128 = 1_0000_0000_0000_0000;
714+
// The check inlines well with the caller flow.
715+
if n < D {
716+
return (0, n as u64);
717+
}
742718

743-
let quot = if n < 1 << 83 {
744-
((n >> 19) as u64 / (DIV >> 19)) as u128
745-
} else {
746-
n.widening_mul(FACTOR).1 >> 62
747-
};
719+
// These constant values are computed with the CHOOSE_MULTIPLIER procedure.
720+
const M_HIGH: u128 = 76624777043294442917917351357515459181;
721+
const SH_POST: u8 = 51;
748722

749-
let rem = (n - quot * DIV as u128) as u64;
750-
(quot, rem)
723+
let quot = n.widening_mul(M_HIGH).1 >> SH_POST;
724+
let rem = n - quot * D;
725+
(quot, rem as u64)
751726
}

0 commit comments

Comments
 (0)