Skip to content

Commit cdf782b

Browse files
committed
make improvements
1 parent 1e8931d commit cdf782b

File tree

5 files changed

+414
-73
lines changed

5 files changed

+414
-73
lines changed

src/checkers.rs

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -132,19 +132,20 @@ const PATH_SIG_TABLE: [u8; 256] = {
132132
t
133133
};
134134

135-
/// Compute a path-signature byte via an unrolled table lookup — branch-free.
136-
/// Returns a bitmask of the flags above.
135+
/// Compute a path-signature byte via Ada's exact 8-at-a-time unrolled lookup.
136+
///
137+
/// Ada C++ uses `for (; i + 7 < size; i += 8)` — we match that exactly.
137138
pub fn path_signature(input: &str) -> u8 {
138139
let b = input.as_bytes();
139140
let mut acc = 0u8;
140141
let mut i = 0;
141-
// Unrolled 4-at-a-time — same as Ada C++ style
142-
while i + 4 <= b.len() {
143-
acc |= PATH_SIG_TABLE[b[i] as usize]
144-
| PATH_SIG_TABLE[b[i+1] as usize]
145-
| PATH_SIG_TABLE[b[i+2] as usize]
146-
| PATH_SIG_TABLE[b[i+3] as usize];
147-
i += 4;
142+
// 8-at-a-time — Ada C++ uses this exact unroll factor
143+
while i + 8 <= b.len() {
144+
acc |= PATH_SIG_TABLE[b[i ] as usize] | PATH_SIG_TABLE[b[i+1] as usize]
145+
| PATH_SIG_TABLE[b[i+2] as usize] | PATH_SIG_TABLE[b[i+3] as usize]
146+
| PATH_SIG_TABLE[b[i+4] as usize] | PATH_SIG_TABLE[b[i+5] as usize]
147+
| PATH_SIG_TABLE[b[i+6] as usize] | PATH_SIG_TABLE[b[i+7] as usize];
148+
i += 8;
148149
}
149150
while i < b.len() { acc |= PATH_SIG_TABLE[b[i] as usize]; i += 1; }
150151
acc

src/helpers.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ pub fn remove_ascii_tab_or_newline(s: &mut String) {
6666
/// Trim leading and trailing C0 control characters and ASCII space.
6767
/// Returns a `&str` slice into the original — **zero allocation**.
6868
#[inline]
69+
#[allow(dead_code)]
6970
pub fn trim_c0_whitespace(s: &str) -> &str {
7071
let start = s
7172
.as_bytes()

src/parser.rs

Lines changed: 266 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@ use crate::helpers::{
1818
find_authority_delimiter, find_authority_delimiter_special, get_host_delimiter_location,
1919
shorten_path, strip_tabs_newlines, trim_c0_whitespace,
2020
};
21-
use crate::scheme::SchemeType as Scheme;
22-
use crate::unicode::{is_alnum_plus, percent_encode};
21+
use crate::scheme::{get_scheme_type_lower, SchemeType as Scheme};
22+
use crate::unicode::{
23+
contains_xn_prefix_pub, is_alnum_plus,
24+
percent_encode,
25+
};
2326

2427
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2528
enum State {
@@ -54,23 +57,29 @@ pub fn parse_url(user_input: &str, base: Option<&Url>) -> Option<Url> {
5457
return None;
5558
}
5659

57-
// Strip tabs/newlines — CoW: zero allocation when none present (common case)
58-
let stripped = strip_tabs_newlines(user_input);
59-
60-
// Trim C0 whitespace — borrow from the (possibly borrowed) cow, no allocation
61-
let url_data = trim_c0_whitespace(&stripped);
60+
// ── Fast path ─────────────────────────────────────────────────────────
61+
// For absolute URLs without a base we try the ultra-fast path BEFORE
62+
// any pre-processing. It inlines C0 trim, tab detection, fragment split,
63+
// host validation, and buffer build in a single forward scan, saving the
64+
// three separate pre-processing passes that the slow path performs.
65+
if base.is_none()
66+
&& let Some(url) = try_parse_absolute_fast(user_input) {
67+
return Some(url);
68+
}
6269

63-
// Extract fragment — split in-place, no allocation (fragment borrows from url_data)
64-
let (url_data, fragment) = match url_data.find('#') {
65-
None => (url_data, None),
66-
Some(p) => (&url_data[..p], Some(&url_data[p + 1..])),
70+
// ── Slow path: full pre-processing + state machine ────────────────────
71+
let stripped = strip_tabs_newlines(user_input);
72+
let trimmed = trim_c0_whitespace(&stripped);
73+
let (url_data, fragment): (&str, Option<&str>) = match trimmed.find('#') {
74+
None => (trimmed, None),
75+
Some(p) => (&trimmed[..p], Some(&trimmed[p + 1..])),
6776
};
6877

6978
let input_size = url_data.len();
7079
let b = url_data.as_bytes();
7180

7281
let mut url = Url::empty();
73-
url.buffer.reserve(input_size + 16);
82+
url.buffer.reserve(input_size + 4);
7483

7584
let mut state = State::SchemeStart;
7685
let mut pos: usize = 0;
@@ -542,3 +551,248 @@ pub fn parse_url(user_input: &str, base: Option<&Url>) -> Option<Url> {
542551
}
543552
Some(url)
544553
}
554+
555+
// =============================================================================
556+
// Fast-path builder for absolute special URLs
557+
// =============================================================================
558+
//
559+
// Handles the overwhelmingly common case: an absolute URL like
560+
// `https://hostname/path?query`
561+
// with no base, no credentials, ASCII-only lowercase host, and no path dots.
562+
//
563+
// Single forward scan replaces the full state-machine loop. Falls back to
564+
// `None` for anything unusual so the slow path covers all edge cases.
565+
566+
/// Try to parse `url_data` (already C0-trimmed, tab/newline-free, fragment-stripped)
567+
/// as a simple absolute special URL without invoking the state machine.
568+
/// Fast-path builder for absolute special URLs — takes RAW (unprocessed) input.
569+
///
570+
/// Inlines C0 trim, tab/newline detection, fragment split, host validation,
571+
/// and buffer construction into a minimal number of forward scans.
572+
/// Falls back to `None` (→ state machine) for anything unusual.
573+
#[inline]
574+
pub(crate) fn try_parse_absolute_fast(raw_input: &str) -> Option<Url> {
575+
use crate::{HostKind, OMITTED};
576+
use crate::character_sets::FRAGMENT_PERCENT_ENCODE;
577+
use crate::unicode::DOMAIN_CHECK;
578+
579+
let raw = raw_input.as_bytes();
580+
581+
// ── C0 whitespace trim (branchless for typical URLs with none) ─────────
582+
let start = if !raw.is_empty() && raw[0] <= b' ' {
583+
raw.iter().position(|&b| b > b' ')?
584+
} else { 0 };
585+
let end = if !raw.is_empty() && raw[raw.len()-1] <= b' ' {
586+
raw.iter().rposition(|&b| b > b' ').map(|i| i + 1)?
587+
} else { raw.len() };
588+
if start >= end { return None; }
589+
let b = &raw[start..end];
590+
591+
// ── Scheme detection (≤ 6 bytes) ──────────────────────────────────────
592+
if b.is_empty() || !is_alpha(b[0]) { return None; }
593+
594+
let colon = {
595+
let mut i = 1usize;
596+
loop {
597+
if i >= b.len().min(7) { return None; }
598+
match b[i] {
599+
b':' => break i,
600+
c if !is_alnum_plus(c) => return None, // invalid scheme char (incl. \t\n\r)
601+
_ => i += 1,
602+
}
603+
}
604+
};
605+
606+
// Perfect-hash scheme type (no string comparison)
607+
let scheme_bytes = &b[..colon];
608+
let scheme = {
609+
let s = unsafe { core::str::from_utf8_unchecked(scheme_bytes) };
610+
let t = get_scheme_type_lower(s);
611+
if t == Scheme::NotSpecial { crate::scheme::get_scheme_type(s) } else { t }
612+
};
613+
if !scheme.is_special() || scheme == Scheme::File { return None; }
614+
615+
// ── Require "://" ──────────────────────────────────────────────────────
616+
if b.len() < colon + 3 || b[colon+1] != b'/' || b[colon+2] != b'/' { return None; }
617+
let auth_start = colon + 3;
618+
619+
// ── Single-pass authority scan ─────────────────────────────────────────
620+
// Simultaneously: find auth end, detect '@' / tabs / non-ASCII / uppercase,
621+
// IPv4 flag (only digits+dots), xn-- flag ('x' present), forbidden chars.
622+
let mut auth_end = auth_start;
623+
let mut port_colon: Option<usize> = None;
624+
let mut has_x = false; // xn-- candidate ('x' seen in host)
625+
626+
while auth_end < b.len() {
627+
let c = b[auth_end];
628+
match c {
629+
b'/' | b'?' | b'#' | b'\\' => break,
630+
b'@' => return None,
631+
b':' if port_colon.is_none() => { port_colon = Some(auth_end); auth_end += 1; }
632+
b'\t' | b'\n' | b'\r' => return None,
633+
c if c >= 0x80 => return None,
634+
b'0'..=b'9' | b'.' => { auth_end += 1; }
635+
c => {
636+
if c == b'x' { has_x = true; }
637+
if DOMAIN_CHECK[c as usize] != 0 { return None; }
638+
auth_end += 1;
639+
}
640+
}
641+
}
642+
643+
let host_end_in_input = port_colon.unwrap_or(auth_end);
644+
let host = &b[auth_start..host_end_in_input];
645+
if host.is_empty() { return None; }
646+
647+
// IPv4 quick-filter: check the last *significant* (non-dot) byte of the host.
648+
// is_ipv4 strips trailing dots internally, so we mirror that here.
649+
// For TLD hostnames (.com/.org/.net) the last letter is 'm','g','t' — never
650+
// in {0-9, a-f, x} — so is_ipv4 is not called for typical domain names.
651+
{
652+
let last_sig = host.iter().rev().find(|&&c| c != b'.').copied().unwrap_or(0);
653+
let maybe_ipv4 = last_sig.is_ascii_digit()
654+
|| matches!(last_sig, b'a'..=b'f')
655+
|| last_sig == b'x';
656+
if maybe_ipv4 {
657+
let host_str = unsafe { core::str::from_utf8_unchecked(host) };
658+
if crate::checkers::is_ipv4(host_str) { return None; }
659+
}
660+
}
661+
662+
// xn-- check: only if 'x' was seen (zero cost for typical .com/.org hosts)
663+
if has_x {
664+
let host_str = unsafe { core::str::from_utf8_unchecked(host) };
665+
if contains_xn_prefix_pub(host_str) { return None; }
666+
}
667+
668+
// ── Port ────────────────────────────────────────────────────────────────
669+
let port_val: u32 = if let Some(pc) = port_colon {
670+
let port_bytes = &b[pc + 1..auth_end];
671+
if port_bytes.is_empty() {
672+
OMITTED
673+
} else {
674+
if !port_bytes.iter().all(|&c| c.is_ascii_digit()) { return None; }
675+
let n: u32 = port_bytes.iter().fold(0u32, |a, &c| a * 10 + (c - b'0') as u32);
676+
if n > 65535 { return None; }
677+
let def = scheme.default_port();
678+
if def != 0 && n as u16 == def { OMITTED } else { n }
679+
}
680+
} else {
681+
OMITTED
682+
};
683+
684+
// ── Path + query + fragment scan ───────────────────────────────────────
685+
let path_start = auth_end;
686+
let mut query_start: Option<usize> = None;
687+
let mut frag_start: Option<usize> = None;
688+
let path_end: usize;
689+
690+
{
691+
let mut i = path_start;
692+
loop {
693+
if i >= b.len() { path_end = i; break; }
694+
match b[i] {
695+
b'?' => { path_end = i; query_start = Some(i); break; }
696+
b'#' => { path_end = i; frag_start = Some(i); break; }
697+
b'\\' => return None, // backslash needs normalisation
698+
_ => {}
699+
}
700+
i += 1;
701+
}
702+
}
703+
704+
// Path: use path_signature to detect encoding needs + dot-segments.
705+
let path_bytes = &b[path_start..path_end];
706+
let path_sig = crate::checkers::path_signature(
707+
unsafe { core::str::from_utf8_unchecked(path_bytes) }
708+
);
709+
if path_sig & 0x0B != 0 { return None; } // needs encoding / backslash / percent
710+
if path_sig & 0x04 != 0 {
711+
// Has a dot — check for actual dot-segments (SIMD str::contains)
712+
let path_str = unsafe { core::str::from_utf8_unchecked(path_bytes) };
713+
if path_str.contains("/.") { return None; }
714+
}
715+
716+
// Query: check for characters needing encoding
717+
let query_end = frag_start.unwrap_or(b.len());
718+
if let Some(qs) = query_start {
719+
let qbytes = &b[qs + 1..query_end];
720+
let encode_set = if scheme.default_port() != 0 {
721+
&crate::character_sets::SPECIAL_QUERY_PERCENT_ENCODE
722+
} else {
723+
&crate::character_sets::QUERY_PERCENT_ENCODE
724+
};
725+
if crate::unicode::percent_encode_index(
726+
unsafe { core::str::from_utf8_unchecked(qbytes) },
727+
encode_set,
728+
) != qbytes.len() {
729+
return None; // query needs encoding
730+
}
731+
}
732+
733+
// Fragment from '#' in the URL (may be None if no '#')
734+
let fragment: Option<&str> = frag_start.map(|fs| {
735+
unsafe { core::str::from_utf8_unchecked(&b[fs + 1..]) }
736+
});
737+
738+
// ── Build URL buffer in a single forward write pass ────────────────────
739+
let total = colon + 1 + 2 // scheme: //
740+
+ host.len()
741+
+ if port_val != OMITTED { 6 } else { 0 } // :NNNNN
742+
+ (path_end - path_start).max(1)
743+
+ query_start.map_or(0, |qs| query_end - qs)
744+
+ fragment.map_or(0, |f| f.len() + 1);
745+
746+
let mut url = Url::empty();
747+
url.scheme = scheme;
748+
url.buffer.reserve(total + 4);
749+
750+
// Scheme (lowercase) + ':'
751+
for &c in scheme_bytes { url.buffer.push((c | 0x20) as char); }
752+
url.buffer.push(':');
753+
url.components.protocol_end = url.buffer.len() as u32;
754+
755+
// "//"
756+
url.buffer.push('/');
757+
url.buffer.push('/');
758+
url.components.username_end = url.buffer.len() as u32;
759+
url.components.host_start = url.buffer.len() as u32;
760+
761+
// Host (already validated: lowercase ASCII, no forbidden chars)
762+
url.buffer.push_str(unsafe { core::str::from_utf8_unchecked(host) });
763+
url.components.host_end = url.buffer.len() as u32;
764+
765+
// Port
766+
if port_val != OMITTED {
767+
url.buffer.push(':');
768+
let mut tmp = [0u8; 5]; let mut n = port_val; let mut len = 0usize;
769+
loop { tmp[len] = b'0' + (n % 10) as u8; n /= 10; len += 1; if n == 0 { break; } }
770+
for k in (0..len).rev() { url.buffer.push(tmp[k] as char); }
771+
url.components.port = port_val;
772+
}
773+
url.components.pathname_start = url.buffer.len() as u32;
774+
775+
// Path
776+
if path_bytes.is_empty() {
777+
url.buffer.push('/');
778+
} else {
779+
url.buffer.push_str(unsafe { core::str::from_utf8_unchecked(path_bytes) });
780+
}
781+
782+
// Query (with leading '?')
783+
if let Some(qs) = query_start {
784+
url.components.search_start = url.buffer.len() as u32;
785+
url.buffer.push_str(unsafe { core::str::from_utf8_unchecked(&b[qs..query_end]) });
786+
}
787+
788+
// Fragment (percent-encoded, with leading '#')
789+
if let Some(frag) = fragment {
790+
url.components.hash_start = url.buffer.len() as u32;
791+
url.buffer.push('#');
792+
let enc = percent_encode(frag, &FRAGMENT_PERCENT_ENCODE);
793+
url.buffer.push_str(&enc);
794+
}
795+
796+
url.host_kind = HostKind::Domain;
797+
Some(url)
798+
}

0 commit comments

Comments
 (0)