@@ -18,8 +18,11 @@ use crate::helpers::{
1818 find_authority_delimiter, find_authority_delimiter_special, get_host_delimiter_location,
1919 shorten_path, strip_tabs_newlines, trim_c0_whitespace,
2020} ;
21- use crate :: scheme:: SchemeType as Scheme ;
22- use crate :: unicode:: { is_alnum_plus, percent_encode} ;
21+ use crate :: scheme:: { get_scheme_type_lower, SchemeType as Scheme } ;
22+ use crate :: unicode:: {
23+ contains_xn_prefix_pub, is_alnum_plus,
24+ percent_encode,
25+ } ;
2326
2427#[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
2528enum State {
@@ -54,23 +57,29 @@ pub fn parse_url(user_input: &str, base: Option<&Url>) -> Option<Url> {
5457 return None ;
5558 }
5659
57- // Strip tabs/newlines — CoW: zero allocation when none present (common case)
58- let stripped = strip_tabs_newlines ( user_input) ;
59-
60- // Trim C0 whitespace — borrow from the (possibly borrowed) cow, no allocation
61- let url_data = trim_c0_whitespace ( & stripped) ;
60+ // ── Fast path ─────────────────────────────────────────────────────────
61+ // For absolute URLs without a base we try the ultra-fast path BEFORE
62+ // any pre-processing. It inlines C0 trim, tab detection, fragment split,
63+ // host validation, and buffer build in a single forward scan, saving the
64+ // three separate pre-processing passes that the slow path performs.
65+ if base. is_none ( )
66+ && let Some ( url) = try_parse_absolute_fast ( user_input) {
67+ return Some ( url) ;
68+ }
6269
63- // Extract fragment — split in-place, no allocation (fragment borrows from url_data)
64- let ( url_data, fragment) = match url_data. find ( '#' ) {
65- None => ( url_data, None ) ,
66- Some ( p) => ( & url_data[ ..p] , Some ( & url_data[ p + 1 ..] ) ) ,
70+ // ── Slow path: full pre-processing + state machine ────────────────────
71+ let stripped = strip_tabs_newlines ( user_input) ;
72+ let trimmed = trim_c0_whitespace ( & stripped) ;
73+ let ( url_data, fragment) : ( & str , Option < & str > ) = match trimmed. find ( '#' ) {
74+ None => ( trimmed, None ) ,
75+ Some ( p) => ( & trimmed[ ..p] , Some ( & trimmed[ p + 1 ..] ) ) ,
6776 } ;
6877
6978 let input_size = url_data. len ( ) ;
7079 let b = url_data. as_bytes ( ) ;
7180
7281 let mut url = Url :: empty ( ) ;
73- url. buffer . reserve ( input_size + 16 ) ;
82+ url. buffer . reserve ( input_size + 4 ) ;
7483
7584 let mut state = State :: SchemeStart ;
7685 let mut pos: usize = 0 ;
@@ -542,3 +551,248 @@ pub fn parse_url(user_input: &str, base: Option<&Url>) -> Option<Url> {
542551 }
543552 Some ( url)
544553}
554+
555+ // =============================================================================
556+ // Fast-path builder for absolute special URLs
557+ // =============================================================================
558+ //
559+ // Handles the overwhelmingly common case: an absolute URL like
560+ // `https://hostname/path?query`
561+ // with no base, no credentials, ASCII-only lowercase host, and no path dots.
562+ //
563+ // Single forward scan replaces the full state-machine loop. Falls back to
564+ // `None` for anything unusual so the slow path covers all edge cases.
565+
566+ /// Try to parse `url_data` (already C0-trimmed, tab/newline-free, fragment-stripped)
567+ /// as a simple absolute special URL without invoking the state machine.
568+ /// Fast-path builder for absolute special URLs — takes RAW (unprocessed) input.
569+ ///
570+ /// Inlines C0 trim, tab/newline detection, fragment split, host validation,
571+ /// and buffer construction into a minimal number of forward scans.
572+ /// Falls back to `None` (→ state machine) for anything unusual.
573+ #[ inline]
574+ pub ( crate ) fn try_parse_absolute_fast ( raw_input : & str ) -> Option < Url > {
575+ use crate :: { HostKind , OMITTED } ;
576+ use crate :: character_sets:: FRAGMENT_PERCENT_ENCODE ;
577+ use crate :: unicode:: DOMAIN_CHECK ;
578+
579+ let raw = raw_input. as_bytes ( ) ;
580+
581+ // ── C0 whitespace trim (branchless for typical URLs with none) ─────────
582+ let start = if !raw. is_empty ( ) && raw[ 0 ] <= b' ' {
583+ raw. iter ( ) . position ( |& b| b > b' ' ) ?
584+ } else { 0 } ;
585+ let end = if !raw. is_empty ( ) && raw[ raw. len ( ) -1 ] <= b' ' {
586+ raw. iter ( ) . rposition ( |& b| b > b' ' ) . map ( |i| i + 1 ) ?
587+ } else { raw. len ( ) } ;
588+ if start >= end { return None ; }
589+ let b = & raw [ start..end] ;
590+
591+ // ── Scheme detection (≤ 6 bytes) ──────────────────────────────────────
592+ if b. is_empty ( ) || !is_alpha ( b[ 0 ] ) { return None ; }
593+
594+ let colon = {
595+ let mut i = 1usize ;
596+ loop {
597+ if i >= b. len ( ) . min ( 7 ) { return None ; }
598+ match b[ i] {
599+ b':' => break i,
600+ c if !is_alnum_plus ( c) => return None , // invalid scheme char (incl. \t\n\r)
601+ _ => i += 1 ,
602+ }
603+ }
604+ } ;
605+
606+ // Perfect-hash scheme type (no string comparison)
607+ let scheme_bytes = & b[ ..colon] ;
608+ let scheme = {
609+ let s = unsafe { core:: str:: from_utf8_unchecked ( scheme_bytes) } ;
610+ let t = get_scheme_type_lower ( s) ;
611+ if t == Scheme :: NotSpecial { crate :: scheme:: get_scheme_type ( s) } else { t }
612+ } ;
613+ if !scheme. is_special ( ) || scheme == Scheme :: File { return None ; }
614+
615+ // ── Require "://" ──────────────────────────────────────────────────────
616+ if b. len ( ) < colon + 3 || b[ colon+1 ] != b'/' || b[ colon+2 ] != b'/' { return None ; }
617+ let auth_start = colon + 3 ;
618+
619+ // ── Single-pass authority scan ─────────────────────────────────────────
620+ // Simultaneously: find auth end, detect '@' / tabs / non-ASCII / uppercase,
621+ // IPv4 flag (only digits+dots), xn-- flag ('x' present), forbidden chars.
622+ let mut auth_end = auth_start;
623+ let mut port_colon: Option < usize > = None ;
624+ let mut has_x = false ; // xn-- candidate ('x' seen in host)
625+
626+ while auth_end < b. len ( ) {
627+ let c = b[ auth_end] ;
628+ match c {
629+ b'/' | b'?' | b'#' | b'\\' => break ,
630+ b'@' => return None ,
631+ b':' if port_colon. is_none ( ) => { port_colon = Some ( auth_end) ; auth_end += 1 ; }
632+ b'\t' | b'\n' | b'\r' => return None ,
633+ c if c >= 0x80 => return None ,
634+ b'0' ..=b'9' | b'.' => { auth_end += 1 ; }
635+ c => {
636+ if c == b'x' { has_x = true ; }
637+ if DOMAIN_CHECK [ c as usize ] != 0 { return None ; }
638+ auth_end += 1 ;
639+ }
640+ }
641+ }
642+
643+ let host_end_in_input = port_colon. unwrap_or ( auth_end) ;
644+ let host = & b[ auth_start..host_end_in_input] ;
645+ if host. is_empty ( ) { return None ; }
646+
647+ // IPv4 quick-filter: check the last *significant* (non-dot) byte of the host.
648+ // is_ipv4 strips trailing dots internally, so we mirror that here.
649+ // For TLD hostnames (.com/.org/.net) the last letter is 'm','g','t' — never
650+ // in {0-9, a-f, x} — so is_ipv4 is not called for typical domain names.
651+ {
652+ let last_sig = host. iter ( ) . rev ( ) . find ( |& & c| c != b'.' ) . copied ( ) . unwrap_or ( 0 ) ;
653+ let maybe_ipv4 = last_sig. is_ascii_digit ( )
654+ || matches ! ( last_sig, b'a' ..=b'f' )
655+ || last_sig == b'x' ;
656+ if maybe_ipv4 {
657+ let host_str = unsafe { core:: str:: from_utf8_unchecked ( host) } ;
658+ if crate :: checkers:: is_ipv4 ( host_str) { return None ; }
659+ }
660+ }
661+
662+ // xn-- check: only if 'x' was seen (zero cost for typical .com/.org hosts)
663+ if has_x {
664+ let host_str = unsafe { core:: str:: from_utf8_unchecked ( host) } ;
665+ if contains_xn_prefix_pub ( host_str) { return None ; }
666+ }
667+
668+ // ── Port ────────────────────────────────────────────────────────────────
669+ let port_val: u32 = if let Some ( pc) = port_colon {
670+ let port_bytes = & b[ pc + 1 ..auth_end] ;
671+ if port_bytes. is_empty ( ) {
672+ OMITTED
673+ } else {
674+ if !port_bytes. iter ( ) . all ( |& c| c. is_ascii_digit ( ) ) { return None ; }
675+ let n: u32 = port_bytes. iter ( ) . fold ( 0u32 , |a, & c| a * 10 + ( c - b'0' ) as u32 ) ;
676+ if n > 65535 { return None ; }
677+ let def = scheme. default_port ( ) ;
678+ if def != 0 && n as u16 == def { OMITTED } else { n }
679+ }
680+ } else {
681+ OMITTED
682+ } ;
683+
684+ // ── Path + query + fragment scan ───────────────────────────────────────
685+ let path_start = auth_end;
686+ let mut query_start: Option < usize > = None ;
687+ let mut frag_start: Option < usize > = None ;
688+ let path_end: usize ;
689+
690+ {
691+ let mut i = path_start;
692+ loop {
693+ if i >= b. len ( ) { path_end = i; break ; }
694+ match b[ i] {
695+ b'?' => { path_end = i; query_start = Some ( i) ; break ; }
696+ b'#' => { path_end = i; frag_start = Some ( i) ; break ; }
697+ b'\\' => return None , // backslash needs normalisation
698+ _ => { }
699+ }
700+ i += 1 ;
701+ }
702+ }
703+
704+ // Path: use path_signature to detect encoding needs + dot-segments.
705+ let path_bytes = & b[ path_start..path_end] ;
706+ let path_sig = crate :: checkers:: path_signature (
707+ unsafe { core:: str:: from_utf8_unchecked ( path_bytes) }
708+ ) ;
709+ if path_sig & 0x0B != 0 { return None ; } // needs encoding / backslash / percent
710+ if path_sig & 0x04 != 0 {
711+ // Has a dot — check for actual dot-segments (SIMD str::contains)
712+ let path_str = unsafe { core:: str:: from_utf8_unchecked ( path_bytes) } ;
713+ if path_str. contains ( "/." ) { return None ; }
714+ }
715+
716+ // Query: check for characters needing encoding
717+ let query_end = frag_start. unwrap_or ( b. len ( ) ) ;
718+ if let Some ( qs) = query_start {
719+ let qbytes = & b[ qs + 1 ..query_end] ;
720+ let encode_set = if scheme. default_port ( ) != 0 {
721+ & crate :: character_sets:: SPECIAL_QUERY_PERCENT_ENCODE
722+ } else {
723+ & crate :: character_sets:: QUERY_PERCENT_ENCODE
724+ } ;
725+ if crate :: unicode:: percent_encode_index (
726+ unsafe { core:: str:: from_utf8_unchecked ( qbytes) } ,
727+ encode_set,
728+ ) != qbytes. len ( ) {
729+ return None ; // query needs encoding
730+ }
731+ }
732+
733+ // Fragment from '#' in the URL (may be None if no '#')
734+ let fragment: Option < & str > = frag_start. map ( |fs| {
735+ unsafe { core:: str:: from_utf8_unchecked ( & b[ fs + 1 ..] ) }
736+ } ) ;
737+
738+ // ── Build URL buffer in a single forward write pass ────────────────────
739+ let total = colon + 1 + 2 // scheme: //
740+ + host. len ( )
741+ + if port_val != OMITTED { 6 } else { 0 } // :NNNNN
742+ + ( path_end - path_start) . max ( 1 )
743+ + query_start. map_or ( 0 , |qs| query_end - qs)
744+ + fragment. map_or ( 0 , |f| f. len ( ) + 1 ) ;
745+
746+ let mut url = Url :: empty ( ) ;
747+ url. scheme = scheme;
748+ url. buffer . reserve ( total + 4 ) ;
749+
750+ // Scheme (lowercase) + ':'
751+ for & c in scheme_bytes { url. buffer . push ( ( c | 0x20 ) as char ) ; }
752+ url. buffer . push ( ':' ) ;
753+ url. components . protocol_end = url. buffer . len ( ) as u32 ;
754+
755+ // "//"
756+ url. buffer . push ( '/' ) ;
757+ url. buffer . push ( '/' ) ;
758+ url. components . username_end = url. buffer . len ( ) as u32 ;
759+ url. components . host_start = url. buffer . len ( ) as u32 ;
760+
761+ // Host (already validated: lowercase ASCII, no forbidden chars)
762+ url. buffer . push_str ( unsafe { core:: str:: from_utf8_unchecked ( host) } ) ;
763+ url. components . host_end = url. buffer . len ( ) as u32 ;
764+
765+ // Port
766+ if port_val != OMITTED {
767+ url. buffer . push ( ':' ) ;
768+ let mut tmp = [ 0u8 ; 5 ] ; let mut n = port_val; let mut len = 0usize ;
769+ loop { tmp[ len] = b'0' + ( n % 10 ) as u8 ; n /= 10 ; len += 1 ; if n == 0 { break ; } }
770+ for k in ( 0 ..len) . rev ( ) { url. buffer . push ( tmp[ k] as char ) ; }
771+ url. components . port = port_val;
772+ }
773+ url. components . pathname_start = url. buffer . len ( ) as u32 ;
774+
775+ // Path
776+ if path_bytes. is_empty ( ) {
777+ url. buffer . push ( '/' ) ;
778+ } else {
779+ url. buffer . push_str ( unsafe { core:: str:: from_utf8_unchecked ( path_bytes) } ) ;
780+ }
781+
782+ // Query (with leading '?')
783+ if let Some ( qs) = query_start {
784+ url. components . search_start = url. buffer . len ( ) as u32 ;
785+ url. buffer . push_str ( unsafe { core:: str:: from_utf8_unchecked ( & b[ qs..query_end] ) } ) ;
786+ }
787+
788+ // Fragment (percent-encoded, with leading '#')
789+ if let Some ( frag) = fragment {
790+ url. components . hash_start = url. buffer . len ( ) as u32 ;
791+ url. buffer . push ( '#' ) ;
792+ let enc = percent_encode ( frag, & FRAGMENT_PERCENT_ENCODE ) ;
793+ url. buffer . push_str ( & enc) ;
794+ }
795+
796+ url. host_kind = HostKind :: Domain ;
797+ Some ( url)
798+ }
0 commit comments