33// For the full copyright and license information, please view the LICENSE
44// file that was distributed with this source code.
55
6- use std:: { cmp:: Ordering , sync:: OnceLock } ;
6+ use std:: { cell :: RefCell , cmp:: Ordering , collections :: HashMap , sync:: OnceLock } ;
77
88use icu_collator:: { self , CollatorBorrowed } ;
99
@@ -15,6 +15,16 @@ pub use icu_collator::options::{
1515
1616static COLLATOR : OnceLock < CollatorBorrowed > = OnceLock :: new ( ) ;
1717
18+ // Simple comparison cache for repeated field values
19+ type ComparisonKey = ( Vec < u8 > , Vec < u8 > ) ;
20+ type ComparisonCache = RefCell < HashMap < ComparisonKey , Ordering > > ;
21+
22+ thread_local ! {
23+ static COMPARISON_CACHE : ComparisonCache = RefCell :: new( HashMap :: new( ) ) ;
24+ }
25+
26+ const CACHE_SIZE_LIMIT : usize = 1000 ;
27+
1828/// Will initialize the collator if not already initialized.
1929/// returns `true` if initialization happened
2030pub fn try_init_collator ( opts : CollatorOptions ) -> bool {
@@ -86,3 +96,76 @@ pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering {
8696 . compare_utf8 ( left, right)
8797 }
8898}
99+
100+ /// Get a reference to the initialized collator for performance-critical paths
101+ #[ inline]
102+ pub fn get_collator ( ) -> & ' static CollatorBorrowed < ' static > {
103+ COLLATOR . get ( ) . expect ( "Collator was not initialized" )
104+ }
105+
106+ /// Hybrid comparison: byte-first with caching and locale fallback
107+ #[ inline]
108+ pub fn locale_cmp_unchecked ( left : & [ u8 ] , right : & [ u8 ] ) -> Ordering {
109+ // Fast path: try byte comparison first
110+ let byte_cmp = left. cmp ( right) ;
111+
112+ // If strings are identical by bytes, they're identical by locale too
113+ if byte_cmp == Ordering :: Equal {
114+ return Ordering :: Equal ;
115+ }
116+
117+ // If both are pure ASCII, byte comparison is sufficient for most locales
118+ if left. is_ascii ( ) && right. is_ascii ( ) {
119+ // For ASCII in en_US and similar locales, byte order equals collation order
120+ // This covers the vast majority of cases
121+ return byte_cmp;
122+ }
123+
124+ // Check cache for repeated comparisons (common in join operations)
125+ let cache_key = if left. len ( ) + right. len ( ) < 64 {
126+ // Only cache small strings
127+ Some ( ( left. to_vec ( ) , right. to_vec ( ) ) )
128+ } else {
129+ None
130+ } ;
131+
132+ if let Some ( ref key) = cache_key {
133+ if let Ok ( Some ( cached_result) ) = COMPARISON_CACHE . try_with ( |c| c. borrow ( ) . get ( key) . copied ( ) )
134+ {
135+ return cached_result;
136+ }
137+ }
138+
139+ // Compute result using ICU for non-ASCII data
140+ let result = match ( std:: str:: from_utf8 ( left) , std:: str:: from_utf8 ( right) ) {
141+ ( Ok ( l) , Ok ( r) ) => {
142+ let l_ascii = l. is_ascii ( ) ;
143+ let r_ascii = r. is_ascii ( ) ;
144+
145+ // If one is ASCII and other isn't, use ICU
146+ if l_ascii != r_ascii {
147+ get_collator ( ) . compare ( l, r)
148+ } else if !l_ascii {
149+ // Both non-ASCII, use ICU
150+ get_collator ( ) . compare ( l, r)
151+ } else {
152+ // Both ASCII - byte comparison should be correct
153+ byte_cmp
154+ }
155+ }
156+ _ => byte_cmp, // Invalid UTF-8, use byte comparison
157+ } ;
158+
159+ // Cache the result for future lookups
160+ if let Some ( key) = cache_key {
161+ let _ = COMPARISON_CACHE . try_with ( |c| {
162+ let mut cache = c. borrow_mut ( ) ;
163+ if cache. len ( ) >= CACHE_SIZE_LIMIT {
164+ cache. clear ( ) ; // Simple eviction policy
165+ }
166+ cache. insert ( key, result) ;
167+ } ) ;
168+ }
169+
170+ result
171+ }
0 commit comments