@@ -4,17 +4,23 @@ use pyo3::prelude::*;
44use rand:: { Rng , SeedableRng } ;
55use rand_pcg:: Pcg64 ;
66use regex:: Regex ;
7- use std:: sync:: Arc ;
7+ use std:: sync:: { Arc , OnceLock } ;
88use xxhash_rust:: xxh3;
99
10+ static WHITESPACE_RE : OnceLock < Regex > = OnceLock :: new ( ) ;
11+
12+ fn whitespace_regex ( ) -> & ' static Regex {
13+ WHITESPACE_RE . get_or_init ( || Regex :: new ( r"\s+" ) . unwrap ( ) )
14+ }
15+
1016/// Clean text using the SlimPajama text cleaning process.
1117/// 1. Lowercase
1218/// 2. Remove punctuation
1319/// 3. Replace multiple whitespace with single space
1420/// 4. Trim
1521pub fn clean_text ( arr : & StringArray ) -> PyResult < Arc < StringArray > > {
1622 let mut builder = StringBuilder :: with_capacity ( arr. len ( ) , arr. len ( ) * 50 ) ;
17- let whitespace_re = Regex :: new ( r"\s+" ) . map_err ( |e| PyValueError :: new_err ( e . to_string ( ) ) ) ? ;
23+ let whitespace_re = whitespace_regex ( ) ;
1824 let punctuation: & [ char ] = & [
1925 '!' , '"' , '#' , '$' , '%' , '&' , '\'' , '(' , ')' , '*' , '+' , ',' , '-' , '.' , '/' , ':' , ';' , '<' ,
2026 '=' , '>' , '?' , '@' , '[' , '\\' , ']' , '^' , '_' , '`' , '{' , '|' , '}' , '~' ,
@@ -73,9 +79,15 @@ pub fn compute_minhash(
7379 let hash = xxh3:: xxh3_64 ( text. as_bytes ( ) ) as u128 ;
7480 update_signature ( & mut signature, hash, & coeffs) ;
7581 } else {
82+ // Reusable buffer for encoding char windows to bytes, avoiding
83+ // a String allocation per ngram.
84+ let mut ngram_buf = Vec :: with_capacity ( ngram_size * 4 ) ;
7685 for window in chars. windows ( ngram_size) {
77- let s: String = window. iter ( ) . collect ( ) ;
78- let hash = xxh3:: xxh3_64 ( s. as_bytes ( ) ) as u128 ;
86+ ngram_buf. clear ( ) ;
87+ for & ch in window {
88+ ngram_buf. extend_from_slice ( ch. encode_utf8 ( & mut [ 0 ; 4 ] ) . as_bytes ( ) ) ;
89+ }
90+ let hash = xxh3:: xxh3_64 ( & ngram_buf) as u128 ;
7991 update_signature ( & mut signature, hash, & coeffs) ;
8092 }
8193 }
0 commit comments