@@ -3,7 +3,27 @@ mod lua;
33pub use lua:: LuaTokenizer ;
44
55use rust_stemmers:: { Algorithm , Stemmer } ;
6- use std:: { collections:: HashMap , path:: Path , sync:: Arc } ;
6+ use std:: {
7+ collections:: { HashMap , HashSet } ,
8+ path:: Path ,
9+ sync:: { Arc , LazyLock } ,
10+ } ;
11+
12+ /// Common English stop words to filter out during tokenization.
13+ /// Implemented manually because the rust-stemmers lib doesn't do stop word removal.
14+ static ENGLISH_STOP_WORDS : LazyLock < HashSet < & ' static str > > = LazyLock :: new ( || {
15+ HashSet :: from ( [
16+ "a" , "an" , "the" , "and" , "or" , "but" , "in" , "on" , "at" , "to" , "for" , "of" , "with" , "by" ,
17+ "from" , "as" , "is" , "was" , "are" , "were" , "be" , "been" , "being" , "have" , "has" , "had" ,
18+ "do" , "does" , "did" , "will" , "would" , "could" , "should" , "may" , "might" , "must" , "shall" ,
19+ "can" , "this" , "that" , "these" , "those" , "it" , "its" , "i" , "me" , "my" , "we" , "us" , "our" ,
20+ "you" , "your" , "he" , "him" , "his" , "she" , "her" , "they" , "them" , "their" , "who" , "whom" ,
21+ "which" , "what" , "where" , "when" , "how" , "why" , "if" , "then" , "so" , "than" , "too" , "very" ,
22+ "just" , "only" , "also" , "into" , "out" , "up" , "down" , "off" , "over" , "under" , "again" ,
23+ "here" , "there" , "all" , "any" , "both" , "each" , "few" , "more" , "most" , "other" , "some" ,
24+ "no" , "not" , "nor" ,
25+ ] )
26+ } ) ;
727
828use crate :: models:: DEFAULT_TOKENIZER ;
929
@@ -56,31 +76,47 @@ impl Tokenizer for SimpleTokenizer {
5676/// Default tokenizer using Snowball stemmer for a specific language.
5777pub struct DefaultTokenizer {
5878 stemmer : Stemmer ,
79+ stop_words : Option < & ' static LazyLock < HashSet < & ' static str > > > ,
5980}
6081
6182impl DefaultTokenizer {
62- pub fn new ( algorithm : Algorithm ) -> Self {
83+ pub fn new (
84+ algorithm : Algorithm ,
85+ stop_words : Option < & ' static LazyLock < HashSet < & ' static str > > > ,
86+ ) -> Self {
6387 Self {
6488 stemmer : Stemmer :: create ( algorithm) ,
89+ stop_words,
6590 }
6691 }
6792}
6893
6994// The default tokenizer for stemmers in the `rust-stemmers` lib.
7095impl Tokenizer for DefaultTokenizer {
7196 fn tokenize ( & self , text : & str , _lang : & str ) -> Result < Vec < String > , TokenizerError > {
72- Ok ( text
73- . split_whitespace ( )
74- . map ( |word| self . stemmer . stem ( & word. to_lowercase ( ) ) . to_string ( ) )
75- . collect ( ) )
97+ let iter = text. split_whitespace ( ) ;
98+ let tokens: Vec < String > = if let Some ( sw) = & self . stop_words {
99+ iter. filter ( |word| !sw. contains ( word. to_lowercase ( ) . as_str ( ) ) )
100+ . map ( |word| self . stemmer . stem ( & word. to_lowercase ( ) ) . to_string ( ) )
101+ . collect ( )
102+ } else {
103+ iter. map ( |word| self . stemmer . stem ( & word. to_lowercase ( ) ) . to_string ( ) )
104+ . collect ( )
105+ } ;
106+ Ok ( tokens)
76107 }
77108
78109 fn to_query ( & self , text : & str , _lang : & str ) -> Result < String , TokenizerError > {
79110 let cleaned = clean_query ( text) ;
80- let terms: Vec < String > = cleaned
81- . split_whitespace ( )
82- . map ( |word| self . stemmer . stem ( & word. to_lowercase ( ) ) . to_string ( ) )
83- . collect ( ) ;
111+ let iter = cleaned. split_whitespace ( ) ;
112+ let terms: Vec < String > = if let Some ( sw) = & self . stop_words {
113+ iter. filter ( |word| !sw. contains ( word. to_lowercase ( ) . as_str ( ) ) )
114+ . map ( |word| self . stemmer . stem ( & word. to_lowercase ( ) ) . to_string ( ) )
115+ . collect ( )
116+ } else {
117+ iter. map ( |word| self . stemmer . stem ( & word. to_lowercase ( ) ) . to_string ( ) )
118+ . collect ( )
119+ } ;
84120 Ok ( terms. join ( " " ) )
85121 }
86122}
@@ -117,9 +153,14 @@ pub fn load_all(dir: &Path) -> Result<Tokenizers, TokenizerError> {
117153 ( "turkish" , Algorithm :: Turkish ) ,
118154 ] ;
119155 for ( name, algorithm) in default_stemmers {
156+ let stop_words = if name == "english" {
157+ Some ( & ENGLISH_STOP_WORDS )
158+ } else {
159+ None
160+ } ;
120161 out. insert (
121162 format ! ( "default:{}" , name) ,
122- Arc :: new ( DefaultTokenizer :: new ( algorithm) ) ,
163+ Arc :: new ( DefaultTokenizer :: new ( algorithm, stop_words ) ) ,
123164 ) ;
124165 }
125166
0 commit comments