Skip to content

Commit d24e261

Browse files
committed
Implement manual English stop-word removal in tokenizer because rust-stummers doesn't support it.
1 parent 184d8ff commit d24e261

File tree

1 file changed

+52
-11
lines changed

1 file changed

+52
-11
lines changed

src/tokenizer/mod.rs

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,27 @@ mod lua;
33
pub use lua::LuaTokenizer;
44

55
use rust_stemmers::{Algorithm, Stemmer};
6-
use std::{collections::HashMap, path::Path, sync::Arc};
6+
use std::{
7+
collections::{HashMap, HashSet},
8+
path::Path,
9+
sync::{Arc, LazyLock},
10+
};
11+
12+
/// Common English stop words to filter out during tokenization.
13+
/// Implemented manually because the rust-stemmers lib doesn't do stop word removal.
14+
static ENGLISH_STOP_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
15+
HashSet::from([
16+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
17+
"from", "as", "is", "was", "are", "were", "be", "been", "being", "have", "has", "had",
18+
"do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall",
19+
"can", "this", "that", "these", "those", "it", "its", "i", "me", "my", "we", "us", "our",
20+
"you", "your", "he", "him", "his", "she", "her", "they", "them", "their", "who", "whom",
21+
"which", "what", "where", "when", "how", "why", "if", "then", "so", "than", "too", "very",
22+
"just", "only", "also", "into", "out", "up", "down", "off", "over", "under", "again",
23+
"here", "there", "all", "any", "both", "each", "few", "more", "most", "other", "some",
24+
"no", "not", "nor",
25+
])
26+
});
727

828
use crate::models::DEFAULT_TOKENIZER;
929

@@ -56,31 +76,47 @@ impl Tokenizer for SimpleTokenizer {
5676
/// Default tokenizer using Snowball stemmer for a specific language.
5777
pub struct DefaultTokenizer {
5878
stemmer: Stemmer,
79+
stop_words: Option<&'static LazyLock<HashSet<&'static str>>>,
5980
}
6081

6182
impl DefaultTokenizer {
62-
pub fn new(algorithm: Algorithm) -> Self {
83+
pub fn new(
84+
algorithm: Algorithm,
85+
stop_words: Option<&'static LazyLock<HashSet<&'static str>>>,
86+
) -> Self {
6387
Self {
6488
stemmer: Stemmer::create(algorithm),
89+
stop_words,
6590
}
6691
}
6792
}
6893

6994
// The default tokenizer for stemmers in the `rust-stemmers` lib.
7095
impl Tokenizer for DefaultTokenizer {
7196
fn tokenize(&self, text: &str, _lang: &str) -> Result<Vec<String>, TokenizerError> {
72-
Ok(text
73-
.split_whitespace()
74-
.map(|word| self.stemmer.stem(&word.to_lowercase()).to_string())
75-
.collect())
97+
let iter = text.split_whitespace();
98+
let tokens: Vec<String> = if let Some(sw) = &self.stop_words {
99+
iter.filter(|word| !sw.contains(word.to_lowercase().as_str()))
100+
.map(|word| self.stemmer.stem(&word.to_lowercase()).to_string())
101+
.collect()
102+
} else {
103+
iter.map(|word| self.stemmer.stem(&word.to_lowercase()).to_string())
104+
.collect()
105+
};
106+
Ok(tokens)
76107
}
77108

78109
fn to_query(&self, text: &str, _lang: &str) -> Result<String, TokenizerError> {
79110
let cleaned = clean_query(text);
80-
let terms: Vec<String> = cleaned
81-
.split_whitespace()
82-
.map(|word| self.stemmer.stem(&word.to_lowercase()).to_string())
83-
.collect();
111+
let iter = cleaned.split_whitespace();
112+
let terms: Vec<String> = if let Some(sw) = &self.stop_words {
113+
iter.filter(|word| !sw.contains(word.to_lowercase().as_str()))
114+
.map(|word| self.stemmer.stem(&word.to_lowercase()).to_string())
115+
.collect()
116+
} else {
117+
iter.map(|word| self.stemmer.stem(&word.to_lowercase()).to_string())
118+
.collect()
119+
};
84120
Ok(terms.join(" "))
85121
}
86122
}
@@ -117,9 +153,14 @@ pub fn load_all(dir: &Path) -> Result<Tokenizers, TokenizerError> {
117153
("turkish", Algorithm::Turkish),
118154
];
119155
for (name, algorithm) in default_stemmers {
156+
let stop_words = if name == "english" {
157+
Some(&ENGLISH_STOP_WORDS)
158+
} else {
159+
None
160+
};
120161
out.insert(
121162
format!("default:{}", name),
122-
Arc::new(DefaultTokenizer::new(algorithm)),
163+
Arc::new(DefaultTokenizer::new(algorithm, stop_words)),
123164
);
124165
}
125166

0 commit comments

Comments
 (0)