rust-ml
diff --git a/‎algorithms/linfa-preprocessing/Cargo.toml‎
Lines changed: 2 additions & 0 deletions b/‎algorithms/linfa-preprocessing/Cargo.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎algorithms/linfa-preprocessing/benches/vectorizer_bench.rs‎
Lines changed: 4 additions & 2 deletions b/‎algorithms/linfa-preprocessing/benches/vectorizer_bench.rs‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎algorithms/linfa-preprocessing/examples/count_vectorization.rs‎
Lines changed: 2 additions & 0 deletions b/‎algorithms/linfa-preprocessing/examples/count_vectorization.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎algorithms/linfa-preprocessing/examples/tfidf_vectorization.rs‎
Lines changed: 2 additions & 0 deletions b/‎algorithms/linfa-preprocessing/examples/tfidf_vectorization.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎algorithms/linfa-preprocessing/src/countgrams/hyperparams.rs‎
Lines changed: 41 additions & 6 deletions b/‎algorithms/linfa-preprocessing/src/countgrams/hyperparams.rs‎
Lines changed: 41 additions & 6 deletions
@@ -32,6 +32,7 @@ encoding = "0.2"
 sprs = { version = "=0.11.1", default-features = false }
 
 serde_regex = { version = "1.1", optional = true }
+itertools = "0.14.0"
 
 [dependencies.serde_crate]
 package = "serde"
@@ -44,6 +45,7 @@ features = ["std", "derive"]
 linfa-datasets = { version = "0.7.1", path = "../../datasets", features = [
     "diabetes",
     "winequality",
+    "generate"
 ] }
 linfa-bayes = { version = "0.7.1", path = "../linfa-bayes" }
 iai = "0.1"
 
@@ -118,7 +118,8 @@ fn fit_transform_vectorizer(file_names: &[std::path::PathBuf]) {
             file_names,
             encoding::all::ISO_8859_1,
             encoding::DecoderTrap::Strict,
-        );
+        )
+        .unwrap();
 }
 fn fit_transform_tf_idf(file_names: &[std::path::PathBuf]) {
     TfIdfVectorizer::default()
@@ -134,7 +135,8 @@ fn fit_transform_tf_idf(file_names: &[std::path::PathBuf]) {
             file_names,
             encoding::all::ISO_8859_1,
             encoding::DecoderTrap::Strict,
-        );
+        )
+        .unwrap();
 }
 
 fn bench(c: &mut Criterion) {
 
@@ -126,6 +126,7 @@ fn main() {
     // Transforming gives a sparse dataset, we make it dense in order to be able to fit the Naive Bayes model
     let training_records = vectorizer
         .transform_files(&training_filenames, ISO_8859_1, Strict)
+        .unwrap()
         .to_dense();
     // Currently linfa only allows real valued features so we have to transform the integer counts to floats
     let training_records = training_records.mapv(|c| c as f32);
@@ -164,6 +165,7 @@ fn main() {
     );
     let test_records = vectorizer
         .transform_files(&test_filenames, ISO_8859_1, Strict)
+        .unwrap()
         .to_dense();
     let test_records = test_records.mapv(|c| c as f32);
     let test_dataset: Dataset<f32, usize, Ix1> = (test_records, test_targets).into();
 
@@ -126,6 +126,7 @@ fn main() {
     // Transforming gives a sparse dataset, we make it dense in order to be able to fit the Naive Bayes model
     let training_records = vectorizer
         .transform_files(&training_filenames, ISO_8859_1, Strict)
+        .unwrap()
         .to_dense();
 
     println!(
@@ -162,6 +163,7 @@ fn main() {
     );
     let test_records = vectorizer
         .transform_files(&test_filenames, ISO_8859_1, Strict)
+        .unwrap()
         .to_dense();
     let test_dataset: Dataset<f64, usize, Ix1> = (test_records, test_targets).into();
     // Let's predict the test data targets
 
@@ -7,6 +7,8 @@ use std::collections::HashSet;
 #[cfg(feature = "serde")]
 use serde_crate::{Deserialize, Serialize};
 
+use super::{Tokenizer, Tokenizerfp};
+
 #[derive(Clone, Debug)]
 #[cfg(not(feature = "serde"))]
 struct SerdeRegex(Regex);
@@ -71,9 +73,21 @@ pub struct CountVectorizerValidParams {
     normalize: bool,
     document_frequency: (f32, f32),
     stopwords: Option<HashSet<String>>,
+    max_features: Option<usize>,
+    #[cfg_attr(feature = "serde", serde(skip))]
+    pub(crate) tokenizer_function: Option<Tokenizerfp>,
+    pub(crate) tokenizer_deserialization_guard: bool,
 }
 
 impl CountVectorizerValidParams {
+    pub fn tokenizer_function(&self) -> Option<Tokenizerfp> {
+        self.tokenizer_function
+    }
+
+    pub fn max_features(&self) -> Option<usize> {
+        self.max_features
+    }
+
     pub fn convert_to_lowercase(&self) -> bool {
         self.convert_to_lowercase
     }
@@ -117,20 +131,41 @@ impl std::default::Default for CountVectorizerParams {
             normalize: true,
             document_frequency: (0., 1.),
             stopwords: None,
+            max_features: None,
+            tokenizer_function: None,
+            tokenizer_deserialization_guard: false,
         })
     }
 }
 
 impl CountVectorizerParams {
-    ///If true, all documents used for fitting will be converted to lowercase.
-    pub fn convert_to_lowercase(mut self, convert_to_lowercase: bool) -> Self {
-        self.0.convert_to_lowercase = convert_to_lowercase;
+    // Set the tokenizer as either a function pointer or a regex
+    // If this method is not called, the default is to use regex "\b\w\w+\b"
+    pub fn tokenizer(mut self, tokenizer: Tokenizer) -> Self {
+        match tokenizer {
+            Tokenizer::Function(fp) => {
+                self.0.tokenizer_function = Some(fp);
+                self.0.tokenizer_deserialization_guard = true;
+            }
+            Tokenizer::Regex(regex_str) => {
+                self.0.split_regex_expr = regex_str.to_string();
+                self.0.tokenizer_deserialization_guard = false;
+            }
+        }
+
+        self
+    }
+
+    /// When building the vocabulary, only consider the top max_features (by term frequency).
+    /// If None, all features are used.
+    pub fn max_features(mut self, max_features: Option<usize>) -> Self {
+        self.0.max_features = max_features;
         self
     }
 
-    /// Sets the regex espression used to split decuments into tokens
-    pub fn split_regex(mut self, regex_str: &str) -> Self {
-        self.0.split_regex_expr = regex_str.to_string();
+    ///If true, all documents used for fitting will be converted to lowercase.
+    pub fn convert_to_lowercase(mut self, convert_to_lowercase: bool) -> Self {
+        self.0.convert_to_lowercase = convert_to_lowercase;
         self
     }