Skip to content

Commit ffabf1f

Browse files
committed
Refactor and simplify tokenizer config to use identifiers like default: and lua:
1 parent 274ec9d commit ffabf1f

File tree

6 files changed

+91
-94
lines changed

6 files changed

+91
-94
lines changed

config.sample.toml

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ tokenizers_dir = "tokenizers"
3535
dicts = [["english", "english"]]
3636

3737

38+
################################################################################
39+
3840

3941
[db]
4042
# Maximum connections.
@@ -63,6 +65,9 @@ max_disk_mb = 512
6365
dir = "/tmp/dictpress-cache"
6466

6567

68+
################################################################################
69+
70+
6671
[api_results]
6772
# Default number of entries to return per page when paginated.
6873
per_page = 10
@@ -101,22 +106,16 @@ max_per_page = 100
101106
num_page_nums = 10
102107

103108

104-
# Tokenizer-specific configuration.
105-
# Passed to Lua tokenizers as `config` table.
106-
[tokenizer.indicphone]
107-
kn_num_keys = 2
108-
ml_num_keys = 2
109-
109+
################################################################################
110110

111111
[lang.english]
112112
name = "English"
113113

114-
# Either 'default' or 'lua'.
115-
tokenizer_type = "default"
116-
117-
# For 'default' tokenizer_type, supported = arabic, danish, dutch, english, finnish, french, german, greek, hungarian, italian, norwegian, portuguese, romanian, russian, spanish, swedish, tamil, turkish
118-
# For 'lua', path to the Lua script file in the tokenizers_dir, eg: malayalam.lua
119-
tokenizer = "english"
114+
# Format: "default:$lang" or "lua:filename.lua"
115+
# For "default", built-in languages are: simple, arabic, danish, dutch, english, finnish, french, german, greek, hungarian, italian, norwegian, portuguese, romanian, russian, spanish, swedish, tamil, turkish
116+
# Example: "default:english"
117+
# For 'lua': path to a Lua script in tokenizers_dir, eg: "lua:malayalam.lua"
118+
tokenizer = "default:english"
120119

121120
[lang.english.types]
122121
noun = "Noun"

src/importer.rs

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use sqlx::Row;
66
use crate::{
77
db,
88
models::{LangMap, STATUS_ENABLED},
9-
tokenizer::Tokenizers,
9+
tokenizer::{parse_tokenizer_field, Tokenizers},
1010
};
1111

1212
const INSERT_BATCH_SIZE: usize = 5000;
@@ -358,25 +358,3 @@ fn split_string(s: &str) -> Vec<String> {
358358
.filter(|s| !s.is_empty())
359359
.collect()
360360
}
361-
362-
/// Parse tokenizer field and return the tokenizer name for lookup.
363-
fn parse_tokenizer_field(tokenizer: &str) -> Option<String> {
364-
if tokenizer.is_empty() {
365-
return None;
366-
}
367-
368-
if let Some(name) = tokenizer.strip_prefix("default:") {
369-
// default:english -> "english"
370-
Some(name.to_string())
371-
} else if let Some(filename) = tokenizer.strip_prefix("lua:") {
372-
// lua:indicphone_kn.lua -> "indicphone_kn.lua"
373-
Some(filename.to_string())
374-
} else {
375-
// Unknown format.
376-
log::warn!(
377-
"unknown tokenizer format '{}'. expected 'default:name' or 'lua:filename.lua'",
378-
tokenizer
379-
);
380-
None
381-
}
382-
}

src/init.rs

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
use std::path::Path;
2-
31
use crate::cache::{Cache, CacheConfig, CacheError};
4-
use crate::models::{Config, Dicts, Lang, LangMap};
5-
use crate::tokenizer::{TokenizerError, Tokenizers};
2+
use crate::models::{Config, Dicts, Lang, LangMap, DEFAULT_TOKENIZER};
3+
use crate::tokenizer::Tokenizers;
64

75
/// Initialize logger.
86
pub fn logger() {
@@ -29,27 +27,38 @@ pub fn logger() {
2927
.init();
3028
}
3129

32-
/// Initialize languages from config.
33-
pub fn langs(config: &Config) -> LangMap {
30+
/// Initialize languages from config, validating tokenizers against loaded tokenizers.
31+
pub fn langs(config: &Config, tokenizers: &Tokenizers) -> LangMap {
3432
let mut langs = LangMap::new();
3533

3634
for (id, cfg) in &config.lang {
37-
// Validate tokenizer_type.
38-
let typ = if cfg.tokenizer_type.is_empty() {
39-
"default".to_string()
35+
let tokenizer = if cfg.tokenizer.is_empty() {
36+
// If the tokenizer is not specified, use default.
37+
DEFAULT_TOKENIZER.to_string()
38+
} else if (!cfg.tokenizer.starts_with("default:")) && (!cfg.tokenizer.starts_with("lua:")) {
39+
// Tokenizer name must start with "default:" or "lua:".
40+
log::error!(
41+
"invalid tokenizer format '{}' for language '{}'. defaulting to '{}'",
42+
cfg.tokenizer,
43+
id,
44+
DEFAULT_TOKENIZER
45+
);
46+
DEFAULT_TOKENIZER.to_string()
47+
} else if tokenizers.contains_key(&cfg.tokenizer) {
48+
// Yep, it's valid.
49+
cfg.tokenizer.clone()
4050
} else {
41-
cfg.tokenizer_type.clone()
42-
};
43-
44-
if typ != "default" && typ != "lua" {
51+
// Unknown tokenizer.
4552
log::error!(
46-
"unknown tokenizer_type '{}' for language '{}'. Must be 'default' or 'lua'.",
47-
typ,
48-
id
53+
"tokenizer '{}' not found for language '{}'. defaulting to '{}'",
54+
cfg.tokenizer,
55+
id,
56+
DEFAULT_TOKENIZER
4957
);
50-
std::process::exit(1);
51-
}
58+
DEFAULT_TOKENIZER.to_string()
59+
};
5260

61+
// Create the language instance.
5362
let lang = Lang {
5463
id: id.clone(),
5564
name: if cfg.name.is_empty() {
@@ -58,20 +67,10 @@ pub fn langs(config: &Config) -> LangMap {
5867
cfg.name.clone()
5968
},
6069
types: cfg.types.clone(),
61-
tokenizer: if cfg.tokenizer.is_empty() {
62-
"simple".to_string()
63-
} else {
64-
cfg.tokenizer.clone()
65-
},
66-
tokenizer_type: typ,
70+
tokenizer: tokenizer.clone(),
6771
};
6872

69-
log::info!(
70-
"language: {} (tokenizer: {}, type: {})",
71-
id,
72-
lang.tokenizer,
73-
lang.tokenizer_type
74-
);
73+
log::info!("language: {} (tokenizer: {})", id, tokenizer);
7574

7675
langs.insert(id.clone(), lang);
7776
}
@@ -171,11 +170,6 @@ pub fn i18n(
171170
Ok(i18n)
172171
}
173172

174-
/// Initialize lua tokenizers from a given directory.
175-
pub fn tokenizers(dir: &str) -> Result<Tokenizers, TokenizerError> {
176-
crate::tokenizer::load_all(Path::new(dir))
177-
}
178-
179173
/// Initialize cache from configuration.
180174
pub async fn cache(cfg: &CacheConfig) -> Result<Cache, CacheError> {
181175
log::info!(

src/main.rs

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ mod tokenizer;
1616
#[global_allocator]
1717
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
1818

19-
use std::sync::Arc;
19+
use std::{path::Path, sync::Arc};
2020

2121
use clap::Parser;
2222

@@ -79,16 +79,19 @@ async fn main() {
7979
db::exists(&cli.db_path);
8080

8181
let config = config::load_all(&cli.config);
82-
let langs = init::langs(&config);
8382

84-
let tokenizers = match init::tokenizers(&config.app.tokenizers_dir) {
83+
// Load tokenizers first for validation.
84+
let tokenizers = match tokenizer::load_all(Path::new(&config.app.tokenizers_dir)) {
8585
Ok(t) => t,
8686
Err(e) => {
8787
log::error!("error loading tokenizers: {}", e);
8888
std::process::exit(1);
8989
}
9090
};
9191

92+
// Load languages with tokenizer validation.
93+
let langs = init::langs(&config, &tokenizers);
94+
9295
if let Err(e) = importer::import_csv(&file, &db_path, &tokenizers, langs).await {
9396
log::error!("error importing: {}", e);
9497
std::process::exit(1);
@@ -136,8 +139,17 @@ async fn main() {
136139
// Load config.
137140
let config = config::load_all(&cli.config);
138141

139-
// Initialize languages and dicts config.
140-
let langs = init::langs(&config);
142+
// Initialize tokenizers first for validation.
143+
let tokenizers = match tokenizer::load_all(Path::new(&config.app.tokenizers_dir)) {
144+
Ok(t) => t,
145+
Err(e) => {
146+
log::error!("error loading tokenizers: {}", e);
147+
std::process::exit(1);
148+
}
149+
};
150+
151+
// Initialize languages with tokenizer validation.
152+
let langs = init::langs(&config, &tokenizers);
141153
let dicts = init::dicts(&langs, &config);
142154

143155
// Create database pool.
@@ -192,15 +204,6 @@ async fn main() {
192204
std::collections::HashMap::new()
193205
};
194206

195-
// Initialize tokenizers.
196-
let tokenizers = match init::tokenizers(&config.app.tokenizers_dir) {
197-
Ok(t) => t,
198-
Err(e) => {
199-
log::error!("error loading tokenizers: {}", e);
200-
std::process::exit(1);
201-
}
202-
};
203-
204207
// Initialize manager.
205208
let mgr = match Manager::new(db, tokenizers, langs.clone(), dicts.clone()).await {
206209
Ok(m) => Arc::new(m),

src/models/models.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ pub const STATUS_ENABLED: &str = "enabled";
1616
#[allow(dead_code)]
1717
pub const STATUS_DISABLED: &str = "disabled";
1818

19+
/// Default tokenizer used when none specified or when configured tokenizer is invalid.
20+
pub const DEFAULT_TOKENIZER: &str = "default:simple";
21+
1922
/// JSON array wrapper for SQLite TEXT columns storing JSON arrays.
2023
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
2124
pub struct StringArray(pub Vec<String>);
@@ -347,9 +350,6 @@ pub struct Lang {
347350

348351
#[serde(default)]
349352
pub tokenizer: String,
350-
351-
#[serde(default)]
352-
pub tokenizer_type: String,
353353
}
354354

355355
pub type LangMap = HashMap<String, Lang>;
@@ -534,9 +534,6 @@ pub struct LangConfig {
534534
#[serde(default)]
535535
pub tokenizer: String,
536536

537-
#[serde(default)]
538-
pub tokenizer_type: String,
539-
540537
#[serde(default)]
541538
pub types: HashMap<String, String>,
542539
}

src/tokenizer/mod.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ pub use lua::LuaTokenizer;
55
use rust_stemmers::{Algorithm, Stemmer};
66
use std::{collections::HashMap, path::Path, sync::Arc};
77

8+
use crate::models::DEFAULT_TOKENIZER;
9+
810
/// Tokenizer trait for converting text to searchable tokens.
911
pub trait Tokenizer: Send + Sync {
1012
/// Convert text to tokens for indexing.
@@ -68,12 +70,13 @@ impl Tokenizer for DefaultTokenizer {
6870

6971
pub type Tokenizers = HashMap<String, Arc<dyn Tokenizer>>;
7072

71-
/// Load tokenizers from directory. Each .lua file becomes a tokenizer.
73+
/// Load all tokenizers into a map, the default bundled ones and the Lua
74+
/// ones from the given directory. Each .lua file becomes a tokenizer.
7275
pub fn load_all(dir: &Path) -> Result<Tokenizers, TokenizerError> {
7376
let mut out: Tokenizers = HashMap::new();
7477

7578
// Always include the simple tokenizer.
76-
out.insert("simple".to_string(), Arc::new(SimpleTokenizer));
79+
out.insert(DEFAULT_TOKENIZER.to_string(), Arc::new(SimpleTokenizer));
7780

7881
// Add built-in default stemmers.
7982
let default_stemmers = [
@@ -97,7 +100,10 @@ pub fn load_all(dir: &Path) -> Result<Tokenizers, TokenizerError> {
97100
("turkish", Algorithm::Turkish),
98101
];
99102
for (name, algorithm) in default_stemmers {
100-
out.insert(name.to_string(), Arc::new(DefaultTokenizer::new(algorithm)));
103+
out.insert(
104+
format!("default:{}", name),
105+
Arc::new(DefaultTokenizer::new(algorithm)),
106+
);
101107
}
102108

103109
// If no dir has been specified, skip loading from disk.
@@ -156,8 +162,28 @@ pub fn load_all(dir: &Path) -> Result<Tokenizers, TokenizerError> {
156162
}
157163

158164
log::info!("loaded '{}'", fname);
159-
out.insert(name, Arc::new(tk));
165+
out.insert(format!("lua:{}", name), Arc::new(tk));
160166
}
161167

162168
Ok(out)
163169
}
170+
171+
/// Parse and validate tokenizer field in format "default:name" or "lua:filename.lua".
172+
/// Returns the validated tokenizer string for lookup in the tokenizers map.
173+
pub fn parse_tokenizer_field(tokenizer: &str) -> Option<String> {
174+
if tokenizer.is_empty() {
175+
return None;
176+
}
177+
178+
if tokenizer.starts_with("default:") && tokenizer.len() > 8 {
179+
Some(tokenizer.to_string())
180+
} else if tokenizer.starts_with("lua:") && tokenizer.len() > 4 {
181+
Some(tokenizer.to_string())
182+
} else {
183+
log::warn!(
184+
"unknown tokenizer format '{}'. expected 'default:name' or 'lua:filename.lua'",
185+
tokenizer
186+
);
187+
None
188+
}
189+
}

0 commit comments

Comments
 (0)