1- use std:: path:: Path ;
2-
31use crate :: cache:: { Cache , CacheConfig , CacheError } ;
4- use crate :: models:: { Config , Dicts , Lang , LangMap } ;
5- use crate :: tokenizer:: { TokenizerError , Tokenizers } ;
2+ use crate :: models:: { Config , Dicts , Lang , LangMap , DEFAULT_TOKENIZER } ;
3+ use crate :: tokenizer:: Tokenizers ;
64
75/// Initialize logger.
86pub fn logger ( ) {
@@ -29,27 +27,38 @@ pub fn logger() {
2927 . init ( ) ;
3028}
3129
32- /// Initialize languages from config.
33- pub fn langs ( config : & Config ) -> LangMap {
30+ /// Initialize languages from config, validating tokenizers against loaded tokenizers .
31+ pub fn langs ( config : & Config , tokenizers : & Tokenizers ) -> LangMap {
3432 let mut langs = LangMap :: new ( ) ;
3533
3634 for ( id, cfg) in & config. lang {
37- // Validate tokenizer_type.
38- let typ = if cfg. tokenizer_type . is_empty ( ) {
39- "default" . to_string ( )
35+ let tokenizer = if cfg. tokenizer . is_empty ( ) {
36+ // If the tokenizer is not specified, use default.
37+ DEFAULT_TOKENIZER . to_string ( )
38+ } else if ( !cfg. tokenizer . starts_with ( "default:" ) ) && ( !cfg. tokenizer . starts_with ( "lua:" ) ) {
39+ // Tokenizer name must start with "default:" or "lua:".
40+ log:: error!(
41+ "invalid tokenizer format '{}' for language '{}'. defaulting to '{}'" ,
42+ cfg. tokenizer,
43+ id,
44+ DEFAULT_TOKENIZER
45+ ) ;
46+ DEFAULT_TOKENIZER . to_string ( )
47+ } else if tokenizers. contains_key ( & cfg. tokenizer ) {
48+ // Yep, it's valid.
49+ cfg. tokenizer . clone ( )
4050 } else {
41- cfg. tokenizer_type . clone ( )
42- } ;
43-
44- if typ != "default" && typ != "lua" {
51+ // Unknown tokenizer.
4552 log:: error!(
46- "unknown tokenizer_type '{}' for language '{}'. Must be 'default' or 'lua'." ,
47- typ,
48- id
53+ "tokenizer '{}' not found for language '{}'. defaulting to '{}'" ,
54+ cfg. tokenizer,
55+ id,
56+ DEFAULT_TOKENIZER
4957 ) ;
50- std :: process :: exit ( 1 ) ;
51- }
58+ DEFAULT_TOKENIZER . to_string ( )
59+ } ;
5260
61+ // Create the language instance.
5362 let lang = Lang {
5463 id : id. clone ( ) ,
5564 name : if cfg. name . is_empty ( ) {
@@ -58,20 +67,10 @@ pub fn langs(config: &Config) -> LangMap {
5867 cfg. name . clone ( )
5968 } ,
6069 types : cfg. types . clone ( ) ,
61- tokenizer : if cfg. tokenizer . is_empty ( ) {
62- "simple" . to_string ( )
63- } else {
64- cfg. tokenizer . clone ( )
65- } ,
66- tokenizer_type : typ,
70+ tokenizer : tokenizer. clone ( ) ,
6771 } ;
6872
69- log:: info!(
70- "language: {} (tokenizer: {}, type: {})" ,
71- id,
72- lang. tokenizer,
73- lang. tokenizer_type
74- ) ;
73+ log:: info!( "language: {} (tokenizer: {})" , id, tokenizer) ;
7574
7675 langs. insert ( id. clone ( ) , lang) ;
7776 }
@@ -171,11 +170,6 @@ pub fn i18n(
171170 Ok ( i18n)
172171}
173172
174- /// Initialize lua tokenizers from a given directory.
175- pub fn tokenizers ( dir : & str ) -> Result < Tokenizers , TokenizerError > {
176- crate :: tokenizer:: load_all ( Path :: new ( dir) )
177- }
178-
179173/// Initialize cache from configuration.
180174pub async fn cache ( cfg : & CacheConfig ) -> Result < Cache , CacheError > {
181175 log:: info!(
0 commit comments