@@ -511,6 +511,47 @@ def get_model(pretrained_model_name_or_path: str) -> str:
511511 return pretrained_model_name_or_path
512512
513513
514+ def _load_glm_moe_dsa_tokenizer (pretrained_model_name_or_path : str ) -> "PreTrainedTokenizerFast" :
515+ """Load GLM-Moe-Dsa / GLM-5 tokenizer directly from tokenizer.json.
516+
517+ Works around incompatibilities when the checkpoint was saved with
518+ transformers 5.x (TokenizersBackend / list-style extra_special_tokens).
519+ """
520+ import json
521+ from pathlib import Path
522+
523+ from tokenizers import Tokenizer as RustTokenizer
524+ from transformers import PreTrainedTokenizerFast
525+
526+ _SAFE_CONFIG_KEYS = (
527+ "pad_token" , "pad_token_id" , "eos_token" , "eos_token_id" ,
528+ "bos_token" , "bos_token_id" , "unk_token" , "unk_token_id" ,
529+ "model_max_length" , "padding_side" , "truncation_side" ,
530+ )
531+
532+ path = Path (pretrained_model_name_or_path )
533+ tokenizer_json = path / "tokenizer.json"
534+ if not tokenizer_json .exists ():
535+ raise FileNotFoundError (
536+ f"Expected tokenizer.json at { tokenizer_json } . "
537+ "GlmMoeDsaTokenizer loads from tokenizer.json only."
538+ )
539+
540+ rust_tok = RustTokenizer .from_file (str (tokenizer_json ))
541+ init_kwargs = {}
542+ config_path = path / "tokenizer_config.json"
543+ if config_path .exists ():
544+ with open (config_path , encoding = "utf-8" ) as f :
545+ config = json .load (f )
546+ for key in _SAFE_CONFIG_KEYS :
547+ if key in config :
548+ init_kwargs [key ] = config [key ]
549+ if "extra_special_tokens" in config :
550+ init_kwargs ["additional_special_tokens" ] = config ["extra_special_tokens" ]
551+
552+ return PreTrainedTokenizerFast (tokenizer_object = rust_tok , ** init_kwargs )
553+
554+
514555def get_tokenizer (
515556 pretrained_model_name_or_path : str ,
516557 tokenizer_mode : str = "auto" ,
@@ -535,13 +576,11 @@ def get_tokenizer(
535576 ) from e
536577 return MistralTokenizer .from_pretrained (str (pretrained_model_name_or_path ))
537578 if custom_tokenizer :
538- from tensorrt_llm .llmapi .llm_args import TOKENIZER_ALIASES
539-
540- tokenizer_path = TOKENIZER_ALIASES .get (custom_tokenizer ,
541- custom_tokenizer )
579+ if custom_tokenizer == "glm_moe_dsa" :
580+ return _load_glm_moe_dsa_tokenizer (pretrained_model_name_or_path )
542581 from importlib import import_module
543582 try :
544- module_path , class_name = tokenizer_path .rsplit ('.' , 1 )
583+ module_path , class_name = custom_tokenizer .rsplit ('.' , 1 )
545584 module = import_module (module_path )
546585 tokenizer_class = getattr (module , class_name )
547586 return tokenizer_class .from_pretrained (
@@ -552,7 +591,7 @@ def get_tokenizer(
552591 except (ValueError , ImportError , AttributeError ) as e :
553592 raise ValueError (
554593 f"Failed to load custom_tokenizer '{ custom_tokenizer } '. "
555- "Expected alias or 'module.path.ClassName'." ) from e
594+ "Expected 'glm_moe_dsa' or 'module.path.ClassName'." ) from e
556595 else :
557596 return AutoTokenizer .from_pretrained (
558597 pretrained_model_name_or_path ,
0 commit comments