@@ -565,10 +565,52 @@ def _fix_v5_tokenizer_components(tokenizer, model_name_or_path):
565565 backend .decoder = raw .decoder
566566
567567
568+ def _load_glm_moe_dsa_tokenizer (pretrained_model_name_or_path : str ) -> "PreTrainedTokenizerFast" :
569+ """Load GLM-Moe-Dsa / GLM-5 tokenizer directly from tokenizer.json.
570+
571+ Works around incompatibilities when the checkpoint was saved with
572+ transformers 5.x (TokenizersBackend / list-style extra_special_tokens).
573+ """
574+ import json
575+ from pathlib import Path
576+
577+ from tokenizers import Tokenizer as RustTokenizer
578+ from transformers import PreTrainedTokenizerFast
579+
580+ _SAFE_CONFIG_KEYS = (
581+ "pad_token" , "pad_token_id" , "eos_token" , "eos_token_id" ,
582+ "bos_token" , "bos_token_id" , "unk_token" , "unk_token_id" ,
583+ "model_max_length" , "padding_side" , "truncation_side" ,
584+ )
585+
586+ path = Path (pretrained_model_name_or_path )
587+ tokenizer_json = path / "tokenizer.json"
588+ if not tokenizer_json .exists ():
589+ raise FileNotFoundError (
590+ f"Expected tokenizer.json at { tokenizer_json } . "
591+ "GlmMoeDsaTokenizer loads from tokenizer.json only."
592+ )
593+
594+ rust_tok = RustTokenizer .from_file (str (tokenizer_json ))
595+ init_kwargs = {}
596+ config_path = path / "tokenizer_config.json"
597+ if config_path .exists ():
598+ with open (config_path , encoding = "utf-8" ) as f :
599+ config = json .load (f )
600+ for key in _SAFE_CONFIG_KEYS :
601+ if key in config :
602+ init_kwargs [key ] = config [key ]
603+ if "extra_special_tokens" in config :
604+ init_kwargs ["additional_special_tokens" ] = config ["extra_special_tokens" ]
605+
606+ return PreTrainedTokenizerFast (tokenizer_object = rust_tok , ** init_kwargs )
607+
608+
568609def get_tokenizer (
569610 pretrained_model_name_or_path : str ,
570611 tokenizer_mode : str = "auto" ,
571612 trust_remote_code : bool = False ,
613+ custom_tokenizer : str | None = None ,
572614 ** kwargs ,
573615) -> PreTrainedTokenizer | PreTrainedTokenizerFast :
574616 if pretrained_model_name_or_path is not None and not os .path .exists (pretrained_model_name_or_path ):
@@ -587,14 +629,31 @@ def get_tokenizer(
587629 "to use mistral tokenizer mode."
588630 ) from e
589631 return MistralTokenizer .from_pretrained (str (pretrained_model_name_or_path ))
590- else :
591- tokenizer = AutoTokenizer .from_pretrained (
592- pretrained_model_name_or_path ,
593- trust_remote_code = trust_remote_code ,
594- ** kwargs ,
595- )
596- _fix_v5_tokenizer_components (tokenizer , pretrained_model_name_or_path )
597- return tokenizer
632+ if custom_tokenizer :
633+ if custom_tokenizer == "glm_moe_dsa" :
634+ return _load_glm_moe_dsa_tokenizer (pretrained_model_name_or_path )
635+ from importlib import import_module
636+ try :
637+ module_path , class_name = custom_tokenizer .rsplit ('.' , 1 )
638+ module = import_module (module_path )
639+ tokenizer_class = getattr (module , class_name )
640+ return tokenizer_class .from_pretrained (
641+ pretrained_model_name_or_path ,
642+ trust_remote_code = trust_remote_code ,
643+ ** kwargs ,
644+ )
645+ except (ValueError , ImportError , AttributeError ) as e :
646+ raise ValueError (
647+ f"Failed to load custom_tokenizer '{ custom_tokenizer } '. "
648+ "Expected 'glm_moe_dsa' or 'module.path.ClassName'." ) from e
649+
650+ tokenizer = AutoTokenizer .from_pretrained (
651+ pretrained_model_name_or_path ,
652+ trust_remote_code = trust_remote_code ,
653+ ** kwargs ,
654+ )
655+ _fix_v5_tokenizer_components (tokenizer , pretrained_model_name_or_path )
656+ return tokenizer
598657
599658
600659ASYNC_REQUEST_FUNCS = {
0 commit comments