fix for glm5

richardhuo-nv · richardhuo-nv · commit d36ebc42f525 · 2026-04-07T16:50:05.000-07:00
fix
diff --git a/src/srtctl/benchmarks/sa_bench.py b/src/srtctl/benchmarks/sa_bench.py
@@ -97,5 +97,7 @@ def build_command(
             str(prefill_gpus),
             str(decode_gpus),
             str(b.random_range_ratio) if b.random_range_ratio is not None else "0.8",
+            b.custom_tokenizer or "",
+            str(b.use_chat_template).lower(),
         ]
         return cmd
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
@@ -511,10 +511,52 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
+def _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path: str) -> "PreTrainedTokenizerFast":
+    """Load GLM-Moe-Dsa / GLM-5 tokenizer directly from tokenizer.json.
+
+    Works around incompatibilities when the checkpoint was saved with
+    transformers 5.x (TokenizersBackend / list-style extra_special_tokens).
+    """
+    import json
+    from pathlib import Path
+
+    from tokenizers import Tokenizer as RustTokenizer
+    from transformers import PreTrainedTokenizerFast
+
+    _SAFE_CONFIG_KEYS = (
+        "pad_token", "pad_token_id", "eos_token", "eos_token_id",
+        "bos_token", "bos_token_id", "unk_token", "unk_token_id",
+        "model_max_length", "padding_side", "truncation_side",
+    )
+
+    path = Path(pretrained_model_name_or_path)
+    tokenizer_json = path / "tokenizer.json"
+    if not tokenizer_json.exists():
+        raise FileNotFoundError(
+            f"Expected tokenizer.json at {tokenizer_json}. "
+            "GlmMoeDsaTokenizer loads from tokenizer.json only."
+        )
+
+    rust_tok = RustTokenizer.from_file(str(tokenizer_json))
+    init_kwargs = {}
+    config_path = path / "tokenizer_config.json"
+    if config_path.exists():
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        for key in _SAFE_CONFIG_KEYS:
+            if key in config:
+                init_kwargs[key] = config[key]
+        if "extra_special_tokens" in config:
+            init_kwargs["additional_special_tokens"] = config["extra_special_tokens"]
+
+    return PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs)
+
+
 def get_tokenizer(
     pretrained_model_name_or_path: str,
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
+    custom_tokenizer: str | None = None,
     **kwargs,
 ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
@@ -533,12 +575,28 @@ def get_tokenizer(
                 "to use mistral tokenizer mode."
             ) from e
         return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
-    else:
-        return AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
+    if custom_tokenizer:
+        if custom_tokenizer == "glm_moe_dsa":
+            return _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path)
+        from importlib import import_module
+        try:
+            module_path, class_name = custom_tokenizer.rsplit('.', 1)
+            module = import_module(module_path)
+            tokenizer_class = getattr(module, class_name)
+            return tokenizer_class.from_pretrained(
+                pretrained_model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        except (ValueError, ImportError, AttributeError) as e:
+            raise ValueError(
+                f"Failed to load custom_tokenizer '{custom_tokenizer}'. "
+                "Expected 'glm_moe_dsa' or 'module.path.ClassName'.") from e
+    return AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path,
+        trust_remote_code=trust_remote_code,
+        **kwargs,
+    )
 
 
 ASYNC_REQUEST_FUNCS = {
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
@@ -60,6 +60,20 @@ TOTAL_GPUS=${9:-0}
 PREFILL_GPUS=${10:-0}
 DECODE_GPUS=${11:-0}
 RANDOM_RANGE_RATIO=${12:-0.8}
+CUSTOM_TOKENIZER=${13:-}
+USE_CHAT_TEMPLATE=${14:-true}
+
+# Build optional custom tokenizer args
+CUSTOM_TOKENIZER_ARGS=()
+if [ -n "$CUSTOM_TOKENIZER" ]; then
+    CUSTOM_TOKENIZER_ARGS=(--custom-tokenizer "$CUSTOM_TOKENIZER")
+fi
+
+# Build optional chat template args
+CHAT_TEMPLATE_ARGS=()
+if [ "$USE_CHAT_TEMPLATE" = "true" ]; then
+    CHAT_TEMPLATE_ARGS=(--use-chat-template)
+fi
 
 # Parse endpoint into host:port
 HOST=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f1)
@@ -119,7 +133,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --request-rate 250 \
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
-        --trust-remote-code
+        --trust-remote-code \
+        "${CUSTOM_TOKENIZER_ARGS[@]}"
 
     num_prompts=$((concurrency * 10))
     
@@ -149,7 +164,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
         --trust-remote-code \
-        --use-chat-template \
+        "${CHAT_TEMPLATE_ARGS[@]}" \
+        "${CUSTOM_TOKENIZER_ARGS[@]}" \
         --save-result --result-dir "$result_dir" --result-filename "$result_filename"
     set +x
 
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
@@ -837,6 +837,7 @@ def main(args: argparse.Namespace):
         tokenizer_id,
         tokenizer_mode=tokenizer_mode,
         trust_remote_code=args.trust_remote_code,
+        custom_tokenizer=args.custom_tokenizer,
     )
 
     if args.dataset is not None:
@@ -1279,6 +1280,14 @@ def main(args: argparse.Namespace):
         '"custom" will use --tokenizer to select the preregistered tokenizer.',
     )
 
+    parser.add_argument(
+        "--custom-tokenizer",
+        type=str,
+        default=None,
+        help="Custom tokenizer to use (e.g., 'glm_moe_dsa' or 'module.path.ClassName'). "
+        "When set, overrides the default tokenizer loading.",
+    )
+
     parser.add_argument(
         "--served-model-name",
         type=str,
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
@@ -539,6 +539,8 @@ class BenchmarkConfig:
     ttft_threshold_ms: int | None = None  # Goodput TTFT threshold in ms (default: 2000)
     itl_threshold_ms: int | None = None  # Goodput ITL threshold in ms (default: 25)
     random_range_ratio: float | None = None  # Random input/output length range ratio (default: 0.8)
+    custom_tokenizer: str | None = None  # Custom tokenizer class (e.g., "module.path.ClassName")
+    use_chat_template: bool = True  # Pass --use-chat-template to benchmark (default: true)
 
     def get_concurrency_list(self) -> list[int]:
         if self.concurrencies is None:

Original file line number	Diff line number	Diff line change
`@@ -97,5 +97,7 @@ def build_command(`
`97`	`97`	`str(prefill_gpus),`
`98`	`98`	`str(decode_gpus),`
`99`	`99`	`str(b.random_range_ratio) if b.random_range_ratio is not None else "0.8",`
	`100`	`+ b.custom_tokenizer or "",`
	`101`	`+ str(b.use_chat_template).lower(),`
`100`	`102`	`]`
`101`	`103`	`return cmd`