[TRTLLM-10076,TRTLLM-10079,TRTLLM-10229,TRTLLM-10078][feat] Serve CLI improvements: renames, new flags, and mm_embedding_serve enhancements

JunyiXu-nv · JunyiXu-nv · commit b398e026e541 · 2026-03-11T07:06:52.000Z
- TRTLLM-10076: Update --tokenizer description for PyTorch backend,
  add --hf_revision alias for --revision with deprecation warning,
  support hf_revision key in YAML config, add --enable_attention_dp flag
- TRTLLM-10079: mm_embedding_serve: add --config alias for
  --extra_encoder_options, expose --hf_revision, --free_gpu_memory_fraction,
  --tensor_parallel_size
- TRTLLM-10229: Add --config alias for --config_file in disaggregated
  and disaggregated_mpi_worker commands
- TRTLLM-10078: Improve --server_role help message with role descriptions

Signed-off-by: Junyi Xu &lt;219237550+JunyiXu-nv@users.noreply.github.com&gt;
Made-with: Cursor
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -6,6 +6,7 @@
 import socket
 import subprocess  # nosec B404
 import sys
+import warnings
 from pathlib import Path
 from typing import Any, Dict, Literal, Mapping, Optional, Sequence
 
@@ -154,6 +155,7 @@ def get_llm_args(
         fail_fast_on_attention_window_too_large: bool = False,
         otlp_traces_endpoint: Optional[str] = None,
         enable_chunked_prefill: bool = False,
+        enable_attention_dp: bool = False,
         **llm_args_extra_dict: Any):
 
     if gpus_per_node is None:
@@ -228,6 +230,8 @@ def get_llm_args(
         num_postprocess_workers,
         "enable_chunked_prefill":
         enable_chunked_prefill,
+        "enable_attention_dp":
+        enable_attention_dp,
         "revision":
         revision,
         "reasoning_parser":
@@ -508,11 +512,13 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
 
 @click.command("serve")
 @click.argument("model", type=str)
-@click.option("--tokenizer",
-              type=str,
-              default=None,
-              help=help_info_with_stability_tag("Path | Name of the tokenizer.",
-                                                "beta"))
+@click.option(
+    "--tokenizer",
+    type=str,
+    default=None,
+    help=help_info_with_stability_tag(
+        "Path or name of the tokenizer. When using the PyTorch backend, "
+        "this replaces the default HuggingFace tokenizer.", "beta"))
 @click.option(
     "--custom_tokenizer",
     type=str,
@@ -641,12 +647,15 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
               default=False,
               help=help_info_with_stability_tag("Flag for HF transformers.",
                                                 "beta"))
-@click.option("--revision",
+@click.option("--hf_revision",
+              "--revision",
+              "revision",
               type=str,
               default=None,
               help=help_info_with_stability_tag(
                   "The revision to use for the HuggingFace model "
-                  "(branch name, tag name, or commit id).", "beta"))
+                  "(branch name, tag name, or commit id). "
+                  "Prefer --hf_revision over --revision.", "beta"))
 @click.option(
     "--config",
     "--extra_llm_api_options",
@@ -681,8 +690,9 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
     type=str,
     default=None,
     help=help_info_with_stability_tag(
-        "Server role. Specify this value only if running in disaggregated mode.",
-        "prototype"))
+        "Server role for disaggregated serving. "
+        "CONTEXT=prefill (prompt processing), GENERATION=decode (token generation). "
+        "Required when using service registry.", "prototype"))
 @click.option(
     "--fail_fast_on_attention_window_too_large",
     is_flag=True,
@@ -706,6 +716,11 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
               default=False,
               help=help_info_with_stability_tag("Enable chunked prefill",
                                                 "prototype"))
+@click.option("--enable_attention_dp",
+              is_flag=True,
+              default=False,
+              help=help_info_with_stability_tag(
+                  "Enable attention data parallel.", "beta"))
 @click.option("--media_io_kwargs",
               type=str,
               default=None,
@@ -752,16 +767,22 @@ def serve(
         metadata_server_config_file: Optional[str], server_role: Optional[str],
         fail_fast_on_attention_window_too_large: bool,
         otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
-        disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
-        custom_module_dirs: list[Path], chat_template: Optional[str],
-        grpc: bool, served_model_name: Optional[str],
+        enable_attention_dp: bool, disagg_cluster_uri: Optional[str],
+        media_io_kwargs: Optional[str], custom_module_dirs: list[Path],
+        chat_template: Optional[str], grpc: bool,
+        served_model_name: Optional[str],
         extra_visual_gen_options: Optional[str]):
     """Running an OpenAI API compatible server
 
     MODEL: model name | HF checkpoint path | TensorRT engine path
     """
     logger.set_level(log_level)
 
+    if "--revision" in sys.argv:
+        warnings.warn("--revision is deprecated, use --hf_revision instead.",
+                      DeprecationWarning,
+                      stacklevel=2)
+
     for custom_module_dir in custom_module_dirs:
         try:
             import_custom_module_from_dir(custom_module_dir)
@@ -796,7 +817,8 @@ def _serve_llm():
             fail_fast_on_attention_window_too_large=
             fail_fast_on_attention_window_too_large,
             otlp_traces_endpoint=otlp_traces_endpoint,
-            enable_chunked_prefill=enable_chunked_prefill)
+            enable_chunked_prefill=enable_chunked_prefill,
+            enable_attention_dp=enable_attention_dp)
 
         llm_args_extra_dict = {}
         if extra_llm_api_options is not None:
@@ -923,33 +945,62 @@ def _serve_visual_gen():
               default=False,
               help="Flag for HF transformers.")
 @click.option(
+    "--config",
     "--extra_encoder_options",
+    "extra_encoder_options",
     type=str,
     default=None,
     help=
-    "Path to a YAML file that overwrites the parameters specified by trtllm-serve."
-)
+    "Path to a YAML file that overwrites the parameters specified by trtllm-serve. "
+    "Prefer --config over --extra_encoder_options.")
+@click.option("--hf_revision",
+              "--revision",
+              "revision",
+              type=str,
+              default=None,
+              help="The revision to use for the HuggingFace model "
+              "(branch name, tag name, or commit id).")
+@click.option("--free_gpu_memory_fraction",
+              type=float,
+              default=0.9,
+              help="Free GPU memory fraction reserved for KV Cache, "
+              "after allocating model weights and buffers.")
+@click.option("--tensor_parallel_size",
+              "--tp_size",
+              type=int,
+              default=1,
+              help="Tensor parallelism size.")
 @click.option("--metadata_server_config_file",
               type=str,
               default=None,
               help="Path to metadata server config file")
 def serve_encoder(model: str, host: str, port: int, log_level: str,
                   max_batch_size: int, max_num_tokens: int,
                   gpus_per_node: Optional[int], trust_remote_code: bool,
-                  extra_encoder_options: Optional[str],
+                  extra_encoder_options: Optional[str], revision: Optional[str],
+                  free_gpu_memory_fraction: float, tensor_parallel_size: int,
                   metadata_server_config_file: Optional[str]):
     """Running an OpenAI API compatible server
 
     MODEL: model name | HF checkpoint path | TensorRT engine path
     """
     logger.set_level(log_level)
 
-    # TODO: expose more arguments progressively
-    llm_args, _ = get_llm_args(model=model,
-                               max_batch_size=max_batch_size,
-                               max_num_tokens=max_num_tokens,
-                               gpus_per_node=gpus_per_node,
-                               trust_remote_code=trust_remote_code)
+    if "--extra_encoder_options" in sys.argv:
+        warnings.warn(
+            "--extra_encoder_options is deprecated, use --config instead.",
+            DeprecationWarning,
+            stacklevel=2)
+
+    llm_args, _ = get_llm_args(
+        model=model,
+        max_batch_size=max_batch_size,
+        max_num_tokens=max_num_tokens,
+        gpus_per_node=gpus_per_node,
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        free_gpu_memory_fraction=free_gpu_memory_fraction,
+        tensor_parallel_size=tensor_parallel_size)
 
     encoder_args_extra_dict = {}
     if extra_encoder_options is not None:
@@ -966,10 +1017,12 @@ def serve_encoder(model: str, host: str, port: int, log_level: str,
 
 @click.command("disaggregated")
 @click.option("-c",
+              "--config",
               "--config_file",
+              "config_file",
               type=str,
               default=None,
-              help="Specific option for disaggregated mode.")
+              help="Path to the disaggregated serving configuration YAML file.")
 @click.option("-m",
               "--metadata_server_config_file",
               type=str,
@@ -1009,6 +1062,11 @@ def disaggregated(
 
     logger.set_level(log_level)
 
+    if "--config_file" in sys.argv:
+        warnings.warn("--config_file is deprecated, use --config instead.",
+                      DeprecationWarning,
+                      stacklevel=2)
+
     disagg_cfg = parse_disagg_config_file(config_file)
 
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -1061,10 +1119,12 @@ def set_cuda_device():
 
 @click.command("disaggregated_mpi_worker")
 @click.option("-c",
+              "--config",
               "--config_file",
+              "config_file",
               type=str,
               default=None,
-              help="Specific option for disaggregated mode.")
+              help="Path to the disaggregated serving configuration YAML file.")
 @click.option('--log_level',
               type=click.Choice(severity_map.keys()),
               default='info',
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -3461,6 +3461,9 @@ def update_llm_args_with_extra_dict(
         llm_args_dict: Dict,
         extra_llm_api_options: Optional[str] = None) -> Dict:
 
+    if 'hf_revision' in llm_args_dict:
+        llm_args_dict.setdefault('revision', llm_args_dict.pop('hf_revision'))
+
     # Deep merge kv_cache_config to prevent partial YAML kv_cache_config from replacing the complete kv_cache_config
     if 'kv_cache_config' in llm_args and 'kv_cache_config' in llm_args_dict:
         # Convert KvCacheConfig object to dict if necessary