Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 34 additions & 12 deletions tensorrt_llm/commands/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def get_llm_args(
trust_remote_code: bool = False,
revision: Optional[str] = None,
reasoning_parser: Optional[str] = None,
fail_fast_on_attention_window_too_large: bool = False,
fail_fast_on_attention_window_too_large: bool = True,
otlp_traces_endpoint: Optional[str] = None,
enable_chunked_prefill: bool = False,
**llm_args_extra_dict: Any):
Expand Down Expand Up @@ -602,12 +602,15 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
default=None,
help=help_info_with_stability_tag("expert parallelism size",
"beta"))
@click.option("--moe_cluster_parallel_size",
"--cluster_size",
type=int,
default=None,
help=help_info_with_stability_tag(
"expert cluster parallelism size", "beta"))
@click.option(
"--moe_cluster_parallel_size",
"--cluster_size",
type=int,
default=None,
help=help_info_with_stability_tag(
"[Deprecated] Expert cluster parallelism size. "
"This option is no longer supported and will be removed in a future release.",
"deprecated"))
@click.option(
"--gpus_per_node",
type=int,
Expand Down Expand Up @@ -686,10 +689,12 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
@click.option(
"--fail_fast_on_attention_window_too_large",
is_flag=True,
default=False,
default=True,
help=help_info_with_stability_tag(
"Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.",
"prototype"))
"[Deprecated] Exit with runtime error when attention window is too large "
"to fit even a single sequence in the KV cache. Now defaults to True. "
"This flag only affects the TRT backend and will be removed in a future release.",
"deprecated"))
@click.option("--otlp_traces_endpoint",
type=str,
default=None,
Expand Down Expand Up @@ -762,6 +767,18 @@ def serve(
"""
logger.set_level(log_level)

if moe_cluster_parallel_size is not None:
logger.warning(
"--moe_cluster_parallel_size / --cluster_size is deprecated and "
"no longer supported. This option will be removed in a future release."
)

if "--fail_fast_on_attention_window_too_large" in sys.argv:
logger.warning(
"--fail_fast_on_attention_window_too_large is deprecated. "
"It now defaults to True and will be removed in a future release. "
"This flag only affects the TRT backend.")

for custom_module_dir in custom_module_dirs:
try:
import_custom_module_from_dir(custom_module_dir)
Expand Down Expand Up @@ -994,8 +1011,8 @@ def serve_encoder(model: str, host: str, port: int, log_level: str,
"--metrics-log-interval",
type=int,
default=0,
help=
"The interval of logging metrics in seconds. Set to 0 to disable metrics logging."
help="[Deprecated] The interval of logging metrics in seconds. "
"This option is not connected to any functionality and will be removed in a future release."
)
def disaggregated(
config_file: Optional[str],
Expand All @@ -1009,6 +1026,11 @@ def disaggregated(

logger.set_level(log_level)

if metrics_log_interval != 0:
logger.warning(
"--metrics-log-interval is deprecated and not connected to any "
"functionality. This option will be removed in a future release.")

disagg_cfg = parse_disagg_config_file(config_file)

with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
Expand Down
6 changes: 3 additions & 3 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2233,7 +2233,7 @@ class BaseLlmArgs(StrictBaseModel):
moe_cluster_parallel_size: Optional[int] = Field(
default=None,
description="The cluster parallel size for MoE model's expert weights.",
status="beta")
status="deprecated")

moe_tensor_parallel_size: Optional[int] = Field(
default=None,
Expand Down Expand Up @@ -2625,10 +2625,10 @@ class TrtLlmArgs(BaseLlmArgs):
description="The workspace for the model.")

fail_fast_on_attention_window_too_large: bool = Field(
default=False,
default=True,
description=
"Fail fast when attention window is too large to fit even a single sequence in the KV cache.",
status="prototype")
status="deprecated")

# Once set, the model will reuse the build_cache
enable_build_cache: Union[BuildCacheConfig,
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/api_stability/references/llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ methods:
moe_cluster_parallel_size:
annotation: Optional[int]
default: null
status: beta
status: deprecated
enable_attention_dp:
annotation: bool
default: False
Expand Down
Loading