Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/cli_user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,16 @@ Use `--generator-config path/to/file.yaml` to provide ServiceConfig/K8sConfig/Dy
- `--generator-set ServiceConfig.model_path=Qwen/Qwen3-32B-FP8`
- `--generator-set K8sConfig.k8s_namespace=dynamo \`

#### Rule Plugin Selection
You can switch the generator rule set via `--generator-set rule=benchmark`. This selects a rule plugin folder under `src/aiconfigurator/generator/rule_plugin/`.

- **Default (production)**: if `rule` is not provided, the generator uses the default production rules. These are tuned for deployment (e.g., adjusted max batch size and CUDA graph batch sizes).
- **Benchmark**: `--generator-set rule=benchmark` enables rules designed to align generated configs with AIC sdk results, including:
- wider CUDA graph batch size coverage to match simulated results
- stricter max batch size that follows the simulated batch size

You can also define your own rule sets by adding a new folder under `src/aiconfigurator/generator/rule_plugin/` and selecting it with `--generator-set rule=<folder_name>`.

Run `aiconfigurator cli default --generator-help` to print information that is sourced directly from `src/aiconfigurator/generator/config/deployment_config.yaml` and `backend_config_mapping.yaml`.

The `--generator-help` command supports three section options:
Expand Down
2 changes: 2 additions & 0 deletions docs/generator_overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ flowchart TD
prefill max_batch_size = (max_batch_size if max_batch_size else 1)
```
DSL rules users can extend to influence generated configs. Field names come from `backend_config_mapping.yaml` and `deployment_config.yaml`; prefixes like `agg_`, `prefill_`, and `decode_` scope the impact to that role’s generated outputs.

**Rule selection**: Use `--generator-set rule=benchmark` to switch to a different rule plugin folder under `src/aiconfigurator/generator/rule_plugin/`. If `rule` is not provided, the default production rules are used (tuned for deployment, including max batch size and CUDA graph batch size adjustments). The `benchmark` rules are designed to align generated configs with AIC simulation, using broader CUDA graph batch sizes and a stricter max batch size derived from the simulated batch size. You can add your own rule sets by creating a folder under `rule_plugin/` and selecting it via `--generator-set rule=<folder_name>`.

- Backend templates (`config/backend_templates/<backend>/`):
Jinja templates that turn mapped parameters into CLI args, engine configs, run scripts, and Kubernetes manifests (optionally versioned).
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ aiconfigurator = "aiconfigurator.main:main"

[tool.setuptools.package-data]
"aiconfigurator.cli" = ["*.yaml", "exps/*.yaml"]
"aiconfigurator.generator" = ["config/**/*.yaml", "config/backend_templates/**/*.j2", "rule_plugin/*.rule"]
"aiconfigurator.generator" = ["config/**/*.yaml", "config/backend_templates/**/*.j2", "rule_plugin/**/*.rule"]
"aiconfigurator" = ["systems/*.yaml", "systems/**/*.txt", "model_configs/*.json"]
"aiconfigurator.webapp.components.profiling" = ["styles.css", "js/*.js"]

Expand Down
3 changes: 3 additions & 0 deletions src/aiconfigurator/generator/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,7 @@ def generate_config_from_input_dict(
)
if target.get("ModelConfig"):
params["ModelConfig"] = target.get("ModelConfig", {})
rule_name = input_params.get("rule")
if rule_name:
params["rule"] = rule_name
return params
3 changes: 3 additions & 0 deletions src/aiconfigurator/generator/module_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,5 +221,8 @@ def _build_worker_params(prefix: str, extra_overrides: dict | None) -> tuple[dic
)

params = _deep_merge(params, overrides.get("Params"))
rule_name = overrides.get("rule")
if rule_name:
params["rule"] = rule_name
params["ModelConfig"] = model_cfg
return params
5 changes: 4 additions & 1 deletion src/aiconfigurator/generator/rendering/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@


def render_backend_templates(
param_values: dict[str, Any], backend: str, templates_dir: Optional[str] = None, version: Optional[str] = None
param_values: dict[str, Any],
backend: str,
templates_dir: Optional[str] = None,
version: Optional[str] = None,
) -> dict[str, str]:
"""
Render templates for a specific backend with version-specific template selection.
Expand Down
24 changes: 22 additions & 2 deletions src/aiconfigurator/generator/rendering/rule_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@
logger = logging.getLogger(__name__)
_BASE_DIR = Path(__file__).resolve().parent
_RULES_DIR = (_BASE_DIR.parent / "rule_plugin").resolve()
_DEFAULT_RULE_PLUGIN = "default"


def _resolve_rule_plugin_dir(plugin: Optional[str], base_dir: Optional[str] = None) -> str:
base = Path(base_dir).resolve() if base_dir else _RULES_DIR
if not plugin or plugin == _DEFAULT_RULE_PLUGIN:
return str(base)
if os.path.sep in plugin or (os.path.altsep and os.path.altsep in plugin):
raise ValueError(f"Rule plugin must be a simple name, got: {plugin!r}")
candidate = (base / plugin).resolve()
if not candidate.exists() or not candidate.is_dir():
raise FileNotFoundError(f"Rule plugin directory not found: {candidate}")
if base not in candidate.parents and candidate != base:
raise ValueError(f"Rule plugin path escapes base directory: {candidate}")
return str(candidate)


def _ensure_scope(pv: dict[str, Any], scope: str) -> dict[str, Any]:
Expand Down Expand Up @@ -157,8 +172,13 @@ def _load_rule_path(base_dir: str, backend: str) -> Optional[str]:
return p if os.path.exists(p) else None


def apply_rule_plugins(param_values: dict[str, Any], backend: str, dsl_dir: Optional[str] = None) -> dict[str, Any]:
base = str(Path(dsl_dir).resolve()) if dsl_dir else str(_RULES_DIR)
def apply_rule_plugins(
param_values: dict[str, Any],
backend: str,
dsl_dir: Optional[str] = None,
) -> dict[str, Any]:
rule_name = param_values.get("rule")
base = _resolve_rule_plugin_dir(rule_name, base_dir=dsl_dir)
rule_path = _load_rule_path(base, backend)
if not rule_path:
return param_values
Expand Down
21 changes: 21 additions & 0 deletions src/aiconfigurator/generator/rule_plugin/benchmark/sglang.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
prefill max_batch_size = (max_batch_size if max_batch_size else 1)
agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)

agg_prefill_decode max_prefill_tokens = SlaConfig.isl + 1500
agg enable_mixed_chunk = true

agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])

# GPUs per worker follow the same TP/PP/DP product that SGLang expects
agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)

agg_prefill_decode kv_cache_dtype = ("fp8_e4m3" if kv_cache_dtype == "fp8" else kv_cache_dtype)
prefill_decode kv_transfer_backend = (kv_transfer_backend if kv_transfer_backend else "nixl")

when (ModelConfig.prefix or 0) > 0:
disable_prefix_cache = false
DynConfig.enable_router = true

when (ModelConfig.nextn or 0) > 0:
speculative_decoding_type = "NEXTN"
speculative_num_steps = ModelConfig.nextn
31 changes: 31 additions & 0 deletions src/aiconfigurator/generator/rule_plugin/benchmark/trtllm.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
prefill max_batch_size = (max_batch_size if max_batch_size else 1)
agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)

prefill disable_overlap_scheduler = true
decode disable_overlap_scheduler = false
agg disable_overlap_scheduler = false

prefill max_num_tokens = SlaConfig.isl + 1500
decode max_num_tokens = max_batch_size
agg max_num_tokens = max_batch_size + SlaConfig.isl + 1500

agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])

# Enforce TensorRT-LLM MoE parallelism: moe_tp × moe_ep = tp
when ModelConfig.is_moe and (moe_tensor_parallel_size and moe_expert_parallel_size):
agg_prefill_decode tensor_parallel_size = moe_tensor_parallel_size * moe_expert_parallel_size

# GPUs per worker (fallback to 1 if any dimension missing)
agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)

agg_prefill_decode enable_attention_dp = ((data_parallel_size or 1) > 1) and ModelConfig.is_moe

when (ModelConfig.prefix or 0) > 0:
agg_prefill_decode disable_prefix_cache = false
DynConfig.enable_router = true


# Speculative decoding
when (ModelConfig.nextn or 0) > 0:
agg_decode speculative_decoding_type = "MTP"
agg_decode num_nextn_predict_layers = ModelConfig.nextn
21 changes: 21 additions & 0 deletions src/aiconfigurator/generator/rule_plugin/benchmark/vllm.rule
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to include cuda_graph_batch_sizes to vllm.rule as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review, I've added the cuda_graph_batch_sizes to vllm.rule on line 8 to stay consistent with trtllm and sglang. Please let me know if this is not appropriate.

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
prefill max_batch_size = (max_batch_size if max_batch_size else 1)
decode max_batch_size = (max_batch_size if max_batch_size else 128)


agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
agg_prefill_decode enable_expert_parallel = ((moe_expert_parallel_size or 1) > 1)

agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])

prefill max_num_tokens = (SlaConfig.isl or 0) + 1500
decode max_num_tokens = max_batch_size
agg max_num_tokens = (max_batch_size or 0) + (SlaConfig.isl or 0) + 1500
agg max_seq_len = (SlaConfig.isl or 0) + (SlaConfig.osl or 0) + 1500

when (ModelConfig.prefix or 0) > 0:
disable_prefix_cache = false
DynConfig.enable_router = true

when (ModelConfig.nextn or 0) > 0:
speculative_decoding_type = "mtp"
num_nextn_predict_layers = ModelConfig.nextn
9 changes: 6 additions & 3 deletions src/aiconfigurator/generator/rule_plugin/sglang.rule
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
agg_decode cuda_graph_batch_sizes = ((range(1, 16 + 1, 1) | list) + (range(16, 32 + 1, 4) | list) + (range(32, 64 + 1, 8) | list) + (range(64, 128 + 1, 16) | list) + (range(128, 256 + 1, 32) | list) + (range(256, 512 + 1, 64) | list) + (range((1 if (max_batch_size or 0) <= 8 else (max_batch_size or 0) - 8), (max_batch_size or 0) + 8 + 1, 1) | list))

agg_decode cuda_graph_enable_padding = true

prefill max_batch_size = (max_batch_size if max_batch_size else 1)
agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)
agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))

agg_prefill_decode max_prefill_tokens = SlaConfig.isl + 1500
agg enable_mixed_chunk = true

agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])

# GPUs per worker follow the same TP/PP/DP product that SGLang expects
agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)

Expand All @@ -14,6 +16,7 @@ prefill_decode kv_transfer_backend = (kv_transfer_backend if kv_transfer_backend

when (ModelConfig.prefix or 0) > 0:
disable_prefix_cache = false
DynConfig.enable_router = true

when (ModelConfig.nextn or 0) > 0:
speculative_decoding_type = "NEXTN"
Expand Down
8 changes: 5 additions & 3 deletions src/aiconfigurator/generator/rule_plugin/trtllm.rule
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
agg_decode cuda_graph_batch_sizes = ((range(1, 16 + 1, 1) | list) + (range(16, 32 + 1, 4) | list) + (range(32, 64 + 1, 8) | list) + (range(64, 128 + 1, 16) | list) + (range(128, 256 + 1, 32) | list) + (range(256, 512 + 1, 64) | list) + (range((1 if (max_batch_size or 0) <= 8 else (max_batch_size or 0) - 8), (max_batch_size or 0) + 8 + 1, 1) | list))

agg_decode cuda_graph_enable_padding = true

prefill max_batch_size = (max_batch_size if max_batch_size else 1)
agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)
agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))

prefill disable_overlap_scheduler = true
decode disable_overlap_scheduler = false
Expand All @@ -9,8 +13,6 @@ prefill max_num_tokens = SlaConfig.isl + 1500
decode max_num_tokens = max_batch_size
agg max_num_tokens = max_batch_size + SlaConfig.isl + 1500

agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])

# Enforce TensorRT-LLM MoE parallelism: moe_tp × moe_ep = tp
when ModelConfig.is_moe and (moe_tensor_parallel_size and moe_expert_parallel_size):
agg_prefill_decode tensor_parallel_size = moe_tensor_parallel_size * moe_expert_parallel_size
Expand Down
3 changes: 2 additions & 1 deletion src/aiconfigurator/generator/rule_plugin/vllm.rule
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))
prefill max_batch_size = (max_batch_size if max_batch_size else 1)
decode max_batch_size = (max_batch_size if max_batch_size else 128)
agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))


agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
Expand Down