ai-dynamo · Ethan-ES · Jan 25, 2026 · simone-chen · Jan 28, 2026 · Ethan-ES
@@ -161,6 +161,16 @@ Use `--generator-config path/to/file.yaml` to provide ServiceConfig/K8sConfig/Dy
 - `--generator-set ServiceConfig.model_path=Qwen/Qwen3-32B-FP8`
 - `--generator-set K8sConfig.k8s_namespace=dynamo \`
 
+#### Rule Plugin Selection
+You can switch the generator rule set via `--generator-set rule=benchmark`. This selects a rule plugin folder under `src/aiconfigurator/generator/rule_plugin/`.
+
+- **Default (production)**: if `rule` is not provided, the generator uses the default production rules. These are tuned for deployment (e.g., adjusted max batch size and CUDA graph batch sizes).
+- **Benchmark**: `--generator-set rule=benchmark` enables rules designed to align generated configs with AIC sdk results, including:
+  - wider CUDA graph batch size coverage to match simulated results
+  - stricter max batch size that follows the simulated batch size
+
+You can also define your own rule sets by adding a new folder under `src/aiconfigurator/generator/rule_plugin/` and selecting it with `--generator-set rule=<folder_name>`.
+
 Run `aiconfigurator cli default --generator-help` to print information that is sourced directly from `src/aiconfigurator/generator/config/deployment_config.yaml` and `backend_config_mapping.yaml`. 
 
 The `--generator-help` command supports three section options:

@@ -47,6 +47,8 @@ flowchart TD
   prefill max_batch_size = (max_batch_size if max_batch_size else 1)
   ```
   DSL rules users can extend to influence generated configs. Field names come from `backend_config_mapping.yaml` and `deployment_config.yaml`; prefixes like `agg_`, `prefill_`, and `decode_` scope the impact to that role’s generated outputs.
+
+  **Rule selection**: Use `--generator-set rule=benchmark` to switch to a different rule plugin folder under `src/aiconfigurator/generator/rule_plugin/`. If `rule` is not provided, the default production rules are used (tuned for deployment, including max batch size and CUDA graph batch size adjustments). The `benchmark` rules are designed to align generated configs with AIC simulation, using broader CUDA graph batch sizes and a stricter max batch size derived from the simulated batch size. You can add your own rule sets by creating a folder under `rule_plugin/` and selecting it via `--generator-set rule=<folder_name>`.
 
 - Backend templates (`config/backend_templates/<backend>/`):  
   Jinja templates that turn mapped parameters into CLI args, engine configs, run scripts, and Kubernetes manifests (optionally versioned). 

@@ -93,7 +93,7 @@ aiconfigurator = "aiconfigurator.main:main"
 
 [tool.setuptools.package-data]
 "aiconfigurator.cli" = ["*.yaml", "exps/*.yaml"]
-"aiconfigurator.generator" = ["config/**/*.yaml", "config/backend_templates/**/*.j2", "rule_plugin/*.rule"]
+"aiconfigurator.generator" = ["config/**/*.yaml", "config/backend_templates/**/*.j2", "rule_plugin/**/*.rule"]
 "aiconfigurator" = ["systems/*.yaml", "systems/**/*.txt", "model_configs/*.json"]
 "aiconfigurator.webapp.components.profiling" = ["styles.css", "js/*.js"]
 

@@ -229,4 +229,7 @@ def generate_config_from_input_dict(
     )
     if target.get("ModelConfig"):
         params["ModelConfig"] = target.get("ModelConfig", {})
+    rule_name = input_params.get("rule")
+    if rule_name:
+        params["rule"] = rule_name
     return params
@@ -221,5 +221,8 @@ def _build_worker_params(prefix: str, extra_overrides: dict | None) -> tuple[dic
     )
 
     params = _deep_merge(params, overrides.get("Params"))
+    rule_name = overrides.get("rule")
+    if rule_name:
+        params["rule"] = rule_name
     params["ModelConfig"] = model_cfg
     return params
@@ -31,7 +31,10 @@
 
 
 def render_backend_templates(
-    param_values: dict[str, Any], backend: str, templates_dir: Optional[str] = None, version: Optional[str] = None
+    param_values: dict[str, Any],
+    backend: str,
+    templates_dir: Optional[str] = None,
+    version: Optional[str] = None,
 ) -> dict[str, str]:
     """
     Render templates for a specific backend with version-specific template selection.

@@ -12,6 +12,21 @@
 logger = logging.getLogger(__name__)
 _BASE_DIR = Path(__file__).resolve().parent
 _RULES_DIR = (_BASE_DIR.parent / "rule_plugin").resolve()
+_DEFAULT_RULE_PLUGIN = "default"
+
+
+def _resolve_rule_plugin_dir(plugin: Optional[str], base_dir: Optional[str] = None) -> str:
+    base = Path(base_dir).resolve() if base_dir else _RULES_DIR
+    if not plugin or plugin == _DEFAULT_RULE_PLUGIN:
+        return str(base)
+    if os.path.sep in plugin or (os.path.altsep and os.path.altsep in plugin):
+        raise ValueError(f"Rule plugin must be a simple name, got: {plugin!r}")
+    candidate = (base / plugin).resolve()
+    if not candidate.exists() or not candidate.is_dir():
+        raise FileNotFoundError(f"Rule plugin directory not found: {candidate}")
+    if base not in candidate.parents and candidate != base:
+        raise ValueError(f"Rule plugin path escapes base directory: {candidate}")
+    return str(candidate)
 
 
 def _ensure_scope(pv: dict[str, Any], scope: str) -> dict[str, Any]:
@@ -157,8 +172,13 @@ def _load_rule_path(base_dir: str, backend: str) -> Optional[str]:
     return p if os.path.exists(p) else None
 
 
-def apply_rule_plugins(param_values: dict[str, Any], backend: str, dsl_dir: Optional[str] = None) -> dict[str, Any]:
-    base = str(Path(dsl_dir).resolve()) if dsl_dir else str(_RULES_DIR)
+def apply_rule_plugins(
+    param_values: dict[str, Any],
+    backend: str,
+    dsl_dir: Optional[str] = None,
+) -> dict[str, Any]:
+    rule_name = param_values.get("rule")
+    base = _resolve_rule_plugin_dir(rule_name, base_dir=dsl_dir)
     rule_path = _load_rule_path(base, backend)
     if not rule_path:
         return param_values

@@ -0,0 +1,21 @@
+prefill max_batch_size = (max_batch_size if max_batch_size else 1)
+agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)
+
+agg_prefill_decode max_prefill_tokens = SlaConfig.isl + 1500
+agg enable_mixed_chunk = true
+
+agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+
+# GPUs per worker follow the same TP/PP/DP product that SGLang expects
+agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
+
+agg_prefill_decode kv_cache_dtype = ("fp8_e4m3" if kv_cache_dtype == "fp8" else kv_cache_dtype)
+prefill_decode kv_transfer_backend = (kv_transfer_backend if kv_transfer_backend else "nixl")
+
+when (ModelConfig.prefix or 0) > 0:
+    disable_prefix_cache = false
+    DynConfig.enable_router = true
+
+when (ModelConfig.nextn or 0) > 0:
+    speculative_decoding_type = "NEXTN"
+    speculative_num_steps = ModelConfig.nextn
@@ -0,0 +1,31 @@
+prefill max_batch_size = (max_batch_size if max_batch_size else 1)
+agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)
+
+prefill disable_overlap_scheduler = true
+decode disable_overlap_scheduler = false
+agg disable_overlap_scheduler = false
+
+prefill max_num_tokens = SlaConfig.isl + 1500
+decode max_num_tokens = max_batch_size
+agg max_num_tokens = max_batch_size + SlaConfig.isl + 1500
+
+agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+
+# Enforce TensorRT-LLM MoE parallelism: moe_tp × moe_ep = tp
+when ModelConfig.is_moe and (moe_tensor_parallel_size and moe_expert_parallel_size):
+    agg_prefill_decode tensor_parallel_size = moe_tensor_parallel_size * moe_expert_parallel_size
+
+# GPUs per worker (fallback to 1 if any dimension missing)
+agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
+
+agg_prefill_decode enable_attention_dp = ((data_parallel_size or 1) > 1) and ModelConfig.is_moe
+
+when (ModelConfig.prefix or 0) > 0:
+    agg_prefill_decode disable_prefix_cache = false
+    DynConfig.enable_router = true
+
+
+# Speculative decoding
+when (ModelConfig.nextn or 0) > 0:
+    agg_decode speculative_decoding_type = "MTP"
+    agg_decode num_nextn_predict_layers = ModelConfig.nextn
@@ -0,0 +1,21 @@
+prefill max_batch_size = (max_batch_size if max_batch_size else 1)
+decode max_batch_size = (max_batch_size if max_batch_size else 128)
+
+
+agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
+agg_prefill_decode enable_expert_parallel = ((moe_expert_parallel_size or 1) > 1)
+
+agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+
+prefill max_num_tokens = (SlaConfig.isl or 0) + 1500
+decode max_num_tokens = max_batch_size
+agg max_num_tokens = (max_batch_size or 0) + (SlaConfig.isl or 0) + 1500
+agg max_seq_len = (SlaConfig.isl or 0) + (SlaConfig.osl or 0) + 1500
+
+when (ModelConfig.prefix or 0) > 0:
+    disable_prefix_cache = false
+    DynConfig.enable_router = true
+
+when (ModelConfig.nextn or 0) > 0:
+    speculative_decoding_type = "mtp"
+    num_nextn_predict_layers = ModelConfig.nextn
@@ -1,11 +1,13 @@
+agg_decode cuda_graph_batch_sizes = ((range(1, 16 + 1, 1) | list) + (range(16, 32 + 1, 4) | list) + (range(32, 64 + 1, 8) | list) + (range(64, 128 + 1, 16) | list) + (range(128, 256 + 1, 32) | list) + (range(256, 512 + 1, 64) | list) + (range((1 if (max_batch_size or 0) <= 8 else (max_batch_size or 0) - 8), (max_batch_size or 0) + 8 + 1, 1) | list))
+
+agg_decode cuda_graph_enable_padding = true
+
 prefill max_batch_size = (max_batch_size if max_batch_size else 1)
-agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)
+agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))
 
 agg_prefill_decode max_prefill_tokens = SlaConfig.isl + 1500
 agg enable_mixed_chunk = true
 
-agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
-
 # GPUs per worker follow the same TP/PP/DP product that SGLang expects
 agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
 
@@ -14,6 +16,7 @@ prefill_decode kv_transfer_backend = (kv_transfer_backend if kv_transfer_backend
 
 when (ModelConfig.prefix or 0) > 0:
     disable_prefix_cache = false
+    DynConfig.enable_router = true
 
 when (ModelConfig.nextn or 0) > 0:
     speculative_decoding_type = "NEXTN"

@@ -1,5 +1,9 @@
+agg_decode cuda_graph_batch_sizes = ((range(1, 16 + 1, 1) | list) + (range(16, 32 + 1, 4) | list) + (range(32, 64 + 1, 8) | list) + (range(64, 128 + 1, 16) | list) + (range(128, 256 + 1, 32) | list) + (range(256, 512 + 1, 64) | list) + (range((1 if (max_batch_size or 0) <= 8 else (max_batch_size or 0) - 8), (max_batch_size or 0) + 8 + 1, 1) | list))
+
+agg_decode cuda_graph_enable_padding = true
+
 prefill max_batch_size = (max_batch_size if max_batch_size else 1)
-agg_decode max_batch_size = (max_batch_size if max_batch_size else 128)
+agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))
 
 prefill disable_overlap_scheduler = true
 decode disable_overlap_scheduler = false
@@ -9,8 +13,6 @@ prefill max_num_tokens = SlaConfig.isl + 1500
 decode max_num_tokens = max_batch_size
 agg max_num_tokens = max_batch_size + SlaConfig.isl + 1500
 
-agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
-
 # Enforce TensorRT-LLM MoE parallelism: moe_tp × moe_ep = tp
 when ModelConfig.is_moe and (moe_tensor_parallel_size and moe_expert_parallel_size):
     agg_prefill_decode tensor_parallel_size = moe_tensor_parallel_size * moe_expert_parallel_size

@@ -1,5 +1,6 @@
+agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))
 prefill max_batch_size = (max_batch_size if max_batch_size else 1)
-decode max_batch_size = (max_batch_size if max_batch_size else 128)
+agg_decode max_batch_size = (512 if (max_batch_size or 0) < 512 else (max_batch_size * 2))
 
 
 agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)