Kaonael
diff --git a/‎components/src/dynamo/planner/README.md‎
Lines changed: 25 additions & 2 deletions b/‎components/src/dynamo/planner/README.md‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎components/src/dynamo/planner/__init__.py‎
Lines changed: 1 addition & 6 deletions b/‎components/src/dynamo/planner/__init__.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎components/src/dynamo/planner/defaults.py‎
Lines changed: 14 additions & 8 deletions b/‎components/src/dynamo/planner/defaults.py‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎components/src/dynamo/planner/kubernetes_connector.py‎
Lines changed: 22 additions & 0 deletions b/‎components/src/dynamo/planner/kubernetes_connector.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎components/src/dynamo/planner/planner_sla.py‎
Lines changed: 27 additions & 2 deletions b/‎components/src/dynamo/planner/planner_sla.py‎
Lines changed: 27 additions & 2 deletions
@@ -19,5 +19,28 @@ limitations under the License.
 
 SLA-driven autoscaling controller for Dynamo inference graphs.
 
-- **User docs**: [docs/planner/](/docs/pages/components/planner/) (deployment, configuration, examples)
-- **Design docs**: [docs/pages/design-docs/planner-design.md](/docs/pages/design-docs/planner-design.md) (architecture, algorithms)
+## Scaling Modes
+
+The SLA Planner supports two scaling modes that can be used independently or together:
+
+### Throughput-Based Scaling
+
+Uses pre-deployment profiling data and traffic prediction to compute the number of prefill/decode replicas needed to meet TTFT and ITL SLA targets. Requires profiling data from the Dynamo profiler.
+
+### Load-Based Scaling (Experimental)
+
+Uses real-time per-worker load metrics (active prefill tokens, active KV blocks) from the router to make SLA-aware scaling decisions via online linear regression. Does not require profiling data. Responds quickly to traffic bursts.
+
+When both modes are enabled, throughput-based scaling provides a lower bound on replicas while load-based scaling handles real-time adjustments.
+
+### Support Matrix
+
+| Deployment Type | Throughput-Based | Load-Based (Experimental) |
+|-----------------|:----------------:|:-------------------------:|
+| Disaggregated   | Supported        | Supported                 |
+| Aggregated      | Unsupported      | Supported                 |
+
+## Documentation
+
+- **User docs**: [Planner Guide](../../../../docs/pages/components/planner/planner-guide.md) (deployment, configuration, examples)
+- **Design docs**: [Planner Design](../../../../docs/pages/design-docs/planner-design.md) (architecture, algorithms)
@@ -5,17 +5,12 @@
     "PlannerConnector",
     "KubernetesConnector",
     "VirtualConnector",
-    "LoadPlannerDefaults",
     "SLAPlannerDefaults",
     "TargetReplica",
     "SubComponentType",
 ]
 # Import the classes
-from dynamo.planner.defaults import (
-    LoadPlannerDefaults,
-    SLAPlannerDefaults,
-    SubComponentType,
-)
+from dynamo.planner.defaults import SLAPlannerDefaults, SubComponentType
 from dynamo.planner.kubernetes_connector import KubernetesConnector, TargetReplica
 from dynamo.planner.planner_connector import PlannerConnector
 from dynamo.planner.virtual_connector import VirtualConnector
 
@@ -48,14 +48,6 @@ class BasePlannerDefaults:
     metric_reporting_prometheus_port = int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0))
 
 
-class LoadPlannerDefaults(BasePlannerDefaults):
-    metric_pulling_interval = 10  # in seconds
-    decode_kv_scale_up_threshold = 0.9
-    decode_kv_scale_down_threshold = 0.5
-    prefill_queue_scale_up_threshold = 5.0
-    prefill_queue_scale_down_threshold = 0.2
-
-
 class SLAPlannerDefaults(BasePlannerDefaults):
     # Prometheus endpoint URL for pulling/querying metrics
     metric_pulling_prometheus_endpoint = os.environ.get(
@@ -81,6 +73,20 @@ class SLAPlannerDefaults(BasePlannerDefaults):
     no_correction = False  # disable correction factor, might be useful under some conditions like long cold start time
     mode = "disagg"  # ["disagg", "prefill", "decode"]
 
+    # Scaling mode flags
+    enable_throughput_scaling = True
+    enable_loadbased_scaling = False
+
+    # Load-based scaling settings
+    loadbased_router_metrics_url: Optional[
+        str
+    ] = None  # will be auto-discovered from the DGD in kubernetes mode if not provided
+    loadbased_adjustment_interval = 5  # in seconds, must be < adjustment_interval
+    loadbased_learning_window = 50  # sliding window size for regression
+    loadbased_scaling_down_sensitivity = 80  # 0-100
+    loadbased_metric_samples = 10  # number of samples per interval
+    loadbased_min_observations = 5  # cold start threshold
+
 
 class VllmComponentName:
     prefill_worker_k8s_name = "VllmPrefillWorker"
 
@@ -278,6 +278,28 @@ def get_gpu_counts(
 
         return prefill_gpu_count, decode_gpu_count
 
+    def get_frontend_metrics_url(self, port: int = 8000) -> Optional[str]:
+        """Auto-discover the Frontend service's metrics URL from the DGD.
+
+        Iterates spec.services to find the service with componentType "frontend",
+        then constructs the in-cluster URL using the operator's naming convention:
+        http://{dgd_name}-{service_key_lowercase}:{port}/metrics
+
+        Returns:
+            The metrics URL string, or None if no frontend service is found.
+        """
+        deployment = self.kube_api.get_graph_deployment(self.graph_deployment_name)
+        services = deployment.get("spec", {}).get("services", {})
+
+        for service_key, service_spec in services.items():
+            if service_spec.get("componentType", "") == "frontend":
+                service_name = f"{self.graph_deployment_name}-{service_key.lower()}"
+                url = f"http://{service_name}:{port}/metrics"
+                logger.info(f"Auto-discovered frontend metrics URL: {url}")
+                return url
+
+        return None
+
     async def wait_for_deployment_ready(self):
         """Wait for the deployment to be ready"""
         await self.kube_api.wait_for_graph_deployment_ready(
 
@@ -13,13 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import asyncio
 import logging
 
 from pydantic import BaseModel
 
-from dynamo.planner.utils.planner_argparse import create_sla_planner_parser
-from dynamo.planner.utils.planner_core import start_sla_planner
+from dynamo.planner.utils.agg_planner import AggPlanner
+from dynamo.planner.utils.decode_planner import DecodePlanner
+from dynamo.planner.utils.disagg_planner import DisaggPlanner
+from dynamo.planner.utils.planner_argparse import (
+    create_sla_planner_parser,
+    validate_sla_planner_args,
+)
+from dynamo.planner.utils.prefill_planner import PrefillPlanner
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 
 logger = logging.getLogger(__name__)
@@ -33,6 +40,24 @@ class RequestType(BaseModel):
     text: str
 
 
+async def start_sla_planner(runtime: DistributedRuntime, args: argparse.Namespace):
+    validate_sla_planner_args(args)
+
+    mode = getattr(args, "mode", "disagg")
+    if mode == "disagg":
+        planner = DisaggPlanner(runtime, args)
+    elif mode == "prefill":
+        planner = PrefillPlanner(runtime, args)
+    elif mode == "decode":
+        planner = DecodePlanner(runtime, args)
+    elif mode == "agg":
+        planner = AggPlanner(runtime, args)
+    else:
+        raise ValueError(f"Invalid planner mode: {mode}")
+    await planner._async_init()
+    await planner.run()
+
+
 @dynamo_worker()
 async def init_planner(runtime: DistributedRuntime, args):
     await asyncio.sleep(INIT_PLANNER_START_DELAY)