Skip to content

Commit 359765d

Browse files
authored
feat: load-based scaling in SLA Planner (ai-dynamo#6145)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
1 parent 815b129 commit 359765d

27 files changed

+2833
-410
lines changed

components/src/dynamo/planner/README.md

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,28 @@ limitations under the License.
1919

2020
SLA-driven autoscaling controller for Dynamo inference graphs.
2121

22-
- **User docs**: [docs/planner/](/docs/pages/components/planner/) (deployment, configuration, examples)
23-
- **Design docs**: [docs/pages/design-docs/planner-design.md](/docs/pages/design-docs/planner-design.md) (architecture, algorithms)
22+
## Scaling Modes
23+
24+
The SLA Planner supports two scaling modes that can be used independently or together:
25+
26+
### Throughput-Based Scaling
27+
28+
Uses pre-deployment profiling data and traffic prediction to compute the number of prefill/decode replicas needed to meet TTFT and ITL SLA targets. Requires profiling data from the Dynamo profiler.
29+
30+
### Load-Based Scaling (Experimental)
31+
32+
Uses real-time per-worker load metrics (active prefill tokens, active KV blocks) from the router to make SLA-aware scaling decisions via online linear regression. Does not require profiling data. Responds quickly to traffic bursts.
33+
34+
When both modes are enabled, throughput-based scaling provides a lower bound on replicas while load-based scaling handles real-time adjustments.
35+
36+
### Support Matrix
37+
38+
| Deployment Type | Throughput-Based | Load-Based (Experimental) |
39+
|-----------------|:----------------:|:-------------------------:|
40+
| Disaggregated | Supported | Supported |
41+
| Aggregated | Unsupported | Supported |
42+
43+
## Documentation
44+
45+
- **User docs**: [Planner Guide](../../../../docs/pages/components/planner/planner-guide.md) (deployment, configuration, examples)
46+
- **Design docs**: [Planner Design](../../../../docs/pages/design-docs/planner-design.md) (architecture, algorithms)

components/src/dynamo/planner/__init__.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,12 @@
55
"PlannerConnector",
66
"KubernetesConnector",
77
"VirtualConnector",
8-
"LoadPlannerDefaults",
98
"SLAPlannerDefaults",
109
"TargetReplica",
1110
"SubComponentType",
1211
]
1312
# Import the classes
14-
from dynamo.planner.defaults import (
15-
LoadPlannerDefaults,
16-
SLAPlannerDefaults,
17-
SubComponentType,
18-
)
13+
from dynamo.planner.defaults import SLAPlannerDefaults, SubComponentType
1914
from dynamo.planner.kubernetes_connector import KubernetesConnector, TargetReplica
2015
from dynamo.planner.planner_connector import PlannerConnector
2116
from dynamo.planner.virtual_connector import VirtualConnector

components/src/dynamo/planner/defaults.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,6 @@ class BasePlannerDefaults:
4848
metric_reporting_prometheus_port = int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0))
4949

5050

51-
class LoadPlannerDefaults(BasePlannerDefaults):
52-
metric_pulling_interval = 10 # in seconds
53-
decode_kv_scale_up_threshold = 0.9
54-
decode_kv_scale_down_threshold = 0.5
55-
prefill_queue_scale_up_threshold = 5.0
56-
prefill_queue_scale_down_threshold = 0.2
57-
58-
5951
class SLAPlannerDefaults(BasePlannerDefaults):
6052
# Prometheus endpoint URL for pulling/querying metrics
6153
metric_pulling_prometheus_endpoint = os.environ.get(
@@ -81,6 +73,20 @@ class SLAPlannerDefaults(BasePlannerDefaults):
8173
no_correction = False # disable correction factor, might be useful under some conditions like long cold start time
8274
mode = "disagg" # ["disagg", "prefill", "decode"]
8375

76+
# Scaling mode flags
77+
enable_throughput_scaling = True
78+
enable_loadbased_scaling = False
79+
80+
# Load-based scaling settings
81+
loadbased_router_metrics_url: Optional[
82+
str
83+
] = None # will be auto-discovered from the DGD in kubernetes mode if not provided
84+
loadbased_adjustment_interval = 5 # in seconds, must be < adjustment_interval
85+
loadbased_learning_window = 50 # sliding window size for regression
86+
loadbased_scaling_down_sensitivity = 80 # 0-100
87+
loadbased_metric_samples = 10 # number of samples per interval
88+
loadbased_min_observations = 5 # cold start threshold
89+
8490

8591
class VllmComponentName:
8692
prefill_worker_k8s_name = "VllmPrefillWorker"

components/src/dynamo/planner/kubernetes_connector.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,28 @@ def get_gpu_counts(
278278

279279
return prefill_gpu_count, decode_gpu_count
280280

281+
def get_frontend_metrics_url(self, port: int = 8000) -> Optional[str]:
282+
"""Auto-discover the Frontend service's metrics URL from the DGD.
283+
284+
Iterates spec.services to find the service with componentType "frontend",
285+
then constructs the in-cluster URL using the operator's naming convention:
286+
http://{dgd_name}-{service_key_lowercase}:{port}/metrics
287+
288+
Returns:
289+
The metrics URL string, or None if no frontend service is found.
290+
"""
291+
deployment = self.kube_api.get_graph_deployment(self.graph_deployment_name)
292+
services = deployment.get("spec", {}).get("services", {})
293+
294+
for service_key, service_spec in services.items():
295+
if service_spec.get("componentType", "") == "frontend":
296+
service_name = f"{self.graph_deployment_name}-{service_key.lower()}"
297+
url = f"http://{service_name}:{port}/metrics"
298+
logger.info(f"Auto-discovered frontend metrics URL: {url}")
299+
return url
300+
301+
return None
302+
281303
async def wait_for_deployment_ready(self):
282304
"""Wait for the deployment to be ready"""
283305
await self.kube_api.wait_for_graph_deployment_ready(

components/src/dynamo/planner/planner_sla.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,20 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import argparse
1617
import asyncio
1718
import logging
1819

1920
from pydantic import BaseModel
2021

21-
from dynamo.planner.utils.planner_argparse import create_sla_planner_parser
22-
from dynamo.planner.utils.planner_core import start_sla_planner
22+
from dynamo.planner.utils.agg_planner import AggPlanner
23+
from dynamo.planner.utils.decode_planner import DecodePlanner
24+
from dynamo.planner.utils.disagg_planner import DisaggPlanner
25+
from dynamo.planner.utils.planner_argparse import (
26+
create_sla_planner_parser,
27+
validate_sla_planner_args,
28+
)
29+
from dynamo.planner.utils.prefill_planner import PrefillPlanner
2330
from dynamo.runtime import DistributedRuntime, dynamo_worker
2431

2532
logger = logging.getLogger(__name__)
@@ -33,6 +40,24 @@ class RequestType(BaseModel):
3340
text: str
3441

3542

43+
async def start_sla_planner(runtime: DistributedRuntime, args: argparse.Namespace):
44+
validate_sla_planner_args(args)
45+
46+
mode = getattr(args, "mode", "disagg")
47+
if mode == "disagg":
48+
planner = DisaggPlanner(runtime, args)
49+
elif mode == "prefill":
50+
planner = PrefillPlanner(runtime, args)
51+
elif mode == "decode":
52+
planner = DecodePlanner(runtime, args)
53+
elif mode == "agg":
54+
planner = AggPlanner(runtime, args)
55+
else:
56+
raise ValueError(f"Invalid planner mode: {mode}")
57+
await planner._async_init()
58+
await planner.run()
59+
60+
3661
@dynamo_worker()
3762
async def init_planner(runtime: DistributedRuntime, args):
3863
await asyncio.sleep(INIT_PLANNER_START_DELAY)

0 commit comments

Comments
 (0)