llm-d-incubation
diff --git a/‎CLAUDE.md‎
Lines changed: 2 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/TESTING.md‎
Lines changed: 1 addition & 1 deletion b/‎backend/TESTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/api/routes.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/api/routes.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/context_intent/schema.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/src/context_intent/schema.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/src/deployment/generator.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/src/deployment/generator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/src/deployment/templates/vllm-config.yaml.j2‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/deployment/templates/vllm-config.yaml.j2‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/knowledge_base/slo_templates.py‎
Lines changed: 2 additions & 2 deletions b/‎backend/src/knowledge_base/slo_templates.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/src/orchestration/workflow.py‎
Lines changed: 5 additions & 5 deletions b/‎backend/src/orchestration/workflow.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backend/src/recommendation/capacity_planner.py‎
Lines changed: 6 additions & 9 deletions b/‎backend/src/recommendation/capacity_planner.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎backend/src/recommendation/traffic_profile.py‎
Lines changed: 6 additions & 3 deletions b/‎backend/src/recommendation/traffic_profile.py‎
Lines changed: 6 additions & 3 deletions
@@ -61,7 +61,7 @@ The system translates high-level user intent into technical specifications:
 - **User says**: "I need a chatbot for 1000 users, low latency is critical"
 - **System generates**:
   - Traffic profile (avg prompt: 150 tokens, gen: 200 tokens, peak QPS: 100)
-  - SLO targets (TTFT p90: 200ms, TPOT p90: 50ms, E2E p95: 2000ms)
+  - SLO targets (TTFT p90: 200ms, TPOT p90: 50ms, E2E p90: 10150ms)
   - GPU capacity plan (e.g., "2x NVIDIA L4 GPUs, independent replicas")
   - Cost estimate ($800/month)
 
@@ -110,7 +110,7 @@ The system translates high-level user intent into technical specifications:
 3. **SLO metrics are mandatory**:
    - TTFT (Time to First Token): p50, p90, p99 - **stored in benchmarks**
    - TPOT (Time Per Output Token): p50, p90, p99 - **stored in benchmarks**
-   - E2E Latency: p50, p95, p99 - **calculated dynamically** from TTFT + (generation_tokens × TPOT)
+   - E2E Latency: p50, p90, p99 - **calculated dynamically** from TTFT + (generation_tokens × TPOT)
    - Throughput: requests/sec and tokens/sec
    - Rationale: E2E latency varies by workload (generation length, streaming mode, use case), so it's calculated per-request rather than stored as a fixed benchmark value
 
 
@@ -76,7 +76,7 @@ print()
 print('SLO Targets:')
 print(f'  TTFT p90: {slo.ttft_p90_target_ms}ms')
 print(f'  TPOT p90: {slo.tpot_p90_target_ms}ms')
-print(f'  E2E p95: {slo.e2e_p95_target_ms}ms')
+print(f'  E2E p90: {slo.e2e_p90_target_ms}ms')
 "
 ```
 
 
@@ -521,7 +521,7 @@ async def get_deployment_status(deployment_id: str):
                 "tpot_p90_ms": base_tpot + random.randint(-3, 5),
                 "tpot_target_ms": 50,
                 "tpot_compliant": True,
-                "e2e_p95_ms": base_e2e + random.randint(-50, 100),
+                "e2e_p90_ms": base_e2e + random.randint(-50, 100),
                 "e2e_target_ms": 2000,
                 "e2e_compliant": True,
                 "throughput_qps": 122 + random.randint(-5, 10),
 
@@ -23,7 +23,7 @@ class SLOTargets(BaseModel):
 
     ttft_p90_target_ms: int = Field(..., description="Time to First Token p90 target (ms)")
     tpot_p90_target_ms: int = Field(..., description="Time Per Output Token p90 target (ms)")
-    e2e_p95_target_ms: int = Field(..., description="End-to-end latency p95 target (ms)")
+    e2e_p90_target_ms: int = Field(..., description="End-to-end latency p90 target (ms)")
 
 
 class GPUConfig(BaseModel):
@@ -93,7 +93,7 @@ class DeploymentRecommendation(BaseModel):
     # Performance predictions
     predicted_ttft_p90_ms: int
     predicted_tpot_p90_ms: int
-    predicted_e2e_p95_ms: int
+    predicted_e2e_p90_ms: int
     predicted_throughput_qps: float
 
     # Cost estimation
 
@@ -163,7 +163,7 @@ def _prepare_template_context(
 
         # Calculate max_num_seqs based on expected QPS and latency
         # Rule of thumb: concurrent requests = QPS × avg_latency_seconds
-        avg_latency_sec = slo.e2e_p95_target_ms / 1000.0
+        avg_latency_sec = slo.e2e_p90_target_ms / 1000.0
         max_num_seqs = max(32, int(traffic.expected_qps * avg_latency_sec * 1.5))
 
         # Max batched tokens (vLLM parameter)
@@ -210,7 +210,7 @@ def _prepare_template_context(
             # SLO targets
             "ttft_target": slo.ttft_p90_target_ms,
             "tpot_target": slo.tpot_p90_target_ms,
-            "e2e_target": slo.e2e_p95_target_ms,
+            "e2e_target": slo.e2e_p90_target_ms,
             "target_qps": traffic.expected_qps,
             # Traffic profile
             "expected_qps": traffic.expected_qps,
 
@@ -45,7 +45,7 @@ performance_tuning:
 slo_targets:
   ttft_p90_ms: {{ ttft_target }}
   tpot_p90_ms: {{ tpot_target }}
-  e2e_p95_ms: {{ e2e_target }}
+  e2e_p90_ms: {{ e2e_target }}
   target_qps: {{ target_qps }}
 
 traffic_profile:
 
@@ -19,7 +19,7 @@ def __init__(self, use_case: str, data: dict):
         slo = data["slo_targets"]
         self.ttft_p90_target_ms = slo["ttft_p90_target_ms"]
         self.tpot_p90_target_ms = slo["tpot_p90_target_ms"]
-        self.e2e_p95_target_ms = slo["e2e_p95_target_ms"]
+        self.e2e_p90_target_ms = slo["e2e_p90_target_ms"]
 
         # Typical traffic characteristics
         traffic = data["typical_traffic"]
@@ -44,7 +44,7 @@ def to_dict(self) -> dict:
             "slo_targets": {
                 "ttft_p90_target_ms": self.ttft_p90_target_ms,
                 "tpot_p90_target_ms": self.tpot_p90_target_ms,
-                "e2e_p95_target_ms": self.e2e_p95_target_ms,
+                "e2e_p90_target_ms": self.e2e_p90_target_ms,
             },
             "typical_traffic": {
                 "prompt_tokens_mean": self.prompt_tokens_mean,
 
@@ -137,7 +137,7 @@ def generate_recommendation(
                     "gpu_config": rec.gpu_config.dict(),
                     "predicted_ttft_p90_ms": rec.predicted_ttft_p90_ms,
                     "predicted_tpot_p90_ms": rec.predicted_tpot_p90_ms,
-                    "predicted_e2e_p95_ms": rec.predicted_e2e_p95_ms,
+                    "predicted_e2e_p90_ms": rec.predicted_e2e_p90_ms,
                     "predicted_throughput_qps": rec.predicted_throughput_qps,
                     "cost_per_hour_usd": rec.cost_per_hour_usd,
                     "cost_per_month_usd": rec.cost_per_month_usd,
@@ -237,7 +237,7 @@ def generate_recommendation_from_specs(self, specifications: dict) -> Deployment
                     "gpu_config": rec.gpu_config.dict(),
                     "predicted_ttft_p90_ms": rec.predicted_ttft_p90_ms,
                     "predicted_tpot_p90_ms": rec.predicted_tpot_p90_ms,
-                    "predicted_e2e_p95_ms": rec.predicted_e2e_p95_ms,
+                    "predicted_e2e_p90_ms": rec.predicted_e2e_p90_ms,
                     "predicted_throughput_qps": rec.predicted_throughput_qps,
                     "cost_per_hour_usd": rec.cost_per_hour_usd,
                     "cost_per_month_usd": rec.cost_per_month_usd,
@@ -285,10 +285,10 @@ def validate_recommendation(self, recommendation: DeploymentRecommendation) -> b
             return False
 
         # Check E2E
-        if recommendation.predicted_e2e_p95_ms > recommendation.slo_targets.e2e_p95_target_ms:
+        if recommendation.predicted_e2e_p90_ms > recommendation.slo_targets.e2e_p90_target_ms:
             logger.warning(
-                f"E2E {recommendation.predicted_e2e_p95_ms}ms exceeds target "
-                f"{recommendation.slo_targets.e2e_p95_target_ms}ms"
+                f"E2E {recommendation.predicted_e2e_p90_ms}ms exceeds target "
+                f"{recommendation.slo_targets.e2e_p90_target_ms}ms"
             )
             return False
 
 
@@ -94,10 +94,10 @@ def plan_capacity(
             predicted_e2e = self._estimate_e2e_latency(bench, traffic_profile)
 
             # Check if E2E also meets target
-            if predicted_e2e > slo_targets.e2e_p95_target_ms:
+            if predicted_e2e > slo_targets.e2e_p90_target_ms:
                 logger.debug(
                     f"Skipping {bench.gpu_type} TP={bench.tensor_parallel}: "
-                    f"E2E {predicted_e2e}ms > target {slo_targets.e2e_p95_target_ms}ms"
+                    f"E2E {predicted_e2e}ms > target {slo_targets.e2e_p90_target_ms}ms"
                 )
                 continue
 
@@ -135,7 +135,7 @@ def plan_capacity(
                 gpu_config=gpu_config,
                 predicted_ttft_p90_ms=bench.ttft_p90_ms,
                 predicted_tpot_p90_ms=bench.tpot_p90_ms,
-                predicted_e2e_p95_ms=predicted_e2e,
+                predicted_e2e_p90_ms=predicted_e2e,
                 predicted_throughput_qps=bench.max_qps * replicas,
                 cost_per_hour_usd=cost_per_hour,
                 cost_per_month_usd=cost_per_month,
@@ -164,7 +164,7 @@ def plan_capacity(
                     "gpu_config": rec.gpu_config.dict(),
                     "predicted_ttft_p90_ms": rec.predicted_ttft_p90_ms,
                     "predicted_tpot_p90_ms": rec.predicted_tpot_p90_ms,
-                    "predicted_e2e_p95_ms": rec.predicted_e2e_p95_ms,
+                    "predicted_e2e_p90_ms": rec.predicted_e2e_p90_ms,
                     "predicted_throughput_qps": rec.predicted_throughput_qps,
                     "cost_per_hour_usd": rec.cost_per_hour_usd,
                     "cost_per_month_usd": rec.cost_per_month_usd,
@@ -213,7 +213,7 @@ def _estimate_e2e_latency(self, bench: BenchmarkData, traffic_profile: TrafficPr
             traffic_profile: Traffic characteristics
 
         Returns:
-            Estimated E2E p95 latency (ms)
+            Estimated E2E p90 latency (ms)
         """
         # For streaming: E2E ≈ TTFT + (first ~20 tokens × TPOT)
         # This represents the time until the user has a meaningful response
@@ -226,10 +226,7 @@ def _estimate_e2e_latency(self, bench: BenchmarkData, traffic_profile: TrafficPr
 
         e2e_p90 = ttft + (tpot * perceived_gen_tokens)
 
-        # Add ~20% buffer for p95
-        e2e_p95 = int(e2e_p90 * 1.2)
-
-        return e2e_p95
+        return e2e_p90
 
     def _generate_reasoning(
         self,
 
@@ -78,13 +78,13 @@ def generate_slo_targets(self, intent: DeploymentIntent) -> SLOTargets:
             template.tpot_p90_target_ms, intent.latency_requirement
         )
         e2e_target = self._adjust_slo_for_latency(
-            template.e2e_p95_target_ms, intent.latency_requirement
+            template.e2e_p90_target_ms, intent.latency_requirement
         )
 
         return SLOTargets(
             ttft_p90_target_ms=ttft_target,
             tpot_p90_target_ms=tpot_target,
-            e2e_p95_target_ms=e2e_target,
+            e2e_p90_target_ms=e2e_target,
         )
 
     def _estimate_qps(
@@ -121,6 +121,9 @@ def _estimate_qps(
         # Apply peak ratio
         peak_qps = avg_qps_peak * peak_ratio
 
+        # Ensure minimum QPS of 0.1 for small workloads
+        peak_qps = max(0.1, peak_qps)
+
         return round(peak_qps, 2)
 
     def _adjust_slo_for_latency(self, base_target_ms: int, latency_requirement: str) -> int:
@@ -169,4 +172,4 @@ def _generate_default_slo(self, intent: DeploymentIntent) -> SLOTargets:
 
         ttft, tpot, e2e = slo_map.get(intent.latency_requirement, (500, 80, 5000))
 
-        return SLOTargets(ttft_p90_target_ms=ttft, tpot_p90_target_ms=tpot, e2e_p95_target_ms=e2e)
+        return SLOTargets(ttft_p90_target_ms=ttft, tpot_p90_target_ms=tpot, e2e_p90_target_ms=e2e)
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ print()`
`76`	`76`	`print('SLO Targets:')`
`77`	`77`	`print(f' TTFT p90: {slo.ttft_p90_target_ms}ms')`
`78`	`78`	`print(f' TPOT p90: {slo.tpot_p90_target_ms}ms')`
`79`		`-print(f' E2E p95: {slo.e2e_p95_target_ms}ms')`
	`79`	`+print(f' E2E p90: {slo.e2e_p90_target_ms}ms')`
`80`	`80`	`"`
`81`	`81`	```
`82`	`82`