Skip to content

Commit 6846bd7

Browse files
anfredetteclaude
andcommitted
Standardize SLO metrics to p90 and add minimum QPS constraint
Changed all E2E latency SLO targets from p95 to p90 for consistency across the system. All SLO targets now use p90 percentile (TTFT p90, TPOT p90, E2E p90). Key changes: - Updated SLO templates with calculated e2e_p90 values using formula: ttft_p90 + (generation_tokens_mean - 1) * tpot_p90 - Modified schemas to use e2e_p90_target_ms and predicted_e2e_p90_ms - Updated capacity planner to calculate p90 E2E latency (removed 1.2x p95 buffer) - Changed all UI labels and metrics from "E2E p95" to "E2E p90" - Updated documentation, test files, and data files for consistency - Added minimum QPS of 0.1 in traffic profile generation to handle small workloads (prevents errors with <100 user scenarios) This change makes SLO percentiles consistent across all metrics and provides more realistic E2E latency targets based on actual token generation requirements. Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Andre Fredette <afredette@redhat.com>
1 parent 6d30658 commit 6846bd7

22 files changed

+87
-87
lines changed

CLAUDE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ The system translates high-level user intent into technical specifications:
6161
- **User says**: "I need a chatbot for 1000 users, low latency is critical"
6262
- **System generates**:
6363
- Traffic profile (avg prompt: 150 tokens, gen: 200 tokens, peak QPS: 100)
64-
- SLO targets (TTFT p90: 200ms, TPOT p90: 50ms, E2E p95: 2000ms)
64+
- SLO targets (TTFT p90: 200ms, TPOT p90: 50ms, E2E p90: 10150ms)
6565
- GPU capacity plan (e.g., "2x NVIDIA L4 GPUs, independent replicas")
6666
- Cost estimate ($800/month)
6767

@@ -110,7 +110,7 @@ The system translates high-level user intent into technical specifications:
110110
3. **SLO metrics are mandatory**:
111111
- TTFT (Time to First Token): p50, p90, p99 - **stored in benchmarks**
112112
- TPOT (Time Per Output Token): p50, p90, p99 - **stored in benchmarks**
113-
- E2E Latency: p50, p95, p99 - **calculated dynamically** from TTFT + (generation_tokens × TPOT)
113+
- E2E Latency: p50, p90, p99 - **calculated dynamically** from TTFT + (generation_tokens × TPOT)
114114
- Throughput: requests/sec and tokens/sec
115115
- Rationale: E2E latency varies by workload (generation length, streaming mode, use case), so it's calculated per-request rather than stored as a fixed benchmark value
116116

backend/TESTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ print()
7676
print('SLO Targets:')
7777
print(f' TTFT p90: {slo.ttft_p90_target_ms}ms')
7878
print(f' TPOT p90: {slo.tpot_p90_target_ms}ms')
79-
print(f' E2E p95: {slo.e2e_p95_target_ms}ms')
79+
print(f' E2E p90: {slo.e2e_p90_target_ms}ms')
8080
"
8181
```
8282

backend/src/api/routes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ async def get_deployment_status(deployment_id: str):
521521
"tpot_p90_ms": base_tpot + random.randint(-3, 5),
522522
"tpot_target_ms": 50,
523523
"tpot_compliant": True,
524-
"e2e_p95_ms": base_e2e + random.randint(-50, 100),
524+
"e2e_p90_ms": base_e2e + random.randint(-50, 100),
525525
"e2e_target_ms": 2000,
526526
"e2e_compliant": True,
527527
"throughput_qps": 122 + random.randint(-5, 10),

backend/src/context_intent/schema.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class SLOTargets(BaseModel):
2323

2424
ttft_p90_target_ms: int = Field(..., description="Time to First Token p90 target (ms)")
2525
tpot_p90_target_ms: int = Field(..., description="Time Per Output Token p90 target (ms)")
26-
e2e_p95_target_ms: int = Field(..., description="End-to-end latency p95 target (ms)")
26+
e2e_p90_target_ms: int = Field(..., description="End-to-end latency p90 target (ms)")
2727

2828

2929
class GPUConfig(BaseModel):
@@ -93,7 +93,7 @@ class DeploymentRecommendation(BaseModel):
9393
# Performance predictions
9494
predicted_ttft_p90_ms: int
9595
predicted_tpot_p90_ms: int
96-
predicted_e2e_p95_ms: int
96+
predicted_e2e_p90_ms: int
9797
predicted_throughput_qps: float
9898

9999
# Cost estimation

backend/src/deployment/generator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def _prepare_template_context(
163163

164164
# Calculate max_num_seqs based on expected QPS and latency
165165
# Rule of thumb: concurrent requests = QPS × avg_latency_seconds
166-
avg_latency_sec = slo.e2e_p95_target_ms / 1000.0
166+
avg_latency_sec = slo.e2e_p90_target_ms / 1000.0
167167
max_num_seqs = max(32, int(traffic.expected_qps * avg_latency_sec * 1.5))
168168

169169
# Max batched tokens (vLLM parameter)
@@ -210,7 +210,7 @@ def _prepare_template_context(
210210
# SLO targets
211211
"ttft_target": slo.ttft_p90_target_ms,
212212
"tpot_target": slo.tpot_p90_target_ms,
213-
"e2e_target": slo.e2e_p95_target_ms,
213+
"e2e_target": slo.e2e_p90_target_ms,
214214
"target_qps": traffic.expected_qps,
215215
# Traffic profile
216216
"expected_qps": traffic.expected_qps,

backend/src/deployment/templates/vllm-config.yaml.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ performance_tuning:
4545
slo_targets:
4646
ttft_p90_ms: {{ ttft_target }}
4747
tpot_p90_ms: {{ tpot_target }}
48-
e2e_p95_ms: {{ e2e_target }}
48+
e2e_p90_ms: {{ e2e_target }}
4949
target_qps: {{ target_qps }}
5050

5151
traffic_profile:

backend/src/knowledge_base/slo_templates.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def __init__(self, use_case: str, data: dict):
1919
slo = data["slo_targets"]
2020
self.ttft_p90_target_ms = slo["ttft_p90_target_ms"]
2121
self.tpot_p90_target_ms = slo["tpot_p90_target_ms"]
22-
self.e2e_p95_target_ms = slo["e2e_p95_target_ms"]
22+
self.e2e_p90_target_ms = slo["e2e_p90_target_ms"]
2323

2424
# Typical traffic characteristics
2525
traffic = data["typical_traffic"]
@@ -44,7 +44,7 @@ def to_dict(self) -> dict:
4444
"slo_targets": {
4545
"ttft_p90_target_ms": self.ttft_p90_target_ms,
4646
"tpot_p90_target_ms": self.tpot_p90_target_ms,
47-
"e2e_p95_target_ms": self.e2e_p95_target_ms,
47+
"e2e_p90_target_ms": self.e2e_p90_target_ms,
4848
},
4949
"typical_traffic": {
5050
"prompt_tokens_mean": self.prompt_tokens_mean,

backend/src/orchestration/workflow.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def generate_recommendation(
137137
"gpu_config": rec.gpu_config.dict(),
138138
"predicted_ttft_p90_ms": rec.predicted_ttft_p90_ms,
139139
"predicted_tpot_p90_ms": rec.predicted_tpot_p90_ms,
140-
"predicted_e2e_p95_ms": rec.predicted_e2e_p95_ms,
140+
"predicted_e2e_p90_ms": rec.predicted_e2e_p90_ms,
141141
"predicted_throughput_qps": rec.predicted_throughput_qps,
142142
"cost_per_hour_usd": rec.cost_per_hour_usd,
143143
"cost_per_month_usd": rec.cost_per_month_usd,
@@ -237,7 +237,7 @@ def generate_recommendation_from_specs(self, specifications: dict) -> Deployment
237237
"gpu_config": rec.gpu_config.dict(),
238238
"predicted_ttft_p90_ms": rec.predicted_ttft_p90_ms,
239239
"predicted_tpot_p90_ms": rec.predicted_tpot_p90_ms,
240-
"predicted_e2e_p95_ms": rec.predicted_e2e_p95_ms,
240+
"predicted_e2e_p90_ms": rec.predicted_e2e_p90_ms,
241241
"predicted_throughput_qps": rec.predicted_throughput_qps,
242242
"cost_per_hour_usd": rec.cost_per_hour_usd,
243243
"cost_per_month_usd": rec.cost_per_month_usd,
@@ -285,10 +285,10 @@ def validate_recommendation(self, recommendation: DeploymentRecommendation) -> b
285285
return False
286286

287287
# Check E2E
288-
if recommendation.predicted_e2e_p95_ms > recommendation.slo_targets.e2e_p95_target_ms:
288+
if recommendation.predicted_e2e_p90_ms > recommendation.slo_targets.e2e_p90_target_ms:
289289
logger.warning(
290-
f"E2E {recommendation.predicted_e2e_p95_ms}ms exceeds target "
291-
f"{recommendation.slo_targets.e2e_p95_target_ms}ms"
290+
f"E2E {recommendation.predicted_e2e_p90_ms}ms exceeds target "
291+
f"{recommendation.slo_targets.e2e_p90_target_ms}ms"
292292
)
293293
return False
294294

backend/src/recommendation/capacity_planner.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,10 @@ def plan_capacity(
9494
predicted_e2e = self._estimate_e2e_latency(bench, traffic_profile)
9595

9696
# Check if E2E also meets target
97-
if predicted_e2e > slo_targets.e2e_p95_target_ms:
97+
if predicted_e2e > slo_targets.e2e_p90_target_ms:
9898
logger.debug(
9999
f"Skipping {bench.gpu_type} TP={bench.tensor_parallel}: "
100-
f"E2E {predicted_e2e}ms > target {slo_targets.e2e_p95_target_ms}ms"
100+
f"E2E {predicted_e2e}ms > target {slo_targets.e2e_p90_target_ms}ms"
101101
)
102102
continue
103103

@@ -135,7 +135,7 @@ def plan_capacity(
135135
gpu_config=gpu_config,
136136
predicted_ttft_p90_ms=bench.ttft_p90_ms,
137137
predicted_tpot_p90_ms=bench.tpot_p90_ms,
138-
predicted_e2e_p95_ms=predicted_e2e,
138+
predicted_e2e_p90_ms=predicted_e2e,
139139
predicted_throughput_qps=bench.max_qps * replicas,
140140
cost_per_hour_usd=cost_per_hour,
141141
cost_per_month_usd=cost_per_month,
@@ -164,7 +164,7 @@ def plan_capacity(
164164
"gpu_config": rec.gpu_config.dict(),
165165
"predicted_ttft_p90_ms": rec.predicted_ttft_p90_ms,
166166
"predicted_tpot_p90_ms": rec.predicted_tpot_p90_ms,
167-
"predicted_e2e_p95_ms": rec.predicted_e2e_p95_ms,
167+
"predicted_e2e_p90_ms": rec.predicted_e2e_p90_ms,
168168
"predicted_throughput_qps": rec.predicted_throughput_qps,
169169
"cost_per_hour_usd": rec.cost_per_hour_usd,
170170
"cost_per_month_usd": rec.cost_per_month_usd,
@@ -213,7 +213,7 @@ def _estimate_e2e_latency(self, bench: BenchmarkData, traffic_profile: TrafficPr
213213
traffic_profile: Traffic characteristics
214214
215215
Returns:
216-
Estimated E2E p95 latency (ms)
216+
Estimated E2E p90 latency (ms)
217217
"""
218218
# For streaming: E2E ≈ TTFT + (first ~20 tokens × TPOT)
219219
# This represents the time until the user has a meaningful response
@@ -226,10 +226,7 @@ def _estimate_e2e_latency(self, bench: BenchmarkData, traffic_profile: TrafficPr
226226

227227
e2e_p90 = ttft + (tpot * perceived_gen_tokens)
228228

229-
# Add ~20% buffer for p95
230-
e2e_p95 = int(e2e_p90 * 1.2)
231-
232-
return e2e_p95
229+
return e2e_p90
233230

234231
def _generate_reasoning(
235232
self,

backend/src/recommendation/traffic_profile.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,13 @@ def generate_slo_targets(self, intent: DeploymentIntent) -> SLOTargets:
7878
template.tpot_p90_target_ms, intent.latency_requirement
7979
)
8080
e2e_target = self._adjust_slo_for_latency(
81-
template.e2e_p95_target_ms, intent.latency_requirement
81+
template.e2e_p90_target_ms, intent.latency_requirement
8282
)
8383

8484
return SLOTargets(
8585
ttft_p90_target_ms=ttft_target,
8686
tpot_p90_target_ms=tpot_target,
87-
e2e_p95_target_ms=e2e_target,
87+
e2e_p90_target_ms=e2e_target,
8888
)
8989

9090
def _estimate_qps(
@@ -121,6 +121,9 @@ def _estimate_qps(
121121
# Apply peak ratio
122122
peak_qps = avg_qps_peak * peak_ratio
123123

124+
# Ensure minimum QPS of 0.1 for small workloads
125+
peak_qps = max(0.1, peak_qps)
126+
124127
return round(peak_qps, 2)
125128

126129
def _adjust_slo_for_latency(self, base_target_ms: int, latency_requirement: str) -> int:
@@ -169,4 +172,4 @@ def _generate_default_slo(self, intent: DeploymentIntent) -> SLOTargets:
169172

170173
ttft, tpot, e2e = slo_map.get(intent.latency_requirement, (500, 80, 5000))
171174

172-
return SLOTargets(ttft_p90_target_ms=ttft, tpot_p90_target_ms=tpot, e2e_p95_target_ms=e2e)
175+
return SLOTargets(ttft_p90_target_ms=ttft, tpot_p90_target_ms=tpot, e2e_p90_target_ms=e2e)

0 commit comments

Comments
 (0)