feat: Add backend APIs for SLO defaults and expected RPS calculation

anfredette · anfredette · commit 80a7c0ce8b07 · 2026-01-06T18:09:02.000-05:00
Add two new endpoints to replace hardcoded UI calculations: - /api/v1/slo-defaults/{use_case}: Returns SLO targets at 75th percentile - /api/v1/expected-rps/{use_case}: Calculates RPS using research-based workload patterns (active_fraction * requests_per_min / 60) Update UI to fetch defaults from backend APIs instead of using heuristics. Fix Streamlit widget state issue where RPS input wouldn't update when use case or user count changed. Fixes: #50 Assisted-by: Claude <noreply@anthropic.com> Signed-off-by: Andre Fredette <afredette@redhat.com>
diff --git a/backend/src/api/routes.py b/backend/src/api/routes.py
@@ -247,6 +247,144 @@ async def list_use_cases():
         raise HTTPException(status_code=500, detail=str(e)) from e
 
 
+def _round_to_nearest(value: float, nearest: int = 5) -> int:
+    """Round a value to the nearest multiple of `nearest`."""
+    return int(round(value / nearest) * nearest)
+
+
+def _calculate_percentile_value(min_val: int, max_val: int, percentile: float = 0.75) -> int:
+    """Calculate value at given percentile between min and max, rounded to nearest 5."""
+    value = min_val + (max_val - min_val) * percentile
+    return _round_to_nearest(value, 5)
+
+
+@app.get("/api/v1/slo-defaults/{use_case}")
+async def get_slo_defaults(use_case: str):
+    """Get default SLO values for a use case.
+
+    Returns SLO targets at the 75th percentile between min and max,
+    rounded to the nearest 5.
+    """
+    try:
+        import json
+        json_path = Path(__file__).parent.parent.parent.parent / "data" / "business_context" / "use_case" / "configs" / "usecase_slo_workload.json"
+
+        if not json_path.exists():
+            logger.error(f"SLO workload config not found at: {json_path}")
+            raise HTTPException(status_code=404, detail="SLO workload configuration not found")
+
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+
+        use_case_data = data.get("use_case_slo_workload", {}).get(use_case)
+        if not use_case_data:
+            raise HTTPException(status_code=404, detail=f"Use case '{use_case}' not found")
+
+        slo_targets = use_case_data.get("slo_targets", {})
+
+        # Get SLO ranges - fail if data is missing
+        ttft = slo_targets["ttft_ms"]
+        itl = slo_targets["itl_ms"]
+        e2e = slo_targets["e2e_ms"]
+
+        defaults = {
+            "use_case": use_case,
+            "description": use_case_data.get("description", ""),
+            "ttft_ms": {
+                "min": ttft["min"],
+                "max": ttft["max"],
+                "default": _calculate_percentile_value(ttft["min"], ttft["max"], 0.75)
+            },
+            "itl_ms": {
+                "min": itl["min"],
+                "max": itl["max"],
+                "default": _calculate_percentile_value(itl["min"], itl["max"], 0.75)
+            },
+            "e2e_ms": {
+                "min": e2e["min"],
+                "max": e2e["max"],
+                "default": _calculate_percentile_value(e2e["min"], e2e["max"], 0.75)
+            }
+        }
+
+        return {"success": True, "slo_defaults": defaults}
+
+    except HTTPException:
+        raise
+    except KeyError as e:
+        logger.error(f"Missing SLO data for {use_case}: {e}")
+        raise HTTPException(status_code=500, detail=f"Missing SLO data: {e}") from e
+    except Exception as e:
+        logger.error(f"Failed to get SLO defaults for {use_case}: {e}")
+        raise HTTPException(status_code=500, detail=str(e)) from e
+
+
+@app.get("/api/v1/expected-rps/{use_case}")
+async def get_expected_rps(use_case: str, user_count: int = 1000):
+    """Calculate expected RPS for a use case based on workload patterns.
+
+    Uses research-backed workload distribution parameters:
+    - active_fraction: percentage of users active at any time
+    - requests_per_active_user_per_min: request rate per active user
+
+    Formula: expected_rps = (user_count * active_fraction * requests_per_min) / 60
+    """
+    try:
+        import json
+        json_path = Path(__file__).parent.parent.parent.parent / "data" / "business_context" / "use_case" / "configs" / "usecase_slo_workload.json"
+
+        if not json_path.exists():
+            logger.error(f"SLO workload config not found at: {json_path}")
+            raise HTTPException(status_code=404, detail="SLO workload configuration not found")
+
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+
+        use_case_data = data.get("use_case_slo_workload", {}).get(use_case)
+        if not use_case_data:
+            raise HTTPException(status_code=404, detail=f"Use case '{use_case}' not found")
+
+        workload = use_case_data.get("workload", {})
+
+        # Get workload parameters - fail if missing
+        active_fraction = workload["active_fraction"]
+        requests_per_min = workload["requests_per_active_user_per_min"]
+        peak_multiplier = workload.get("peak_multiplier", 2.0)
+        distribution = workload.get("distribution", "poisson")
+
+        # Calculate expected RPS using research-based formula
+        expected_concurrent = int(user_count * active_fraction)
+        expected_rps = (expected_concurrent * requests_per_min) / 60
+        expected_rps = max(1, round(expected_rps, 2))  # Minimum 1 RPS, round to 2 decimals
+
+        # Calculate peak RPS for capacity planning
+        peak_rps = expected_rps * peak_multiplier
+
+        return {
+            "success": True,
+            "use_case": use_case,
+            "user_count": user_count,
+            "workload_params": {
+                "active_fraction": active_fraction,
+                "requests_per_active_user_per_min": requests_per_min,
+                "peak_multiplier": peak_multiplier,
+                "distribution": distribution
+            },
+            "expected_rps": expected_rps,
+            "expected_concurrent_users": expected_concurrent,
+            "peak_rps": round(peak_rps, 2)
+        }
+
+    except HTTPException:
+        raise
+    except KeyError as e:
+        logger.error(f"Missing workload data for {use_case}: {e}")
+        raise HTTPException(status_code=500, detail=f"Missing workload data: {e}") from e
+    except Exception as e:
+        logger.error(f"Failed to calculate expected RPS for {use_case}: {e}")
+        raise HTTPException(status_code=500, detail=str(e)) from e
+
+
 # Get benchmark data
 @app.get("/api/v1/benchmarks")
 async def get_benchmarks():
diff --git a/ui/app.py b/ui/app.py
@@ -2020,6 +2020,55 @@ def _get_hardware_selection_reason(priority: str, hw_option: dict, slo_targets:
 # RANKED RECOMMENDATIONS (Backend API Integration)
 # =============================================================================
 
+@st.cache_data(ttl=300)
+def fetch_slo_defaults(use_case: str) -> dict | None:
+    """Fetch default SLO values for a use case from the backend API.
+
+    Returns dict with ttft_ms, itl_ms, e2e_ms each containing min, max, default.
+    Cached for 5 minutes.
+    """
+    try:
+        response = requests.get(
+            f"{API_BASE_URL}/api/v1/slo-defaults/{use_case}",
+            timeout=10,
+        )
+        response.raise_for_status()
+        data = response.json()
+        if data.get("success"):
+            return data.get("slo_defaults")
+        return None
+    except Exception as e:
+        logger.warning(f"Failed to fetch SLO defaults for {use_case}: {e}")
+        return None
+
+
+@st.cache_data(ttl=300)
+def fetch_expected_rps(use_case: str, user_count: int) -> dict | None:
+    """Fetch expected RPS for a use case from the backend API.
+
+    Uses research-backed workload patterns to calculate:
+    - expected_rps: average requests per second
+    - peak_rps: peak capacity needed
+    - workload_params: active_fraction, requests_per_min, etc.
+
+    Cached for 5 minutes.
+    """
+    try:
+        response = requests.get(
+            f"{API_BASE_URL}/api/v1/expected-rps/{use_case}",
+            params={"user_count": user_count},
+            timeout=10,
+        )
+        response.raise_for_status()
+        data = response.json()
+        if data.get("success"):
+            return data
+        return None
+    except Exception as e:
+        logger.warning(f"Failed to fetch expected RPS for {use_case}: {e}")
+        return None
+
+
 def fetch_ranked_recommendations(
     use_case: str,
     user_count: int,
@@ -4747,10 +4796,27 @@ def render_slo_cards(use_case: str, user_count: int, priority: str = "balanced",
     ttft_default = int(ttft_max_raw * ttft_factor)
     itl_default = int(itl_max_raw * itl_factor)
     e2e_default = int(e2e_max_raw * e2e_factor)
-    
-    # Calculate QPS based on user count
-    estimated_qps = max(1, user_count // 50)
-    
+
+    # Fetch expected RPS from backend using research-based workload patterns
+    rps_data = fetch_expected_rps(use_case, user_count)
+    if rps_data:
+        estimated_qps = int(rps_data.get("expected_rps", 1))
+    else:
+        # Fallback to simple heuristic if API fails
+        estimated_qps = max(1, user_count // 50)
+
+    # Track if use_case or user_count changed - if so, reset custom_qps to use new default
+    last_use_case = st.session_state.get("_last_rps_use_case")
+    last_user_count = st.session_state.get("_last_rps_user_count")
+    if last_use_case != use_case or last_user_count != user_count:
+        # Use case or user count changed - reset to new calculated default
+        st.session_state.custom_qps = None
+        st.session_state._last_rps_use_case = use_case
+        st.session_state._last_rps_user_count = user_count
+        # Clear the widget key to force re-render with new value
+        if "edit_qps" in st.session_state:
+            del st.session_state["edit_qps"]
+
     # Use custom values if set, otherwise use MAX as default (shows all configs)
     # Ensure all values are integers for slider compatibility
     ttft = int(st.session_state.custom_ttft) if st.session_state.custom_ttft else ttft_default
@@ -4873,6 +4939,12 @@ def render_slo_cards(use_case: str, user_count: int, priority: str = "balanced",
         prev_itl = st.session_state.get("_last_itl")
         prev_e2e = st.session_state.get("_last_e2e")
 
+        # Fetch use-case specific SLO defaults from backend API
+        slo_defaults = fetch_slo_defaults(use_case)
+        default_ttft = slo_defaults["ttft_ms"]["default"] if slo_defaults else 500
+        default_itl = slo_defaults["itl_ms"]["default"] if slo_defaults else 50
+        default_e2e = slo_defaults["e2e_ms"]["default"] if slo_defaults else 10000
+
         # Helper function to get range for a metric and percentile
         def get_metric_range(metric: str, percentile_key: str) -> tuple:
             if metric == "ttft":
@@ -4894,7 +4966,7 @@ def get_metric_range(metric: str, percentile_key: str) -> tuple:
         # === TTFT ===
         ttft_min, ttft_max = get_metric_range("ttft", st.session_state.ttft_percentile)
         if 'input_ttft' not in st.session_state:
-            st.session_state.input_ttft = ttft_max
+            st.session_state.input_ttft = default_ttft
 
         st.markdown('<div style="margin-top: 0.5rem; margin-bottom: 0.25rem;"><span style="color: #EE0000; font-weight: 700; font-size: 0.95rem;">TTFT (Time to First Token)</span></div>', unsafe_allow_html=True)
         ttft_val_col, ttft_pct_col = st.columns([2, 1])
@@ -4910,7 +4982,7 @@ def get_metric_range(metric: str, percentile_key: str) -> tuple:
         # === ITL ===
         itl_min, itl_max = get_metric_range("itl", st.session_state.itl_percentile)
         if 'input_itl' not in st.session_state:
-            st.session_state.input_itl = itl_max
+            st.session_state.input_itl = default_itl
 
         st.markdown('<div style="margin-top: 1rem; margin-bottom: 0.25rem;"><span style="color: #EE0000; font-weight: 700; font-size: 0.95rem;">ITL (Inter-Token Latency)</span></div>', unsafe_allow_html=True)
         itl_val_col, itl_pct_col = st.columns([2, 1])
@@ -4926,7 +4998,7 @@ def get_metric_range(metric: str, percentile_key: str) -> tuple:
         # === E2E ===
         e2e_min, e2e_max = get_metric_range("e2e", st.session_state.e2e_percentile)
         if 'input_e2e' not in st.session_state:
-            st.session_state.input_e2e = e2e_max
+            st.session_state.input_e2e = default_e2e
 
         st.markdown('<div style="margin-top: 1rem; margin-bottom: 0.25rem;"><span style="color: #EE0000; font-weight: 700; font-size: 0.95rem;">E2E (End-to-End Latency)</span></div>', unsafe_allow_html=True)
         e2e_val_col, e2e_pct_col = st.columns([2, 1])