@@ -2020,6 +2020,55 @@ def _get_hardware_selection_reason(priority: str, hw_option: dict, slo_targets:
20202020# RANKED RECOMMENDATIONS (Backend API Integration)
20212021# =============================================================================
20222022
2023+ @st .cache_data (ttl = 300 )
2024+ def fetch_slo_defaults (use_case : str ) -> dict | None :
2025+ """Fetch default SLO values for a use case from the backend API.
2026+
2027+ Returns dict with ttft_ms, itl_ms, e2e_ms each containing min, max, default.
2028+ Cached for 5 minutes.
2029+ """
2030+ try :
2031+ response = requests .get (
2032+ f"{ API_BASE_URL } /api/v1/slo-defaults/{ use_case } " ,
2033+ timeout = 10 ,
2034+ )
2035+ response .raise_for_status ()
2036+ data = response .json ()
2037+ if data .get ("success" ):
2038+ return data .get ("slo_defaults" )
2039+ return None
2040+ except Exception as e :
2041+ logger .warning (f"Failed to fetch SLO defaults for { use_case } : { e } " )
2042+ return None
2043+
2044+
2045+ @st .cache_data (ttl = 300 )
2046+ def fetch_expected_rps (use_case : str , user_count : int ) -> dict | None :
2047+ """Fetch expected RPS for a use case from the backend API.
2048+
2049+ Uses research-backed workload patterns to calculate:
2050+ - expected_rps: average requests per second
2051+ - peak_rps: peak capacity needed
2052+ - workload_params: active_fraction, requests_per_min, etc.
2053+
2054+ Cached for 5 minutes.
2055+ """
2056+ try :
2057+ response = requests .get (
2058+ f"{ API_BASE_URL } /api/v1/expected-rps/{ use_case } " ,
2059+ params = {"user_count" : user_count },
2060+ timeout = 10 ,
2061+ )
2062+ response .raise_for_status ()
2063+ data = response .json ()
2064+ if data .get ("success" ):
2065+ return data
2066+ return None
2067+ except Exception as e :
2068+ logger .warning (f"Failed to fetch expected RPS for { use_case } : { e } " )
2069+ return None
2070+
2071+
20232072def fetch_ranked_recommendations (
20242073 use_case : str ,
20252074 user_count : int ,
@@ -4747,10 +4796,27 @@ def render_slo_cards(use_case: str, user_count: int, priority: str = "balanced",
47474796 ttft_default = int (ttft_max_raw * ttft_factor )
47484797 itl_default = int (itl_max_raw * itl_factor )
47494798 e2e_default = int (e2e_max_raw * e2e_factor )
4750-
4751- # Calculate QPS based on user count
4752- estimated_qps = max (1 , user_count // 50 )
4753-
4799+
4800+ # Fetch expected RPS from backend using research-based workload patterns
4801+ rps_data = fetch_expected_rps (use_case , user_count )
4802+ if rps_data :
4803+ estimated_qps = int (rps_data .get ("expected_rps" , 1 ))
4804+ else :
4805+ # Fallback to simple heuristic if API fails
4806+ estimated_qps = max (1 , user_count // 50 )
4807+
4808+ # Track if use_case or user_count changed - if so, reset custom_qps to use new default
4809+ last_use_case = st .session_state .get ("_last_rps_use_case" )
4810+ last_user_count = st .session_state .get ("_last_rps_user_count" )
4811+ if last_use_case != use_case or last_user_count != user_count :
4812+ # Use case or user count changed - reset to new calculated default
4813+ st .session_state .custom_qps = None
4814+ st .session_state ._last_rps_use_case = use_case
4815+ st .session_state ._last_rps_user_count = user_count
4816+ # Clear the widget key to force re-render with new value
4817+ if "edit_qps" in st .session_state :
4818+ del st .session_state ["edit_qps" ]
4819+
47544820 # Use custom values if set, otherwise use MAX as default (shows all configs)
47554821 # Ensure all values are integers for slider compatibility
47564822 ttft = int (st .session_state .custom_ttft ) if st .session_state .custom_ttft else ttft_default
@@ -4873,6 +4939,12 @@ def render_slo_cards(use_case: str, user_count: int, priority: str = "balanced",
48734939 prev_itl = st .session_state .get ("_last_itl" )
48744940 prev_e2e = st .session_state .get ("_last_e2e" )
48754941
4942+ # Fetch use-case specific SLO defaults from backend API
4943+ slo_defaults = fetch_slo_defaults (use_case )
4944+ default_ttft = slo_defaults ["ttft_ms" ]["default" ] if slo_defaults else 500
4945+ default_itl = slo_defaults ["itl_ms" ]["default" ] if slo_defaults else 50
4946+ default_e2e = slo_defaults ["e2e_ms" ]["default" ] if slo_defaults else 10000
4947+
48764948 # Helper function to get range for a metric and percentile
48774949 def get_metric_range (metric : str , percentile_key : str ) -> tuple :
48784950 if metric == "ttft" :
@@ -4894,7 +4966,7 @@ def get_metric_range(metric: str, percentile_key: str) -> tuple:
48944966 # === TTFT ===
48954967 ttft_min , ttft_max = get_metric_range ("ttft" , st .session_state .ttft_percentile )
48964968 if 'input_ttft' not in st .session_state :
4897- st .session_state .input_ttft = ttft_max
4969+ st .session_state .input_ttft = default_ttft
48984970
48994971 st .markdown ('<div style="margin-top: 0.5rem; margin-bottom: 0.25rem;"><span style="color: #EE0000; font-weight: 700; font-size: 0.95rem;">TTFT (Time to First Token)</span></div>' , unsafe_allow_html = True )
49004972 ttft_val_col , ttft_pct_col = st .columns ([2 , 1 ])
@@ -4910,7 +4982,7 @@ def get_metric_range(metric: str, percentile_key: str) -> tuple:
49104982 # === ITL ===
49114983 itl_min , itl_max = get_metric_range ("itl" , st .session_state .itl_percentile )
49124984 if 'input_itl' not in st .session_state :
4913- st .session_state .input_itl = itl_max
4985+ st .session_state .input_itl = default_itl
49144986
49154987 st .markdown ('<div style="margin-top: 1rem; margin-bottom: 0.25rem;"><span style="color: #EE0000; font-weight: 700; font-size: 0.95rem;">ITL (Inter-Token Latency)</span></div>' , unsafe_allow_html = True )
49164988 itl_val_col , itl_pct_col = st .columns ([2 , 1 ])
@@ -4926,7 +4998,7 @@ def get_metric_range(metric: str, percentile_key: str) -> tuple:
49264998 # === E2E ===
49274999 e2e_min , e2e_max = get_metric_range ("e2e" , st .session_state .e2e_percentile )
49285000 if 'input_e2e' not in st .session_state :
4929- st .session_state .input_e2e = e2e_max
5001+ st .session_state .input_e2e = default_e2e
49305002
49315003 st .markdown ('<div style="margin-top: 1rem; margin-bottom: 0.25rem;"><span style="color: #EE0000; font-weight: 700; font-size: 0.95rem;">E2E (End-to-End Latency)</span></div>' , unsafe_allow_html = True )
49325004 e2e_val_col , e2e_pct_col = st .columns ([2 , 1 ])
0 commit comments