1+ """
2+ Main Page
3+ """
4+
5+ from matplotlib import pyplot as plt
6+ import streamlit as st
7+ import db
8+ import util
9+ import numpy as np
10+ from src .config_explorer .capacity_planner import *
11+ from huggingface_hub .errors import *
12+
13+ def update_gpu_spec ():
14+ """
15+ Update user selected GPU spec in session state
16+ """
17+ st .session_state ['scenario' ].gpu_spec = st .session_state ['gpu_spec' ][st .session_state ['selected_gpu_spec' ]]
18+
19+ def update_gpu_count_avail ():
20+ """
21+ Update user selected GPU count in session state
22+ """
23+ st .session_state ['scenario' ].gpu_count_avail = st .session_state ['selected_gpu_count_avail' ]
24+
25+ @st .dialog ("Register a new accelerator" )
26+ def register_new_accelerator ():
27+ """
28+ Dialog to register a new accelerator type
29+ """
30+ acc_name = st .text_input ("Name" , placeholder = "NVIDIA-A100-40GB" )
31+ acc_mem = st .number_input ("Memory (GB)" , min_value = 1 , step = 1 )
32+
33+ if st .button ("Register" , use_container_width = True ):
34+ if acc_name :
35+ st .session_state ["gpu_spec" ][acc_name ] = {
36+ "name" : acc_name ,
37+ "memory" : acc_mem
38+ }
39+ st .rerun ()
40+
41+ def model_specification ():
42+ """
43+ Get model inputs like model name, precision
44+ """
45+
46+ user_scenario = st .session_state [util .USER_SCENARIO_KEY ]
47+ model_info = None
48+
49+ # Model
50+ with st .container (border = True ):
51+ st .write ("**Model Specification**" )
52+
53+ selected_model = st .text_input ("Model (Hugging Face format)" ,
54+ value = user_scenario .get_model_name (),
55+ key = util .SELECTED_MODEL_KEY ,
56+ on_change = util .on_update_model_name ,
57+ )
58+ hf_token = None
59+
60+ if selected_model and selected_model != "" :
61+ # Fetch model info
62+ try :
63+ model_info = get_model_info_from_hf (selected_model )
64+ user_scenario .model_info = model_info
65+ except Exception as e :
66+ st .warning ("Cannot access model information, see error below." )
67+ st .warning (e )
68+ return None
69+
70+ # Fetch model config
71+ try :
72+ model_config = get_model_config_from_hf (selected_model , hf_token = hf_token )
73+ user_scenario .model_config = model_config
74+ except Exception as e :
75+ e_str = str (e )
76+ if "gated" in e_str :
77+ st .warning ("This is a gated model, please submit a HF token to view information" )
78+ hf_token = st .text_input ("HF token" )
79+ if hf_token :
80+ model_config = get_model_config_from_hf (selected_model , hf_token = hf_token )
81+ user_scenario .model_config = model_config
82+ else :
83+ st .warning ("Cannot access model config, see error below." )
84+ st .warning (e )
85+ return None
86+
87+ try :
88+ total_params = model_total_params (model_info )
89+ precision_keys = model_precision_keys (model_info )
90+ model_gpu_memory_req = round (model_memory_req (model_info ))
91+ except Exception as e :
92+ st .warning (f"Cannot retrieve relevant information about the model, { e } " )
93+ return None
94+
95+ # Display first precision
96+ st .caption (f"Precision: { ', ' .join (precision_keys )} " )
97+ st .caption (f"Total parameters: { total_params } " )
98+ st .caption (f"GPU memory requirement: ~{ model_gpu_memory_req } GB" )
99+
100+ else :
101+ return None
102+
103+ def hardware_specification ():
104+ """
105+ Get hardware inputs like name and number of accelerators available
106+ """
107+
108+ user_scenario = st .session_state [util .USER_SCENARIO_KEY ]
109+
110+ # Hardware
111+ with st .container (border = True ):
112+ st .write ("**Hardware Specification**" )
113+
114+ col1 , col2 = st .columns ([0.7 , 0.3 ])
115+
116+ index = 0
117+ if user_scenario .gpu_name in db .gpu_specs .keys ():
118+ index = list (db .gpu_specs .keys ()).index (user_scenario .gpu_name )
119+
120+ # Select GPU type
121+ selected_gpu_name = col1 .selectbox ("Accelerator" ,
122+ key = util .SELECTED_GPU_NAME_KEY ,
123+ index = index ,
124+ options = db .gpu_specs ,
125+ on_change = util .update_scenario ,
126+ args = [util .SELECTED_GPU_NAME_KEY , "gpu_name" ],
127+ )
128+ # Dialog for registering new accelerator data
129+ col2 .info ("Don't see your accelerator? Register a new one below" )
130+ if col2 .button ("Register new accelerator" , use_container_width = True ):
131+ register_new_accelerator ()
132+
133+ if selected_gpu_name :
134+ # util.update_scenario(util.SELECTED_GPU_NAME_KEY, "gpu_name")
135+ gpu_memory = user_scenario .get_gpu_memory (db .gpu_specs )
136+ st .caption (f"GPU memory: { gpu_memory } GB" )
137+
138+
139+ # Number of GPUs available
140+ num_acc_avail = st .number_input ("Number accelerators available" ,
141+ key = util .SELECTED_GPU_COUNT_AVAIL_KEY ,
142+ value = user_scenario .gpu_count_avail ,
143+ step = 1 ,
144+ min_value = 0 ,
145+ on_change = util .on_update_gpu_count ,
146+ )
147+
148+ # Calculate the minimum number of GPUs required
149+ if selected_gpu_name and num_acc_avail :
150+ min_gpu_needed = min_gpu_req (user_scenario .model_info , gpu_memory )
151+ if num_acc_avail < min_gpu_needed :
152+ st .error (f"Not enough GPU memory to load the model. At least { min_gpu_needed } is required." )
153+ return None
154+
155+ def workload_specification ():
156+ """
157+ Estimate total memory needed for KV cache
158+ """
159+
160+ user_scenario = st .session_state [util .USER_SCENARIO_KEY ]
161+ model_info = user_scenario .model_info
162+ model_config = user_scenario .model_config
163+
164+ # Workload
165+ with st .container (border = True ):
166+ st .write ("**Workload Characteristics (KV Cache Estimator)**" )
167+ st .caption ("Estimate KV cache memory requirements for the selected model based on workload." )
168+
169+ if model_info is None :
170+ st .warning ("Model information not yet selected" )
171+ return None
172+ if model_config is None :
173+ st .warning ("Model config not available, cannot estimate KV cache size" )
174+ return None
175+
176+ col1 , col2 = st .columns (2 )
177+
178+ min_gpu_required = min_gpu_req (model_info , user_scenario .get_gpu_memory (db .gpu_specs ))
179+ model_max_context_len = max_context_len (model_config )
180+ selected_max_model_len = col1 .number_input (
181+ f"Max model len (max model context length is: { model_max_context_len } )" ,
182+ min_value = 1 ,
183+ max_value = model_max_context_len ,
184+ value = user_scenario .max_model_len ,
185+ key = util .SELECTED_MAX_MODEL_LEN_KEY ,
186+ on_change = util .update_scenario ,
187+ args = [util .SELECTED_MAX_MODEL_LEN_KEY , "max_model_len" ]
188+ )
189+ col1 .caption ("Maximum model length for the model: how many tokens (input + output) the model can process. \
190+ Higher max model length means fewer concurrent requests can be served, \
191+ because for the same GPU memory available for KV cache, \
192+ each request requires more memory allocation. \
193+ " )
194+
195+ max_concurrency = None
196+ if selected_max_model_len :
197+ # Calculate max concurrent requests available given GPU count
198+ if user_scenario .gpu_count_avail :
199+ max_concurrency = max_concurrent_req (model_info ,
200+ model_config ,
201+ selected_max_model_len ,
202+ user_scenario .gpu_count_avail ,
203+ user_scenario .get_gpu_memory (db .gpu_specs ),
204+ )
205+
206+ selected_concurrency = col2 .number_input ("Concurrency" ,
207+ min_value = 0 ,
208+ max_value = max_concurrency ,
209+ step = 1 ,
210+ key = util .SELECTED_CONCURRENCY_KEY ,
211+ value = user_scenario .concurrency ,
212+ on_change = util .update_scenario ,
213+ args = [util .SELECTED_CONCURRENCY_KEY , "concurrency" ]
214+ )
215+
216+ # Display missing information messages
217+ if user_scenario .gpu_count_avail :
218+ if user_scenario .gpu_count_avail < min_gpu_required :
219+ col2 .info ("Not enough GPU memory available to load model." )
220+ else :
221+ col2 .info ("Input accelerator count above." )
222+
223+ if not selected_max_model_len :
224+ col2 .info ("Input maximum model length to estimate max concurrency that can be achieved." )
225+ elif max_concurrency is not None :
226+ per_req_kv_req = kv_cache_req (model_info ,
227+ model_config ,
228+ context_len = selected_max_model_len ,
229+ )
230+ col2 .info (f"Each request will take ~{ round (per_req_kv_req , 2 )} GB of KV cache, and there is enough KV cache to process up to { max_concurrency } requests concurrently." )
231+ else :
232+ col2 .info ("Not enough information to calculate max concurrency. Need model info, accelerator type, count, and max model length." )
233+
234+ def memory_util_chart ():
235+ """
236+ Show memory utilization chart
237+ """
238+
239+ user_scenario = st .session_state [util .USER_SCENARIO_KEY ]
240+ model_info = user_scenario .model_info
241+ model_config = user_scenario .model_config
242+ min_gpu_required = min_gpu_req (model_info , user_scenario .get_gpu_memory (db .gpu_specs ))
243+
244+ # Display GPU + KV pie chart
245+ if user_scenario .can_show_mem_util_chart (min_gpu_required ):
246+ model_size = round (model_memory_req (model_info ), 2 )
247+ kv_cache = 0
248+ total = 0
249+ free = 0
250+
251+ kv_cache = kv_cache_req (model_info ,
252+ model_config ,
253+ context_len = user_scenario .max_model_len ,
254+ batch_size = user_scenario .concurrency ,
255+ )
256+ kv_cache = round (kv_cache , 2 )
257+ total = user_scenario .gpu_count_avail * user_scenario .get_gpu_memory (db .gpu_specs )
258+ free = round (total - model_size - kv_cache , 2 )
259+
260+ if free < 0 :
261+ st .warning (f'Memory usage exceeds available by { - free :.1f} GB' )
262+ free = 0
263+ return None
264+
265+ # Display chart iff model and cache size are selected
266+ labels = ["Model" , "KV Cache" , "Free" ]
267+ sizes = [model_size , kv_cache , free ]
268+ colors = ["#ff9999" , "#66b3ff" , "#99ff99" ]
269+
270+ # Create donut chart
271+ fig , ax = plt .subplots (figsize = (4 , 4 ))
272+ wedges , texts = ax .pie (
273+ sizes ,
274+ colors = colors ,
275+ startangle = 90 , # Start at top
276+ wedgeprops = dict (width = 0.4 ), # <-- Makes it a donut,
277+ labeldistance = 1.1 , # Push labels outward
278+ pctdistance = 0.7 , # Adjust percentage position
279+ )
280+
281+ # Add total as text in the center of the donut
282+ ax .text (0 , 0 , f"Total\n { total } GB" , ha = "center" , va = "center" , fontsize = 12 , fontweight = "bold" )
283+
284+ # Create a custom legend, including the total
285+ legend_labels = [f"{ labels [i ]} : { sizes [i ]} GB" for i in range (len (labels ))]
286+
287+ # Position legend on the right
288+ ax .legend (
289+ wedges + [plt .Line2D ([0 ], [0 ], color = "#CCCCCC" , lw = 10 )], # Add fake handle for total
290+ legend_labels ,
291+ title = "Storage Breakdown" ,
292+ loc = "center left" ,
293+ bbox_to_anchor = (1 , 0 , 0.5 , 1 )
294+ )
295+
296+ # Render in Streamlit
297+ _ , col , _ = st .columns ([.5 , 1 , .5 ])
298+ with col :
299+ st .pyplot (fig , bbox_inches = "tight" )
300+
301+ if __name__ == '__main__' :
302+
303+ # Set up streamlit config
304+ st .set_page_config (page_title = "Configuration Explorer" ,
305+ page_icon = None ,
306+ layout = "wide" ,
307+ initial_sidebar_state = "expanded" ,
308+ menu_items = None )
309+
310+ st .title ("Configuration Explorer" )
311+ st .caption ("This tool helps you find the most cost-effective, optimal configuration for serving models on llm-d based on hardware specification, workload characteristics, and SLO requirements." )
312+
313+ util .init_session_state ()
314+
315+ # Display Capacity Planner headings
316+ st .subheader ("Capacity Planner" )
317+ st .caption ("Determine how many GPUs you need to fit your model and how many requests can be served at once depending on request patterns." )
318+
319+ # Get user inputs and show outputs
320+ model_specification ()
321+ hardware_specification ()
322+ workload_specification ()
323+ memory_util_chart ()
0 commit comments