Skip to content

Commit 6e1315b

Browse files
jgchnnamasl
andauthored
Configuration Explorer initial capability: capacity planner (llm-d#237)
* Initial mock of config recommender Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Add a fail case Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Intiial gpu memory math Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Edge cases Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Fix math Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Incorporate feedback Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Add some charts Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Add some viz Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Warn if no result Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Enhance plt charts and viz Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Update memory usage plot Signed-off-by: Nick Masluk <nick@randombytes.net> * Update requirements Signed-off-by: Nick Masluk <nick@randombytes.net> * Update plots, results table, default values Signed-off-by: Nick Masluk <nick@randombytes.net> * Fix memory usage calculation Signed-off-by: Nick Masluk <nick@randombytes.net> * Fix widget behavior Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Fixes Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Update KV cache estimator Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Major cleanup on capacity planner Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Fix second page Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Refactor lib Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Add docs Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Add test coverage Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Add version to pyproject Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Update pyproject Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Update dependencies Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Update readme Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Fix test Signed-off-by: Jing Chen <jing.chen2@ibm.com> * Address comments Signed-off-by: Jing Chen <jing.chen2@ibm.com> --------- Signed-off-by: Jing Chen <jing.chen2@ibm.com> Signed-off-by: Nick Masluk <nick@randombytes.net> Co-authored-by: Nick Masluk <nick@randombytes.net>
1 parent d6204d0 commit 6e1315b

File tree

12 files changed

+884
-0
lines changed

12 files changed

+884
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: Config Explorer Test
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
config-explorer-pytest:
7+
runs-on: ubuntu-latest
8+
strategy:
9+
matrix:
10+
python-version: ["3.11", "3.12", "3.13"]
11+
12+
steps:
13+
- uses: actions/checkout@v5
14+
15+
- name: Set up Python ${{ matrix.python-version }}
16+
uses: actions/setup-python@v5
17+
with:
18+
python-version: ${{ matrix.python-version }}
19+
cache: 'pip'
20+
21+
- name: Display Python version
22+
run: python -c "import sys; print(sys.version)"
23+
24+
- name: Install dependencies
25+
run: |
26+
python -m pip install --upgrade pip
27+
pip install -r config_explorer/requirements.txt
28+
29+
- name: Test with pytest
30+
run: |
31+
pip install pytest pytest-cov
32+
pytest -s config_explorer/tests/ --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html

config_explorer/Home.py

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
"""
2+
Main Page
3+
"""
4+
5+
from matplotlib import pyplot as plt
6+
import streamlit as st
7+
import db
8+
import util
9+
import numpy as np
10+
from src.config_explorer.capacity_planner import *
11+
from huggingface_hub.errors import *
12+
13+
def update_gpu_spec():
14+
"""
15+
Update user selected GPU spec in session state
16+
"""
17+
st.session_state['scenario'].gpu_spec = st.session_state['gpu_spec'][st.session_state['selected_gpu_spec']]
18+
19+
def update_gpu_count_avail():
20+
"""
21+
Update user selected GPU count in session state
22+
"""
23+
st.session_state['scenario'].gpu_count_avail = st.session_state['selected_gpu_count_avail']
24+
25+
@st.dialog("Register a new accelerator")
26+
def register_new_accelerator():
27+
"""
28+
Dialog to register a new accelerator type
29+
"""
30+
acc_name = st.text_input("Name", placeholder="NVIDIA-A100-40GB")
31+
acc_mem = st.number_input("Memory (GB)", min_value=1, step=1)
32+
33+
if st.button("Register", use_container_width=True):
34+
if acc_name:
35+
st.session_state["gpu_spec"][acc_name] = {
36+
"name": acc_name,
37+
"memory": acc_mem
38+
}
39+
st.rerun()
40+
41+
def model_specification():
42+
"""
43+
Get model inputs like model name, precision
44+
"""
45+
46+
user_scenario = st.session_state[util.USER_SCENARIO_KEY]
47+
model_info = None
48+
49+
# Model
50+
with st.container(border=True):
51+
st.write("**Model Specification**")
52+
53+
selected_model = st.text_input("Model (Hugging Face format)",
54+
value=user_scenario.get_model_name(),
55+
key=util.SELECTED_MODEL_KEY,
56+
on_change=util.on_update_model_name,
57+
)
58+
hf_token = None
59+
60+
if selected_model and selected_model != "":
61+
# Fetch model info
62+
try:
63+
model_info = get_model_info_from_hf(selected_model)
64+
user_scenario.model_info = model_info
65+
except Exception as e:
66+
st.warning("Cannot access model information, see error below.")
67+
st.warning(e)
68+
return None
69+
70+
# Fetch model config
71+
try:
72+
model_config = get_model_config_from_hf(selected_model, hf_token=hf_token)
73+
user_scenario.model_config = model_config
74+
except Exception as e:
75+
e_str = str(e)
76+
if "gated" in e_str:
77+
st.warning("This is a gated model, please submit a HF token to view information")
78+
hf_token = st.text_input("HF token")
79+
if hf_token:
80+
model_config = get_model_config_from_hf(selected_model, hf_token=hf_token)
81+
user_scenario.model_config = model_config
82+
else:
83+
st.warning("Cannot access model config, see error below.")
84+
st.warning(e)
85+
return None
86+
87+
try:
88+
total_params = model_total_params(model_info)
89+
precision_keys = model_precision_keys(model_info)
90+
model_gpu_memory_req = round(model_memory_req(model_info))
91+
except Exception as e:
92+
st.warning(f"Cannot retrieve relevant information about the model, {e}")
93+
return None
94+
95+
# Display first precision
96+
st.caption(f"Precision: {', '.join(precision_keys)}")
97+
st.caption(f"Total parameters: {total_params}")
98+
st.caption(f"GPU memory requirement: ~{model_gpu_memory_req} GB")
99+
100+
else:
101+
return None
102+
103+
def hardware_specification():
104+
"""
105+
Get hardware inputs like name and number of accelerators available
106+
"""
107+
108+
user_scenario = st.session_state[util.USER_SCENARIO_KEY]
109+
110+
# Hardware
111+
with st.container(border=True):
112+
st.write("**Hardware Specification**")
113+
114+
col1, col2 = st.columns([0.7, 0.3])
115+
116+
index = 0
117+
if user_scenario.gpu_name in db.gpu_specs.keys():
118+
index = list(db.gpu_specs.keys()).index(user_scenario.gpu_name)
119+
120+
# Select GPU type
121+
selected_gpu_name = col1.selectbox("Accelerator",
122+
key=util.SELECTED_GPU_NAME_KEY,
123+
index=index,
124+
options=db.gpu_specs,
125+
on_change=util.update_scenario,
126+
args=[util.SELECTED_GPU_NAME_KEY, "gpu_name"],
127+
)
128+
# Dialog for registering new accelerator data
129+
col2.info("Don't see your accelerator? Register a new one below")
130+
if col2.button("Register new accelerator", use_container_width=True):
131+
register_new_accelerator()
132+
133+
if selected_gpu_name:
134+
# util.update_scenario(util.SELECTED_GPU_NAME_KEY, "gpu_name")
135+
gpu_memory = user_scenario.get_gpu_memory(db.gpu_specs)
136+
st.caption(f"GPU memory: {gpu_memory} GB")
137+
138+
139+
# Number of GPUs available
140+
num_acc_avail = st.number_input("Number accelerators available",
141+
key=util.SELECTED_GPU_COUNT_AVAIL_KEY,
142+
value=user_scenario.gpu_count_avail,
143+
step=1,
144+
min_value=0,
145+
on_change=util.on_update_gpu_count,
146+
)
147+
148+
# Calculate the minimum number of GPUs required
149+
if selected_gpu_name and num_acc_avail:
150+
min_gpu_needed = min_gpu_req(user_scenario.model_info, gpu_memory)
151+
if num_acc_avail < min_gpu_needed:
152+
st.error(f"Not enough GPU memory to load the model. At least {min_gpu_needed} is required.")
153+
return None
154+
155+
def workload_specification():
156+
"""
157+
Estimate total memory needed for KV cache
158+
"""
159+
160+
user_scenario = st.session_state[util.USER_SCENARIO_KEY]
161+
model_info = user_scenario.model_info
162+
model_config = user_scenario.model_config
163+
164+
# Workload
165+
with st.container(border=True):
166+
st.write("**Workload Characteristics (KV Cache Estimator)**")
167+
st.caption("Estimate KV cache memory requirements for the selected model based on workload.")
168+
169+
if model_info is None:
170+
st.warning("Model information not yet selected")
171+
return None
172+
if model_config is None:
173+
st.warning("Model config not available, cannot estimate KV cache size")
174+
return None
175+
176+
col1, col2 = st.columns(2)
177+
178+
min_gpu_required = min_gpu_req(model_info, user_scenario.get_gpu_memory(db.gpu_specs))
179+
model_max_context_len = max_context_len(model_config)
180+
selected_max_model_len = col1.number_input(
181+
f"Max model len (max model context length is: {model_max_context_len})",
182+
min_value=1,
183+
max_value=model_max_context_len,
184+
value=user_scenario.max_model_len,
185+
key=util.SELECTED_MAX_MODEL_LEN_KEY,
186+
on_change=util.update_scenario,
187+
args=[util.SELECTED_MAX_MODEL_LEN_KEY, "max_model_len"]
188+
)
189+
col1.caption("Maximum model length for the model: how many tokens (input + output) the model can process. \
190+
Higher max model length means fewer concurrent requests can be served, \
191+
because for the same GPU memory available for KV cache, \
192+
each request requires more memory allocation. \
193+
")
194+
195+
max_concurrency = None
196+
if selected_max_model_len:
197+
# Calculate max concurrent requests available given GPU count
198+
if user_scenario.gpu_count_avail:
199+
max_concurrency = max_concurrent_req(model_info,
200+
model_config,
201+
selected_max_model_len,
202+
user_scenario.gpu_count_avail,
203+
user_scenario.get_gpu_memory(db.gpu_specs),
204+
)
205+
206+
selected_concurrency = col2.number_input("Concurrency",
207+
min_value=0,
208+
max_value=max_concurrency,
209+
step=1,
210+
key=util.SELECTED_CONCURRENCY_KEY,
211+
value=user_scenario.concurrency,
212+
on_change=util.update_scenario,
213+
args=[util.SELECTED_CONCURRENCY_KEY, "concurrency"]
214+
)
215+
216+
# Display missing information messages
217+
if user_scenario.gpu_count_avail:
218+
if user_scenario.gpu_count_avail < min_gpu_required:
219+
col2.info("Not enough GPU memory available to load model.")
220+
else:
221+
col2.info("Input accelerator count above.")
222+
223+
if not selected_max_model_len:
224+
col2.info("Input maximum model length to estimate max concurrency that can be achieved.")
225+
elif max_concurrency is not None:
226+
per_req_kv_req = kv_cache_req(model_info,
227+
model_config,
228+
context_len=selected_max_model_len,
229+
)
230+
col2.info(f"Each request will take ~{round(per_req_kv_req, 2)} GB of KV cache, and there is enough KV cache to process up to {max_concurrency} requests concurrently.")
231+
else:
232+
col2.info("Not enough information to calculate max concurrency. Need model info, accelerator type, count, and max model length.")
233+
234+
def memory_util_chart():
235+
"""
236+
Show memory utilization chart
237+
"""
238+
239+
user_scenario = st.session_state[util.USER_SCENARIO_KEY]
240+
model_info = user_scenario.model_info
241+
model_config = user_scenario.model_config
242+
min_gpu_required = min_gpu_req(model_info, user_scenario.get_gpu_memory(db.gpu_specs))
243+
244+
# Display GPU + KV pie chart
245+
if user_scenario.can_show_mem_util_chart(min_gpu_required):
246+
model_size = round(model_memory_req(model_info), 2)
247+
kv_cache = 0
248+
total = 0
249+
free = 0
250+
251+
kv_cache = kv_cache_req(model_info,
252+
model_config,
253+
context_len=user_scenario.max_model_len,
254+
batch_size=user_scenario.concurrency,
255+
)
256+
kv_cache = round(kv_cache, 2)
257+
total = user_scenario.gpu_count_avail * user_scenario.get_gpu_memory(db.gpu_specs)
258+
free = round(total - model_size - kv_cache, 2)
259+
260+
if free < 0:
261+
st.warning(f'Memory usage exceeds available by {-free:.1f} GB')
262+
free = 0
263+
return None
264+
265+
# Display chart iff model and cache size are selected
266+
labels = ["Model", "KV Cache", "Free"]
267+
sizes = [model_size, kv_cache, free]
268+
colors = ["#ff9999", "#66b3ff", "#99ff99"]
269+
270+
# Create donut chart
271+
fig, ax = plt.subplots(figsize=(4, 4))
272+
wedges, texts = ax.pie(
273+
sizes,
274+
colors=colors,
275+
startangle=90, # Start at top
276+
wedgeprops=dict(width=0.4), # <-- Makes it a donut,
277+
labeldistance=1.1, # Push labels outward
278+
pctdistance=0.7, # Adjust percentage position
279+
)
280+
281+
# Add total as text in the center of the donut
282+
ax.text(0, 0, f"Total\n{total} GB", ha="center", va="center", fontsize=12, fontweight="bold")
283+
284+
# Create a custom legend, including the total
285+
legend_labels = [f"{labels[i]}: {sizes[i]} GB" for i in range(len(labels))]
286+
287+
# Position legend on the right
288+
ax.legend(
289+
wedges + [plt.Line2D([0], [0], color="#CCCCCC", lw=10)], # Add fake handle for total
290+
legend_labels,
291+
title="Storage Breakdown",
292+
loc="center left",
293+
bbox_to_anchor=(1, 0, 0.5, 1)
294+
)
295+
296+
# Render in Streamlit
297+
_, col, _ = st.columns([.5, 1, .5])
298+
with col:
299+
st.pyplot(fig, bbox_inches="tight")
300+
301+
if __name__ == '__main__':
302+
303+
# Set up streamlit config
304+
st.set_page_config(page_title="Configuration Explorer",
305+
page_icon=None,
306+
layout="wide",
307+
initial_sidebar_state="expanded",
308+
menu_items=None)
309+
310+
st.title("Configuration Explorer")
311+
st.caption("This tool helps you find the most cost-effective, optimal configuration for serving models on llm-d based on hardware specification, workload characteristics, and SLO requirements.")
312+
313+
util.init_session_state()
314+
315+
# Display Capacity Planner headings
316+
st.subheader("Capacity Planner")
317+
st.caption("Determine how many GPUs you need to fit your model and how many requests can be served at once depending on request patterns.")
318+
319+
# Get user inputs and show outputs
320+
model_specification()
321+
hardware_specification()
322+
workload_specification()
323+
memory_util_chart()

0 commit comments

Comments
 (0)