Skip to content

Commit ec13ebc

Browse files
[Setup] chore: switch step 04 to python implementation (llm-d#247)
* [Setup] chore: switch step 04 to python implementation Some additional fixes were required, in particular, auto-detection of "default" storage class. Also cleaned up further the model attribute detection Added `GitPython` as a dependency in `install_deps.sh` Finally, added the "wait for creation" step on `standalone` Added python implementation for step 0 * Added patch suggested by @kalantar * Fixed the issue with "dry run" on step 04 in python * Additional fixes --------- Signed-off-by: maugustosilva <maugusto.silva@gmail.com>
1 parent 00fbb58 commit ec13ebc

File tree

12 files changed

+116
-78
lines changed

12 files changed

+116
-78
lines changed

.github/workflows/ci-pr-benchmark.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737

3838
- name: Populate python deps
3939
run: |
40-
echo -e "pandas\ngrip>=4.6.0\nmatplotlib>=3.7.0\nnumpy>=1.22.0\nseaborn>=0.12.0\nkubernetes>=28.0.0" > requirements.txt
40+
echo -e "pandas\ngrip>=4.6.0\nmatplotlib>=3.7.0\nnumpy>=1.22.0\nseaborn>=0.12.0\nkubernetes>=28.0.0\npykube\nkubernetes-asyncio\nGitPython" > requirements.txt
4141
4242
- name: Install python deps
4343
uses: actions/setup-python@v5

scenarios/cicd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
export LLMDBENCH_CONTROL_WORK_DIR=/tmp/cicd/
22
export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
33
export LLMDBENCH_VLLM_COMMON_NAMESPACE=llmdbenchcicd
4-
export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB
4+
export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S
55
export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
66
export LLMDBENCH_VLLM_MODELSERVICE_RELEASE=llmdbenchcicd
77
export LLMDBENCH_VLLM_COMMON_REPLICAS=1

setup/env.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,11 @@ export LLMDBENCH_CONTROL_STANDUP_ALL_STEPS=${LLMDBENCH_CONTROL_STANDUP_ALL_STEPS
154154
export LLMDBENCH_CONTROL_WAIT_TIMEOUT=${LLMDBENCH_CONTROL_WAIT_TIMEOUT:-900}
155155
export LLMDBENCH_CONTROL_CHECK_CLUSTER_AUTHORIZATIONS=${LLMDBENCH_CONTROL_CHECK_CLUSTER_AUTHORIZATIONS:-0}
156156
export LLMDBENCH_CONTROL_RESOURCE_LIST=${LLMDBENCH_CONTROL_RESOURCE_LIST:-deployment,httproute,service,gateway,gatewayparameters,inferencepool,inferencemodel,cm,ing,pod,job}
157-
export LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION:-sh}
158-
export LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION:-sh}
157+
export LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION:-py}
158+
export LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION:-py}
159159
export LLMDBENCH_CONTROL_STEP_02_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_02_IMPLEMENTATION:-sh}
160160
export LLMDBENCH_CONTROL_STEP_03_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_03_IMPLEMENTATION:-sh}
161-
export LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION:-sh}
161+
export LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION:-py}
162162
export LLMDBENCH_CONTROL_STEP_05_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_05_IMPLEMENTATION:-sh}
163163
export LLMDBENCH_CONTROL_STEP_06_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_06_IMPLEMENTATION:-sh}
164164
export LLMDBENCH_CONTROL_STEP_07_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_07_IMPLEMENTATION:-sh}

setup/functions.py

Lines changed: 52 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
import time
77
from pathlib import Path
88
import subprocess
9-
import inspect
9+
import inspect
1010
import pykube
11+
import hashlib
1112
from pykube.exceptions import PyKubeError
1213

1314
import yaml
@@ -21,7 +22,7 @@
2122

2223
import asyncio
2324

24-
import logging
25+
import logging
2526
logging.basicConfig(
2627
level=logging.INFO,
2728
format='%(asctime)s - %(levelname)s - %(message)s'
@@ -32,15 +33,15 @@
3233
def announce(message: str, logfile : str = None):
3334
work_dir = os.getenv("LLMDBENCH_CONTROL_WORK_DIR", '.')
3435
log_dir = os.path.join(work_dir, 'logs')
35-
36+
3637
# ensure logs dir exists
3738
os.makedirs(log_dir, exist_ok=True)
3839

3940

4041
if not logfile:
4142
cur_step = os.getenv("CURRENT_STEP_NAME", 'step')
4243
logfile = cur_step + '.log'
43-
44+
4445
logpath = os.path.join(log_dir, logfile)
4546

4647
logger.info(message)
@@ -66,10 +67,10 @@ def kube_connect(config_path : str = '~/.kube/config'):
6667
sys.exit(1)
6768

6869
return api
69-
7070

7171

72-
72+
73+
7374
def llmdbench_execute_cmd(
7475
actual_cmd: str,
7576
dry_run: bool = True,
@@ -81,11 +82,11 @@ def llmdbench_execute_cmd(
8182
) -> int:
8283
work_dir_str = os.getenv("LLMDBENCH_CONTROL_WORK_DIR", ".")
8384
log_dir = Path(work_dir_str) / "setup" / "commands"
84-
85+
8586
log_dir.mkdir(parents=True, exist_ok=True)
8687

8788
command_tstamp = int(time.time() * 1_000_000_000)
88-
89+
8990
if dry_run:
9091
msg = f"---> would have executed the command \"{actual_cmd}\""
9192
announce(msg)
@@ -105,11 +106,11 @@ def llmdbench_execute_cmd(
105106
ecode = -1
106107
last_stdout_log = None
107108
last_stderr_log = None
108-
109+
109110
for counter in range(1, attempts + 1):
110111
command_tstamp = int(time.time() * 1_000_000_000)
111-
112-
# log file paths
112+
113+
# log file paths
113114
stdout_log = log_dir / f"{command_tstamp}_stdout.log"
114115
stderr_log = log_dir / f"{command_tstamp}_stderr.log"
115116
last_stdout_log = stdout_log
@@ -128,31 +129,31 @@ def llmdbench_execute_cmd(
128129
# run with verbose
129130
announce(msg)
130131
result = subprocess.run(actual_cmd, shell=True, check=False)
131-
132+
132133
ecode = result.returncode
133134

134135
except Exception as e:
135136
announce(f"An unexpected error occurred while running the command: {e}")
136137
ecode = -1
137138

138139
if ecode == 0:
139-
break
140-
140+
break
141+
141142
if counter < attempts:
142143
announce(f"Command failed with exit code {ecode}. Retrying in {delay} seconds... ({counter}/{attempts})")
143144
time.sleep(delay)
144145

145146
if ecode != 0:
146147
announce(f"\nERROR while executing command \"{actual_cmd}\"")
147-
148+
148149
if last_stdout_log and last_stdout_log.exists():
149150
try:
150151
announce(last_stdout_log.read_text())
151152
except IOError:
152153
announce("(stdout not captured)")
153154
else:
154155
announce("(stdout not captured)")
155-
156+
156157
# print stderr log if it exists
157158
if last_stderr_log and last_stderr_log.exists():
158159
try:
@@ -206,12 +207,18 @@ def validate_and_create_pvc(
206207
if '/' not in download_model:
207208
announce(f"'{download_model}' is not in Hugging Face format <org>/<repo>")
208209
sys.exit(1)
209-
210+
210211
announce(f"🔍 Checking storage class '{pvc_class}'...")
211212
try:
212213
k8s_config.load_kube_config()
213214
storage_v1_api = k8s_client.StorageV1Api()
214-
215+
216+
if pvc_class == "default" :
217+
for x in storage_v1_api.list_storage_class().items :
218+
if x.metadata.annotations and "storageclass.kubernetes.io/is-default-class" in x.metadata.annotations :
219+
if x.metadata.annotations["storageclass.kubernetes.io/is-default-class"] == "true" :
220+
announce(f"ℹ️ Environment variable LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS automatically set to \"{x.metadata.name}\"")
221+
pvc_class = x.metadata.name
215222
storage_v1_api.read_storage_class(name=pvc_class)
216223
announce(f"StorageClass '{pvc_class}' found.")
217224

@@ -270,7 +277,7 @@ def launch_download_job(
270277
dry_run: bool = False,
271278
verbose: bool = False
272279
):
273-
280+
274281
work_dir_str = os.getenv("LLMDBENCH_CONTROL_WORK_DIR", ".")
275282
current_step = os.getenv("LLMDBENCH_CURRENT_STEP", "step")
276283
kcmd = os.getenv("LLMDBENCH_CONTROL_KCMD", "kubectl")
@@ -343,7 +350,7 @@ def launch_download_job(
343350
sys.exit(1)
344351

345352
delete_cmd = f"{kcmd} delete job {job_name} -n {namespace} --ignore-not-found=true"
346-
353+
347354
announce(f"--> Deleting previous job '{job_name}' (if it exists) to prevent conflicts...")
348355
llmdbench_execute_cmd(
349356
actual_cmd=delete_cmd,
@@ -362,10 +369,13 @@ def launch_download_job(
362369
)
363370

364371

365-
async def wait_for_job(job_name, namespace, timeout=7200):
372+
async def wait_for_job(job_name, namespace, timeout=7200, dry_run: bool = False):
366373
"""Wait for the job to complete"""
367374
announce(f"Waiting for job {job_name} to complete...")
368375

376+
if dry_run :
377+
return True
378+
369379
# use async config loading
370380
await k8s_async_config.load_kube_config()
371381
api_client = k8s_async_client.ApiClient()
@@ -391,7 +401,7 @@ async def wait_for_job(job_name, namespace, timeout=7200):
391401
announce(f"Evaluation job {job_name} failed")
392402
return False
393403

394-
404+
395405
except asyncio.TimeoutError:
396406
announce(f"Timeout waiting for evaluation job {job_name} after {timeout} seconds.")
397407
return False
@@ -401,29 +411,37 @@ async def wait_for_job(job_name, namespace, timeout=7200):
401411
await api_client.close()
402412

403413
def model_attribute(model: str, attribute: str) -> str:
404-
414+
415+
model, modelid = model.split(':', 1) if ':' in model else (model, model)
416+
405417
# split the model name into provider and rest
406418
provider, model_part = model.split('/', 1) if '/' in model else ("", model)
407419

420+
hash_object = hashlib.sha256()
421+
hash_object.update(modelid.encode('utf-8'))
422+
digest = hash_object.hexdigest()
423+
modelid_label = f"{provider[:8]}-{digest[:8]}-{model_part[-8:]}"
424+
408425
# create a list of components from the model part
409426
# equiv to: tr '[:upper:]' '[:lower:]' | sed -e 's^qwen^qwen-^g' -e 's^-^\n^g'
410427
model_components_str = model_part.lower().replace("qwen", "qwen-")
411428
model_components = model_components_str.split('-')
412429

413-
# get individual attributes using regex
430+
# get individual attributes using regex
414431
type_str = ""
415432
for comp in model_components:
416-
if re.search(r"nstruct|hf|chat|speech|vision", comp, re.IGNORECASE):
433+
if re.search(r"nstruct|hf|chat|speech|vision|opt", comp, re.IGNORECASE):
417434
type_str = comp
418435
break
419436

420437
parameters = ""
421438
for comp in model_components:
422439
if re.search(r"[0-9].*[bm]", comp, re.IGNORECASE):
423-
parameters = comp.replace('.', 'p')
440+
parameters = re.sub(r'^[a-z]', '', comp, count=1)
441+
parameters = parameters.replace('.', 'p')
424442
break
425-
426-
major_version = ""
443+
444+
major_version = "1"
427445
for comp in model_components:
428446
# find component that starts with a digit but is not the parameter string
429447
if comp.isdigit() or (comp and comp[0].isdigit() and not re.search(r"b|m", comp, re.IGNORECASE)):
@@ -433,19 +451,21 @@ def model_attribute(model: str, attribute: str) -> str:
433451
break
434452

435453
kind = model_components[0] if model_components else ""
436-
454+
437455
as_label = model.lower().replace('/', '-').replace('.', '-')
438-
456+
439457
# build label and clean it up
440458
label_parts = [part for part in [kind, major_version, parameters] if part]
441459
label = '-'.join(label_parts)
442460
label = re.sub(r'-+', '-', label).strip('-') # replace multiple hyphens and strip from ends
443461

444462
folder = model.lower().replace('/', '_').replace('-', '_')
445463

446-
# storing all attributes in a dictionary
464+
# storing all attributes in a dictionary
447465
attributes = {
448466
"model": model,
467+
"modelid": modelid,
468+
"modelid_label": modelid_label,
449469
"provider": provider,
450470
"type": type_str,
451471
"parameters": parameters,
@@ -458,7 +478,7 @@ def model_attribute(model: str, attribute: str) -> str:
458478

459479
# return requested attrib
460480
result = attributes.get(attribute, "")
461-
481+
462482
# The original script lowercases everything except the model attribute
463483
if attribute != "model":
464484
return result.lower()

setup/functions.sh

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,14 @@ function model_attribute {
2828
local modelid=$(echo $model | cut -d: -f2)
2929
local modelid_label="$(echo -n $modelid | cut -d '/' -f 1 | cut -c1-8)-$(echo -n $modelid | sha256sum | awk '{print $1}' | cut -c1-8)-$(echo -n $modelid | cut -d '/' -f 2 | rev | cut -c1-8 | rev)"
3030

31-
# TODO handle this in a more appropriate way
32-
# Hack to get all attributes for facebook/opt-125m
33-
case "$model" in
34-
"facebook/opt-125m") local model_hack=facebook/opt-1.0-125m-hf ;;
35-
*)
36-
model_hack=$model ;;
37-
esac
38-
39-
local modelcomponents=$(echo $model_hack | cut -d '/' -f 2 | tr '[:upper:]' '[:lower:]' | $LLMDBENCH_CONTROL_SCMD -e 's^qwen^qwen-^g' -e 's^-^\n^g')
31+
local modelcomponents=$(echo $model | cut -d '/' -f 2 | tr '[:upper:]' '[:lower:]' | $LLMDBENCH_CONTROL_SCMD -e 's^qwen^qwen-^g' -e 's^-^\n^g')
4032
local provider=$(echo $model | cut -d '/' -f 1)
41-
local type=$(echo "${modelcomponents}" | grep -Ei "nstruct|hf|chat|speech|vision")
33+
local type=$(echo "${modelcomponents}" | grep -Ei "nstruct|hf|chat|speech|vision|opt")
4234
local parameters=$(echo "${modelcomponents}" | grep -Ei "[0-9].*b|[0-9].*m" | $LLMDBENCH_CONTROL_SCMD -e 's^a^^' -e 's^\.^p^')
4335
local majorversion=$(echo "${modelcomponents}" | grep -Ei "^[0-9]" | grep -Evi "b|E" | $LLMDBENCH_CONTROL_SCMD -e "s/$parameters//g" | cut -d '.' -f 1)
36+
if [[ -z $majorversion ]]; then
37+
local majorversion=1
38+
fi
4439
local kind=$(echo "${modelcomponents}" | head -n 1 | cut -d '/' -f 1)
4540
local as_label=$(echo $model | tr '[:upper:]' '[:lower:]' | $LLMDBENCH_CONTROL_SCMD -e "s^/^-^g")
4641
local label=$(echo ${kind}-${majorversion}-${parameters} | $LLMDBENCH_CONTROL_SCMD -e 's^-$^^g' -e 's^--^^g')
@@ -729,6 +724,10 @@ function run_step {
729724
source $script_path
730725
elif [[ ${!script_implementaton} == py ]]; then
731726
python3 $script_path
727+
local ec=$?
728+
if [[ $ec -ne 0 ]]; then
729+
exit $ec
730+
fi
732731
else
733732
announce "ERROR: Unsupported script type for \"$script_path\""
734733
fi
@@ -766,6 +765,8 @@ spec:
766765
- name: harness
767766
image: $(get_image ${LLMDBENCH_IMAGE_REGISTRY} ${LLMDBENCH_IMAGE_REPO} ${LLMDBENCH_IMAGE_NAME} ${LLMDBENCH_IMAGE_TAG})
768767
imagePullPolicy: Always
768+
securityContext:
769+
runAsUser: 0
769770
command: ["sh", "-c"]
770771
args:
771772
- "${LLMDBENCH_HARNESS_EXECUTABLE}"

setup/install_deps.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,4 +152,4 @@ for dep in $python_deps; do
152152
done
153153
echo "---------------------------"
154154

155-
popd &>/dev/null
155+
popd &>/dev/null

setup/run.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,18 +224,19 @@ for method in ${LLMDBENCH_DEPLOY_METHODS//,/ }; do
224224
if [[ $LLMDBENCH_CONTROL_ENVIRONMENT_TYPE_STANDALONE_ACTIVE -eq 0 && $LLMDBENCH_CONTROL_ENVIRONMENT_TYPE_MODELSERVICE_ACTIVE -eq 0 ]]; then
225225
announce "🔍 Deployment method - $LLMDBENCH_DEPLOY_METHODS - is neither \"standalone\" nor \"modelservice\". Trying to find a matching endpoint name..."
226226
export LLMDBENCH_HARNESS_STACK_TYPE=vllm-prod
227-
export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get service --no-headers | awk '{print $1}' | grep -x ${LLMDBENCH_DEPLOY_METHODS} || true)
227+
export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get service --no-headers | awk '{print $1}' | grep ${LLMDBENCH_DEPLOY_METHODS} || true)
228228
if [[ ! -z $LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME ]]; then
229229
export LLMDBENCH_HARNESS_STACK_ENDPOINT_PORT=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get service/$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME --no-headers -o json | jq -r '.spec.ports[0].port')
230230
else
231-
export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod --no-headers | awk '{print $1}' | grep -x ${LLMDBENCH_DEPLOY_METHODS} | head -n 1 || true)
231+
export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod --no-headers | awk '{print $1}' | grep ${LLMDBENCH_DEPLOY_METHODS} | head -n 1 || true)
232232
export LLMDBENCH_VLLM_FQDN=
233233
if [[ ! -z $LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME ]]; then
234234
announce "ℹ️ Stack Endpoint name detected is \"$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME\""
235235
export LLMDBENCH_HARNESS_STACK_ENDPOINT_PORT=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod/$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME --no-headers -o json | jq -r ".spec.containers[0].ports[0].containerPort")
236236
export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod/$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME --no-headers -o json | jq -r ".status.podIP")
237237
fi
238238
fi
239+
export LLMDBENCH_DEPLOY_CURRENT_MODEL="auto"
239240
fi
240241

241242
if [[ $LLMDBENCH_CONTROL_DRY_RUN -eq 1 ]]; then
@@ -260,8 +261,11 @@ for method in ${LLMDBENCH_DEPLOY_METHODS//,/ }; do
260261
announce "ℹ️ Stack model detected is \"mock\""
261262
else
262263
received_model_name=$(get_model_name_from_pod $LLMDBENCH_VLLM_COMMON_NAMESPACE $(get_image ${LLMDBENCH_IMAGE_REGISTRY} ${LLMDBENCH_IMAGE_REPO} ${LLMDBENCH_IMAGE_NAME} ${LLMDBENCH_IMAGE_TAG}) ${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL} 80)
263-
if [[ ${received_model_name} == ${LLMDBENCH_DEPLOY_CURRENT_MODEL} ]]; then
264+
if [[ $LLMDBENCH_DEPLOY_CURRENT_MODEL == "auto" ]]; then
265+
export LLMDBENCH_DEPLOY_CURRENT_MODEL=$received_model_name
264266
announce "ℹ️ Stack model detected is \"$received_model_name\""
267+
elif [[ ${received_model_name} == ${LLMDBENCH_DEPLOY_CURRENT_MODEL} ]]; then
268+
announce "ℹ️ Stack model detected is \"$received_model_name\", matches requested \"$LLMDBENCH_DEPLOY_CURRENT_MODEL\""
265269
else
266270
announce "❌ Stack model detected is \"$received_model_name\" (instead of $LLMDBENCH_DEPLOY_CURRENT_MODEL)!"
267271
exit 1

0 commit comments

Comments
 (0)