Skip to content

Commit b58823b

Browse files
authored
Enhances step 04, removes llm-d-deployer dep. (llm-d#145)
Signed-off-by: vezio <tyler.rimaldi@ibm.com>
1 parent 4358609 commit b58823b

File tree

2 files changed

+280
-44
lines changed

2 files changed

+280
-44
lines changed

setup/env.sh

Lines changed: 211 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ export LLMDBENCH_VLLM_COMMON_PVC_NAME=${LLMDBENCH_VLLM_COMMON_PVC_NAME:-"model-p
5656
export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS="${LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS:-default}"
5757
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE="${LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE:-300Gi}"
5858
export LLMDBENCH_VLLM_COMMON_PVC_DOWNLOAD_TIMEOUT=${LLMDBENCH_VLLM_COMMON_PVC_DOWNLOAD_TIMEOUT:-"2400"}
59+
export LLMDBENCH_VLLM_COMMON_HF_TOKEN_KEY="${LLMDBENCH_VLLM_COMMON_HF_TOKEN_KEY:-"HF_TOKEN"}"
5960
export LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME=${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME:-"llm-d-hf-token"}
6061
export LLMDBENCH_VLLM_COMMON_INFERENCE_PORT=${LLMDBENCH_VLLM_COMMON_INFERENCE_PORT:-"8000"}
6162
export LLMDBENCH_VLLM_COMMON_FQDN=${LLMDBENCH_VLLM_COMMON_FQDN:-".svc.cluster.local"}
@@ -474,6 +475,7 @@ function llmdbench_execute_cmd {
474475
local verbose=${3:-0}
475476
local silent=${4:-1}
476477
local attempts=${5:-1}
478+
local fatal=${6:-0}
477479
local counter=1
478480
local delay=10
479481

@@ -522,7 +524,16 @@ function llmdbench_execute_cmd {
522524
fi
523525

524526
set -euo pipefail
525-
return $ecode
527+
528+
if [[ ${fatal} -eq 1 ]];
529+
then
530+
if [[ ${ecode} -ne 0 ]]
531+
then
532+
exit ${ecode}
533+
fi
534+
fi
535+
536+
return ${ecode}
526537
}
527538
export -f llmdbench_execute_cmd
528539

@@ -676,3 +687,202 @@ function announce {
676687
fi
677688
}
678689
export -f announce
690+
691+
require_var() {
692+
local var_name="$1"
693+
local var_value="$2"
694+
if [[ -z "${var_value}" ]]; then
695+
announce "❌ Required variable '${var_name}' is empty"
696+
exit 1
697+
fi
698+
}
699+
export -f require_var
700+
701+
create_namespace() {
702+
local kcmd="$1"
703+
local namespace="$2"
704+
require_var "namespace" "${namespace}"
705+
announce "📦 Creating namespace ${namespace}..."
706+
${kcmd} create namespace "${namespace}" --dry-run=client -o yaml | ${kcmd} apply -f - &>/dev/null || {
707+
announce "❌ Failed to create/apply namespace ${namespace}"
708+
exit 1
709+
}
710+
announce "✅ Namespace ready"
711+
}
712+
export -f create_namespace
713+
714+
create_or_update_hf_secret() {
715+
local kcmd="$1"
716+
local namespace="$2"
717+
local secret_name="$3"
718+
local secret_key="$4"
719+
local hf_token="$5"
720+
721+
require_var "namespace" "${namespace}"
722+
require_var "secret_name" "${secret_name}"
723+
require_var "hf_token" "${hf_token}"
724+
725+
announce "🔐 Creating/updating HF token secret..."
726+
727+
llmdbench_execute_cmd "${kcmd} delete secret ${secret_name} -n ${namespace} --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
728+
${kcmd} create secret generic "${secret_name}" \
729+
--namespace "${namespace}" \
730+
--from-literal="${secret_key}=${hf_token}" \
731+
--dry-run=client -o yaml | ${kcmd} apply -n "${namespace}" -f - &>/dev/null || {
732+
announce "❌ Failed to create/apply secret ${secret_name}"
733+
exit 1
734+
}
735+
announce "✅ HF token secret created"
736+
}
737+
export -f create_or_update_hf_secret
738+
739+
#
740+
# vLLM Model Download Utilities
741+
#
742+
743+
validate_and_create_pvc() {
744+
local kcmd="$1"
745+
local namespace="$2"
746+
local download_model="$3"
747+
local pvc_name="$4"
748+
local pvc_size="$5"
749+
local pvc_class="$6"
750+
751+
require_var "download_model" "${download_model}"
752+
require_var "pvc_name" "${pvc_name}"
753+
require_var "pvc_size" "${pvc_size}"
754+
require_var "pvc_class" "${pvc_class}"
755+
756+
announce "💾 Provisioning model storage…"
757+
758+
if [[ "${download_model}" != */* ]]; then
759+
announce "❌ '${download_model}' is not in Hugging Face format <org>/<repo>"
760+
exit 1
761+
fi
762+
763+
announce "🔍 Checking storage class '${pvc_class}'..."
764+
if ! ${kcmd} get storageclass "${pvc_class}" &>/dev/null; then
765+
announce "❌ StorageClass '${pvc_class}' not found"
766+
exit 1
767+
fi
768+
769+
cat << EOF > ${LLMDBENCH_CONTROL_WORK_DIR}/setup/yamls/${LLMDBENCH_CURRENT_STEP}_storage_pvc_setup.yaml
770+
apiVersion: v1
771+
kind: PersistentVolumeClaim
772+
metadata:
773+
name: ${pvc_name}
774+
spec:
775+
accessModes:
776+
- ReadWriteMany
777+
resources:
778+
requests:
779+
storage: ${pvc_size}
780+
storageClassName: ${pvc_class}
781+
volumeMode: Filesystem
782+
EOF
783+
784+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} apply -n ${namespace} -f ${LLMDBENCH_CONTROL_WORK_DIR}/setup/yamls/${LLMDBENCH_CURRENT_STEP}_storage_pvc_setup.yaml" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 1 1 1
785+
}
786+
export -f validate_and_create_pvc
787+
788+
launch_download_job() {
789+
local kcmd="$1"
790+
local namespace="$2"
791+
local secret_name="$3"
792+
local download_model="$4"
793+
local model_path="$5"
794+
local pvc_name="$6"
795+
796+
require_var "namespace" "${namespace}"
797+
require_var "secret_name" "${secret_name}"
798+
require_var "download_model" "${download_model}"
799+
require_var "model_path" "${model_path}"
800+
require_var "pvc_name" "${pvc_name}"
801+
802+
announce "🚀 Launching model download job..."
803+
804+
cat << EOF > ${LLMDBENCH_CONTROL_WORK_DIR}/setup/yamls/${LLMDBENCH_CURRENT_STEP}_download_pod_job.yaml
805+
apiVersion: batch/v1
806+
kind: Job
807+
metadata:
808+
name: download-model
809+
spec:
810+
template:
811+
spec:
812+
containers:
813+
- name: downloader
814+
image: python:3.10
815+
command: ["/bin/sh", "-c"]
816+
args:
817+
- mkdir -p "\${MOUNT_PATH}/\${MODEL_PATH}" && \
818+
pip install huggingface_hub && \
819+
export PATH="\${PATH}:\${HOME}/.local/bin" && \
820+
huggingface-cli login --token "\${HF_TOKEN}" && \
821+
huggingface-cli download "\${HF_MODEL_ID}" --local-dir "/cache/\${MODEL_PATH}"
822+
env:
823+
- name: MODEL_PATH
824+
value: ${model_path}
825+
- name: HF_MODEL_ID
826+
value: ${download_model}
827+
- name: HF_TOKEN
828+
valueFrom:
829+
secretKeyRef:
830+
name: ${secret_name}
831+
key: HF_TOKEN
832+
- name: HF_HOME
833+
value: /tmp/huggingface
834+
- name: HOME
835+
value: /tmp
836+
- name: MOUNT_PATH
837+
value: /cache
838+
volumeMounts:
839+
- name: model-cache
840+
mountPath: /cache
841+
restartPolicy: OnFailure
842+
imagePullPolicy: IfNotPresent
843+
volumes:
844+
- name: model-cache
845+
persistentVolumeClaim:
846+
claimName: ${pvc_name}
847+
EOF
848+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} apply -n ${namespace} -f ${LLMDBENCH_CONTROL_WORK_DIR}/setup/yamls/${LLMDBENCH_CURRENT_STEP}_download_pod_job.yaml" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 1 1 1
849+
}
850+
export -f launch_download_job
851+
852+
wait_for_download_job() {
853+
local kcmd="$1"
854+
local namespace="$2"
855+
local timeout="$3"
856+
857+
require_var "namespace" "${namespace}"
858+
require_var "timeout" "${timeout}"
859+
860+
announce "⏳ Waiting for pod to start model download job ..."
861+
local pod_name
862+
pod_name="$(${kcmd} get pod --selector=job-name=download-model -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
863+
864+
if [[ -z "${pod_name}" ]]; then
865+
announce "🙀 No pod found for the job. Exiting..."
866+
llmdbench_execute_cmd "${kcmd} logs job/download-model -n ${namespace}" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 1 1 1
867+
fi
868+
869+
llmdbench_execute_cmd "${kcmd} wait --for=condition=Ready pod/"${pod_name}" --timeout=60s -n ${namespace}" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
870+
if [[ $? -ne 0 ]]
871+
then
872+
announce "🙀 Pod did not become Ready"
873+
llmdbench_execute_cmd "${kcmd} logs job/download-model -n ${namespace}" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 0 1 0
874+
exit 1
875+
fi
876+
877+
announce "⏳ Waiting up to ${timeout}s for job to complete..."
878+
llmdbench_execute_cmd "${kcmd} wait --for=condition=complete --timeout="${timeout}"s job/download-model -n ${namespace}" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
879+
if [[ $? -ne 0 ]]
880+
then
881+
announce "🙀 Download job failed or timed out"
882+
llmdbench_execute_cmd "${kcmd} logs job/download-model -n ${namespace}" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 0 1 0
883+
exit 1
884+
fi
885+
886+
announce "✅ Model downloaded"
887+
}
888+
export -f wait_for_download_job
Lines changed: 69 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,70 @@
11
#!/usr/bin/env bash
2-
source ${LLMDBENCH_CONTROL_DIR}/env.sh
3-
4-
announce "🔍 Checking if \"${LLMDBENCH_VLLM_COMMON_NAMESPACE}\" is prepared."
5-
check_storage_class_and_affinity
6-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace ${LLMDBENCH_VLLM_COMMON_NAMESPACE} delete job download-model --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
7-
8-
for model in ${LLMDBENCH_DEPLOY_MODEL_LIST//,/ }; do
9-
modelfn=$(echo ${model} | ${LLMDBENCH_CONTROL_SCMD} 's^/^___^g' )
10-
cat << EOF > $LLMDBENCH_CONTROL_WORK_DIR/setup/yamls/${LLMDBENCH_CURRENT_STEP}_prepare_namespace_${modelfn}.yaml
11-
sampleApplication:
12-
enabled: true
13-
baseConfigMapRefName: basic-gpu-with-nixl-and-redis-lookup-preset
14-
model:
15-
modelArtifactURI: pvc://$LLMDBENCH_VLLM_COMMON_PVC_NAME/models/$(model_attribute $model model)
16-
modelName: "$(model_attribute $model model)"
17-
EOF
18-
19-
llmd_opts="--namespace ${LLMDBENCH_VLLM_COMMON_NAMESPACE} --skip-infra --download-pvc-only --storage-class ${LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS} --storage-size ${LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE} --download-model $(model_attribute $model model) --download-timeout ${LLMDBENCH_VLLM_COMMON_PVC_DOWNLOAD_TIMEOUT} --values-file $LLMDBENCH_CONTROL_WORK_DIR/setup/yamls/${LLMDBENCH_CURRENT_STEP}_prepare_namespace_${modelfn}.yaml --context $LLMDBENCH_CONTROL_WORK_DIR/environment/context.ctx"
20-
announce "🚀 Calling llm-d-deployer with options \"${llmd_opts}\"..."
21-
pushd $LLMDBENCH_DEPLOYER_DIR/llm-d-deployer/quickstart &>/dev/null
22-
llmdbench_execute_cmd "cd $LLMDBENCH_DEPLOYER_DIR/llm-d-deployer/quickstart; export HF_TOKEN=$LLMDBENCH_HF_TOKEN; ./llmd-installer.sh $llmd_opts" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 0 3
23-
popd &>/dev/null
24-
announce "✅ llm-d-deployer prepared namespace"
25-
done
26-
27-
if [[ $LLMDBENCH_CONTROL_DEPLOY_IS_OPENSHIFT -eq 1 ]]; then
28-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} \
29-
adm \
30-
policy \
31-
add-scc-to-user \
32-
anyuid \
33-
-z ${LLMDBENCH_VLLM_COMMON_SERVICE_ACCOUNT} \
34-
-n $LLMDBENCH_VLLM_COMMON_NAMESPACE" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
35-
36-
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} \
37-
adm \
38-
policy \
39-
add-scc-to-user \
40-
privileged \
41-
-z ${LLMDBENCH_VLLM_COMMON_SERVICE_ACCOUNT} \
42-
-n $LLMDBENCH_VLLM_COMMON_NAMESPACE" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
43-
fi
44-
announce "✅ Namespace \"${LLMDBENCH_VLLM_COMMON_NAMESPACE}\" prepared."
2+
source "${LLMDBENCH_CONTROL_DIR}/env.sh"
3+
4+
main() {
5+
announce "🔍 Checking if \"${LLMDBENCH_VLLM_COMMON_NAMESPACE}\" is prepared."
6+
check_storage_class_and_affinity
7+
if [[ $? -ne 0 ]]
8+
then
9+
announce "❌ Failed to check storage class and affinity"
10+
exit 1
11+
fi
12+
13+
for model in ${LLMDBENCH_DEPLOY_MODEL_LIST//,/ }; do
14+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} --namespace "${LLMDBENCH_VLLM_COMMON_NAMESPACE}" delete job download-model --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
15+
16+
local MODEL_ARTIFACT_URI="pvc://${LLMDBENCH_VLLM_COMMON_PVC_NAME}/models/$(model_attribute "${model}" model)"
17+
local PROTOCOL="${MODEL_ARTIFACT_URI%%://*}"
18+
local DOWNLOAD_MODEL="$(model_attribute "${model}" model)"
19+
20+
local PVC_AND_MODEL_PATH="${MODEL_ARTIFACT_URI#*://}"
21+
local PVC_NAME="${PVC_AND_MODEL_PATH%%/*}"
22+
local MODEL_PATH="${PVC_AND_MODEL_PATH#*/}"
23+
24+
create_namespace "${LLMDBENCH_CONTROL_KCMD}" "${LLMDBENCH_VLLM_COMMON_NAMESPACE}"
25+
create_or_update_hf_secret "${LLMDBENCH_CONTROL_KCMD}" "${LLMDBENCH_VLLM_COMMON_NAMESPACE}" "${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME}" ${LLMDBENCH_VLLM_COMMON_HF_TOKEN_KEY} "${LLMDBENCH_HF_TOKEN}"
26+
27+
validate_and_create_pvc \
28+
"${LLMDBENCH_CONTROL_KCMD}" \
29+
"${LLMDBENCH_VLLM_COMMON_NAMESPACE}" \
30+
"${DOWNLOAD_MODEL}" \
31+
"${PVC_NAME}" \
32+
"${LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE}" \
33+
"${LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS}"
34+
35+
launch_download_job \
36+
"${LLMDBENCH_CONTROL_KCMD}" \
37+
"${LLMDBENCH_VLLM_COMMON_NAMESPACE}" \
38+
"${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME}" \
39+
"${DOWNLOAD_MODEL}" \
40+
"${MODEL_PATH}" \
41+
"${PVC_NAME}"
42+
43+
wait_for_download_job \
44+
"${LLMDBENCH_CONTROL_KCMD}" \
45+
"${LLMDBENCH_VLLM_COMMON_NAMESPACE}" \
46+
"${LLMDBENCH_VLLM_COMMON_PVC_DOWNLOAD_TIMEOUT}"
47+
48+
announce "✅ llm-d-deployer prepared namespace"
49+
50+
if [[ "${LLMDBENCH_CONTROL_DEPLOY_IS_OPENSHIFT}" -eq 1 ]]; then
51+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} \
52+
adm policy add-scc-to-user anyuid \
53+
-z ${LLMDBENCH_VLLM_COMMON_SERVICE_ACCOUNT} \
54+
-n ${LLMDBENCH_VLLM_COMMON_NAMESPACE}" \
55+
"${LLMDBENCH_CONTROL_DRY_RUN}" "${LLMDBENCH_CONTROL_VERBOSE}" 1 1 1
56+
57+
llmdbench_execute_cmd "${LLMDBENCH_CONTROL_KCMD} \
58+
adm policy add-scc-to-user privileged \
59+
-z ${LLMDBENCH_VLLM_COMMON_SERVICE_ACCOUNT} \
60+
-n ${LLMDBENCH_VLLM_COMMON_NAMESPACE}" \
61+
"${LLMDBENCH_CONTROL_DRY_RUN}" "${LLMDBENCH_CONTROL_VERBOSE}" 1 1 1
62+
fi
63+
64+
announce "✅ Namespace \"${LLMDBENCH_VLLM_COMMON_NAMESPACE}\" prepared."
65+
done
66+
67+
return 0
68+
}
69+
70+
main

0 commit comments

Comments
 (0)