@@ -56,6 +56,7 @@ export LLMDBENCH_VLLM_COMMON_PVC_NAME=${LLMDBENCH_VLLM_COMMON_PVC_NAME:-"model-p
5656export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=" ${LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS:- default} "
5757export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=" ${LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE:- 300Gi} "
5858export LLMDBENCH_VLLM_COMMON_PVC_DOWNLOAD_TIMEOUT=${LLMDBENCH_VLLM_COMMON_PVC_DOWNLOAD_TIMEOUT:- " 2400" }
59+ export LLMDBENCH_VLLM_COMMON_HF_TOKEN_KEY=" ${LLMDBENCH_VLLM_COMMON_HF_TOKEN_KEY:- " HF_TOKEN" } "
5960export LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME=${LLMDBENCH_VLLM_COMMON_HF_TOKEN_NAME:- " llm-d-hf-token" }
6061export LLMDBENCH_VLLM_COMMON_INFERENCE_PORT=${LLMDBENCH_VLLM_COMMON_INFERENCE_PORT:- " 8000" }
6162export LLMDBENCH_VLLM_COMMON_FQDN=${LLMDBENCH_VLLM_COMMON_FQDN:- " .svc.cluster.local" }
@@ -474,6 +475,7 @@ function llmdbench_execute_cmd {
474475 local verbose=${3:- 0}
475476 local silent=${4:- 1}
476477 local attempts=${5:- 1}
478+ local fatal=${6:- 0}
477479 local counter=1
478480 local delay=10
479481
@@ -522,7 +524,16 @@ function llmdbench_execute_cmd {
522524 fi
523525
524526 set -euo pipefail
525- return $ecode
527+
528+ if [[ ${fatal} -eq 1 ]];
529+ then
530+ if [[ ${ecode} -ne 0 ]]
531+ then
532+ exit ${ecode}
533+ fi
534+ fi
535+
536+ return ${ecode}
526537}
527538export -f llmdbench_execute_cmd
528539
@@ -676,3 +687,202 @@ function announce {
676687 fi
677688}
678689export -f announce
690+
691+ require_var () {
692+ local var_name=" $1 "
693+ local var_value=" $2 "
694+ if [[ -z " ${var_value} " ]]; then
695+ announce " ❌ Required variable '${var_name} ' is empty"
696+ exit 1
697+ fi
698+ }
699+ export -f require_var
700+
701+ create_namespace () {
702+ local kcmd=" $1 "
703+ local namespace=" $2 "
704+ require_var " namespace" " ${namespace} "
705+ announce " 📦 Creating namespace ${namespace} ..."
706+ ${kcmd} create namespace " ${namespace} " --dry-run=client -o yaml | ${kcmd} apply -f - & > /dev/null || {
707+ announce " ❌ Failed to create/apply namespace ${namespace} "
708+ exit 1
709+ }
710+ announce " ✅ Namespace ready"
711+ }
712+ export -f create_namespace
713+
714+ create_or_update_hf_secret () {
715+ local kcmd=" $1 "
716+ local namespace=" $2 "
717+ local secret_name=" $3 "
718+ local secret_key=" $4 "
719+ local hf_token=" $5 "
720+
721+ require_var " namespace" " ${namespace} "
722+ require_var " secret_name" " ${secret_name} "
723+ require_var " hf_token" " ${hf_token} "
724+
725+ announce " 🔐 Creating/updating HF token secret..."
726+
727+ llmdbench_execute_cmd " ${kcmd} delete secret ${secret_name} -n ${namespace} --ignore-not-found" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
728+ ${kcmd} create secret generic " ${secret_name} " \
729+ --namespace " ${namespace} " \
730+ --from-literal=" ${secret_key} =${hf_token} " \
731+ --dry-run=client -o yaml | ${kcmd} apply -n " ${namespace} " -f - & > /dev/null || {
732+ announce " ❌ Failed to create/apply secret ${secret_name} "
733+ exit 1
734+ }
735+ announce " ✅ HF token secret created"
736+ }
737+ export -f create_or_update_hf_secret
738+
739+ #
740+ # vLLM Model Download Utilities
741+ #
742+
743+ validate_and_create_pvc () {
744+ local kcmd=" $1 "
745+ local namespace=" $2 "
746+ local download_model=" $3 "
747+ local pvc_name=" $4 "
748+ local pvc_size=" $5 "
749+ local pvc_class=" $6 "
750+
751+ require_var " download_model" " ${download_model} "
752+ require_var " pvc_name" " ${pvc_name} "
753+ require_var " pvc_size" " ${pvc_size} "
754+ require_var " pvc_class" " ${pvc_class} "
755+
756+ announce " 💾 Provisioning model storage…"
757+
758+ if [[ " ${download_model} " != * /* ]]; then
759+ announce " ❌ '${download_model} ' is not in Hugging Face format <org>/<repo>"
760+ exit 1
761+ fi
762+
763+ announce " 🔍 Checking storage class '${pvc_class} '..."
764+ if ! ${kcmd} get storageclass " ${pvc_class} " & > /dev/null; then
765+ announce " ❌ StorageClass '${pvc_class} ' not found"
766+ exit 1
767+ fi
768+
769+ cat << EOF > ${LLMDBENCH_CONTROL_WORK_DIR} /setup/yamls/${LLMDBENCH_CURRENT_STEP} _storage_pvc_setup.yaml
770+ apiVersion: v1
771+ kind: PersistentVolumeClaim
772+ metadata:
773+ name: ${pvc_name}
774+ spec:
775+ accessModes:
776+ - ReadWriteMany
777+ resources:
778+ requests:
779+ storage: ${pvc_size}
780+ storageClassName: ${pvc_class}
781+ volumeMode: Filesystem
782+ EOF
783+
784+ llmdbench_execute_cmd " ${LLMDBENCH_CONTROL_KCMD} apply -n ${namespace} -f ${LLMDBENCH_CONTROL_WORK_DIR} /setup/yamls/${LLMDBENCH_CURRENT_STEP} _storage_pvc_setup.yaml" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 1 1 1
785+ }
786+ export -f validate_and_create_pvc
787+
788+ launch_download_job () {
789+ local kcmd=" $1 "
790+ local namespace=" $2 "
791+ local secret_name=" $3 "
792+ local download_model=" $4 "
793+ local model_path=" $5 "
794+ local pvc_name=" $6 "
795+
796+ require_var " namespace" " ${namespace} "
797+ require_var " secret_name" " ${secret_name} "
798+ require_var " download_model" " ${download_model} "
799+ require_var " model_path" " ${model_path} "
800+ require_var " pvc_name" " ${pvc_name} "
801+
802+ announce " 🚀 Launching model download job..."
803+
804+ cat << EOF > ${LLMDBENCH_CONTROL_WORK_DIR} /setup/yamls/${LLMDBENCH_CURRENT_STEP} _download_pod_job.yaml
805+ apiVersion: batch/v1
806+ kind: Job
807+ metadata:
808+ name: download-model
809+ spec:
810+ template:
811+ spec:
812+ containers:
813+ - name: downloader
814+ image: python:3.10
815+ command: ["/bin/sh", "-c"]
816+ args:
817+ - mkdir -p "\$ {MOUNT_PATH}/\$ {MODEL_PATH}" && \
818+ pip install huggingface_hub && \
819+ export PATH="\$ {PATH}:\$ {HOME}/.local/bin" && \
820+ huggingface-cli login --token "\$ {HF_TOKEN}" && \
821+ huggingface-cli download "\$ {HF_MODEL_ID}" --local-dir "/cache/\$ {MODEL_PATH}"
822+ env:
823+ - name: MODEL_PATH
824+ value: ${model_path}
825+ - name: HF_MODEL_ID
826+ value: ${download_model}
827+ - name: HF_TOKEN
828+ valueFrom:
829+ secretKeyRef:
830+ name: ${secret_name}
831+ key: HF_TOKEN
832+ - name: HF_HOME
833+ value: /tmp/huggingface
834+ - name: HOME
835+ value: /tmp
836+ - name: MOUNT_PATH
837+ value: /cache
838+ volumeMounts:
839+ - name: model-cache
840+ mountPath: /cache
841+ restartPolicy: OnFailure
842+ imagePullPolicy: IfNotPresent
843+ volumes:
844+ - name: model-cache
845+ persistentVolumeClaim:
846+ claimName: ${pvc_name}
847+ EOF
848+ llmdbench_execute_cmd " ${LLMDBENCH_CONTROL_KCMD} apply -n ${namespace} -f ${LLMDBENCH_CONTROL_WORK_DIR} /setup/yamls/${LLMDBENCH_CURRENT_STEP} _download_pod_job.yaml" ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 1 1 1
849+ }
850+ export -f launch_download_job
851+
852+ wait_for_download_job () {
853+ local kcmd=" $1 "
854+ local namespace=" $2 "
855+ local timeout=" $3 "
856+
857+ require_var " namespace" " ${namespace} "
858+ require_var " timeout" " ${timeout} "
859+
860+ announce " ⏳ Waiting for pod to start model download job ..."
861+ local pod_name
862+ pod_name=" $( ${kcmd} get pod --selector=job-name=download-model -n " ${namespace} " -o jsonpath=' {.items[0].metadata.name}' 2> /dev/null || true) "
863+
864+ if [[ -z " ${pod_name} " ]]; then
865+ announce " 🙀 No pod found for the job. Exiting..."
866+ llmdbench_execute_cmd " ${kcmd} logs job/download-model -n ${namespace} " ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 1 1 1
867+ fi
868+
869+ llmdbench_execute_cmd " ${kcmd} wait --for=condition=Ready pod/" ${pod_name} " --timeout=60s -n ${namespace} " ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
870+ if [[ $? -ne 0 ]]
871+ then
872+ announce " 🙀 Pod did not become Ready"
873+ llmdbench_execute_cmd " ${kcmd} logs job/download-model -n ${namespace} " ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 0 1 0
874+ exit 1
875+ fi
876+
877+ announce " ⏳ Waiting up to ${timeout} s for job to complete..."
878+ llmdbench_execute_cmd " ${kcmd} wait --for=condition=complete --timeout=" ${timeout} " s job/download-model -n ${namespace} " ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE}
879+ if [[ $? -ne 0 ]]
880+ then
881+ announce " 🙀 Download job failed or timed out"
882+ llmdbench_execute_cmd " ${kcmd} logs job/download-model -n ${namespace} " ${LLMDBENCH_CONTROL_DRY_RUN} ${LLMDBENCH_CONTROL_VERBOSE} 0 1 0
883+ exit 1
884+ fi
885+
886+ announce " ✅ Model downloaded"
887+ }
888+ export -f wait_for_download_job
0 commit comments