kalantar
diff --git a/‎.github/workflows/ci-pr-benchmark.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-pr-benchmark.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scenarios/cicd.sh‎
Lines changed: 1 addition & 1 deletion b/‎scenarios/cicd.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup/env.sh‎
Lines changed: 3 additions & 3 deletions b/‎setup/env.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎setup/functions.py‎
Lines changed: 52 additions & 32 deletions b/‎setup/functions.py‎
Lines changed: 52 additions & 32 deletions
diff --git a/‎setup/functions.sh‎
Lines changed: 11 additions & 10 deletions b/‎setup/functions.sh‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎setup/install_deps.sh‎
Lines changed: 1 addition & 1 deletion b/‎setup/install_deps.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup/run.sh‎
Lines changed: 7 additions & 3 deletions b/‎setup/run.sh‎
Lines changed: 7 additions & 3 deletions
@@ -37,7 +37,7 @@ jobs:
 
       - name: Populate python deps
         run: |
-          echo -e "pandas\ngrip>=4.6.0\nmatplotlib>=3.7.0\nnumpy>=1.22.0\nseaborn>=0.12.0\nkubernetes>=28.0.0" > requirements.txt
+          echo -e "pandas\ngrip>=4.6.0\nmatplotlib>=3.7.0\nnumpy>=1.22.0\nseaborn>=0.12.0\nkubernetes>=28.0.0\npykube\nkubernetes-asyncio\nGitPython" > requirements.txt
 
       - name: Install python deps
         uses: actions/setup-python@v5
 
@@ -1,7 +1,7 @@
 export LLMDBENCH_CONTROL_WORK_DIR=/tmp/cicd/
 export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
 export LLMDBENCH_VLLM_COMMON_NAMESPACE=llmdbenchcicd
-export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB
+export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S
 export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
 export LLMDBENCH_VLLM_MODELSERVICE_RELEASE=llmdbenchcicd
 export LLMDBENCH_VLLM_COMMON_REPLICAS=1
 
@@ -154,11 +154,11 @@ export LLMDBENCH_CONTROL_STANDUP_ALL_STEPS=${LLMDBENCH_CONTROL_STANDUP_ALL_STEPS
 export LLMDBENCH_CONTROL_WAIT_TIMEOUT=${LLMDBENCH_CONTROL_WAIT_TIMEOUT:-900}
 export LLMDBENCH_CONTROL_CHECK_CLUSTER_AUTHORIZATIONS=${LLMDBENCH_CONTROL_CHECK_CLUSTER_AUTHORIZATIONS:-0}
 export LLMDBENCH_CONTROL_RESOURCE_LIST=${LLMDBENCH_CONTROL_RESOURCE_LIST:-deployment,httproute,service,gateway,gatewayparameters,inferencepool,inferencemodel,cm,ing,pod,job}
-export LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION:-sh}
-export LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION:-sh}
+export LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_00_IMPLEMENTATION:-py}
+export LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_01_IMPLEMENTATION:-py}
 export LLMDBENCH_CONTROL_STEP_02_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_02_IMPLEMENTATION:-sh}
 export LLMDBENCH_CONTROL_STEP_03_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_03_IMPLEMENTATION:-sh}
-export LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION:-sh}
+export LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_04_IMPLEMENTATION:-py}
 export LLMDBENCH_CONTROL_STEP_05_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_05_IMPLEMENTATION:-sh}
 export LLMDBENCH_CONTROL_STEP_06_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_06_IMPLEMENTATION:-sh}
 export LLMDBENCH_CONTROL_STEP_07_IMPLEMENTATION=${LLMDBENCH_CONTROL_STEP_07_IMPLEMENTATION:-sh}
 
@@ -6,8 +6,9 @@
 import time
 from pathlib import Path
 import subprocess
-import inspect 
+import inspect
 import pykube
+import hashlib
 from pykube.exceptions import PyKubeError
 
 import yaml
@@ -21,7 +22,7 @@
 
 import asyncio
 
-import logging 
+import logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
@@ -32,15 +33,15 @@
 def announce(message: str, logfile : str = None):
     work_dir = os.getenv("LLMDBENCH_CONTROL_WORK_DIR", '.')
     log_dir = os.path.join(work_dir, 'logs')
-    
+
     # ensure logs dir exists
     os.makedirs(log_dir, exist_ok=True)
 
 
     if not logfile:
         cur_step = os.getenv("CURRENT_STEP_NAME", 'step')
         logfile = cur_step + '.log'
-    
+
     logpath = os.path.join(log_dir, logfile)
 
     logger.info(message)
@@ -66,10 +67,10 @@ def kube_connect(config_path : str = '~/.kube/config'):
         sys.exit(1)
 
     return api
-    
 
 
-        
+
+
 def llmdbench_execute_cmd(
     actual_cmd: str,
     dry_run: bool = True,
@@ -81,11 +82,11 @@ def llmdbench_execute_cmd(
 ) -> int:
     work_dir_str = os.getenv("LLMDBENCH_CONTROL_WORK_DIR", ".")
     log_dir = Path(work_dir_str) / "setup" / "commands"
-    
+
     log_dir.mkdir(parents=True, exist_ok=True)
 
     command_tstamp = int(time.time() * 1_000_000_000)
-    
+
     if dry_run:
         msg = f"---> would have executed the command \"{actual_cmd}\""
         announce(msg)
@@ -105,11 +106,11 @@ def llmdbench_execute_cmd(
     ecode = -1
     last_stdout_log = None
     last_stderr_log = None
-    
+
     for counter in range(1, attempts + 1):
         command_tstamp = int(time.time() * 1_000_000_000)
-        
-        # log file paths 
+
+        # log file paths
         stdout_log = log_dir / f"{command_tstamp}_stdout.log"
         stderr_log = log_dir / f"{command_tstamp}_stderr.log"
         last_stdout_log = stdout_log
@@ -128,31 +129,31 @@ def llmdbench_execute_cmd(
                 # run with verbose
                 announce(msg)
                 result = subprocess.run(actual_cmd, shell=True, check=False)
-            
+
             ecode = result.returncode
 
         except Exception as e:
             announce(f"An unexpected error occurred while running the command: {e}")
             ecode = -1
 
         if ecode == 0:
-            break  
-        
+            break
+
         if counter < attempts:
             announce(f"Command failed with exit code {ecode}. Retrying in {delay} seconds... ({counter}/{attempts})")
             time.sleep(delay)
 
     if ecode != 0:
         announce(f"\nERROR while executing command \"{actual_cmd}\"")
-        
+
         if last_stdout_log and last_stdout_log.exists():
             try:
                 announce(last_stdout_log.read_text())
             except IOError:
                 announce("(stdout not captured)")
         else:
             announce("(stdout not captured)")
-        
+
         # print stderr log if it exists
         if last_stderr_log and last_stderr_log.exists():
             try:
@@ -206,12 +207,18 @@ def validate_and_create_pvc(
     if '/' not in download_model:
         announce(f"'{download_model}' is not in Hugging Face format <org>/<repo>")
         sys.exit(1)
-    
+
     announce(f"🔍 Checking storage class '{pvc_class}'...")
     try:
         k8s_config.load_kube_config()
         storage_v1_api = k8s_client.StorageV1Api()
-        
+
+        if pvc_class == "default" :
+            for x in storage_v1_api.list_storage_class().items :
+                if x.metadata.annotations and "storageclass.kubernetes.io/is-default-class" in x.metadata.annotations :
+                    if x.metadata.annotations["storageclass.kubernetes.io/is-default-class"] == "true" :
+                        announce(f"ℹ️ Environment variable LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS automatically set to \"{x.metadata.name}\"")
+                        pvc_class = x.metadata.name
         storage_v1_api.read_storage_class(name=pvc_class)
         announce(f"StorageClass '{pvc_class}' found.")
 
@@ -270,7 +277,7 @@ def launch_download_job(
     dry_run: bool = False,
     verbose: bool = False
 ):
-    
+
     work_dir_str = os.getenv("LLMDBENCH_CONTROL_WORK_DIR", ".")
     current_step = os.getenv("LLMDBENCH_CURRENT_STEP", "step")
     kcmd = os.getenv("LLMDBENCH_CONTROL_KCMD", "kubectl")
@@ -343,7 +350,7 @@ def launch_download_job(
         sys.exit(1)
 
     delete_cmd = f"{kcmd} delete job {job_name} -n {namespace} --ignore-not-found=true"
-    
+
     announce(f"--> Deleting previous job '{job_name}' (if it exists) to prevent conflicts...")
     llmdbench_execute_cmd(
         actual_cmd=delete_cmd,
@@ -362,10 +369,13 @@ def launch_download_job(
     )
 
 
-async def wait_for_job(job_name, namespace, timeout=7200):
+async def wait_for_job(job_name, namespace, timeout=7200, dry_run: bool = False):
     """Wait for the  job to complete"""
     announce(f"Waiting for job {job_name} to complete...")
 
+    if dry_run :
+        return True
+
     # use async config loading
     await k8s_async_config.load_kube_config()
     api_client = k8s_async_client.ApiClient()
@@ -391,7 +401,7 @@ async def wait_for_job(job_name, namespace, timeout=7200):
                     announce(f"Evaluation job {job_name} failed")
                     return False
 
-                
+
     except asyncio.TimeoutError:
         announce(f"Timeout waiting for evaluation job {job_name} after {timeout} seconds.")
         return False
@@ -401,29 +411,37 @@ async def wait_for_job(job_name, namespace, timeout=7200):
         await api_client.close()
 
 def model_attribute(model: str, attribute: str) -> str:
-   
+
+    model, modelid = model.split(':', 1) if ':' in model else (model, model)
+
     #  split the model name into provider and rest
     provider, model_part = model.split('/', 1) if '/' in model else ("", model)
 
+    hash_object = hashlib.sha256()
+    hash_object.update(modelid.encode('utf-8'))
+    digest = hash_object.hexdigest()
+    modelid_label = f"{provider[:8]}-{digest[:8]}-{model_part[-8:]}"
+
     # create a list of components from the model part
     # equiv  to: tr '[:upper:]' '[:lower:]' | sed -e 's^qwen^qwen-^g' -e 's^-^\n^g'
     model_components_str = model_part.lower().replace("qwen", "qwen-")
     model_components = model_components_str.split('-')
 
-    # get individual attributes using regex 
+    # get individual attributes using regex
     type_str = ""
     for comp in model_components:
-        if re.search(r"nstruct|hf|chat|speech|vision", comp, re.IGNORECASE):
+        if re.search(r"nstruct|hf|chat|speech|vision|opt", comp, re.IGNORECASE):
             type_str = comp
             break
 
     parameters = ""
     for comp in model_components:
         if re.search(r"[0-9].*[bm]", comp, re.IGNORECASE):
-            parameters = comp.replace('.', 'p')
+            parameters = re.sub(r'^[a-z]', '', comp, count=1)
+            parameters = parameters.replace('.', 'p')
             break
-    
-    major_version = ""
+
+    major_version = "1"
     for comp in model_components:
         # find component that starts with a digit but is not the parameter string
         if comp.isdigit() or (comp and comp[0].isdigit() and not re.search(r"b|m", comp, re.IGNORECASE)):
@@ -433,19 +451,21 @@ def model_attribute(model: str, attribute: str) -> str:
             break
 
     kind = model_components[0] if model_components else ""
-    
+
     as_label = model.lower().replace('/', '-').replace('.', '-')
-    
+
     # build label and clean it up
     label_parts = [part for part in [kind, major_version, parameters] if part]
     label = '-'.join(label_parts)
     label = re.sub(r'-+', '-', label).strip('-') # replace multiple hyphens and strip from ends
 
     folder = model.lower().replace('/', '_').replace('-', '_')
 
-    # storing all attributes in a dictionary 
+    # storing all attributes in a dictionary
     attributes = {
         "model": model,
+        "modelid": modelid,
+        "modelid_label": modelid_label,
         "provider": provider,
         "type": type_str,
         "parameters": parameters,
@@ -458,7 +478,7 @@ def model_attribute(model: str, attribute: str) -> str:
 
     # return requested attrib
     result = attributes.get(attribute, "")
-    
+
     # The original script lowercases everything except the model attribute
     if attribute != "model":
         return result.lower()
 
@@ -28,19 +28,14 @@ function model_attribute {
   local modelid=$(echo $model | cut -d: -f2)
   local modelid_label="$(echo -n $modelid | cut -d '/' -f 1 | cut -c1-8)-$(echo -n $modelid | sha256sum | awk '{print $1}' | cut -c1-8)-$(echo -n $modelid | cut -d '/' -f 2 | rev | cut -c1-8 | rev)"
 
-  # TODO handle this in a more appropriate way
-  # Hack to get all attributes for facebook/opt-125m
-  case "$model" in
-    "facebook/opt-125m") local model_hack=facebook/opt-1.0-125m-hf ;;
-    *)
-      model_hack=$model ;;
-  esac
-
-  local modelcomponents=$(echo $model_hack | cut -d '/' -f 2 |  tr '[:upper:]' '[:lower:]' | $LLMDBENCH_CONTROL_SCMD -e 's^qwen^qwen-^g' -e 's^-^\n^g')
+  local modelcomponents=$(echo $model | cut -d '/' -f 2 |  tr '[:upper:]' '[:lower:]' | $LLMDBENCH_CONTROL_SCMD -e 's^qwen^qwen-^g' -e 's^-^\n^g')
   local provider=$(echo $model | cut -d '/' -f 1)
-  local type=$(echo "${modelcomponents}" | grep -Ei "nstruct|hf|chat|speech|vision")
+  local type=$(echo "${modelcomponents}" | grep -Ei "nstruct|hf|chat|speech|vision|opt")
   local parameters=$(echo "${modelcomponents}" | grep -Ei "[0-9].*b|[0-9].*m" | $LLMDBENCH_CONTROL_SCMD -e 's^a^^' -e 's^\.^p^')
   local majorversion=$(echo "${modelcomponents}" | grep -Ei "^[0-9]" | grep -Evi "b|E" |  $LLMDBENCH_CONTROL_SCMD -e "s/$parameters//g" | cut -d '.' -f 1)
+  if [[ -z $majorversion ]]; then
+    local majorversion=1
+  fi
   local kind=$(echo "${modelcomponents}" | head -n 1 | cut -d '/' -f 1)
   local as_label=$(echo $model | tr '[:upper:]' '[:lower:]' | $LLMDBENCH_CONTROL_SCMD -e "s^/^-^g")
   local label=$(echo ${kind}-${majorversion}-${parameters} | $LLMDBENCH_CONTROL_SCMD -e 's^-$^^g' -e 's^--^^g')
@@ -729,6 +724,10 @@ function run_step {
       source $script_path
     elif [[ ${!script_implementaton} == py ]]; then
       python3 $script_path
+      local ec=$?
+      if [[ $ec -ne 0 ]]; then
+        exit $ec
+      fi
     else
       announce "ERROR: Unsupported script type for \"$script_path\""
     fi
@@ -766,6 +765,8 @@ spec:
   - name: harness
     image: $(get_image ${LLMDBENCH_IMAGE_REGISTRY} ${LLMDBENCH_IMAGE_REPO} ${LLMDBENCH_IMAGE_NAME} ${LLMDBENCH_IMAGE_TAG})
     imagePullPolicy: Always
+    securityContext:
+      runAsUser: 0
     command: ["sh", "-c"]
     args:
     - "${LLMDBENCH_HARNESS_EXECUTABLE}"
 
@@ -152,4 +152,4 @@ for dep in $python_deps; do
 done
 echo "---------------------------"
 
-popd &>/dev/null
+popd &>/dev/null
@@ -224,18 +224,19 @@ for method in ${LLMDBENCH_DEPLOY_METHODS//,/ }; do
       if [[ $LLMDBENCH_CONTROL_ENVIRONMENT_TYPE_STANDALONE_ACTIVE -eq 0 && $LLMDBENCH_CONTROL_ENVIRONMENT_TYPE_MODELSERVICE_ACTIVE -eq 0 ]]; then
         announce "🔍 Deployment method - $LLMDBENCH_DEPLOY_METHODS - is neither \"standalone\" nor \"modelservice\". Trying to find a matching endpoint name..."
         export LLMDBENCH_HARNESS_STACK_TYPE=vllm-prod
-        export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get service --no-headers | awk '{print $1}' | grep -x ${LLMDBENCH_DEPLOY_METHODS} || true)
+        export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get service --no-headers | awk '{print $1}' | grep ${LLMDBENCH_DEPLOY_METHODS} || true)
         if [[ ! -z $LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME ]]; then
           export LLMDBENCH_HARNESS_STACK_ENDPOINT_PORT=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get service/$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME --no-headers -o json | jq -r '.spec.ports[0].port')
         else
-          export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod --no-headers | awk '{print $1}' | grep -x ${LLMDBENCH_DEPLOY_METHODS} | head -n 1 || true)
+          export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod --no-headers | awk '{print $1}' | grep ${LLMDBENCH_DEPLOY_METHODS} | head -n 1 || true)
           export LLMDBENCH_VLLM_FQDN=
           if [[ ! -z $LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME ]]; then
             announce "ℹ️ Stack Endpoint name detected is \"$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME\""
             export LLMDBENCH_HARNESS_STACK_ENDPOINT_PORT=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod/$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME --no-headers -o json | jq -r ".spec.containers[0].ports[0].containerPort")
             export LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME=$(${LLMDBENCH_CONTROL_KCMD} --namespace "$LLMDBENCH_VLLM_COMMON_NAMESPACE" get pod/$LLMDBENCH_HARNESS_STACK_ENDPOINT_NAME --no-headers -o json | jq -r ".status.podIP")
           fi
         fi
+        export LLMDBENCH_DEPLOY_CURRENT_MODEL="auto"
       fi
 
       if [[ $LLMDBENCH_CONTROL_DRY_RUN -eq 1 ]]; then
@@ -260,8 +261,11 @@ for method in ${LLMDBENCH_DEPLOY_METHODS//,/ }; do
         announce "ℹ️ Stack model detected is \"mock\""
       else
         received_model_name=$(get_model_name_from_pod $LLMDBENCH_VLLM_COMMON_NAMESPACE $(get_image ${LLMDBENCH_IMAGE_REGISTRY} ${LLMDBENCH_IMAGE_REPO} ${LLMDBENCH_IMAGE_NAME} ${LLMDBENCH_IMAGE_TAG}) ${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL} 80)
-        if [[ ${received_model_name} == ${LLMDBENCH_DEPLOY_CURRENT_MODEL} ]]; then
+        if [[ $LLMDBENCH_DEPLOY_CURRENT_MODEL == "auto" ]]; then
+          export LLMDBENCH_DEPLOY_CURRENT_MODEL=$received_model_name
           announce "ℹ️ Stack model detected is \"$received_model_name\""
+        elif [[ ${received_model_name} == ${LLMDBENCH_DEPLOY_CURRENT_MODEL} ]]; then
+          announce "ℹ️ Stack model detected is \"$received_model_name\", matches requested \"$LLMDBENCH_DEPLOY_CURRENT_MODEL\""
         else
           announce "❌ Stack model detected is \"$received_model_name\" (instead of $LLMDBENCH_DEPLOY_CURRENT_MODEL)!"
           exit 1