66import time
77from pathlib import Path
88import subprocess
9- import inspect
9+ import inspect
1010import pykube
11+ import hashlib
1112from pykube .exceptions import PyKubeError
1213
1314import yaml
2122
2223import asyncio
2324
24- import logging
25+ import logging
2526logging .basicConfig (
2627 level = logging .INFO ,
2728 format = '%(asctime)s - %(levelname)s - %(message)s'
3233def announce (message : str , logfile : str = None ):
3334 work_dir = os .getenv ("LLMDBENCH_CONTROL_WORK_DIR" , '.' )
3435 log_dir = os .path .join (work_dir , 'logs' )
35-
36+
3637 # ensure logs dir exists
3738 os .makedirs (log_dir , exist_ok = True )
3839
3940
4041 if not logfile :
4142 cur_step = os .getenv ("CURRENT_STEP_NAME" , 'step' )
4243 logfile = cur_step + '.log'
43-
44+
4445 logpath = os .path .join (log_dir , logfile )
4546
4647 logger .info (message )
@@ -66,10 +67,10 @@ def kube_connect(config_path : str = '~/.kube/config'):
6667 sys .exit (1 )
6768
6869 return api
69-
7070
7171
72-
72+
73+
7374def llmdbench_execute_cmd (
7475 actual_cmd : str ,
7576 dry_run : bool = True ,
@@ -81,11 +82,11 @@ def llmdbench_execute_cmd(
8182) -> int :
8283 work_dir_str = os .getenv ("LLMDBENCH_CONTROL_WORK_DIR" , "." )
8384 log_dir = Path (work_dir_str ) / "setup" / "commands"
84-
85+
8586 log_dir .mkdir (parents = True , exist_ok = True )
8687
8788 command_tstamp = int (time .time () * 1_000_000_000 )
88-
89+
8990 if dry_run :
9091 msg = f"---> would have executed the command \" { actual_cmd } \" "
9192 announce (msg )
@@ -105,11 +106,11 @@ def llmdbench_execute_cmd(
105106 ecode = - 1
106107 last_stdout_log = None
107108 last_stderr_log = None
108-
109+
109110 for counter in range (1 , attempts + 1 ):
110111 command_tstamp = int (time .time () * 1_000_000_000 )
111-
112- # log file paths
112+
113+ # log file paths
113114 stdout_log = log_dir / f"{ command_tstamp } _stdout.log"
114115 stderr_log = log_dir / f"{ command_tstamp } _stderr.log"
115116 last_stdout_log = stdout_log
@@ -128,31 +129,31 @@ def llmdbench_execute_cmd(
128129 # run with verbose
129130 announce (msg )
130131 result = subprocess .run (actual_cmd , shell = True , check = False )
131-
132+
132133 ecode = result .returncode
133134
134135 except Exception as e :
135136 announce (f"An unexpected error occurred while running the command: { e } " )
136137 ecode = - 1
137138
138139 if ecode == 0 :
139- break
140-
140+ break
141+
141142 if counter < attempts :
142143 announce (f"Command failed with exit code { ecode } . Retrying in { delay } seconds... ({ counter } /{ attempts } )" )
143144 time .sleep (delay )
144145
145146 if ecode != 0 :
146147 announce (f"\n ERROR while executing command \" { actual_cmd } \" " )
147-
148+
148149 if last_stdout_log and last_stdout_log .exists ():
149150 try :
150151 announce (last_stdout_log .read_text ())
151152 except IOError :
152153 announce ("(stdout not captured)" )
153154 else :
154155 announce ("(stdout not captured)" )
155-
156+
156157 # print stderr log if it exists
157158 if last_stderr_log and last_stderr_log .exists ():
158159 try :
@@ -206,12 +207,18 @@ def validate_and_create_pvc(
206207 if '/' not in download_model :
207208 announce (f"'{ download_model } ' is not in Hugging Face format <org>/<repo>" )
208209 sys .exit (1 )
209-
210+
210211 announce (f"🔍 Checking storage class '{ pvc_class } '..." )
211212 try :
212213 k8s_config .load_kube_config ()
213214 storage_v1_api = k8s_client .StorageV1Api ()
214-
215+
216+ if pvc_class == "default" :
217+ for x in storage_v1_api .list_storage_class ().items :
218+ if x .metadata .annotations and "storageclass.kubernetes.io/is-default-class" in x .metadata .annotations :
219+ if x .metadata .annotations ["storageclass.kubernetes.io/is-default-class" ] == "true" :
220+ announce (f"ℹ️ Environment variable LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS automatically set to \" { x .metadata .name } \" " )
221+ pvc_class = x .metadata .name
215222 storage_v1_api .read_storage_class (name = pvc_class )
216223 announce (f"StorageClass '{ pvc_class } ' found." )
217224
@@ -270,7 +277,7 @@ def launch_download_job(
270277 dry_run : bool = False ,
271278 verbose : bool = False
272279):
273-
280+
274281 work_dir_str = os .getenv ("LLMDBENCH_CONTROL_WORK_DIR" , "." )
275282 current_step = os .getenv ("LLMDBENCH_CURRENT_STEP" , "step" )
276283 kcmd = os .getenv ("LLMDBENCH_CONTROL_KCMD" , "kubectl" )
@@ -343,7 +350,7 @@ def launch_download_job(
343350 sys .exit (1 )
344351
345352 delete_cmd = f"{ kcmd } delete job { job_name } -n { namespace } --ignore-not-found=true"
346-
353+
347354 announce (f"--> Deleting previous job '{ job_name } ' (if it exists) to prevent conflicts..." )
348355 llmdbench_execute_cmd (
349356 actual_cmd = delete_cmd ,
@@ -362,10 +369,13 @@ def launch_download_job(
362369 )
363370
364371
365- async def wait_for_job (job_name , namespace , timeout = 7200 ):
372+ async def wait_for_job (job_name , namespace , timeout = 7200 , dry_run : bool = False ):
366373 """Wait for the job to complete"""
367374 announce (f"Waiting for job { job_name } to complete..." )
368375
376+ if dry_run :
377+ return True
378+
369379 # use async config loading
370380 await k8s_async_config .load_kube_config ()
371381 api_client = k8s_async_client .ApiClient ()
@@ -391,7 +401,7 @@ async def wait_for_job(job_name, namespace, timeout=7200):
391401 announce (f"Evaluation job { job_name } failed" )
392402 return False
393403
394-
404+
395405 except asyncio .TimeoutError :
396406 announce (f"Timeout waiting for evaluation job { job_name } after { timeout } seconds." )
397407 return False
@@ -401,29 +411,37 @@ async def wait_for_job(job_name, namespace, timeout=7200):
401411 await api_client .close ()
402412
403413def model_attribute (model : str , attribute : str ) -> str :
404-
414+
415+ model , modelid = model .split (':' , 1 ) if ':' in model else (model , model )
416+
405417 # split the model name into provider and rest
406418 provider , model_part = model .split ('/' , 1 ) if '/' in model else ("" , model )
407419
420+ hash_object = hashlib .sha256 ()
421+ hash_object .update (modelid .encode ('utf-8' ))
422+ digest = hash_object .hexdigest ()
423+ modelid_label = f"{ provider [:8 ]} -{ digest [:8 ]} -{ model_part [- 8 :]} "
424+
408425 # create a list of components from the model part
409426 # equiv to: tr '[:upper:]' '[:lower:]' | sed -e 's^qwen^qwen-^g' -e 's^-^\n^g'
410427 model_components_str = model_part .lower ().replace ("qwen" , "qwen-" )
411428 model_components = model_components_str .split ('-' )
412429
413- # get individual attributes using regex
430+ # get individual attributes using regex
414431 type_str = ""
415432 for comp in model_components :
416- if re .search (r"nstruct|hf|chat|speech|vision" , comp , re .IGNORECASE ):
433+ if re .search (r"nstruct|hf|chat|speech|vision|opt " , comp , re .IGNORECASE ):
417434 type_str = comp
418435 break
419436
420437 parameters = ""
421438 for comp in model_components :
422439 if re .search (r"[0-9].*[bm]" , comp , re .IGNORECASE ):
423- parameters = comp .replace ('.' , 'p' )
440+ parameters = re .sub (r'^[a-z]' , '' , comp , count = 1 )
441+ parameters = parameters .replace ('.' , 'p' )
424442 break
425-
426- major_version = ""
443+
444+ major_version = "1 "
427445 for comp in model_components :
428446 # find component that starts with a digit but is not the parameter string
429447 if comp .isdigit () or (comp and comp [0 ].isdigit () and not re .search (r"b|m" , comp , re .IGNORECASE )):
@@ -433,19 +451,21 @@ def model_attribute(model: str, attribute: str) -> str:
433451 break
434452
435453 kind = model_components [0 ] if model_components else ""
436-
454+
437455 as_label = model .lower ().replace ('/' , '-' ).replace ('.' , '-' )
438-
456+
439457 # build label and clean it up
440458 label_parts = [part for part in [kind , major_version , parameters ] if part ]
441459 label = '-' .join (label_parts )
442460 label = re .sub (r'-+' , '-' , label ).strip ('-' ) # replace multiple hyphens and strip from ends
443461
444462 folder = model .lower ().replace ('/' , '_' ).replace ('-' , '_' )
445463
446- # storing all attributes in a dictionary
464+ # storing all attributes in a dictionary
447465 attributes = {
448466 "model" : model ,
467+ "modelid" : modelid ,
468+ "modelid_label" : modelid_label ,
449469 "provider" : provider ,
450470 "type" : type_str ,
451471 "parameters" : parameters ,
@@ -458,7 +478,7 @@ def model_attribute(model: str, attribute: str) -> str:
458478
459479 # return requested attrib
460480 result = attributes .get (attribute , "" )
461-
481+
462482 # The original script lowercases everything except the model attribute
463483 if attribute != "model" :
464484 return result .lower ()
0 commit comments