2121
2222def wrap_script_with_logging (
2323 script : str ,
24- dashboard_info : Dict [str , str ] = None ,
25- recipe_subdir : str = "esm2_native_te_mfsdp" ,
2624 all_config_json : str = "{}" ,
2725) -> str :
28- if isinstance (dashboard_info , (HydraDictConfig , ListConfig )):
29- dashboard_info = OmegaConf .to_container (dashboard_info , resolve = True )
30- if dashboard_info is None :
31- dashboard_info = {}
32-
33- # serialize after conversion
34- dashboard_json = json .dumps (dashboard_info , separators = ("," , ":" ))
35-
3626 return f"""set -euo pipefail
3727
3828# Get job name
@@ -46,8 +36,20 @@ def wrap_script_with_logging(
4636RC=$?
4737set -e
4838
39+ echo "pwd"
40+ pwd
41+
42+ echo "ls"
43+ ls
44+
45+ echo "commit in bionemo-framework"
46+ (cd bionemo-framework && git log -1 || true)
47+ # Always grab the exact commit currently checked out in the framework repo
48+ COMMIT_SHA="$(cd bionemo-framework && git rev-parse HEAD 2>/dev/null || true)"
49+ echo "Resolved framework commit: ${{COMMIT_SHA:-<none>}}"
50+
4951# Authenticate to Lepton
50- pip install -q leptonai >/dev/null 2>&1 || pip install leptonai
52+ pip install -q leptonai >/dev/null 2>&1 || pip install -q leptonai || true
5153lep login -c "$LEP_LOGIN_CREDENTIALS" || true
5254
5355# Get lepton job details
@@ -104,25 +106,104 @@ def wrap_script_with_logging(
104106 }}
105107 ' 2>/dev/null
106108)"
107-
108109JOB_INFO_JSON="$(printf '%s' "$JOB_INFO" | jq -c . 2>/dev/null || echo '{{}}')"
110+
111+ # Ingest provided config JSON
109112ALL_CONFIG_JSON='{ all_config_json } '
110- DASHBOARD_INFO_JSON='{ dashboard_json } '
113+ if echo "$ALL_CONFIG_JSON" | jq -e . >/dev/null 2>&1; then
114+ ALL_CONFIG_JSON_UPDATED="$(printf '%s' "$ALL_CONFIG_JSON" | jq -c '.')"
115+ else
116+ echo "Warning: ALL_CONFIG_JSON is not valid JSON. Using empty object."
117+ ALL_CONFIG_JSON_UPDATED='{{}}'
118+ fi
119+
120+ # Inject/overwrite the resolved framework commit (only if we actually got one)
121+ if [ -n "${{COMMIT_SHA:-}}" ]; then
122+ ALL_CONFIG_JSON_UPDATED="$(printf '%s' "$ALL_CONFIG_JSON_UPDATED" | jq -c --arg commit "$COMMIT_SHA" '.commit_sha = $commit')"
123+ fi
124+
125+ # Extract values from config (with sensible defaults)
126+ RECIPE_SUBDIR="$(printf '%s' "$ALL_CONFIG_JSON_UPDATED" | jq -r '.recipe_subdir // "esm2_native_te_mfsdp"')"
127+
128+ # ---------------------------
129+ # Collect NVIDIA SMI as JSON (no cuda_version in --query-gpu)
130+ # ---------------------------
131+ set +e
132+ NVIDIA_SMI_BIN="$(command -v nvidia-smi || echo /usr/bin/nvidia-smi)"
133+ NVIDIA_SMI_JSON="[]"
134+ for GPU_FIELDS in \
135+ 'index,uuid,name,driver_version,pci.bus_id,pstate,temperature.gpu,power.draw,power.limit,clocks.sm,clocks.mem,clocks.gr,memory.total,memory.free,memory.used,utilization.gpu,utilization.memory,compute_mode' \
136+ 'index,uuid,name,driver_version,pci.bus_id,pstate,temperature.gpu,power.draw,power.limit,clocks.current.sm,clocks.current.memory,clocks.current.graphics,memory.total,memory.free,memory.used,utilization.gpu,utilization.memory,compute_mode' \
137+ 'index,uuid,name,driver_version,pci.bus_id,memory.total,memory.free,memory.used,utilization.gpu'; do
138+ RAW_SMI="$("$NVIDIA_SMI_BIN" --query-gpu="$GPU_FIELDS" --format=csv,noheader,nounits 2>/dev/null || true)"
139+ if [ -n "$RAW_SMI" ]; then
140+ NVIDIA_SMI_JSON="$(
141+ GPU_FIELDS="$GPU_FIELDS" python3 - <<'PY' 2>/dev/null || true
142+ import os, sys, csv, json
143+ keys = [s.strip() for s in os.environ.get("GPU_FIELDS","").split(",") if s.strip()]
144+ rows = []
145+ for r in csv.reader(sys.stdin):
146+ if not r:
147+ continue
148+ vals = [x.strip() for x in r]
149+ if len(vals) < len(keys):
150+ vals += [None]*(len(keys)-len(vals))
151+ rows.append(dict(zip(keys, vals[:len(keys)])))
152+ print(json.dumps(rows))
153+ PY
154+ <<< "$RAW_SMI"
155+ )"
156+ if [ -n "$NVIDIA_SMI_JSON" ] && [ "$NVIDIA_SMI_JSON" != "[]" ]; then
157+ break
158+ fi
159+ fi
160+ done
161+
162+ RAW_APPS="$("$NVIDIA_SMI_BIN" --query-compute-apps=gpu_uuid,pid,process_name,used_memory --format=csv,noheader,nounits 2>/dev/null || true)"
163+ if [ -n "$RAW_APPS" ]; then
164+ NVIDIA_COMPUTE_APPS_JSON="$(
165+ python3 - <<'PY' 2>/dev/null || true
166+ import sys, csv, json
167+ rows=[]
168+ for r in csv.reader(sys.stdin):
169+ if not r:
170+ continue
171+ gpu_uuid = r[0].strip() if len(r)>0 else None
172+ # pid as int where possible
173+ pid = None
174+ if len(r)>1:
175+ try: pid = int(r[1].strip())
176+ except: pid = None
177+ process = r[2].strip() if len(r)>2 else None
178+ used_mem = r[3].strip() if len(r)>3 else None
179+ rows.append({{"gpu_uuid": gpu_uuid, "pid": pid, "process_name": process, "used_memory": used_mem}})
180+ print(json.dumps(rows))
181+ PY
182+ <<< "$RAW_APPS"
183+ )"
184+ else
185+ NVIDIA_COMPUTE_APPS_JSON="[]"
186+ fi
187+
188+ # Driver/CUDA at top level from -q (stable across versions)
189+ DRIVER_VERSION="$("$NVIDIA_SMI_BIN" -q 2>/dev/null | awk -F': ' '/Driver Version/ {{print $2; exit}}')"
190+ CUDA_VERSION="$("$NVIDIA_SMI_BIN" -q 2>/dev/null | awk -F': ' '/CUDA Version/ {{print $2; exit}}')"
191+ NVIDIA_DRIVER_INFO="$(jq -n --arg dv "$DRIVER_VERSION" --arg cv "$CUDA_VERSION" 'def nn($x): if ($x|length)>0 then $x else null end; {{driver_version: nn($dv), cuda_version: nn($cv)}}' 2>/dev/null || echo '{{}}')"
192+ set -e
111193
112194# Look for W&B files
113- WANDB_DIR="/workspace/bionemo-framework/recipes/{ recipe_subdir } /wandb"
195+ WANDB_DIR="/workspace/bionemo-framework/recipes/$RECIPE_SUBDIR /wandb"
114196WANDB_FOUND=0
115197WANDB_SUMMARY=""
116198WANDB_METADATA=""
117199
118200if [ -d "$WANDB_DIR" ]; then
119- # Use latest-run symlink or find most recent run
120201 if [ -L "$WANDB_DIR/latest-run" ]; then
121202 LATEST_RUN="$WANDB_DIR/latest-run"
122203 else
123204 LATEST_RUN=$(ls -td "$WANDB_DIR"/run-* "$WANDB_DIR"/offline-run-* 2>/dev/null | head -n1)
124205 fi
125-
206+
126207 if [ -n "$LATEST_RUN" ] && [ -d "$LATEST_RUN/files" ]; then
127208 if [ -f "$LATEST_RUN/files/wandb-summary.json" ]; then
128209 WANDB_SUMMARY="$LATEST_RUN/files/wandb-summary.json"
@@ -134,24 +215,28 @@ def wrap_script_with_logging(
134215
135216if [ "$WANDB_FOUND" = "1" ] && [ -n "$WANDB_SUMMARY" ]; then
136217 echo "Uploading W&B metrics to Kratos..."
137-
218+
138219 METADATA_JSON=$(cat "$WANDB_METADATA" 2>/dev/null || echo '{{}}')
139220 SUMMARY_JSON=$(cat "$WANDB_SUMMARY" 2>/dev/null || echo '{{}}')
140221
141222 COMBINED_JSON=$(jq -n \
142223 --arg m "$METADATA_JSON" \
143224 --arg s "$SUMMARY_JSON" \
144225 --argjson job_info "$JOB_INFO_JSON" \
145- --argjson dashboard_info "$DASHBOARD_INFO_JSON" \
146- --argjson all_config "$ALL_CONFIG_JSON" \
226+ --argjson all_config "$ALL_CONFIG_JSON_UPDATED" \
227+ --argjson nvidia_smi "$NVIDIA_SMI_JSON" \
228+ --argjson nvidia_compute_apps "$NVIDIA_COMPUTE_APPS_JSON" \
229+ --argjson nvidia_driver "$NVIDIA_DRIVER_INFO" \
147230 '
148231 . + {{
149232 job_name: env.LEPTON_JOB_NAME,
150233 metadata: ($m | fromjson? // {{}}),
151234 summary: ($s | fromjson? // {{}}),
152235 job_info: $job_info,
153- dashboard_info: $dashboard_info,
154- config: $all_config
236+ config: $all_config,
237+ nvidia_smi: $nvidia_smi,
238+ nvidia_compute_apps: $nvidia_compute_apps,
239+ nvidia_driver: $nvidia_driver
155240 }}
156241 ')
157242
@@ -239,8 +324,6 @@ def launch_single_job(client, cfg: DictConfig):
239324 "-c" ,
240325 wrap_script_with_logging (
241326 cfg .script ,
242- dashboard_info = cfg .dashboard_info if hasattr (cfg , 'dashboard_info' ) else None ,
243- recipe_subdir = cfg .recipe_subdir if hasattr (cfg , 'recipe_subdir' ) else "esm2_native_te_mfsdp" ,
244327 all_config_json = full_cfg_json ,
245328 ),
246329 ]
@@ -325,10 +408,14 @@ def main(cfg: DictConfig):
325408 # Create new OmegaConf object from merged dict
326409 product_cfg = OmegaConf .create (merged_dict )
327410
328- # Generate job name as recipe_subdir-model_name, replacing underscores and slashes with hyphens
329- recipe_subdir = product_cfg .recipe_subdir .replace ('_' , '-' ).replace ('/' , '-' )
330- model_name = product_dict ['model_name' ].replace ('_' , '-' ).replace ('/' , '-' )
331- product_cfg .job_name = f"{ model_name } " .lower ()
411+ # Generate job name using recipe_subdir and config value
412+ # Extract the base recipe name from recipe_subdir (e.g., "geneformer" from "geneformer_native_te_mfsdp_fp8")
413+ recipe_parts = product_cfg .recipe_subdir .split ('_' )
414+ base_recipe_name = recipe_parts [0 ] if recipe_parts else product_cfg .recipe_subdir
415+
416+ # Create job name as base_recipe_name-config (e.g., "geneformer-10m")
417+ config_name = product_dict ['config' ].replace ('_' , '-' ).replace ('/' , '-' )
418+ product_cfg .job_name = f"{ base_recipe_name } -{ config_name } " .lower ()
332419
333420 print (f"\n [{ i } /{ len (cfg .products )} ] Launching: { product_cfg .job_name } " )
334421
0 commit comments