Skip to content

Commit a04f05d

Browse files
committed
update configs
Signed-off-by: Jared Wilber <jwilber@nvidia.com>
1 parent de9b3da commit a04f05d

File tree

7 files changed

+182
-86
lines changed

7 files changed

+182
-86
lines changed

.github/workflows/convergence-tests.yml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,12 @@ jobs:
4141
python-version: "3.11"
4242
cache: "pip"
4343
cache-dependency-path: |
44-
requirements.ci.txt
45-
ci/lepton/model_convergence/scripts/requirements.txt
44+
ci/lepton/model_convergence/requirements.txt
4645
47-
- name: Install deps
46+
- name: Install dependencies
4847
run: |
4948
python -m pip install --upgrade pip
50-
pip install "hydra-core==1.3.2" "omegaconf==2.3.0" "leptonai" typer pandas
51-
python - <<'PY'
52-
import hydra, omegaconf, leptonai
53-
print("OK:", hydra.__version__, omegaconf.__version__, leptonai.__version__)
54-
PY
49+
pip install -r ci/lepton/model_convergence/requirements.txt
5550
5651
- name: Submit Lepton Jobs
5752
env:

ci/lepton/model_convergence/configs/base.yaml

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ container:
88
environment_variables:
99
- name: WANDB_API_KEY
1010
value_from: JWILBER_WANDB_API_KEY
11-
- name: PYTHONPATH
12-
value: /workspace/bionemo2/sub-packages/bionemo-evo2/src
1311
- name: KRATOS_SSA_URL
1412
value_from: KRATOS_SSA_URL
1513
- name: KRATOS_SSA_CLIENT_ID
@@ -23,4 +21,32 @@ environment_variables:
2321
# mount_path: /BioNeMo
2422
# from_: node-nfs:lepton-shared-fs
2523

24+
wandb_init_args:
25+
group: "recipes_model_convergence"
26+
mode: "online"
27+
28+
branch: main
2629
commit_sha: ""
30+
31+
# Sharedcheckout script used by all recipes
32+
checkout_script: |
33+
set -euo pipefail
34+
git clone https://github.com/NVIDIA/bionemo-framework.git
35+
cd bionemo-framework
36+
if [ -n "${commit_sha}" ]; then
37+
echo "Checking out commit: ${commit_sha}"
38+
git checkout "${commit_sha}"
39+
elif [ "${branch}" != "main" ]; then
40+
echo "Checking out branch: ${branch}"
41+
git checkout "${branch}"
42+
fi
43+
cd ..
44+
cd bionemo-framework/recipes/${recipe_subdir}
45+
pip install -r requirements.txt
46+
47+
# Child configs should override this with their run-specific commands
48+
train_script: ""
49+
50+
script: |
51+
${checkout_script}
52+
${train_script}

ci/lepton/model_convergence/configs/recipes/amplify_accelerate_te_fp8.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ job_name: amplify-recipes
77
resource_shape: gpu.2xh200
88

99
dashboard_info:
10-
model: geneformer
10+
model: amplify
1111
variant: recipes
1212
repo: recipes
1313

@@ -19,7 +19,7 @@ train_cmnd: train
1919
wandb_init_args:
2020
project: "amplify_accelerate_te_fp8"
2121
group: "recipes_model_convergence"
22-
mode: "online" # need online to collect logs; if offline, must manually sync after run
22+
mode: "online"
2323

2424
stop_after_n_steps: 10
2525

@@ -30,8 +30,8 @@ trainer:
3030
products:
3131
- model_name: amplify_120M_sanity
3232
config: L0_sanity
33-
- model_name: L1-350M-partial-conv
34-
config: L1_350M_partial_conv
33+
# - model_name: L1-350M-partial-conv
34+
# config: L1_350M_partial_conv
3535

3636
script: |
3737
git clone https://github.com/NVIDIA/bionemo-framework.git
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# @package _global_
2+
defaults:
3+
- /base
4+
- _self_
5+
6+
# Base configuration
7+
recipe_subdir: geneformer_native_te_mfsdp_fp8
8+
train_cmnd: train
9+
10+
num_train_steps: 10000
11+
12+
wandb_init_args:
13+
project: "geneformer_native_te_mfsdp_fp8"
14+
15+
# Run one for each config
16+
products:
17+
- config: 10m
18+
resource_shape: gpu.4xh200
19+
- config: 106m
20+
resource_shape: gpu.4xh200
21+
- config: 4b
22+
resource_shape: gpu.8xh200
23+
24+
train_script: |
25+
torchrun ${train_cmnd}.py \
26+
--config-name ${config}.yaml \
27+
training.num_train_steps=${num_train_steps} \
28+
wandb_init_args.mode=${wandb_init_args.mode} \
29+
wandb_init_args.project=${wandb_init_args.project} \
30+
+wandb_init_args.group=${wandb_init_args.group}

ci/lepton/model_convergence/configs/recipes/geneformer_native_te_nvfsdp_fp8.yaml

Lines changed: 0 additions & 45 deletions
This file was deleted.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
hydra-core==1.3.2
2+
omegaconf==2.3.0
3+
leptonai==0.26.1

ci/lepton/model_convergence/scripts/launch_job.py

Lines changed: 114 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,8 @@
2121

2222
def wrap_script_with_logging(
2323
script: str,
24-
dashboard_info: Dict[str, str] = None,
25-
recipe_subdir: str = "esm2_native_te_mfsdp",
2624
all_config_json: str = "{}",
2725
) -> str:
28-
if isinstance(dashboard_info, (HydraDictConfig, ListConfig)):
29-
dashboard_info = OmegaConf.to_container(dashboard_info, resolve=True)
30-
if dashboard_info is None:
31-
dashboard_info = {}
32-
33-
# serialize after conversion
34-
dashboard_json = json.dumps(dashboard_info, separators=(",", ":"))
35-
3626
return f"""set -euo pipefail
3727
3828
# Get job name
@@ -46,8 +36,20 @@ def wrap_script_with_logging(
4636
RC=$?
4737
set -e
4838
39+
echo "pwd"
40+
pwd
41+
42+
echo "ls"
43+
ls
44+
45+
echo "commit in bionemo-framework"
46+
(cd bionemo-framework && git log -1 || true)
47+
# Always grab the exact commit currently checked out in the framework repo
48+
COMMIT_SHA="$(cd bionemo-framework && git rev-parse HEAD 2>/dev/null || true)"
49+
echo "Resolved framework commit: ${{COMMIT_SHA:-<none>}}"
50+
4951
# Authenticate to Lepton
50-
pip install -q leptonai >/dev/null 2>&1 || pip install leptonai
52+
pip install -q leptonai >/dev/null 2>&1 || pip install -q leptonai || true
5153
lep login -c "$LEP_LOGIN_CREDENTIALS" || true
5254
5355
# Get lepton job details
@@ -104,25 +106,104 @@ def wrap_script_with_logging(
104106
}}
105107
' 2>/dev/null
106108
)"
107-
108109
JOB_INFO_JSON="$(printf '%s' "$JOB_INFO" | jq -c . 2>/dev/null || echo '{{}}')"
110+
111+
# Ingest provided config JSON
109112
ALL_CONFIG_JSON='{all_config_json}'
110-
DASHBOARD_INFO_JSON='{dashboard_json}'
113+
if echo "$ALL_CONFIG_JSON" | jq -e . >/dev/null 2>&1; then
114+
ALL_CONFIG_JSON_UPDATED="$(printf '%s' "$ALL_CONFIG_JSON" | jq -c '.')"
115+
else
116+
echo "Warning: ALL_CONFIG_JSON is not valid JSON. Using empty object."
117+
ALL_CONFIG_JSON_UPDATED='{{}}'
118+
fi
119+
120+
# Inject/overwrite the resolved framework commit (only if we actually got one)
121+
if [ -n "${{COMMIT_SHA:-}}" ]; then
122+
ALL_CONFIG_JSON_UPDATED="$(printf '%s' "$ALL_CONFIG_JSON_UPDATED" | jq -c --arg commit "$COMMIT_SHA" '.commit_sha = $commit')"
123+
fi
124+
125+
# Extract values from config (with sensible defaults)
126+
RECIPE_SUBDIR="$(printf '%s' "$ALL_CONFIG_JSON_UPDATED" | jq -r '.recipe_subdir // "esm2_native_te_mfsdp"')"
127+
128+
# ---------------------------
129+
# Collect NVIDIA SMI as JSON (no cuda_version in --query-gpu)
130+
# ---------------------------
131+
set +e
132+
NVIDIA_SMI_BIN="$(command -v nvidia-smi || echo /usr/bin/nvidia-smi)"
133+
NVIDIA_SMI_JSON="[]"
134+
for GPU_FIELDS in \
135+
'index,uuid,name,driver_version,pci.bus_id,pstate,temperature.gpu,power.draw,power.limit,clocks.sm,clocks.mem,clocks.gr,memory.total,memory.free,memory.used,utilization.gpu,utilization.memory,compute_mode' \
136+
'index,uuid,name,driver_version,pci.bus_id,pstate,temperature.gpu,power.draw,power.limit,clocks.current.sm,clocks.current.memory,clocks.current.graphics,memory.total,memory.free,memory.used,utilization.gpu,utilization.memory,compute_mode' \
137+
'index,uuid,name,driver_version,pci.bus_id,memory.total,memory.free,memory.used,utilization.gpu'; do
138+
RAW_SMI="$("$NVIDIA_SMI_BIN" --query-gpu="$GPU_FIELDS" --format=csv,noheader,nounits 2>/dev/null || true)"
139+
if [ -n "$RAW_SMI" ]; then
140+
NVIDIA_SMI_JSON="$(
141+
GPU_FIELDS="$GPU_FIELDS" python3 - <<'PY' 2>/dev/null || true
142+
import os, sys, csv, json
143+
keys = [s.strip() for s in os.environ.get("GPU_FIELDS","").split(",") if s.strip()]
144+
rows = []
145+
for r in csv.reader(sys.stdin):
146+
if not r:
147+
continue
148+
vals = [x.strip() for x in r]
149+
if len(vals) < len(keys):
150+
vals += [None]*(len(keys)-len(vals))
151+
rows.append(dict(zip(keys, vals[:len(keys)])))
152+
print(json.dumps(rows))
153+
PY
154+
<<< "$RAW_SMI"
155+
)"
156+
if [ -n "$NVIDIA_SMI_JSON" ] && [ "$NVIDIA_SMI_JSON" != "[]" ]; then
157+
break
158+
fi
159+
fi
160+
done
161+
162+
RAW_APPS="$("$NVIDIA_SMI_BIN" --query-compute-apps=gpu_uuid,pid,process_name,used_memory --format=csv,noheader,nounits 2>/dev/null || true)"
163+
if [ -n "$RAW_APPS" ]; then
164+
NVIDIA_COMPUTE_APPS_JSON="$(
165+
python3 - <<'PY' 2>/dev/null || true
166+
import sys, csv, json
167+
rows=[]
168+
for r in csv.reader(sys.stdin):
169+
if not r:
170+
continue
171+
gpu_uuid = r[0].strip() if len(r)>0 else None
172+
# pid as int where possible
173+
pid = None
174+
if len(r)>1:
175+
try: pid = int(r[1].strip())
176+
except: pid = None
177+
process = r[2].strip() if len(r)>2 else None
178+
used_mem = r[3].strip() if len(r)>3 else None
179+
rows.append({{"gpu_uuid": gpu_uuid, "pid": pid, "process_name": process, "used_memory": used_mem}})
180+
print(json.dumps(rows))
181+
PY
182+
<<< "$RAW_APPS"
183+
)"
184+
else
185+
NVIDIA_COMPUTE_APPS_JSON="[]"
186+
fi
187+
188+
# Driver/CUDA at top level from -q (stable across versions)
189+
DRIVER_VERSION="$("$NVIDIA_SMI_BIN" -q 2>/dev/null | awk -F': ' '/Driver Version/ {{print $2; exit}}')"
190+
CUDA_VERSION="$("$NVIDIA_SMI_BIN" -q 2>/dev/null | awk -F': ' '/CUDA Version/ {{print $2; exit}}')"
191+
NVIDIA_DRIVER_INFO="$(jq -n --arg dv "$DRIVER_VERSION" --arg cv "$CUDA_VERSION" 'def nn($x): if ($x|length)>0 then $x else null end; {{driver_version: nn($dv), cuda_version: nn($cv)}}' 2>/dev/null || echo '{{}}')"
192+
set -e
111193
112194
# Look for W&B files
113-
WANDB_DIR="/workspace/bionemo-framework/recipes/{recipe_subdir}/wandb"
195+
WANDB_DIR="/workspace/bionemo-framework/recipes/$RECIPE_SUBDIR/wandb"
114196
WANDB_FOUND=0
115197
WANDB_SUMMARY=""
116198
WANDB_METADATA=""
117199
118200
if [ -d "$WANDB_DIR" ]; then
119-
# Use latest-run symlink or find most recent run
120201
if [ -L "$WANDB_DIR/latest-run" ]; then
121202
LATEST_RUN="$WANDB_DIR/latest-run"
122203
else
123204
LATEST_RUN=$(ls -td "$WANDB_DIR"/run-* "$WANDB_DIR"/offline-run-* 2>/dev/null | head -n1)
124205
fi
125-
206+
126207
if [ -n "$LATEST_RUN" ] && [ -d "$LATEST_RUN/files" ]; then
127208
if [ -f "$LATEST_RUN/files/wandb-summary.json" ]; then
128209
WANDB_SUMMARY="$LATEST_RUN/files/wandb-summary.json"
@@ -134,24 +215,28 @@ def wrap_script_with_logging(
134215
135216
if [ "$WANDB_FOUND" = "1" ] && [ -n "$WANDB_SUMMARY" ]; then
136217
echo "Uploading W&B metrics to Kratos..."
137-
218+
138219
METADATA_JSON=$(cat "$WANDB_METADATA" 2>/dev/null || echo '{{}}')
139220
SUMMARY_JSON=$(cat "$WANDB_SUMMARY" 2>/dev/null || echo '{{}}')
140221
141222
COMBINED_JSON=$(jq -n \
142223
--arg m "$METADATA_JSON" \
143224
--arg s "$SUMMARY_JSON" \
144225
--argjson job_info "$JOB_INFO_JSON" \
145-
--argjson dashboard_info "$DASHBOARD_INFO_JSON" \
146-
--argjson all_config "$ALL_CONFIG_JSON" \
226+
--argjson all_config "$ALL_CONFIG_JSON_UPDATED" \
227+
--argjson nvidia_smi "$NVIDIA_SMI_JSON" \
228+
--argjson nvidia_compute_apps "$NVIDIA_COMPUTE_APPS_JSON" \
229+
--argjson nvidia_driver "$NVIDIA_DRIVER_INFO" \
147230
'
148231
. + {{
149232
job_name: env.LEPTON_JOB_NAME,
150233
metadata: ($m | fromjson? // {{}}),
151234
summary: ($s | fromjson? // {{}}),
152235
job_info: $job_info,
153-
dashboard_info: $dashboard_info,
154-
config: $all_config
236+
config: $all_config,
237+
nvidia_smi: $nvidia_smi,
238+
nvidia_compute_apps: $nvidia_compute_apps,
239+
nvidia_driver: $nvidia_driver
155240
}}
156241
')
157242
@@ -239,8 +324,6 @@ def launch_single_job(client, cfg: DictConfig):
239324
"-c",
240325
wrap_script_with_logging(
241326
cfg.script,
242-
dashboard_info=cfg.dashboard_info if hasattr(cfg, 'dashboard_info') else None,
243-
recipe_subdir=cfg.recipe_subdir if hasattr(cfg, 'recipe_subdir') else "esm2_native_te_mfsdp",
244327
all_config_json=full_cfg_json,
245328
),
246329
]
@@ -325,10 +408,14 @@ def main(cfg: DictConfig):
325408
# Create new OmegaConf object from merged dict
326409
product_cfg = OmegaConf.create(merged_dict)
327410

328-
# Generate job name as recipe_subdir-model_name, replacing underscores and slashes with hyphens
329-
recipe_subdir = product_cfg.recipe_subdir.replace('_', '-').replace('/', '-')
330-
model_name = product_dict['model_name'].replace('_', '-').replace('/', '-')
331-
product_cfg.job_name = f"{model_name}".lower()
411+
# Generate job name using recipe_subdir and config value
412+
# Extract the base recipe name from recipe_subdir (e.g., "geneformer" from "geneformer_native_te_mfsdp_fp8")
413+
recipe_parts = product_cfg.recipe_subdir.split('_')
414+
base_recipe_name = recipe_parts[0] if recipe_parts else product_cfg.recipe_subdir
415+
416+
# Create job name as base_recipe_name-config (e.g., "geneformer-10m")
417+
config_name = product_dict['config'].replace('_', '-').replace('/', '-')
418+
product_cfg.job_name = f"{base_recipe_name}-{config_name}".lower()
332419

333420
print(f"\n[{i}/{len(cfg.products)}] Launching: {product_cfg.job_name}")
334421

0 commit comments

Comments
 (0)