change zones

kddubey · kddubey · commit 6014f9eee039 · 2026-05-09T00:57:25.000-07:00
diff --git a/bin/_startup.sh b/bin/_startup.sh
@@ -4,7 +4,10 @@
 # after SSH'ing in, run `sudo -i` first so $HOME=/root and paths line up.
 set -euo pipefail
 
-# Install uv (manages its own Python; respects .python-version in the repo).
+# GCP's metadata script runner doesn't export HOME
+export HOME="${HOME:-/root}"
+
+# Install uv (manages its own Python, respects .python-version in the repo).
 curl -LsSf https://astral.sh/uv/install.sh | sh
 export PATH="$HOME/.local/bin:$PATH"
 
diff --git a/src/grouping_trainer/launch.py b/src/grouping_trainer/launch.py
@@ -41,15 +41,14 @@ class GpuConfig(BaseModel):
     model_config = ConfigDict(frozen=True, extra="forbid")
 
     name: str
-    zone: str  # default zone; flex-start capacity varies across regions, so this
-    # gets overridden via the --zone flag when the default is dry.
+    zone: str = "us-central1-a"
     machine_type: str
-    accelerator: str | None  # None for *-ddp variants — accelerators are built
-    # into the machine type, so passing --accelerator is redundant/erroneous.
+    accelerator: str | None  # None for *-ddp variants b/c accelerators are built into the machine type
     max_run: str
     install_nvidia_driver: bool
     reservation_affinity: Literal["none", "any"]
-    wait: bool  # whether to block locally on instance creation. False adds --async.
+    wait_for_instance_creation: bool
+    is_for_training: bool
 
 
 gpu_type_to_config: dict[GpuType, GpuConfig] = {
@@ -61,7 +60,7 @@ class GpuConfig(BaseModel):
         max_run="86400s",
         install_nvidia_driver=False,
         reservation_affinity="any",
-        wait=True,  # L4s come up fast; block so errors surface promptly
+        wait_for_instance_creation=True,  # L4s come up fast. Block so errors surface promptly
     ),
     "h100": GpuConfig(
         name="grouping-trainer-h100",
@@ -71,7 +70,7 @@ class GpuConfig(BaseModel):
         max_run="86400s",
         install_nvidia_driver=True,
         reservation_affinity="none",
-        wait=False,  # flex-start can queue for up to 1h; don't block the shell
+        wait_for_instance_creation=False,  # flex-start can queue for up to 1h
     ),
     "h100-ddp": GpuConfig(
         name="grouping-trainer-h100-ddp",
@@ -81,7 +80,7 @@ class GpuConfig(BaseModel):
         max_run="172800s",
         install_nvidia_driver=True,
         reservation_affinity="none",
-        wait=False,
+        wait_for_instance_creation=False,
     ),
     "a100": GpuConfig(
         name="grouping-trainer-a100",
@@ -91,7 +90,7 @@ class GpuConfig(BaseModel):
         max_run="86400s",
         install_nvidia_driver=True,
         reservation_affinity="none",
-        wait=False,
+        wait_for_instance_creation=False,
     ),
     "a100-ddp": GpuConfig(
         name="grouping-trainer-a100-ddp",
@@ -101,7 +100,7 @@ class GpuConfig(BaseModel):
         max_run="172800s",
         install_nvidia_driver=True,
         reservation_affinity="none",
-        wait=False,
+        wait_for_instance_creation=False,
     ),
 }
 
diff --git a/train.py b/train.py
@@ -51,6 +51,8 @@ def upload_run_metadata(run_gcs_dir: str, training_config: gt.train.TrainingConf
     "jinaai/jina-embeddings-v5-text-nano-text-matching": 4,
 }
 
+TrainingGpuType = Literal[tuple(gpu_type for gpu_type in gt.launch.gpu_type_to_config.keys() if gpu_type != "l4")]
+
 
 def run(
     base_model: str = "lightonai/modernbert-embed-large",
@@ -60,7 +62,7 @@ def run(
     per_device_train_batch_size: int = 256,
     learning_rate: float = 1e-4,
     tiny_run: bool = False,
-    gpu: Literal["h100", "h100-ddp", "a100", "a100-ddp"] | None = None,
+    gpu: TrainingGpuType | None = None,
     zone: str | None = None,
 ):
     """
@@ -95,10 +97,8 @@ def run(
     if not tiny_run:
         assert run_shortname is not None, "run_shortname is required for full training runs"
 
-    # Generate run_name up front so we can log the artifact URL locally before
-    # auto-launching. On the remote, re-use the local run_name via env var so
-    # both sides log the same GCS path (rather than each generating its own
-    # timestamp).
+    # Generate run_name up front so we can log the artifact URL locally before auto-launching. On the remote, re-use the
+    # local run_name via env var so both sides log the same GCS path (rather than each generating its own timestamp).
     run_name_env = os.environ.get(_RUN_NAME_ENV_VAR)
     if run_name_env:
         run_name = run_name_env