diff --git a/tests/trainer/resources/lora.ipynb b/tests/trainer/resources/lora.ipynb index 145cd9623..370724101 100644 --- a/tests/trainer/resources/lora.ipynb +++ b/tests/trainer/resources/lora.ipynb @@ -438,11 +438,17 @@ "metadata": {}, "outputs": [], "source": [ - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n", @@ -465,22 +471,36 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", - " ),\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", + " )\n", " )\n", " )\n", " ],\n", diff --git a/tests/trainer/resources/mnist.ipynb b/tests/trainer/resources/mnist.ipynb index 7933f910d..0a5f5345b 100644 --- a/tests/trainer/resources/mnist.ipynb +++ b/tests/trainer/resources/mnist.ipynb @@ -383,7 +383,19 @@ "source": [ "import os\n", "from kubeflow.trainer import CustomTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride, Labels\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch,\n", + " Labels\n", + ")\n", "\n", "pvc_name = os.getenv(\"SHARED_PVC_NAME\", \"\")\n", "print(f\"[notebook] Using PVC: {pvc_name}\")\n", @@ -408,24 +420,38 @@ " ),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\n", - " \"name\": \"work\",\n", - " \"persistentVolumeClaim\": {\"claimName\": pvc_name},\n", - " }\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/mnt/shared\", \"readOnly\": False}\n", - " ],\n", - " )\n", - " ],\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\n", + " \"name\": \"work\",\n", + " \"persistentVolumeClaim\": {\"claimName\": pvc_name},\n", + " }\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/mnt/shared\", \"readOnly\": False}\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " )\n", " )\n", " )\n", diff --git a/tests/trainer/resources/osft.ipynb b/tests/trainer/resources/osft.ipynb index fe3665666..71036315c 100644 --- a/tests/trainer/resources/osft.ipynb +++ b/tests/trainer/resources/osft.ipynb @@ -366,11 +366,17 @@ "outputs": [], "source": [ "\n", - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n", @@ -393,22 +399,36 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", - " ),\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", + " )\n", " )\n", " )\n", " ],\n", diff --git a/tests/trainer/resources/rhai_features.ipynb b/tests/trainer/resources/rhai_features.ipynb index 1538aa6cc..0211afa31 100644 --- a/tests/trainer/resources/rhai_features.ipynb +++ b/tests/trainer/resources/rhai_features.ipynb @@ -359,7 +359,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -436,7 +447,7 @@ " # (SDK bug: verification treats S3 prefix as a file)\n", " trainer_kwargs[\"verify_cloud_storage_access\"] = False\n", "\n", - "# Build volumes and volume mounts for pod template overrides\n", + "# Build volumes and volume mounts for runtime patches\n", "volumes = [\n", " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", "]\n", @@ -453,15 +464,29 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=volumes,\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=volume_mounts\n", - " )]\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=volumes,\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=volume_mounts\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " )\n", " )\n", " )\n", diff --git a/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb b/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb index beff338a6..c32bfd1bc 100644 --- a/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb +++ b/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb @@ -374,7 +374,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -453,23 +464,37 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", - " # Add larger /dev/shm for DeepSpeed/NCCL (fixes \"No space left on device\" error)\n", - " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", - " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", - " ],\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", - " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", + " # Add larger /dev/shm for DeepSpeed/NCCL (fixes \"No space left on device\" error)\n", + " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", + " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", + " ],\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", + " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " ]\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", " ]\n", - " )]\n", + " )\n", " )\n", " )\n", " )\n", diff --git a/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb b/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb index 2e8a3f153..0ea2b0ff8 100644 --- a/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb +++ b/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb @@ -355,7 +355,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -434,15 +445,29 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[{\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}}],\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[{\"name\": \"workspace\", \"mountPath\": \"/workspace\"}]\n", - " )]\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[{\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}}],\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[{\"name\": \"workspace\", \"mountPath\": \"/workspace\"}]\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " )\n", " )\n", " )\n", diff --git a/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb b/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb index 0c9c43ecb..6b33ac33f 100644 --- a/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb +++ b/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb @@ -363,7 +363,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -442,23 +453,37 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", - " # Add larger /dev/shm for FSDP/NCCL (fixes \"No space left on device\" error)\n", - " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", - " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", - " ],\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", - " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", + " # Add larger /dev/shm for FSDP/NCCL (fixes \"No space left on device\" error)\n", + " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", + " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", + " ],\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", + " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " ]\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", " ]\n", - " )]\n", + " )\n", " )\n", " )\n", " )\n", diff --git a/tests/trainer/resources/sft.ipynb b/tests/trainer/resources/sft.ipynb index e8bb808fc..0f67fb579 100644 --- a/tests/trainer/resources/sft.ipynb +++ b/tests/trainer/resources/sft.ipynb @@ -452,11 +452,17 @@ "metadata": {}, "outputs": [], "source": [ - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n", @@ -479,21 +485,35 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\", \n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\", \n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " ),\n", " )\n", " )\n", diff --git a/tests/trainer/resources/torchrun_failure.ipynb b/tests/trainer/resources/torchrun_failure.ipynb index 8c6435632..5b47eeaf6 100644 --- a/tests/trainer/resources/torchrun_failure.ipynb +++ b/tests/trainer/resources/torchrun_failure.ipynb @@ -293,11 +293,17 @@ "# The model loads, data loads, torchrun starts the training loop, and then\n", "# the first batch allocation exceeds GPU memory.\n", "\n", - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "LOCAL_MODEL_PATH = \"/opt/app-root/src/Qwen/Qwen2.5-1.5B-Instruct\"\n", @@ -336,21 +342,35 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " ),\n", " )\n", " )\n",