From de00eed2ebb2ece39cb6c33ba87375c242ec2ddc Mon Sep 17 00:00:00 2001 From: Hari Haran Rathinakumar Date: Thu, 7 May 2026 17:45:08 +0100 Subject: [PATCH 1/3] feat: Migrate RHAI notebooks to RuntimePatches API --- tests/trainer/resources/lora.ipynb | 64 +++++++++++------- tests/trainer/resources/mnist.ipynb | 66 +++++++++++++------ tests/trainer/resources/osft.ipynb | 64 +++++++++++------- tests/trainer/resources/rhai_features.ipynb | 49 ++++++++++---- .../rhai_features_deepspeed_stage0.ipynb | 61 ++++++++++++----- .../rhai_features_fsdp_full_state.ipynb | 47 +++++++++---- .../rhai_features_fsdp_shared_state.ipynb | 61 ++++++++++++----- tests/trainer/resources/sft.ipynb | 62 +++++++++++------ .../trainer/resources/torchrun_failure.ipynb | 62 +++++++++++------ 9 files changed, 371 insertions(+), 165 deletions(-) diff --git a/tests/trainer/resources/lora.ipynb b/tests/trainer/resources/lora.ipynb index 145cd9623..4c848cc02 100644 --- a/tests/trainer/resources/lora.ipynb +++ b/tests/trainer/resources/lora.ipynb @@ -438,11 +438,17 @@ "metadata": {}, "outputs": [], "source": [ - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n", @@ -465,22 +471,36 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", - " ),\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", + " )\n", " )\n", " )\n", " ],\n", @@ -584,4 +604,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/mnist.ipynb b/tests/trainer/resources/mnist.ipynb index 7933f910d..b006e652e 100644 --- a/tests/trainer/resources/mnist.ipynb +++ b/tests/trainer/resources/mnist.ipynb @@ -383,7 +383,19 @@ "source": [ "import os\n", "from kubeflow.trainer import CustomTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride, Labels\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch,\n", + " Labels\n", + ")\n", "\n", "pvc_name = os.getenv(\"SHARED_PVC_NAME\", \"\")\n", "print(f\"[notebook] Using PVC: {pvc_name}\")\n", @@ -408,24 +420,38 @@ " ),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\n", - " \"name\": \"work\",\n", - " \"persistentVolumeClaim\": {\"claimName\": pvc_name},\n", - " }\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/mnt/shared\", \"readOnly\": False}\n", - " ],\n", - " )\n", - " ],\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\n", + " \"name\": \"work\",\n", + " \"persistentVolumeClaim\": {\"claimName\": pvc_name},\n", + " }\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/mnt/shared\", \"readOnly\": False}\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " )\n", " )\n", " )\n", @@ -551,4 +577,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/osft.ipynb b/tests/trainer/resources/osft.ipynb index fe3665666..5e8d3f2b7 100644 --- a/tests/trainer/resources/osft.ipynb +++ b/tests/trainer/resources/osft.ipynb @@ -366,11 +366,17 @@ "outputs": [], "source": [ "\n", - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n", @@ -393,22 +399,36 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", - " ),\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", + " )\n", " )\n", " )\n", " ],\n", @@ -512,4 +532,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/rhai_features.ipynb b/tests/trainer/resources/rhai_features.ipynb index 1538aa6cc..c8371e4d5 100644 --- a/tests/trainer/resources/rhai_features.ipynb +++ b/tests/trainer/resources/rhai_features.ipynb @@ -359,7 +359,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -436,7 +447,7 @@ " # (SDK bug: verification treats S3 prefix as a file)\n", " trainer_kwargs[\"verify_cloud_storage_access\"] = False\n", "\n", - "# Build volumes and volume mounts for pod template overrides\n", + "# Build volumes and volume mounts for runtime patches\n", "volumes = [\n", " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", "]\n", @@ -453,15 +464,29 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=volumes,\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=volume_mounts\n", - " )]\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=volumes,\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=volume_mounts\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " )\n", " )\n", " )\n", @@ -530,4 +555,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb b/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb index beff338a6..85dc47c3f 100644 --- a/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb +++ b/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb @@ -374,7 +374,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -453,23 +464,37 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", - " # Add larger /dev/shm for DeepSpeed/NCCL (fixes \"No space left on device\" error)\n", - " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", - " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", - " ],\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", - " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", + " # Add larger /dev/shm for DeepSpeed/NCCL (fixes \"No space left on device\" error)\n", + " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", + " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", + " ],\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", + " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " ]\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", " ]\n", - " )]\n", + " )\n", " )\n", " )\n", " )\n", @@ -538,4 +563,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb b/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb index 2e8a3f153..85549735e 100644 --- a/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb +++ b/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb @@ -355,7 +355,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -434,15 +445,29 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[{\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}}],\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[{\"name\": \"workspace\", \"mountPath\": \"/workspace\"}]\n", - " )]\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[{\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}}],\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[{\"name\": \"workspace\", \"mountPath\": \"/workspace\"}]\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " )\n", " )\n", " )\n", @@ -511,4 +536,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb b/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb index 0c9c43ecb..5c9f6a10f 100644 --- a/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb +++ b/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb @@ -363,7 +363,18 @@ "outputs": [], "source": [ "from kubeflow.trainer.rhai.transformers import TransformersTrainer\n", - "from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", + ")\n", "import os\n", "\n", "# Read feature flags from environment\n", @@ -442,23 +453,37 @@ " trainer=TransformersTrainer(**trainer_kwargs),\n", " runtime=torch_runtime,\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", - " # Add larger /dev/shm for FSDP/NCCL (fixes \"No space left on device\" error)\n", - " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", - " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", - " ],\n", - " containers=[ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", - " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n", + " # Add larger /dev/shm for FSDP/NCCL (fixes \"No space left on device\" error)\n", + " # Note: Some SDK versions may not support sizeLimit - if validation fails, remove sizeLimit\n", + " {\"name\": \"dshm\", \"emptyDir\": {\"medium\": \"Memory\"}} # sizeLimit may cause ValidationError in some SDK versions\n", + " ],\n", + " containers=[ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"workspace\", \"mountPath\": \"/workspace\"},\n", + " {\"name\": \"dshm\", \"mountPath\": \"/dev/shm\"}\n", + " ]\n", + " )]\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", " ]\n", - " )]\n", + " )\n", " )\n", " )\n", " )\n", @@ -527,4 +552,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/sft.ipynb b/tests/trainer/resources/sft.ipynb index e8bb808fc..bf4645db6 100644 --- a/tests/trainer/resources/sft.ipynb +++ b/tests/trainer/resources/sft.ipynb @@ -452,11 +452,17 @@ "metadata": {}, "outputs": [], "source": [ - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "cache_root = \"/opt/app-root/src/.cache/huggingface\"\n", @@ -479,21 +485,35 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\", \n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\", \n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " ),\n", " )\n", " )\n", @@ -602,4 +622,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/tests/trainer/resources/torchrun_failure.ipynb b/tests/trainer/resources/torchrun_failure.ipynb index 8c6435632..c0f247218 100644 --- a/tests/trainer/resources/torchrun_failure.ipynb +++ b/tests/trainer/resources/torchrun_failure.ipynb @@ -293,11 +293,17 @@ "# The model loads, data loads, torchrun starts the training loop, and then\n", "# the first batch allocation exceeds GPU memory.\n", "\n", - "from kubeflow.trainer.options.kubernetes import (\n", - " PodTemplateOverrides,\n", - " PodTemplateOverride,\n", - " PodSpecOverride,\n", - " ContainerOverride,\n", + "from kubeflow.trainer.options import (\n", + " RuntimePatch,\n", + " TrainingRuntimeSpecPatch,\n", + " JobSetTemplatePatch,\n", + " JobSetSpecPatch,\n", + " ReplicatedJobPatch,\n", + " JobTemplatePatch,\n", + " JobSpecPatch,\n", + " PodTemplatePatch,\n", + " PodSpecPatch,\n", + " ContainerPatch\n", ")\n", "\n", "LOCAL_MODEL_PATH = \"/opt/app-root/src/Qwen/Qwen2.5-1.5B-Instruct\"\n", @@ -336,21 +342,35 @@ " },\n", " ),\n", " options=[\n", - " PodTemplateOverrides(\n", - " PodTemplateOverride(\n", - " target_jobs=[\"node\"],\n", - " spec=PodSpecOverride(\n", - " volumes=[\n", - " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", - " ],\n", - " containers=[\n", - " ContainerOverride(\n", - " name=\"node\",\n", - " volume_mounts=[\n", - " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", - " ],\n", - " )\n", - " ],\n", + " RuntimePatch(\n", + " training_runtime_spec=TrainingRuntimeSpecPatch(\n", + " template=JobSetTemplatePatch(\n", + " spec=JobSetSpecPatch(\n", + " replicated_jobs=[\n", + " ReplicatedJobPatch(\n", + " name=\"node\",\n", + " template=JobTemplatePatch(\n", + " spec=JobSpecPatch(\n", + " template=PodTemplatePatch(\n", + " spec=PodSpecPatch(\n", + " volumes=[\n", + " {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n", + " ],\n", + " containers=[\n", + " ContainerPatch(\n", + " name=\"node\",\n", + " volume_mounts=[\n", + " {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n", + " ],\n", + " )\n", + " ],\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " ]\n", + " )\n", " ),\n", " )\n", " )\n", @@ -482,4 +502,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From d75b42367d8508f23009c195998592dd5294787e Mon Sep 17 00:00:00 2001 From: Hari Haran Rathinakumar Date: Thu, 7 May 2026 17:58:15 +0100 Subject: [PATCH 2/3] fix: remove trailing newlines from TrainingHub notebooks --- tests/trainer/resources/lora.ipynb | 2 +- tests/trainer/resources/mnist.ipynb | 2 +- tests/trainer/resources/osft.ipynb | 2 +- tests/trainer/resources/sft.ipynb | 2 +- tests/trainer/resources/torchrun_failure.ipynb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/resources/lora.ipynb b/tests/trainer/resources/lora.ipynb index 4c848cc02..370724101 100644 --- a/tests/trainer/resources/lora.ipynb +++ b/tests/trainer/resources/lora.ipynb @@ -604,4 +604,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/mnist.ipynb b/tests/trainer/resources/mnist.ipynb index b006e652e..0a5f5345b 100644 --- a/tests/trainer/resources/mnist.ipynb +++ b/tests/trainer/resources/mnist.ipynb @@ -577,4 +577,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/osft.ipynb b/tests/trainer/resources/osft.ipynb index 5e8d3f2b7..71036315c 100644 --- a/tests/trainer/resources/osft.ipynb +++ b/tests/trainer/resources/osft.ipynb @@ -532,4 +532,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/sft.ipynb b/tests/trainer/resources/sft.ipynb index bf4645db6..0f67fb579 100644 --- a/tests/trainer/resources/sft.ipynb +++ b/tests/trainer/resources/sft.ipynb @@ -622,4 +622,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/torchrun_failure.ipynb b/tests/trainer/resources/torchrun_failure.ipynb index c0f247218..5b47eeaf6 100644 --- a/tests/trainer/resources/torchrun_failure.ipynb +++ b/tests/trainer/resources/torchrun_failure.ipynb @@ -502,4 +502,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file From 58f00c0502b143662234eb01ae2cd33a3bf4b2bc Mon Sep 17 00:00:00 2001 From: Hari Haran Rathinakumar Date: Thu, 7 May 2026 17:59:20 +0100 Subject: [PATCH 3/3] fix: remove trailing newlines from RHAI notebooks --- tests/trainer/resources/rhai_features.ipynb | 2 +- tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb | 2 +- tests/trainer/resources/rhai_features_fsdp_full_state.ipynb | 2 +- tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/trainer/resources/rhai_features.ipynb b/tests/trainer/resources/rhai_features.ipynb index c8371e4d5..0211afa31 100644 --- a/tests/trainer/resources/rhai_features.ipynb +++ b/tests/trainer/resources/rhai_features.ipynb @@ -555,4 +555,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb b/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb index 85dc47c3f..c32bfd1bc 100644 --- a/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb +++ b/tests/trainer/resources/rhai_features_deepspeed_stage0.ipynb @@ -563,4 +563,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb b/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb index 85549735e..0ea2b0ff8 100644 --- a/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb +++ b/tests/trainer/resources/rhai_features_fsdp_full_state.ipynb @@ -536,4 +536,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb b/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb index 5c9f6a10f..6b33ac33f 100644 --- a/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb +++ b/tests/trainer/resources/rhai_features_fsdp_shared_state.ipynb @@ -552,4 +552,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file