Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 42 additions & 22 deletions tests/trainer/resources/lora.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -438,11 +438,17 @@
"metadata": {},
"outputs": [],
"source": [
"from kubeflow.trainer.options.kubernetes import (\n",
" PodTemplateOverrides,\n",
" PodTemplateOverride,\n",
" PodSpecOverride,\n",
" ContainerOverride,\n",
"from kubeflow.trainer.options import (\n",
" RuntimePatch,\n",
" TrainingRuntimeSpecPatch,\n",
" JobSetTemplatePatch,\n",
" JobSetSpecPatch,\n",
" ReplicatedJobPatch,\n",
" JobTemplatePatch,\n",
" JobSpecPatch,\n",
" PodTemplatePatch,\n",
" PodSpecPatch,\n",
" ContainerPatch\n",
")\n",
"\n",
"cache_root = \"/opt/app-root/src/.cache/huggingface\"\n",
Expand All @@ -465,22 +471,36 @@
" },\n",
" ),\n",
" options=[\n",
" PodTemplateOverrides(\n",
" PodTemplateOverride(\n",
" target_jobs=[\"node\"],\n",
" spec=PodSpecOverride(\n",
" volumes=[\n",
" {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n",
" ],\n",
" containers=[\n",
" ContainerOverride(\n",
" name=\"node\",\n",
" volume_mounts=[\n",
" {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n",
" ],\n",
" )\n",
" ],\n",
" ),\n",
" RuntimePatch(\n",
" training_runtime_spec=TrainingRuntimeSpecPatch(\n",
" template=JobSetTemplatePatch(\n",
" spec=JobSetSpecPatch(\n",
" replicated_jobs=[\n",
" ReplicatedJobPatch(\n",
" name=\"node\",\n",
" template=JobTemplatePatch(\n",
" spec=JobSpecPatch(\n",
" template=PodTemplatePatch(\n",
" spec=PodSpecPatch(\n",
" volumes=[\n",
" {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n",
" ],\n",
" containers=[\n",
" ContainerPatch(\n",
" name=\"node\",\n",
" volume_mounts=[\n",
" {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n",
" ],\n",
" )\n",
" ],\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" ]\n",
" )\n",
" )\n",
" )\n",
" )\n",
" ],\n",
Expand Down Expand Up @@ -584,4 +604,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
66 changes: 46 additions & 20 deletions tests/trainer/resources/mnist.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,19 @@
"source": [
"import os\n",
"from kubeflow.trainer import CustomTrainer\n",
"from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride, Labels\n",
"from kubeflow.trainer.options import (\n",
" RuntimePatch,\n",
" TrainingRuntimeSpecPatch,\n",
" JobSetTemplatePatch,\n",
" JobSetSpecPatch,\n",
" ReplicatedJobPatch,\n",
" JobTemplatePatch,\n",
" JobSpecPatch,\n",
" PodTemplatePatch,\n",
" PodSpecPatch,\n",
" ContainerPatch,\n",
" Labels\n",
")\n",
"\n",
"pvc_name = os.getenv(\"SHARED_PVC_NAME\", \"\")\n",
"print(f\"[notebook] Using PVC: {pvc_name}\")\n",
Expand All @@ -408,24 +420,38 @@
" ),\n",
" runtime=torch_runtime,\n",
" options=[\n",
" PodTemplateOverrides(\n",
" PodTemplateOverride(\n",
" target_jobs=[\"node\"],\n",
" spec=PodSpecOverride(\n",
" volumes=[\n",
" {\n",
" \"name\": \"work\",\n",
" \"persistentVolumeClaim\": {\"claimName\": pvc_name},\n",
" }\n",
" ],\n",
" containers=[\n",
" ContainerOverride(\n",
" name=\"node\",\n",
" volume_mounts=[\n",
" {\"name\": \"work\", \"mountPath\": \"/mnt/shared\", \"readOnly\": False}\n",
" ],\n",
" )\n",
" ],\n",
" RuntimePatch(\n",
" training_runtime_spec=TrainingRuntimeSpecPatch(\n",
" template=JobSetTemplatePatch(\n",
" spec=JobSetSpecPatch(\n",
" replicated_jobs=[\n",
" ReplicatedJobPatch(\n",
" name=\"node\",\n",
" template=JobTemplatePatch(\n",
" spec=JobSpecPatch(\n",
" template=PodTemplatePatch(\n",
" spec=PodSpecPatch(\n",
" volumes=[\n",
" {\n",
" \"name\": \"work\",\n",
" \"persistentVolumeClaim\": {\"claimName\": pvc_name},\n",

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

pvc_name is used without an emptiness guard; an empty value produces an opaque Kubernetes API error.

pvc_name = os.getenv("SHARED_PVC_NAME", "") (line 400) defaults to "". The changed code at line 437 injects it directly as claimName. Other notebooks in the same repo (e.g., lora.ipynb) validate the PVC name before use.

 pvc_name = os.getenv("SHARED_PVC_NAME", "")
 print(f"[notebook] Using PVC: {pvc_name}")
+if not pvc_name:
+    raise RuntimeError("SHARED_PVC_NAME environment variable is required")
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/trainer/resources/mnist.ipynb` at line 437, The PVC name pvc_name
(assigned from os.getenv("SHARED_PVC_NAME", "")) is injected unguarded into the
k8s manifest as claimName which causes an opaque API error when empty; update
the manifest assembly in mnist.ipynb to only include the
"persistentVolumeClaim": {"claimName": pvc_name} entry when pvc_name is
non-empty (truthy) or omit the entire volumeMount/volume entry otherwise—locate
the code that constructs the dict containing "persistentVolumeClaim" and wrap or
conditionally add that key based on pvc_name to mirror the validation used in
lora.ipynb.

" }\n",
" ],\n",
" containers=[\n",
" ContainerPatch(\n",
" name=\"node\",\n",
" volume_mounts=[\n",
" {\"name\": \"work\", \"mountPath\": \"/mnt/shared\", \"readOnly\": False}\n",
" ],\n",
" )\n",
" ],\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" ]\n",
" )\n",
" )\n",
" )\n",
" )\n",
Expand Down Expand Up @@ -551,4 +577,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
64 changes: 42 additions & 22 deletions tests/trainer/resources/osft.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -366,11 +366,17 @@
"outputs": [],
"source": [
"\n",
"from kubeflow.trainer.options.kubernetes import (\n",
" PodTemplateOverrides,\n",
" PodTemplateOverride,\n",
" PodSpecOverride,\n",
" ContainerOverride,\n",
"from kubeflow.trainer.options import (\n",
" RuntimePatch,\n",
" TrainingRuntimeSpecPatch,\n",
" JobSetTemplatePatch,\n",
" JobSetSpecPatch,\n",
" ReplicatedJobPatch,\n",
" JobTemplatePatch,\n",
" JobSpecPatch,\n",
" PodTemplatePatch,\n",
" PodSpecPatch,\n",
" ContainerPatch\n",
")\n",
"\n",
"cache_root = \"/opt/app-root/src/.cache/huggingface\"\n",
Expand All @@ -393,22 +399,36 @@
" },\n",
" ),\n",
" options=[\n",
" PodTemplateOverrides(\n",
" PodTemplateOverride(\n",
" target_jobs=[\"node\"],\n",
" spec=PodSpecOverride(\n",
" volumes=[\n",
" {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n",
" ],\n",
" containers=[\n",
" ContainerOverride(\n",
" name=\"node\",\n",
" volume_mounts=[\n",
" {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n",
" ],\n",
" )\n",
" ],\n",
" ),\n",
" RuntimePatch(\n",
" training_runtime_spec=TrainingRuntimeSpecPatch(\n",
" template=JobSetTemplatePatch(\n",
" spec=JobSetSpecPatch(\n",
" replicated_jobs=[\n",
" ReplicatedJobPatch(\n",
" name=\"node\",\n",
" template=JobTemplatePatch(\n",
" spec=JobSpecPatch(\n",
" template=PodTemplatePatch(\n",
" spec=PodSpecPatch(\n",
" volumes=[\n",
" {\"name\": \"work\", \"persistentVolumeClaim\": {\"claimName\": PVC_NAME}},\n",
" ],\n",
" containers=[\n",
" ContainerPatch(\n",
" name=\"node\",\n",
" volume_mounts=[\n",
" {\"name\": \"work\", \"mountPath\": \"/opt/app-root/src\", \"readOnly\": False},\n",
" ],\n",
" )\n",
" ],\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" ]\n",
" )\n",
" )\n",
" )\n",
" )\n",
" ],\n",
Expand Down Expand Up @@ -512,4 +532,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
49 changes: 37 additions & 12 deletions tests/trainer/resources/rhai_features.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,18 @@
"outputs": [],
"source": [
"from kubeflow.trainer.rhai.transformers import TransformersTrainer\n",
"from kubeflow.trainer.options import PodTemplateOverrides, PodTemplateOverride, PodSpecOverride, ContainerOverride\n",
"from kubeflow.trainer.options import (\n",
" RuntimePatch,\n",
" TrainingRuntimeSpecPatch,\n",
" JobSetTemplatePatch,\n",
" JobSetSpecPatch,\n",
" ReplicatedJobPatch,\n",
" JobTemplatePatch,\n",
" JobSpecPatch,\n",
" PodTemplatePatch,\n",
" PodSpecPatch,\n",
" ContainerPatch\n",
")\n",
"import os\n",
"\n",
"# Read feature flags from environment\n",
Expand Down Expand Up @@ -436,7 +447,7 @@
" # (SDK bug: verification treats S3 prefix as a file)\n",
" trainer_kwargs[\"verify_cloud_storage_access\"] = False\n",
"\n",
"# Build volumes and volume mounts for pod template overrides\n",
"# Build volumes and volume mounts for runtime patches\n",
"volumes = [\n",
" {\"name\": \"workspace\", \"persistentVolumeClaim\": {\"claimName\": shared_pvc_name}},\n",
"]\n",
Expand All @@ -453,15 +464,29 @@
" trainer=TransformersTrainer(**trainer_kwargs),\n",
" runtime=torch_runtime,\n",
" options=[\n",
" PodTemplateOverrides(\n",
" PodTemplateOverride(\n",
" target_jobs=[\"node\"],\n",
" spec=PodSpecOverride(\n",
" volumes=volumes,\n",
" containers=[ContainerOverride(\n",
" name=\"node\",\n",
" volume_mounts=volume_mounts\n",
" )]\n",
" RuntimePatch(\n",
" training_runtime_spec=TrainingRuntimeSpecPatch(\n",
" template=JobSetTemplatePatch(\n",
" spec=JobSetSpecPatch(\n",
" replicated_jobs=[\n",
" ReplicatedJobPatch(\n",
" name=\"node\",\n",
" template=JobTemplatePatch(\n",
" spec=JobSpecPatch(\n",
" template=PodTemplatePatch(\n",
" spec=PodSpecPatch(\n",
" volumes=volumes,\n",
" containers=[ContainerPatch(\n",
" name=\"node\",\n",
" volume_mounts=volume_mounts\n",
" )]\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" ]\n",
" )\n",
" )\n",
" )\n",
" )\n",
Expand Down Expand Up @@ -530,4 +555,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading