kubeflow
diff --git a/‎.github/workflows/test-e2e-gpu.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/test-e2e-gpu.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/kubeflow-trainer/README.md‎
Lines changed: 89 additions & 0 deletions b/‎charts/kubeflow-trainer/README.md‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎charts/kubeflow-trainer/README.md.gotmpl‎
Lines changed: 59 additions & 1 deletion b/‎charts/kubeflow-trainer/README.md.gotmpl‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎charts/kubeflow-trainer/templates/_helpers.tpl‎
Lines changed: 45 additions & 8 deletions b/‎charts/kubeflow-trainer/templates/_helpers.tpl‎
Lines changed: 45 additions & 8 deletions
diff --git a/‎charts/kubeflow-trainer/templates/runtimes/data-cache/torch-distributed-with-cache.yaml‎
Lines changed: 74 additions & 0 deletions b/‎charts/kubeflow-trainer/templates/runtimes/data-cache/torch-distributed-with-cache.yaml‎
Lines changed: 74 additions & 0 deletions
@@ -56,8 +56,8 @@ jobs:
       - name: Run e2e test on GPU cluster
         run: |
           mkdir -p artifacts/notebooks
-          make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/qwen2_5/qwen2.5-1.5B-with-alpaca.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_qwen2_5_with_alpaca-trainjob-yaml.ipynb TIMEOUT=600
-          make test-e2e-notebook NOTEBOOK_INPUT=./examples/jax/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_jax_mnist.ipynb PAPERMILL_PARAMS="-p num_cpu 8 -p num_gpu 1 -p num_nodes 1" TIMEOUT=600
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/qwen2_5/qwen2.5-1.5B-with-alpaca.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_qwen2_5_with_alpaca-trainjob-yaml.ipynb PAPERMILL_TIMEOUT=1800
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/jax/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_jax_mnist.ipynb PAPERMILL_PARAMS="-p num_cpu 8 -p num_gpu 1 -p num_nodes 1" PAPERMILL_TIMEOUT=1800
 
       - name: Upload Artifacts to GitHub
         if: always()
 
@@ -31,6 +31,64 @@ Alternatively, you can install the latest version from the master branch (e.g. `
 helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer --version 0.0.0-sha-bfccb7b
 ```
 
+### Install with ClusterTrainingRuntimes
+
+You can optionally deploy ClusterTrainingRuntimes as part of the Helm installation. Runtimes are disabled by default to keep the chart lightweight.
+
+To enable all default runtimes (torch, deepspeed, mlx, torchtune):
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  --set runtimes.defaultEnabled=true
+```
+
+To enable specific runtimes:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  --set runtimes.torchDistributed.enabled=true \
+  --set runtimes.deepspeedDistributed.enabled=true
+```
+
+Or use a custom values file:
+
+```yaml
+# values.yaml
+runtimes:
+  torchDistributed:
+    enabled: true
+  deepspeedDistributed:
+    enabled: true
+  mlxDistributed:
+    enabled: true
+
+# For torch-distributed-with-cache, enable both dataCache.enabled and dataCache.runtimes.torchDistributed.enabled
+dataCache:
+  enabled: true
+  cacheImage:
+    tag: "v2.0.0"
+  runtimes:
+    torchDistributed:
+      enabled: true
+```
+
+Then install with:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  -f values.yaml
+```
+
+### Available Runtimes
+
+- **torch-distributed**: PyTorch distributed training (no custom images)
+- **torch-distributed-with-cache**: PyTorch with distributed data cache support (requires `dataCache.enabled=true`)
+- **deepspeed-distributed**: DeepSpeed distributed training with MPI
+- **mlx-distributed**: MLX distributed training with MPI
+
 ### Uninstall the chart
 
 ```shell
@@ -72,6 +130,37 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum
 | dataCache.enabled | bool | `false` | Enable/disable data cache support (LWS dependency, ClusterRole). Set to `true` to install data cache components. |
 | dataCache.lws.install | bool | `true` | Whether to install LeaderWorkerSet as a dependency. Set to `false` if LeaderWorkerSet is already installed in the cluster. |
 | dataCache.lws.fullnameOverride | string | `"lws"` | String to fully override LeaderWorkerSet release name. |
+| dataCache.cacheImage.registry | string | `"ghcr.io"` | Data cache image registry |
+| dataCache.cacheImage.repository | string | `"kubeflow/trainer/data-cache"` | Data cache image repository |
+| dataCache.cacheImage.tag | string | `""` | Data cache image tag. Defaults to chart version if empty. |
+| dataCache.runtimes.torchDistributed | object | `{"enabled":false}` | PyTorch distributed training with data cache support |
+| dataCache.runtimes.torchDistributed.enabled | bool | `false` | Enable deployment of torch-distributed-with-cache runtime |
+| runtimes | object | `{"deepspeedDistributed":{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/deepspeed-runtime","tag":""}},"defaultEnabled":false,"jaxDistributed":{"enabled":false},"mlxDistributed":{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/mlx-runtime","tag":""}},"torchDistributed":{"enabled":false},"torchtuneDistributed":{"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/torchtune-trainer","tag":""},"llama3_2_1B":{"enabled":false},"llama3_2_3B":{"enabled":false},"qwen2_5_1_5B":{"enabled":false}}}` | ClusterTrainingRuntimes configuration These are optional runtime templates that can be deployed with the Helm chart. Each runtime provides a blueprint for different ML frameworks and configurations. |
+| runtimes.defaultEnabled | bool | `false` | Enable all default runtimes (torch, deepspeed, mlx, jax, torchtune) when set to true. Individual runtime settings will be ignored if this is enabled. |
+| runtimes.torchDistributed | object | `{"enabled":false}` | PyTorch distributed training runtime (no custom images required) |
+| runtimes.torchDistributed.enabled | bool | `false` | Enable deployment of torch-distributed runtime |
+| runtimes.deepspeedDistributed | object | `{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/deepspeed-runtime","tag":""}}` | DeepSpeed distributed training runtime |
+| runtimes.deepspeedDistributed.enabled | bool | `false` | Enable deployment of deepspeed-distributed runtime |
+| runtimes.deepspeedDistributed.image.registry | string | `"ghcr.io"` | DeepSpeed runtime image registry |
+| runtimes.deepspeedDistributed.image.repository | string | `"kubeflow/trainer/deepspeed-runtime"` | DeepSpeed runtime image repository |
+| runtimes.deepspeedDistributed.image.tag | string | `""` | DeepSpeed runtime image tag. Defaults to chart version if empty. |
+| runtimes.mlxDistributed | object | `{"enabled":false,"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/mlx-runtime","tag":""}}` | MLX distributed training runtime |
+| runtimes.mlxDistributed.enabled | bool | `false` | Enable deployment of mlx-distributed runtime |
+| runtimes.mlxDistributed.image.registry | string | `"ghcr.io"` | MLX runtime image registry |
+| runtimes.mlxDistributed.image.repository | string | `"kubeflow/trainer/mlx-runtime"` | MLX runtime image repository |
+| runtimes.mlxDistributed.image.tag | string | `""` | MLX runtime image tag. Defaults to chart version if empty. |
+| runtimes.jaxDistributed | object | `{"enabled":false}` | JAX distributed training runtime (no custom images required) |
+| runtimes.jaxDistributed.enabled | bool | `false` | Enable deployment of jax-distributed runtime |
+| runtimes.torchtuneDistributed | object | `{"image":{"registry":"ghcr.io","repository":"kubeflow/trainer/torchtune-trainer","tag":""},"llama3_2_1B":{"enabled":false},"llama3_2_3B":{"enabled":false},"qwen2_5_1_5B":{"enabled":false}}` | TorchTune distributed training runtime |
+| runtimes.torchtuneDistributed.image.registry | string | `"ghcr.io"` | TorchTune runtime image registry |
+| runtimes.torchtuneDistributed.image.repository | string | `"kubeflow/trainer/torchtune-trainer"` | TorchTune runtime image repository |
+| runtimes.torchtuneDistributed.image.tag | string | `""` | TorchTune runtime image tag. Defaults to chart version if empty. |
+| runtimes.torchtuneDistributed.llama3_2_1B | object | `{"enabled":false}` | Llama 3.2 1B model configuration |
+| runtimes.torchtuneDistributed.llama3_2_1B.enabled | bool | `false` | Enable deployment of Llama 3.2 1B runtime |
+| runtimes.torchtuneDistributed.llama3_2_3B | object | `{"enabled":false}` | Llama 3.2 3B model configuration |
+| runtimes.torchtuneDistributed.llama3_2_3B.enabled | bool | `false` | Enable deployment of Llama 3.2 3B runtime |
+| runtimes.torchtuneDistributed.qwen2_5_1_5B | object | `{"enabled":false}` | Qwen 2.5 1.5B model configuration |
+| runtimes.torchtuneDistributed.qwen2_5_1_5B.enabled | bool | `false` | Enable deployment of Qwen 2.5 1.5B runtime |
 
 ## Maintainers
 
 
@@ -1,5 +1,5 @@
 {{- /*
-Copyright 2025 The Kubeflow authors.
+Copyright 2026 The Kubeflow authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -49,6 +49,64 @@ Alternatively, you can install the latest version from the master branch (e.g. `
 helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer --version 0.0.0-sha-bfccb7b
 ```
 
+### Install with ClusterTrainingRuntimes
+
+You can optionally deploy ClusterTrainingRuntimes as part of the Helm installation. Runtimes are disabled by default to keep the chart lightweight.
+
+To enable all default runtimes (torch, deepspeed, mlx, torchtune):
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  --set runtimes.defaultEnabled=true
+```
+
+To enable specific runtimes:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  --set runtimes.torchDistributed.enabled=true \
+  --set runtimes.deepspeedDistributed.enabled=true
+```
+
+Or use a custom values file:
+
+```yaml
+# values.yaml
+runtimes:
+  torchDistributed:
+    enabled: true
+  deepspeedDistributed:
+    enabled: true
+  mlxDistributed:
+    enabled: true
+
+# For torch-distributed-with-cache, enable both dataCache.enabled and dataCache.runtimes.torchDistributed.enabled
+dataCache:
+  enabled: true
+  cacheImage:
+    tag: "v2.0.0"
+  runtimes:
+    torchDistributed:
+      enabled: true
+```
+
+Then install with:
+
+```bash
+helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \
+  --version 2.1.0 \
+  -f values.yaml
+```
+
+### Available Runtimes
+
+- **torch-distributed**: PyTorch distributed training (no custom images)
+- **torch-distributed-with-cache**: PyTorch with distributed data cache support (requires `dataCache.enabled=true`)
+- **deepspeed-distributed**: DeepSpeed distributed training with MPI
+- **mlx-distributed**: MLX distributed training with MPI
+
 ### Uninstall the chart
 
 ```shell
 
@@ -64,24 +64,61 @@ app.kubernetes.io/name: {{ include "trainer.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 
+{{/*
+Resolve the effective image tag, using a provided tag if present or
+falling back to the default image tag derived from the chart version.
+Usage: include "trainer.resolveImageTag" (dict "tag" .Values.image.tag "context" .)
+*/}}
+{{- define "trainer.resolveImageTag" -}}
+{{- if .tag }}
+{{- .tag -}}
+{{- else -}}
+{{- include "trainer.defaultImageTag" .context -}}
+{{- end -}}
+{{- end }}
+
 {{- define "trainer.image" -}}
 {{- $imageRegistry := .Values.image.registry | default "docker.io" }}
 {{- $imageRepository := .Values.image.repository }}
-{{- $imageTag := .Values.image.tag -}}
-{{- if not $imageTag -}}
-{{- if hasPrefix "0.0.0-" .Chart.Version -}}
-{{- $imageTag = trimPrefix "0.0.0-" .Chart.Version -}}
-{{- else -}}
-{{- $imageTag = printf "v%s" .Chart.Version -}}
-{{- end -}}
-{{- end -}}
+{{- $imageTag := include "trainer.resolveImageTag" (dict "tag" .Values.image.tag "context" .) -}}
 {{- if eq $imageRegistry "docker.io" }}
 {{- printf "%s:%s" $imageRepository $imageTag }}
 {{- else }}
 {{- printf "%s/%s:%s" $imageRegistry $imageRepository $imageTag }}
 {{- end }}
 {{- end }}
 
+{{/*
+Generate the default image tag for runtimes based on chart version
+*/}}
+{{- define "trainer.defaultImageTag" -}}
+{{- if hasPrefix "0.0.0-" .Chart.Version -}}
+{{- trimPrefix "0.0.0-" .Chart.Version -}}
+{{- else -}}
+{{- printf "v%s" .Chart.Version -}}
+{{- end -}}
+{{- end }}
+
+{{/*
+Generate runtime image with registry, repository, and tag from values
+Usage: include "trainer.runtimeImage" (list .Values.runtimes.deepspeedDistributed.image .)
+*/}}
+{{- define "trainer.runtimeImage" -}}
+{{- $imageConfig := index . 0 }}
+{{- $root := index . 1 }}
+{{- $registry := $imageConfig.registry | default "ghcr.io" }}
+{{- $repository := $imageConfig.repository }}
+{{- $tag := include "trainer.resolveImageTag" (dict "tag" ($imageConfig.tag) "context" $root) -}}
+{{- if eq $registry "docker.io" }}
+{{- printf "%s:%s" $repository $tag }}
+{{- else }}
+{{- printf "%s/%s:%s" $registry $repository $tag }}
+{{- end }}
+{{- end }}
+{{/*
+Return the version of the trainer.
+If the version is 0.0.0, we assume it is a development version.
+*/}}
 {{- define "trainer.version" -}}
 {{- if hasPrefix "0.0.0-" .Chart.Version -}}
 dev
 
@@ -0,0 +1,74 @@
+{{- /*
+Copyright 2026 The Kubeflow authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/ -}}
+
+{{- if and .Values.dataCache.enabled .Values.dataCache.runtimes.torchDistributed.enabled }}
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: torch-distributed-with-cache
+  labels:
+    trainer.kubeflow.org/framework: torch
+    {{- include "trainer.labels" . | nindent 4 }}
+spec:
+  mlPolicy:
+    numNodes: 1
+    torch:
+      numProcPerNode: auto
+  template:
+    spec:
+      replicatedJobs:
+      - name: dataset-initializer
+        replicas: 1
+        template:
+          metadata:
+            labels:
+              trainer.kubeflow.org/trainjob-ancestor-step: dataset-initializer
+          spec:
+            template:
+              spec:
+                serviceAccountName: kubeflow-trainer-cache-initializer
+                containers:
+                  - name: dataset-initializer
+                    image: {{ printf "ghcr.io/kubeflow/trainer/dataset-initializer:%s" (include "trainer.defaultImageTag" .) }}
+                    env:
+                      - name: CACHE_IMAGE
+                        value: {{ include "trainer.runtimeImage" (list .Values.dataCache.cacheImage .) | quote }}
+                      - name: TRAIN_JOB_NAME
+                        valueFrom:
+                          fieldRef:
+                            apiVersion: v1
+                            fieldPath: metadata.labels['jobset.sigs.k8s.io/jobset-name']
+      - name: node
+        dependsOn:
+          - name: dataset-initializer
+            status: Complete
+        template:
+          metadata:
+            labels:
+              trainer.kubeflow.org/trainjob-ancestor-step: trainer
+          spec:
+            template:
+              spec:
+                containers:
+                  - name: node
+                    image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
+                    env:
+                      - name: TRAIN_JOB_NAME
+                        valueFrom:
+                          fieldRef:
+                            apiVersion: v1
+                            fieldPath: metadata.labels['jobset.sigs.k8s.io/jobset-name']
+{{- end }}