From d8ecc0afa26cfb9753a8c3437d8e3542b335ccfe Mon Sep 17 00:00:00 2001 From: kramaranya Date: Thu, 20 Nov 2025 23:55:29 +0000 Subject: [PATCH 1/5] Update RHOAI manifests to include universal workbench image configuration Signed-off-by: kramaranya --- manifests/rhoai/kustomization.yaml | 14 ++++++++ manifests/rhoai/params.env | 1 + ...aining-hub-universal-cuda-imagestream.yaml | 36 +++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 manifests/rhoai/training-hub-universal-cuda-imagestream.yaml diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml index 86ec91385c..8b6f6a6a2a 100644 --- a/manifests/rhoai/kustomization.yaml +++ b/manifests/rhoai/kustomization.yaml @@ -26,6 +26,19 @@ replacements: name: kubeflow-trainer-controller-manager fieldPaths: - spec.template.spec.containers.0.image +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image + targets: + - select: + group: image.openshift.io + version: v1 + kind: ImageStream + name: training-hub-universal-cuda + fieldPaths: + - spec.tags.0.from.name # Labels to add to all resources and selectors. labels: @@ -41,6 +54,7 @@ resources: - ../base/webhook # - ../third-party/jobset #uncomment if jobset should be bundled under kubeflow trainer controller manager - runtimes +- training-hub-universal-cuda-imagestream.yaml - kubeflow-training-roles.yaml - monitor.yaml diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env index be4de6d00b..e4cb027fa8 100644 --- a/manifests/rhoai/params.env +++ b/manifests/rhoai/params.env @@ -1 +1,2 @@ odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0 +odh-kubeflow-trainer-universal-workbench-image=quay.io/mstoklus/training-image:runtime-image-training-hub30 diff --git a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml new file mode 100644 index 0000000000..e6e3b50c2b --- /dev/null +++ b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml @@ -0,0 +1,36 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: training-hub-universal-cuda + labels: + opendatahub.io/notebook-image: "true" + annotations: + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/trainer" + opendatahub.io/notebook-image-name: "Training Hub Universal (CUDA, Python 3.12)" + opendatahub.io/notebook-image-desc: "Universal Training Hub workbench image for CUDA-based fine-tuning and distributed training." + opendatahub.io/notebook-image-order: "1" + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' +spec: + lookupPolicy: + local: true + tags: + - name: latest + annotations: + opendatahub.io/notebook-software: | + [ + {"name": "CUDA", "version": "12.6"}, + {"name": "Python", "version": "v3.12"}, + {"name": "Training Hub", "version": "v0.3.0"} + ] + opendatahub.io/notebook-python-dependencies: | + [ + {"name": "JupyterLab", "version": "4.4"} + ] + openshift.io/imported-from: quay.io/mstoklus/training-image + from: + kind: DockerImage + name: odh-kubeflow-trainer-universal-workbench-image + importPolicy: + importMode: Legacy + referencePolicy: + type: Source From f8b7c92d744aa32115b0f01c6cff09114ec73835 Mon Sep 17 00:00:00 2001 From: kramaranya Date: Fri, 21 Nov 2025 09:32:34 +0000 Subject: [PATCH 2/5] Update quay repo Signed-off-by: kramaranya --- manifests/rhoai/params.env | 2 +- manifests/rhoai/training-hub-universal-cuda-imagestream.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env index e4cb027fa8..4970d645a1 100644 --- a/manifests/rhoai/params.env +++ b/manifests/rhoai/params.env @@ -1,2 +1,2 @@ odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0 -odh-kubeflow-trainer-universal-workbench-image=quay.io/mstoklus/training-image:runtime-image-training-hub30 +odh-kubeflow-trainer-universal-workbench-image=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest diff --git a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml index e6e3b50c2b..860c3a84d1 100644 --- a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml +++ b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml @@ -26,7 +26,7 @@ spec: [ {"name": "JupyterLab", "version": "4.4"} ] - openshift.io/imported-from: quay.io/mstoklus/training-image + openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 from: kind: DockerImage name: odh-kubeflow-trainer-universal-workbench-image From e657541d16219b3e70af88cbc212a490c5e7c133 Mon Sep 17 00:00:00 2001 From: kramaranya Date: Wed, 3 Dec 2025 10:30:48 +0100 Subject: [PATCH 3/5] Add ROCm and CPU ImageStreams Signed-off-by: kramaranya --- manifests/rhoai/kustomization.yaml | 30 ++++++++++++- manifests/rhoai/params.env | 4 +- ...raining-hub-universal-cpu-imagestream.yaml | 42 ++++++++++++++++++ ...aining-hub-universal-cuda-imagestream.yaml | 29 +++++++++---- ...aining-hub-universal-rocm-imagestream.yaml | 43 +++++++++++++++++++ 5 files changed, 137 insertions(+), 11 deletions(-) create mode 100644 manifests/rhoai/training-hub-universal-cpu-imagestream.yaml create mode 100644 manifests/rhoai/training-hub-universal-rocm-imagestream.yaml diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml index 8b6f6a6a2a..f174ea87c6 100644 --- a/manifests/rhoai/kustomization.yaml +++ b/manifests/rhoai/kustomization.yaml @@ -30,7 +30,7 @@ replacements: kind: ConfigMap name: rhoai-config version: v1 - fieldPath: data.odh-kubeflow-trainer-universal-workbench-image + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cuda targets: - select: group: image.openshift.io @@ -39,6 +39,32 @@ replacements: name: training-hub-universal-cuda fieldPaths: - spec.tags.0.from.name +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-rocm + targets: + - select: + group: image.openshift.io + version: v1 + kind: ImageStream + name: training-hub-universal-rocm + fieldPaths: + - spec.tags.0.from.name +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cpu + targets: + - select: + group: image.openshift.io + version: v1 + kind: ImageStream + name: training-hub-universal-cpu + fieldPaths: + - spec.tags.0.from.name # Labels to add to all resources and selectors. labels: @@ -55,6 +81,8 @@ resources: # - ../third-party/jobset #uncomment if jobset should be bundled under kubeflow trainer controller manager - runtimes - training-hub-universal-cuda-imagestream.yaml +- training-hub-universal-rocm-imagestream.yaml +- training-hub-universal-cpu-imagestream.yaml - kubeflow-training-roles.yaml - monitor.yaml diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env index 4970d645a1..8f6f233e0b 100644 --- a/manifests/rhoai/params.env +++ b/manifests/rhoai/params.env @@ -1,2 +1,4 @@ odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0 -odh-kubeflow-trainer-universal-workbench-image=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest +odh-kubeflow-trainer-universal-workbench-image-cuda=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest +odh-kubeflow-trainer-universal-workbench-image-rocm=quay.io/mstoklus/workbench-images:py312-rocm64-torch280-3 +odh-kubeflow-trainer-universal-workbench-image-cpu=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest diff --git a/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml b/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml new file mode 100644 index 0000000000..6bc353e445 --- /dev/null +++ b/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml @@ -0,0 +1,42 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: training-hub-universal-cpu + labels: + opendatahub.io/notebook-image: "true" + annotations: + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training" + opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | CPU | Python" + opendatahub.io/notebook-image-desc: "Training runtime image for CPU-based fine-tuning and distributed training." + opendatahub.io/notebook-image-order: "3" +spec: + lookupPolicy: + local: true + tags: + - name: "2025.1" + annotations: + opendatahub.io/notebook-software: | + [ + {"name": "Python", "version": "v3.12"}, + {"name": "PyTorch", "version": "2.8.0"}, + {"name": "Kubeflow SDK", "version": "0.2.0"}, + {"name": "Training Hub", "version": "v0.3.0"} + ] + opendatahub.io/notebook-python-dependencies: | + [ + {"name": "jupyterlab", "version": "4.4.4"}, + {"name": "transformers", "version": "4.57.1"}, + {"name": "accelerate", "version": "1.10.0"}, + {"name": "peft", "version": "0.17.0"}, + {"name": "trl", "version": "0.21.0"}, + {"name": "numpy", "version": "2.3.5"}, + {"name": "pandas", "version": "2.3.3"}, + {"name": "matplotlib", "version": "3.10.7"}, + {"name": "scikit-learn", "version": "1.7.2"} + ] + openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 + from: + kind: DockerImage + name: odh-kubeflow-trainer-universal-workbench-image-cpu + referencePolicy: + type: Source diff --git a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml index 860c3a84d1..806ef1f2d6 100644 --- a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml +++ b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml @@ -5,32 +5,43 @@ metadata: labels: opendatahub.io/notebook-image: "true" annotations: - opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/trainer" - opendatahub.io/notebook-image-name: "Training Hub Universal (CUDA, Python 3.12)" - opendatahub.io/notebook-image-desc: "Universal Training Hub workbench image for CUDA-based fine-tuning and distributed training." + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training" + opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | CUDA | Python" + opendatahub.io/notebook-image-desc: "Training runtime image for CUDA-based fine-tuning and distributed training." opendatahub.io/notebook-image-order: "1" opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' spec: lookupPolicy: local: true tags: - - name: latest + - name: "2025.1" annotations: opendatahub.io/notebook-software: | [ - {"name": "CUDA", "version": "12.6"}, + {"name": "CUDA", "version": "12.8"}, {"name": "Python", "version": "v3.12"}, + {"name": "PyTorch", "version": "2.8.0"}, + {"name": "Kubeflow SDK", "version": "0.2.0"} {"name": "Training Hub", "version": "v0.3.0"} ] opendatahub.io/notebook-python-dependencies: | [ - {"name": "JupyterLab", "version": "4.4"} + {"name": "jupyterlab", "version": "4.4.4"}, + {"name": "flash-attn", "version": "2.8.3"}, + {"name": "transformers", "version": "4.57.1"}, + {"name": "accelerate", "version": "1.10.0"}, + {"name": "liger-kernel", "version": "0.6.2"}, + {"name": "peft", "version": "0.17.0"}, + {"name": "triton", "version": "3.4.0"}, + {"name": "trl", "version": "0.21.0"}, + {"name": "numpy", "version": "2.3.5"}, + {"name": "pandas", "version": "2.3.3"}, + {"name": "matplotlib", "version": "3.10.7"}, + {"name": "scikit-learn", "version": "1.7.2"} ] openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 from: kind: DockerImage - name: odh-kubeflow-trainer-universal-workbench-image - importPolicy: - importMode: Legacy + name: odh-kubeflow-trainer-universal-workbench-image-cuda referencePolicy: type: Source diff --git a/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml b/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml new file mode 100644 index 0000000000..d738635537 --- /dev/null +++ b/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml @@ -0,0 +1,43 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: training-hub-universal-rocm + labels: + opendatahub.io/notebook-image: "true" + annotations: + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training" + opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | ROCm | Python" + opendatahub.io/notebook-image-desc: "Training runtime image for ROCm-based fine-tuning and distributed training." + opendatahub.io/notebook-image-order: "2" + opendatahub.io/recommended-accelerators: '["amd.com/gpu"]' +spec: + lookupPolicy: + local: true + tags: + - name: "2025.1" + annotations: + opendatahub.io/notebook-software: | + [ + {"name": "ROCm", "version": "6.4"}, + {"name": "Python", "version": "v3.12"}, + {"name": "PyTorch", "version": "2.8.0"}, + {"name": "Kubeflow SDK", "version": "0.2.0"} + ] + opendatahub.io/notebook-python-dependencies: | + [ + {"name": "jupyterlab", "version": "4.4.9"}, + {"name": "transformers", "version": "4.55.2"}, + {"name": "accelerate", "version": "1.10.0"}, + {"name": "liger-kernel", "version": "0.5.10"}, + {"name": "peft", "version": "0.17.0"}, + {"name": "trl", "version": "0.21.0"}, + {"name": "deepspeed", "version": "0.14.3"}, + {"name": "datasets", "version": "4.0.0"}, + {"name": "tensorboard", "version": "2.19.0"} + ] + openshift.io/imported-from: quay.io/mstoklus/workbench-images + from: + kind: DockerImage + name: odh-kubeflow-trainer-universal-workbench-image-rocm + referencePolicy: + type: Source From 9bbceeb84b0c986e82ca3fc9ae0b25ff24946756 Mon Sep 17 00:00:00 2001 From: kramaranya Date: Wed, 3 Dec 2025 13:45:20 +0100 Subject: [PATCH 4/5] Fix syntax error in CUDA Image Stream Signed-off-by: kramaranya --- manifests/rhoai/training-hub-universal-cuda-imagestream.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml index 806ef1f2d6..99874031b0 100644 --- a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml +++ b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml @@ -21,7 +21,7 @@ spec: {"name": "CUDA", "version": "12.8"}, {"name": "Python", "version": "v3.12"}, {"name": "PyTorch", "version": "2.8.0"}, - {"name": "Kubeflow SDK", "version": "0.2.0"} + {"name": "Kubeflow SDK", "version": "0.2.0"}, {"name": "Training Hub", "version": "v0.3.0"} ] opendatahub.io/notebook-python-dependencies: | From 54f00892817e3969c76cf667852e73bc23cb907c Mon Sep 17 00:00:00 2001 From: kramaranya Date: Fri, 5 Dec 2025 09:12:23 +0100 Subject: [PATCH 5/5] manifests: refactor params.env to handle multiple versions Signed-off-by: kramaranya --- manifests/rhoai/kustomization.yaml | 6 +++--- manifests/rhoai/params.env | 6 +++--- .../training-hub-universal-cpu-imagestream.yaml | 15 ++++++++------- .../training-hub-universal-cuda-imagestream.yaml | 4 ++-- .../training-hub-universal-rocm-imagestream.yaml | 2 +- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml index f174ea87c6..2f798be3d0 100644 --- a/manifests/rhoai/kustomization.yaml +++ b/manifests/rhoai/kustomization.yaml @@ -30,7 +30,7 @@ replacements: kind: ConfigMap name: rhoai-config version: v1 - fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cuda + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1 targets: - select: group: image.openshift.io @@ -43,7 +43,7 @@ replacements: kind: ConfigMap name: rhoai-config version: v1 - fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-rocm + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1 targets: - select: group: image.openshift.io @@ -56,7 +56,7 @@ replacements: kind: ConfigMap name: rhoai-config version: v1 - fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cpu + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1 targets: - select: group: image.openshift.io diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env index 8f6f233e0b..f75eafd1d4 100644 --- a/manifests/rhoai/params.env +++ b/manifests/rhoai/params.env @@ -1,4 +1,4 @@ odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0 -odh-kubeflow-trainer-universal-workbench-image-cuda=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest -odh-kubeflow-trainer-universal-workbench-image-rocm=quay.io/mstoklus/workbench-images:py312-rocm64-torch280-3 -odh-kubeflow-trainer-universal-workbench-image-cpu=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest +odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest +odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1=quay.io/mstoklus/workbench-images:py312-rocm64-torch280-3 +odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest-cpu diff --git a/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml b/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml index 6bc353e445..e5e127ffb6 100644 --- a/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml +++ b/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml @@ -18,25 +18,26 @@ spec: opendatahub.io/notebook-software: | [ {"name": "Python", "version": "v3.12"}, - {"name": "PyTorch", "version": "2.8.0"}, + {"name": "PyTorch", "version": "2.9.0"}, {"name": "Kubeflow SDK", "version": "0.2.0"}, - {"name": "Training Hub", "version": "v0.3.0"} + {"name": "Training Hub", "version": "v0.4.0"} ] opendatahub.io/notebook-python-dependencies: | [ - {"name": "jupyterlab", "version": "4.4.4"}, + {"name": "jupyterlab", "version": "4.4.9"}, {"name": "transformers", "version": "4.57.1"}, {"name": "accelerate", "version": "1.10.0"}, {"name": "peft", "version": "0.17.0"}, {"name": "trl", "version": "0.21.0"}, - {"name": "numpy", "version": "2.3.5"}, + {"name": "numpy", "version": "1.26.4"}, {"name": "pandas", "version": "2.3.3"}, - {"name": "matplotlib", "version": "3.10.7"}, - {"name": "scikit-learn", "version": "1.7.2"} + {"name": "matplotlib-inline", "version": "0.1.7"}, + {"name": "tensorboard", "version": "2.19.0"}, + {"name": "instructlab-training", "version": "0.12.1"} ] openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 from: kind: DockerImage - name: odh-kubeflow-trainer-universal-workbench-image-cpu + name: odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1 referencePolicy: type: Source diff --git a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml index 99874031b0..0756d1190b 100644 --- a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml +++ b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml @@ -22,7 +22,7 @@ spec: {"name": "Python", "version": "v3.12"}, {"name": "PyTorch", "version": "2.8.0"}, {"name": "Kubeflow SDK", "version": "0.2.0"}, - {"name": "Training Hub", "version": "v0.3.0"} + {"name": "Training Hub", "version": "v0.4.0"} ] opendatahub.io/notebook-python-dependencies: | [ @@ -42,6 +42,6 @@ spec: openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 from: kind: DockerImage - name: odh-kubeflow-trainer-universal-workbench-image-cuda + name: odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1 referencePolicy: type: Source diff --git a/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml b/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml index d738635537..c4822f3723 100644 --- a/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml +++ b/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml @@ -38,6 +38,6 @@ spec: openshift.io/imported-from: quay.io/mstoklus/workbench-images from: kind: DockerImage - name: odh-kubeflow-trainer-universal-workbench-image-rocm + name: odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1 referencePolicy: type: Source