diff --git a/manifests/rhoai/kustomization.yaml b/manifests/rhoai/kustomization.yaml index 86ec91385c..2f798be3d0 100644 --- a/manifests/rhoai/kustomization.yaml +++ b/manifests/rhoai/kustomization.yaml @@ -26,6 +26,45 @@ replacements: name: kubeflow-trainer-controller-manager fieldPaths: - spec.template.spec.containers.0.image +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1 + targets: + - select: + group: image.openshift.io + version: v1 + kind: ImageStream + name: training-hub-universal-cuda + fieldPaths: + - spec.tags.0.from.name +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1 + targets: + - select: + group: image.openshift.io + version: v1 + kind: ImageStream + name: training-hub-universal-rocm + fieldPaths: + - spec.tags.0.from.name +- source: + kind: ConfigMap + name: rhoai-config + version: v1 + fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1 + targets: + - select: + group: image.openshift.io + version: v1 + kind: ImageStream + name: training-hub-universal-cpu + fieldPaths: + - spec.tags.0.from.name # Labels to add to all resources and selectors. labels: @@ -41,6 +80,9 @@ resources: - ../base/webhook # - ../third-party/jobset #uncomment if jobset should be bundled under kubeflow trainer controller manager - runtimes +- training-hub-universal-cuda-imagestream.yaml +- training-hub-universal-rocm-imagestream.yaml +- training-hub-universal-cpu-imagestream.yaml - kubeflow-training-roles.yaml - monitor.yaml diff --git a/manifests/rhoai/params.env b/manifests/rhoai/params.env index be4de6d00b..f75eafd1d4 100644 --- a/manifests/rhoai/params.env +++ b/manifests/rhoai/params.env @@ -1 +1,4 @@ odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0 +odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest +odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1=quay.io/mstoklus/workbench-images:py312-rocm64-torch280-3 +odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest-cpu diff --git a/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml b/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml new file mode 100644 index 0000000000..e5e127ffb6 --- /dev/null +++ b/manifests/rhoai/training-hub-universal-cpu-imagestream.yaml @@ -0,0 +1,43 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: training-hub-universal-cpu + labels: + opendatahub.io/notebook-image: "true" + annotations: + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training" + opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | CPU | Python" + opendatahub.io/notebook-image-desc: "Training runtime image for CPU-based fine-tuning and distributed training." + opendatahub.io/notebook-image-order: "3" +spec: + lookupPolicy: + local: true + tags: + - name: "2025.1" + annotations: + opendatahub.io/notebook-software: | + [ + {"name": "Python", "version": "v3.12"}, + {"name": "PyTorch", "version": "2.9.0"}, + {"name": "Kubeflow SDK", "version": "0.2.0"}, + {"name": "Training Hub", "version": "v0.4.0"} + ] + opendatahub.io/notebook-python-dependencies: | + [ + {"name": "jupyterlab", "version": "4.4.9"}, + {"name": "transformers", "version": "4.57.1"}, + {"name": "accelerate", "version": "1.10.0"}, + {"name": "peft", "version": "0.17.0"}, + {"name": "trl", "version": "0.21.0"}, + {"name": "numpy", "version": "1.26.4"}, + {"name": "pandas", "version": "2.3.3"}, + {"name": "matplotlib-inline", "version": "0.1.7"}, + {"name": "tensorboard", "version": "2.19.0"}, + {"name": "instructlab-training", "version": "0.12.1"} + ] + openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 + from: + kind: DockerImage + name: odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1 + referencePolicy: + type: Source diff --git a/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml new file mode 100644 index 0000000000..0756d1190b --- /dev/null +++ b/manifests/rhoai/training-hub-universal-cuda-imagestream.yaml @@ -0,0 +1,47 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: training-hub-universal-cuda + labels: + opendatahub.io/notebook-image: "true" + annotations: + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training" + opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | CUDA | Python" + opendatahub.io/notebook-image-desc: "Training runtime image for CUDA-based fine-tuning and distributed training." + opendatahub.io/notebook-image-order: "1" + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' +spec: + lookupPolicy: + local: true + tags: + - name: "2025.1" + annotations: + opendatahub.io/notebook-software: | + [ + {"name": "CUDA", "version": "12.8"}, + {"name": "Python", "version": "v3.12"}, + {"name": "PyTorch", "version": "2.8.0"}, + {"name": "Kubeflow SDK", "version": "0.2.0"}, + {"name": "Training Hub", "version": "v0.4.0"} + ] + opendatahub.io/notebook-python-dependencies: | + [ + {"name": "jupyterlab", "version": "4.4.4"}, + {"name": "flash-attn", "version": "2.8.3"}, + {"name": "transformers", "version": "4.57.1"}, + {"name": "accelerate", "version": "1.10.0"}, + {"name": "liger-kernel", "version": "0.6.2"}, + {"name": "peft", "version": "0.17.0"}, + {"name": "triton", "version": "3.4.0"}, + {"name": "trl", "version": "0.21.0"}, + {"name": "numpy", "version": "2.3.5"}, + {"name": "pandas", "version": "2.3.3"}, + {"name": "matplotlib", "version": "3.10.7"}, + {"name": "scikit-learn", "version": "1.7.2"} + ] + openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9 + from: + kind: DockerImage + name: odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1 + referencePolicy: + type: Source diff --git a/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml b/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml new file mode 100644 index 0000000000..c4822f3723 --- /dev/null +++ b/manifests/rhoai/training-hub-universal-rocm-imagestream.yaml @@ -0,0 +1,43 @@ +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: training-hub-universal-rocm + labels: + opendatahub.io/notebook-image: "true" + annotations: + opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training" + opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | ROCm | Python" + opendatahub.io/notebook-image-desc: "Training runtime image for ROCm-based fine-tuning and distributed training." + opendatahub.io/notebook-image-order: "2" + opendatahub.io/recommended-accelerators: '["amd.com/gpu"]' +spec: + lookupPolicy: + local: true + tags: + - name: "2025.1" + annotations: + opendatahub.io/notebook-software: | + [ + {"name": "ROCm", "version": "6.4"}, + {"name": "Python", "version": "v3.12"}, + {"name": "PyTorch", "version": "2.8.0"}, + {"name": "Kubeflow SDK", "version": "0.2.0"} + ] + opendatahub.io/notebook-python-dependencies: | + [ + {"name": "jupyterlab", "version": "4.4.9"}, + {"name": "transformers", "version": "4.55.2"}, + {"name": "accelerate", "version": "1.10.0"}, + {"name": "liger-kernel", "version": "0.5.10"}, + {"name": "peft", "version": "0.17.0"}, + {"name": "trl", "version": "0.21.0"}, + {"name": "deepspeed", "version": "0.14.3"}, + {"name": "datasets", "version": "4.0.0"}, + {"name": "tensorboard", "version": "2.19.0"} + ] + openshift.io/imported-from: quay.io/mstoklus/workbench-images + from: + kind: DockerImage + name: odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1 + referencePolicy: + type: Source