Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions manifests/rhoai/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,45 @@ replacements:
name: kubeflow-trainer-controller-manager
fieldPaths:
- spec.template.spec.containers.0.image
- source:
kind: ConfigMap
name: rhoai-config
version: v1
fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1
targets:
- select:
group: image.openshift.io
version: v1
kind: ImageStream
name: training-hub-universal-cuda
fieldPaths:
- spec.tags.0.from.name
- source:
kind: ConfigMap
name: rhoai-config
version: v1
fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1
targets:
- select:
group: image.openshift.io
version: v1
kind: ImageStream
name: training-hub-universal-rocm
fieldPaths:
- spec.tags.0.from.name
- source:
kind: ConfigMap
name: rhoai-config
version: v1
fieldPath: data.odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1
targets:
- select:
group: image.openshift.io
version: v1
kind: ImageStream
name: training-hub-universal-cpu
fieldPaths:
- spec.tags.0.from.name

# Labels to add to all resources and selectors.
labels:
Expand All @@ -41,6 +80,9 @@ resources:
- ../base/webhook
# - ../third-party/jobset #uncomment if jobset should be bundled under kubeflow trainer controller manager
- runtimes
- training-hub-universal-cuda-imagestream.yaml
- training-hub-universal-rocm-imagestream.yaml
- training-hub-universal-cpu-imagestream.yaml
- kubeflow-training-roles.yaml
- monitor.yaml

Expand Down
3 changes: 3 additions & 0 deletions manifests/rhoai/params.env
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
odh-kubeflow-trainer-controller-image=quay.io/opendatahub/trainer:v2.1.0
odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1=quay.io/mstoklus/workbench-images:py312-rocm64-torch280-3
odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1=quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest-cpu
43 changes: 43 additions & 0 deletions manifests/rhoai/training-hub-universal-cpu-imagestream.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apiVersion: image.openshift.io/v1
kind: ImageStream
metadata:
name: training-hub-universal-cpu
labels:
opendatahub.io/notebook-image: "true"
annotations:
opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training"
opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | CPU | Python"
opendatahub.io/notebook-image-desc: "Training runtime image for CPU-based fine-tuning and distributed training."
opendatahub.io/notebook-image-order: "3"
spec:
lookupPolicy:
local: true
tags:
- name: "2025.1"
annotations:
opendatahub.io/notebook-software: |
[
{"name": "Python", "version": "v3.12"},
{"name": "PyTorch", "version": "2.9.0"},
{"name": "Kubeflow SDK", "version": "0.2.0"},
{"name": "Training Hub", "version": "v0.4.0"}
]
opendatahub.io/notebook-python-dependencies: |
[
{"name": "jupyterlab", "version": "4.4.9"},
{"name": "transformers", "version": "4.57.1"},
{"name": "accelerate", "version": "1.10.0"},
{"name": "peft", "version": "0.17.0"},
{"name": "trl", "version": "0.21.0"},
{"name": "numpy", "version": "1.26.4"},
{"name": "pandas", "version": "2.3.3"},
{"name": "matplotlib-inline", "version": "0.1.7"},
{"name": "tensorboard", "version": "2.19.0"},
{"name": "instructlab-training", "version": "0.12.1"}
]
openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9
from:
kind: DockerImage
name: odh-kubeflow-trainer-universal-workbench-image-cpu-2025-1
referencePolicy:
type: Source
47 changes: 47 additions & 0 deletions manifests/rhoai/training-hub-universal-cuda-imagestream.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: image.openshift.io/v1
kind: ImageStream
metadata:
name: training-hub-universal-cuda
labels:
opendatahub.io/notebook-image: "true"
annotations:
opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training"
opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | CUDA | Python"
opendatahub.io/notebook-image-desc: "Training runtime image for CUDA-based fine-tuning and distributed training."
opendatahub.io/notebook-image-order: "1"
opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
spec:
lookupPolicy:
local: true
tags:
- name: "2025.1"
annotations:
opendatahub.io/notebook-software: |
[
{"name": "CUDA", "version": "12.8"},
{"name": "Python", "version": "v3.12"},
{"name": "PyTorch", "version": "2.8.0"},
{"name": "Kubeflow SDK", "version": "0.2.0"},
{"name": "Training Hub", "version": "v0.4.0"}
]
opendatahub.io/notebook-python-dependencies: |
[
{"name": "jupyterlab", "version": "4.4.4"},
{"name": "flash-attn", "version": "2.8.3"},
{"name": "transformers", "version": "4.57.1"},
{"name": "accelerate", "version": "1.10.0"},
{"name": "liger-kernel", "version": "0.6.2"},
{"name": "peft", "version": "0.17.0"},
{"name": "triton", "version": "3.4.0"},
{"name": "trl", "version": "0.21.0"},
{"name": "numpy", "version": "2.3.5"},
{"name": "pandas", "version": "2.3.3"},
{"name": "matplotlib", "version": "3.10.7"},
{"name": "scikit-learn", "version": "1.7.2"}
]
openshift.io/imported-from: quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should be dynamically fetched? I guess at some point this will be coming from registry (??)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notebooks repo keeps it static, but we can make it dynamic after images move to the final registry

from:
kind: DockerImage
name: odh-kubeflow-trainer-universal-workbench-image-cuda-2025-1
referencePolicy:
type: Source
43 changes: 43 additions & 0 deletions manifests/rhoai/training-hub-universal-rocm-imagestream.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apiVersion: image.openshift.io/v1
kind: ImageStream
metadata:
name: training-hub-universal-rocm
labels:
opendatahub.io/notebook-image: "true"
annotations:
opendatahub.io/notebook-image-url: "https://github.com/opendatahub-io/distributed-workloads/tree/main/images/universal/training"
opendatahub.io/notebook-image-name: "Training | Jupyter | PyTorch | ROCm | Python"
opendatahub.io/notebook-image-desc: "Training runtime image for ROCm-based fine-tuning and distributed training."
opendatahub.io/notebook-image-order: "2"
opendatahub.io/recommended-accelerators: '["amd.com/gpu"]'
spec:
lookupPolicy:
local: true
tags:
- name: "2025.1"
annotations:
opendatahub.io/notebook-software: |
[
{"name": "ROCm", "version": "6.4"},
{"name": "Python", "version": "v3.12"},
{"name": "PyTorch", "version": "2.8.0"},
{"name": "Kubeflow SDK", "version": "0.2.0"}
]
opendatahub.io/notebook-python-dependencies: |
[
{"name": "jupyterlab", "version": "4.4.9"},
{"name": "transformers", "version": "4.55.2"},
{"name": "accelerate", "version": "1.10.0"},
{"name": "liger-kernel", "version": "0.5.10"},
{"name": "peft", "version": "0.17.0"},
{"name": "trl", "version": "0.21.0"},
{"name": "deepspeed", "version": "0.14.3"},
{"name": "datasets", "version": "4.0.0"},
{"name": "tensorboard", "version": "2.19.0"}
]
openshift.io/imported-from: quay.io/mstoklus/workbench-images
from:
kind: DockerImage
name: odh-kubeflow-trainer-universal-workbench-image-rocm-2025-1
referencePolicy:
type: Source
Loading