File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -5,3 +5,8 @@ resources:
55 - torch_cuda_251.yaml
66 - torch_rocm_241.yaml
77 - torch_rocm_251.yaml
8+ - torch_distributed_th03_cuda128_torch28_py312.yaml
9+ - torch_distributed.yaml
10+ - training_hub_th03_cuda128_torch28_py312.yaml
11+ - training_hub.yaml
12+ - training_hub_th03_cuda128_torch28_py312.yaml
Original file line number Diff line number Diff line change 1+ apiVersion : trainer.kubeflow.org/v1alpha1
2+ kind : ClusterTrainingRuntime
3+ metadata :
4+ name : torch-distributed
5+ labels :
6+ trainer.kubeflow.org/framework : torch
7+ spec :
8+ mlPolicy :
9+ numNodes : 1
10+ torch :
11+ numProcPerNode : auto
12+ template :
13+ spec :
14+ replicatedJobs :
15+ - name : node
16+ template :
17+ metadata :
18+ labels :
19+ trainer.kubeflow.org/trainjob-ancestor-step : trainer
20+ spec :
21+ template :
22+ spec :
23+ containers :
24+ - name : node
25+ image : quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
26+
Original file line number Diff line number Diff line change 1+ apiVersion : trainer.kubeflow.org/v1alpha1
2+ kind : ClusterTrainingRuntime
3+ metadata :
4+ name : torch-distributed
5+ labels :
6+ trainer.kubeflow.org/framework : torch
7+ spec :
8+ mlPolicy :
9+ numNodes : 1
10+ torch :
11+ numProcPerNode : auto
12+ template :
13+ spec :
14+ replicatedJobs :
15+ - name : node
16+ template :
17+ metadata :
18+ labels :
19+ trainer.kubeflow.org/trainjob-ancestor-step : trainer
20+ spec :
21+ template :
22+ spec :
23+ containers :
24+ - name : node
25+ image : quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
26+
Original file line number Diff line number Diff line change 1+ apiVersion : trainer.kubeflow.org/v1alpha1
2+ kind : ClusterTrainingRuntime
3+ metadata :
4+ name : training-hub
5+ labels :
6+ trainer.kubeflow.org/framework : training-hub
7+ spec :
8+ mlPolicy :
9+ numNodes : 1
10+ torch :
11+ numProcPerNode : auto
12+ template :
13+ spec :
14+ replicatedJobs :
15+ - name : node
16+ template :
17+ metadata :
18+ labels :
19+ trainer.kubeflow.org/trainjob-ancestor-step : trainer
20+ spec :
21+ template :
22+ spec :
23+ containers :
24+ - name : node
25+ image : quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
26+
Original file line number Diff line number Diff line change 1+ apiVersion : trainer.kubeflow.org/v1alpha1
2+ kind : ClusterTrainingRuntime
3+ metadata :
4+ name : training-hub03-cuda128-torch28-py312
5+ labels :
6+ trainer.kubeflow.org/framework : training-hub
7+ spec :
8+ mlPolicy :
9+ numNodes : 1
10+ torch :
11+ numProcPerNode : auto
12+ template :
13+ spec :
14+ replicatedJobs :
15+ - name : node
16+ template :
17+ metadata :
18+ labels :
19+ trainer.kubeflow.org/trainjob-ancestor-step : trainer
20+ spec :
21+ template :
22+ spec :
23+ containers :
24+ - name : node
25+ image : quay.io/opendatahub/odh-training-th03-cuda128-torch28-py312-rhel9:latest
26+
You can’t perform that action at this time.
0 commit comments