redhat-na-ssa · codekow · Apr 14, 2025 · Apr 14, 2025 · Apr 17, 2025 · Apr 18, 2025
diff --git a/.pyspelling.yml b/.pyspelling.yml
@@ -45,5 +45,5 @@ matrix:
     pipeline:
       - pyspelling.filters.python:
     sources:
-      - '!venv/**|!scratch/**|**/*.py'
+      - '!venv/**|!components/dump/**|!scratch/**|**/*.py'
     default_encoding: utf-8
diff --git a/.wordlist-md b/.wordlist-md
@@ -1,3 +1,4 @@
+
 ack
 ACK
 acm
@@ -39,6 +40,7 @@ Authorino
 Autogenerated
 autoscale
 autoscaler
+autoscalers
 autoscaling
 aws
 backplane
@@ -255,6 +257,7 @@ NMState
 nodeFeatureDiscovery
 noobaa
 Noobaa
+NoSchedule
 NuGet
 nv
 nvidia
@@ -421,5 +424,6 @@ wtoctl
 wunderio
 www
 XDG
+xlarge
 yaml
 YOLO
diff --git a/.yamllint b/.yamllint
@@ -25,3 +25,4 @@ ignore:
   - "components/containers/web-terminal/src/*"
   - "components/operators/gpu-operator-certified/operator/components/console-plugin-helm/*"
   - "components/charts/*/templates/*"
+  - "components/dump/*"
diff --git a/components/dump/autoscale-gpu/README.md b/components/dump/autoscale-gpu/README.md
@@ -0,0 +1,55 @@
+# Steps
+
+Create a L4 GPU MachineSet (g6.xlarge)
+
+In the machine set template add the following:
+
+```yaml
+spec:
+  template:
+    spec:
+      taints:
+      - effect: NoSchedule           
+        key: nvidia.com/gpu
+        value: ''
+      metadata:
+        labels:
+          cluster-api/accelerator: nvidia-l4
+```
+
+Create a A10 GPU Machineset (g5.xlarge)
+
+Same as above, this time:
+
+```yaml
+spec:
+  template:
+    spec:
+      taints:
+      - effect: NoSchedule           
+        key: nvidia.com/gpu
+        value: ''
+      metadata:
+        labels:
+          cluster-api/accelerator: nvidia-a10g
+```
+
+Install NFD Operator and NFD Instance as usual
+
+Install GPU Operator and GPU Cluster Policy as usual
+
+Create autoscalers
+
+```bash
+oc create -f cluster-autoscaler.yaml
+oc create -f l4-autoscaler.yaml
+oc create -f a10-autoscaler.yaml
+```
+
+Finally, deploy the sample apps:
+
+```bash
+oc new-project sandbox
+oc create -f cuda-a10.yaml
+oc create -f cuda-l4.yaml
+```
diff --git a/components/dump/autoscale-gpu/a10-autoscaler.yaml b/components/dump/autoscale-gpu/a10-autoscaler.yaml
@@ -0,0 +1,12 @@
+apiVersion: "autoscaling.openshift.io/v1beta1"
+kind: "MachineAutoscaler"
+metadata:
+  name: "gpu-a10-machineset-autoscaler"
+  namespace: "openshift-machine-api"
+spec:
+  minReplicas: 0
+  maxReplicas: 4
+  scaleTargetRef:
+    apiVersion: machine.openshift.io/v1beta1
+    kind: MachineSet 
+    name: gpu-a10-machineset
diff --git a/components/dump/autoscale-gpu/aws-gpu-machineset/README.md b/components/dump/autoscale-gpu/aws-gpu-machineset/README.md
@@ -0,0 +1,24 @@
+# aws-gpu-machineset
+
+## Purpose
+
+This component is designed to setup a MachineSet with GPUs on an AWS based OpenShift cluster.
+
+This component triggers a job that creates a MachineSet based on your current MachineSet.
+
+This component has been tested using AWS based OpenShift instances provisioned by demo.redhat.com.
+
+## Usage
+
+This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file:
+
+```
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+components:
+  - ../../components/aws-gpu-machineset
+```
diff --git a/components/dump/autoscale-gpu/aws-gpu-machineset/job.sh b/components/dump/autoscale-gpu/aws-gpu-machineset/job.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# shellcheck disable=SC1091
+. /scripts/ocp.sh
+
+INSTANCE_TYPE=${INSTANCE_TYPE:-g4dn.4xlarge}
+
+ocp_aws_cluster || exit 0
+ocp_aws_machineset_clone_worker g6.xlarge gpu-l4-machineset
+ocp_aws_machineset_clone_worker g5.xlarge gpu-a10-machineset
+ocp_aws_machineset_fix_storage gpu-l4-machineset 400
+ocp_aws_machineset_fix_storage gpu-a10-machineset 400
diff --git a/components/dump/autoscale-gpu/aws-gpu-machineset/job.yaml b/components/dump/autoscale-gpu/aws-gpu-machineset/job.yaml
@@ -0,0 +1,90 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: job-aws-gpu-machineset
+rules:
+- apiGroups:
+  - machine.openshift.io
+  resources:
+  - machinesets
+  verbs:
+  - '*'
+- apiGroups:
+  - autoscaling.openshift.io
+  resources:
+  - machineautoscalers
+  verbs:
+  - '*'
+- apiGroups:
+  - ''
+  resources:
+  - secrets
+  resourceNames:
+  - aws-creds
+  verbs:
+  - get
+  - list
+# - nonResourceURLs:
+#   - '*'
+#   verbs:
+#   - '*'
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: job-aws-gpu-machineset
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: job-aws-gpu-machineset
+subjects:
+  - kind: ServiceAccount
+    name: job-aws-gpu-machineset
+    namespace: nvidia-gpu-operator
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  generateName: job-aws-gpu-machineset-
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+  # annotations:
+  #   argocd.argoproj.io/hook: Sync
+    # argocd.argoproj.io/hook-delete-policy: HookSucceeded
+spec:
+  template:
+    spec:
+      containers:
+        - name: job-aws-gpu-machineset
+          # image: image-registry.openshift-image-registry.svc:5000/openshift/tools:latest
+          image: registry.redhat.io/openshift4/ose-cli
+          env:
+            - name: INSTANCE_TYPE
+              value: "g4dn.4xlarge"
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          command:
+            - /bin/bash
+            - -c
+            - /scripts/job.sh
+          volumeMounts:
+            - name: scripts
+              mountPath: /scripts
+      volumes:
+        - name: scripts
+          configMap:
+            name: job-aws-gpu-machineset
+            defaultMode: 0755
+      restartPolicy: Never
+      terminationGracePeriodSeconds: 30
+      serviceAccount: job-aws-gpu-machineset
+      serviceAccountName: job-aws-gpu-machineset
diff --git a/components/dump/autoscale-gpu/aws-gpu-machineset/kustomization.yaml b/components/dump/autoscale-gpu/aws-gpu-machineset/kustomization.yaml
@@ -0,0 +1,16 @@
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+resources:
+  - job.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+configMapGenerator:
+  - name: job-aws-gpu-machineset
+    namespace: nvidia-gpu-operator
+    files:
+      - job.sh
+      - ocp.sh
+      # - https://raw.githubusercontent.com/redhat-na-ssa/demo-ai-gitops-catalog/main/scripts/library/ocp.sh