Merge pull request #1442 from porter-dev/main

yosefmih · web-flow · commit e2ce68d51dea · 2024-11-21T17:05:06.000-05:00
Move gpu enabled jobs to prod
diff --git a/applications/job/templates/cronjob.yaml b/applications/job/templates/cronjob.yaml
@@ -169,6 +169,11 @@ spec:
                 requests:
                   cpu: {{ .Values.resources.requests.cpu }}
                   memory: {{ .Values.resources.requests.memory }}
+              {{ if .Values.resources.requests }}
+              {{ if .Values.resources.requests.nvidiaGpu }}
+                  nvidia.com/gpu: {{ .Values.resources.requests.nvidiaGpu }}
+              {{ end }}
+              {{ end }}
                 limits:
                 {{- if .Values.resources.setCPULimits }}
                   {{- if .Values.resources.limits.cpu }}
@@ -182,6 +187,9 @@ spec:
                 {{- else }}
                   memory: {{ .Values.resources.requests.memory }}
                 {{- end }}
+                {{- if .Values.resources.limits.nvidiaGpu }}
+                  nvidia.com/gpu: {{ .Values.resources.limits.nvidiaGpu }}
+                {{- end }}
             - name: sidecar
               image: ghcr.io/porter-dev/job_sidecar_container:31e471f4d
               imagePullPolicy: Always
@@ -262,6 +270,13 @@ spec:
               value: {{ $nodeGroup.id | quote }}
               effect: "NoSchedule"
           {{- end }}
+          # nvidia.com/gpu toleration is automatically injected in EKS/GKE, but not in AKS 
+          # since we want to be cloud agnostic, we just add it ourselves wherever we need it
+          {{- if .Values.resources.requests.nvidiaGpu}}
+            - key: "nvidia.com/gpu"
+              operator: "Exists"
+              effect: "NoSchedule"
+          {{- end }}
       backoffLimit: 0
       {{- if (.Values.sidecar.timeout) }}
       activeDeadlineSeconds: {{ .Values.sidecar.timeout }}
diff --git a/applications/web/templates/deployment.yaml b/applications/web/templates/deployment.yaml
@@ -462,6 +462,13 @@ spec:
           value: {{ $nodeGroup.id | quote }}
           effect: "NoSchedule"
       {{- end }}
+      # nvidia.com/gpu toleration is automatically injected in EKS/GKE, but not in AKS 
+      # since we want to be cloud agnostic, we just add it ourselves wherever we need it
+      {{- if .Values.resources.requests.nvidiaGpu}}
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      {{- end }}
       {{- if .Values.topology.enabled }}
       topologySpreadConstraints: 
         - maxSkew: {{ .Values.topology.maxSkew }}
diff --git a/applications/worker/templates/deployment.yaml b/applications/worker/templates/deployment.yaml
@@ -375,6 +375,13 @@ spec:
           value: {{ $nodeGroup.id | quote }}
           effect: "NoSchedule"
       {{- end }}
+      # nvidia.com/gpu toleration is automatically injected in EKS/GKE, but not in AKS 
+      # since we want to be cloud agnostic, we just add it ourselves wherever we need it
+      {{- if .Values.resources.requests.nvidiaGpu}}
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      {{- end }}
       {{- if .Values.topology.enabled }}
       topologySpreadConstraints:
         - maxSkew: {{ .Values.topology.maxSkew }}