ray-project · Aug 13, 2021 · Sep 7, 2021 · Sep 7, 2021 · Sep 7, 2021 · Sep 21, 2021
diff --git a/deploy/charts/ray/Chart.yaml b/deploy/charts/ray/Chart.yaml
@@ -4,7 +4,7 @@ description: A Helm chart for deployments of Ray on Kubernetes.
 type: application
 
 # Chart version.
-version: 0.1.0
+version: 0.1.0-24
 
 # Ray version.
 appVersion: "latest"
diff --git a/deploy/charts/ray/templates/lb_service.yaml b/deploy/charts/ray/templates/lb_service.yaml
@@ -0,0 +1,24 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Release.Name }}{{ .Values.clusterNameSuffix }}-head-lb
+spec:
+  loadBalancerSourceRanges:
+  {{- toYaml .Values.headNodeServiceAllowedRanges | nindent 2 }}
+  ports:
+  - port: 54399
+    protocol: TCP
+    targetPort: 54399
+    name: metrics
+  - port: 10001
+    protocol: TCP
+    targetPort: 10001
+    name: client
+  - port: 6379
+    protocol: TCP
+    targetPort: 6379
+    name: redis
+  selector:
+    ray-node-type: head
+    ray-cluster-name: {{ .Release.Name }}{{ .Values.clusterNameSuffix }}
+  type: LoadBalancer
diff --git a/deploy/charts/ray/templates/operator_cluster_scoped.yaml b/deploy/charts/ray/templates/operator_cluster_scoped.yaml
@@ -53,13 +53,28 @@ spec:
         image: {{ .Values.operatorImage }}
         command: ["ray-operator"]
         env:
+        - name: TZ
+          value: UTC
         - name: AUTOSCALER_MAX_NUM_FAILURES
           value: "inf"
+        - name: RAY_CLUSTER_ADDRESS
+          value: ray-ray-head.{{ .Values.operatorNamespace }}.svc.cluster.local
+        - name: RAY_CLUSTER_PORT
+          value: "10001"
         resources:
           requests:
             cpu: 1
             memory: 1Gi
           limits:
             memory: 2Gi
             cpu: 1
+        livenessProbe:
+          exec:
+            command:
+            - python
+            - /worker_heartbeat.py
+          failureThreshold: 1
+          initialDelaySeconds: 600
+          periodSeconds: 900
+          timeoutSeconds: 240
 {{- end }}
diff --git a/deploy/charts/ray/templates/operator_namespaced.yaml b/deploy/charts/ray/templates/operator_namespaced.yaml
@@ -50,18 +50,33 @@ spec:
         image: {{ .Values.operatorImage }}
         command: ["ray-operator"]
         env:
+        - name: TZ
+          value: UTC
         - name: RAY_OPERATOR_POD_NAMESPACE
           valueFrom:
             fieldRef:
               fieldPath: metadata.namespace
         - name: AUTOSCALER_MAX_NUM_FAILURES
           value: "inf"
+        - name: RAY_CLUSTER_ADDRESS
+          value: ray-ray-head.{{ .Values.operatorNamespace }}.svc.cluster.local
+        - name: RAY_CLUSTER_PORT
+          value: "10001"
         resources:
           requests:
             cpu: 1
             memory: 1Gi
           limits:
             memory: 2Gi
             cpu: 1
+        livenessProbe:
+          exec:
+            command:
+            - python
+            - /worker_heartbeat.py
+          failureThreshold: 1
+          initialDelaySeconds: 600
+          periodSeconds: 900
+          timeoutSeconds: 240
 {{- end }}
 
diff --git a/deploy/charts/ray/templates/raycluster.yaml b/deploy/charts/ray/templates/raycluster.yaml
@@ -10,9 +10,9 @@ spec:
   # E.g., if the task requires adding more nodes then autoscaler will gradually
   # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
   # This number should be > 0.
-  upscalingSpeed: 1.0
+  upscalingSpeed: 10.0
   # If a node is idle for this many minutes, it will be removed.
-  idleTimeoutMinutes: 5
+  idleTimeoutMinutes: 1
   # Specify the pod type for the ray head node (as configured below).
   headPodType: {{ .Values.headPodType }}
   # Specify the allowed pod types for this ray cluster and the resources they provide.
@@ -39,6 +39,11 @@ spec:
           - name: dshm
             emptyDir:
               medium: Memory
+          {{- range $volume := .persistentVolumes }}
+          - name: {{ $volume.claimName }}
+            persistentVolumeClaim:
+              claimName: {{ $volume.claimName }}
+          {{- end }}
           containers:
           - name: ray-node
             imagePullPolicy: Always
@@ -49,16 +54,43 @@ spec:
             args: ['trap : TERM INT; sleep infinity & wait;']
             ports:
             - containerPort: 6379  # Redis port
-            - containerPort: 10001  # Used by Ray Client
+            - containerPort: 10001 # Used by Ray Client
             - containerPort: 8265  # Used by Ray Dashboard
-            - containerPort: 8000 # Used by Ray Serve
-
+            - containerPort: 8000  # Used by Ray Serve
+            - containerPort: 54399 # Metrics
+            env:
+            - name: TZ
+              value: UTC
+            {{- if eq $key $.Values.headPodType }}
+            # Set max autoscaler number of failures to infinity to prevent
+            # unexpected autoscaler crashes.
+            - name: AUTOSCALER_MAX_NUM_FAILURES
+              value: inf
+            - name: PYTHONFAULTHANDLER
+              value: "true"
+            {{- end }}
+            - name: RAY_BACKEND_LOG_LEVEL
+              value: fatal
+            - name: RAY_DISABLE_MEMORY_MONITOR
+              value: "1"
             # This volume allocates shared memory for Ray to use for its plasma
             # object store. If you do not provide this, Ray will fall back to
             # /tmp which cause slowdowns if is not a shared memory volume.
             volumeMounts:
             - mountPath: /dev/shm
               name: dshm
+            {{- range $volume := .persistentVolumes }}
+            {{- range $mount := $volume.mounts }}
+            - name: {{ $volume.claimName }}
+              mountPath: {{ $mount.mountPath }}
+              {{- if $mount.subPath }}
+              subPath: {{ $mount.subPath }}
+              {{- end }}
+              {{- if $mount.readOnly }}
+              readOnly: {{ $mount.readOnly }}
+              {{- end }}
+            {{- end }}
+            {{- end }}
             resources:
               requests:
                 cpu: {{ .CPU }}
@@ -76,16 +108,24 @@ spec:
                 {{- if .GPU }}
                 nvidia.com/gpu: {{ .GPU }}
                 {{- end }}
+            securityContext:
+              capabilities:
+                add:
+                - SYS_PTRACE
           {{- if .nodeSelector }}
           nodeSelector:
               {{- toYaml .nodeSelector | nindent 12 }}
           {{- end }}
+          {{- if .tolerations }}
+          tolerations:
+              {{- toYaml .tolerations | nindent 12 }}
+          {{- end }}
     {{- end }}
   # Commands to start Ray on the head node. You don't need to change this.
   # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
   headStartRayCommands:
     - ray stop
-    - ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0
+    - ulimit -n 65536; ray start --head --no-monitor --include-dashboard 0 --metrics-export-port=54399
   # Commands to start Ray on worker nodes. You don't need to change this.
   workerStartRayCommands:
     - ray stop

diff --git a/deploy/charts/ray/values.yaml b/deploy/charts/ray/values.yaml
@@ -6,6 +6,8 @@
 image: rayproject/ray:latest
 # headPodType is the podType used for the Ray head node (as configured below).
 headPodType: rayHeadType
+# Allowed source ranges for head node LB service
+headNodeServiceAllowedRanges: []
 # podTypes is the list of pod configurations available for use as Ray nodes.
 podTypes:
     # The key for each podType is a user-defined string.
@@ -31,6 +33,10 @@ podTypes:
         rayResources: {}
         # Optionally, set a node selector for this podType: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector
         nodeSelector: {}
+        # Optionally, set a tolerations for this podType: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
+        tolerations: {}
+        # PersistentVolumeClaim mounts
+        persistentVolumes: []
     # The key for each podType is a user-defined string.
     rayWorkerType:
         # minWorkers is the minimum number of Ray workers of this pod type to keep running.
@@ -51,8 +57,12 @@ podTypes:
         # For example, rayResources: {"CPU": 0} can be used in the head podType to prevent Ray from scheduling tasks on the head.
         # See https://docs.ray.io/en/master/advanced.html#dynamic-remote-parameters for an example of usage of custom resources in a Ray task.
         rayResources: {}
+        # Optionally, set a tolerations for this podType: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
+        tolerations: {}
         # Optionally, set a node selector for this Pod type. See https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector
         nodeSelector: {}
+        # PersistentVolumeClaim mounts
+        persistentVolumes: []
 
 
 # Operator settings: