examples/mnist-transfer-learning/ray_cluster.yaml

# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 2
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 10
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 2
# Whether or not to autoscale aggressively. If this is enabled, if at any point
#   we would start more workers, we start at least enough to bring us to
#   initial_workers.
autoscaling_mode: default
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
    type: kubernetes
    # Exposing external IP addresses for ray pods isn't currently supported.
    use_internal_ips: true
    # Namespace to use for all resources created.
    namespace: 58jmk88t9it
# Kubernetes pod config for the head node pod.
head_node:
    apiVersion: v1
    kind: Pod
    metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: ray-head-
        # Must match the head node service selector above if a head node
        # service is required.
        labels:
            component: ray-head
    spec:
        # Change this if you altered the autoscaler_service_account above
        # or want to provide your own.
        serviceAccountName: 58jmk88t9it-writer
        # Restarting the head node automatically is not currently supported.
        # If the head node goes down, `ray up` must be run again.
        restartPolicy: Never
        # This volume allocates shared memory for Ray to use for its plasma
        # object store. If you do not provide this, Ray will fall back to
        # /tmp which cause slowdowns if is not a shared memory volume.
        volumes:
        - name: dshm
          emptyDir:
              medium: Memory
        containers:
        - name: ray-node
          imagePullPolicy: Always
          # You are free (and encouraged) to use your own container image,
          # but it should have the following installed:
          #   - rsync (used for `ray rsync` commands and file mounts)
          #   - screen (used for `ray attach`)
          #   - kubectl (used by the autoscaler to manage worker pods)
          image: lchu/horovod-ray:v3
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
          args: ["trap : TERM INT; sleep infinity & wait;"]
          ports:
              - containerPort: 6379 # Redis port.
              - containerPort: 6380 # Redis port.
              - containerPort: 6381 # Redis port.
              - containerPort: 12345 # Ray internal communication.
              - containerPort: 12346 # Ray internal communication.
          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
              - mountPath: /dev/shm
                name: dshm
          resources:
              requests:
                  cpu: 2
                  memory: 8Gi
                  ephemeral-storage: 2Gi
              limits:
                  # The maximum memory that this pod is allowed to use. The
                  # limit will be detected by ray and split to use 10% for
                  # redis, 30% for the shared memory object store, and the
                  # rest for application memory. If this limit is not set and
                  # the object store size is not set manually, ray will
                  # allocate a very large object store in each pod that may
                  # cause problems for other pods.
                  memory: 8Gi
                  cpu: 2
                  ephemeral-storage: 2Gi
          env:
              # This is used in the head_start_ray_commands below so that
              # Ray can spawn the correct number of processes. Omitting this
              # may lead to degraded performance.
              - name: MY_CPU_REQUEST
                valueFrom:
                    resourceFieldRef:
                        resource: requests.cpu
# Kubernetes pod config for worker node pods.
worker_nodes:
    apiVersion: v1
    kind: Pod
    metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: ray-worker-
        # Must match the worker node service selector above if a worker node
        # service is required.
        labels:
            component: ray-worker
    spec:
        serviceAccountName: default
        # Worker nodes will be managed automatically by the head node, so
        # do not change the restart policy.
        restartPolicy: Never
        # This volume allocates shared memory for Ray to use for its plasma
        # object store. If you do not provide this, Ray will fall back to
        # /tmp which cause slowdowns if is not a shared memory volume.
        volumes:
        - name: dshm
          emptyDir:
              medium: Memory
        containers:
        - name: ray-node
          imagePullPolicy: Always
          # You are free (and encouraged) to use your own container image,
          # but it should have the following installed:
          #   - rsync (used for `ray rsync` commands and file mounts)
          image: lchu/horovod-ray:v3
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
          args: ["trap : TERM INT; sleep infinity & wait;"]
          ports:
              - containerPort: 12345 # Ray internal communication.
              - containerPort: 12346 # Ray internal communication.
          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
              - mountPath: /dev/shm
                name: dshm
          resources:
              requests:
                  cpu: 2
                  memory: 8Gi
                  ephemeral-storage: 2Gi
              limits:
                  # This memory limit will be detected by ray and split into
                  # 30% for plasma, and 70% for workers.
                  memory: 8Gi
                  cpu: 2
                  ephemeral-storage: 2Gi
          env:
              # This is used in the head_start_ray_commands below so that
              # Ray can spawn the correct number of processes. Omitting this
              # may lead to degraded performance.
              - name: MY_CPU_REQUEST
                valueFrom:
                    resourceFieldRef:
                        resource: requests.cpu
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml # --webui-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076