|
| 1 | +# An unique identifier for the head node and workers of this cluster. |
| 2 | +cluster_name: <<<CLUSTER_NAME>>> |
| 3 | + |
| 4 | +# The minimum number of workers nodes to launch in addition to the head |
| 5 | +# node. This number should be >= 0. |
| 6 | +min_workers: <<<MIN_WORKERS>>> |
| 7 | + |
| 8 | +# The maximum number of workers nodes to launch in addition to the head |
| 9 | +# node. This takes precedence over min_workers. |
| 10 | +max_workers: <<<MAX_WORKERS>>> |
| 11 | + |
| 12 | +# This executes all commands on all nodes in the docker container, |
| 13 | +# and opens all the necessary ports to support the Ray cluster. |
| 14 | +# Empty string means disabled. |
| 15 | +docker: |
| 16 | + image: "" # e.g., tensorflow/tensorflow:1.5.0-py3 |
| 17 | + container_name: "" # e.g. ray_docker |
| 18 | + |
| 19 | +# The autoscaler will scale up the cluster to this target fraction of resource |
| 20 | +# usage. For example, if a cluster of 10 nodes is 100% busy and |
| 21 | +# target_utilization is 0.8, it would resize the cluster to 13. This fraction |
| 22 | +# can be decreased to increase the aggressiveness of upscaling. |
| 23 | +# This value must be less than 1.0 for scaling to happen. |
| 24 | +target_utilization_fraction: 0.8 |
| 25 | + |
| 26 | +# If a node is idle for this many minutes, it will be removed. |
| 27 | +idle_timeout_minutes: 5 |
| 28 | + |
| 29 | +# Cloud-provider specific configuration. |
| 30 | +provider: |
| 31 | + type: aws |
| 32 | + region: us-west-2 |
| 33 | + # Availability zone(s), comma-separated, that nodes may be launched in. |
| 34 | + # Nodes are currently spread between zones by a round-robin approach, |
| 35 | + # however this implementation detail should not be relied upon. |
| 36 | + availability_zone: us-west-2a,us-west-2b |
| 37 | + |
| 38 | +# How Ray will authenticate with newly launched nodes. |
| 39 | +auth: |
| 40 | + ssh_user: ubuntu |
| 41 | +# By default Ray creates a new private keypair, but you can also use your own. |
| 42 | +# If you do so, make sure to also set "KeyName" in the head and worker node |
| 43 | +# configurations below. |
| 44 | +# ssh_private_key: /path/to/your/key.pem |
| 45 | + |
| 46 | +# Provider-specific config for the head node, e.g. instance type. By default |
| 47 | +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. |
| 48 | +# For more documentation on available fields, see: |
| 49 | +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances |
| 50 | +head_node: |
| 51 | + InstanceType: <<<HEAD_TYPE>>> |
| 52 | + ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018 |
| 53 | + |
| 54 | + # You can provision additional disk space with a conf as follows |
| 55 | + BlockDeviceMappings: |
| 56 | + - DeviceName: /dev/sda1 |
| 57 | + Ebs: |
| 58 | + VolumeSize: 100 |
| 59 | + |
| 60 | + # Additional options in the boto docs. |
| 61 | + |
| 62 | +# Provider-specific config for worker nodes, e.g. instance type. By default |
| 63 | +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. |
| 64 | +# For more documentation on available fields, see: |
| 65 | +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances |
| 66 | +worker_nodes: |
| 67 | + InstanceType: <<<WORKER_TYPE>>> |
| 68 | + ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018 |
| 69 | + |
| 70 | + # Run workers on spot by default. Comment this out to use on-demand. |
| 71 | + InstanceMarketOptions: |
| 72 | + MarketType: spot |
| 73 | + # Additional options can be found in the boto docs, e.g. |
| 74 | + # SpotOptions: |
| 75 | + # MaxPrice: MAX_HOURLY_PRICE |
| 76 | + |
| 77 | + # Additional options in the boto docs. |
| 78 | + |
| 79 | +# Files or directories to copy to the head and worker nodes. The format is a |
| 80 | +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. |
| 81 | +file_mounts: { |
| 82 | +# "/path1/on/remote/machine": "/path1/on/local/machine", |
| 83 | +# "/path2/on/remote/machine": "/path2/on/local/machine", |
| 84 | +} |
| 85 | + |
| 86 | +# List of shell commands to run to set up nodes. |
| 87 | +setup_commands: |
| 88 | + - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc |
| 89 | + - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl |
| 90 | + - rllib || pip install -U <<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib] |
| 91 | + - pip install -U tensorflow-gpu |
| 92 | + # Consider uncommenting these if you also want to run apt-get commands during setup |
| 93 | + # - sudo pkill -9 apt-get || true |
| 94 | + # - sudo pkill -9 dpkg || true |
| 95 | + # - sudo dpkg --configure -a |
| 96 | + |
| 97 | +# Custom commands that will be run on the head node after common setup. |
| 98 | +head_setup_commands: |
| 99 | + - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions |
| 100 | + |
| 101 | +# Custom commands that will be run on worker nodes after common setup. |
| 102 | +worker_setup_commands: [] |
| 103 | + |
| 104 | +# Command to start ray on the head node. You don't need to change this. |
| 105 | +head_start_ray_commands: |
| 106 | + - ray stop |
| 107 | + - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml |
| 108 | + |
| 109 | +# Command to start ray on worker nodes. You don't need to change this. |
| 110 | +worker_start_ray_commands: |
| 111 | + - ray stop |
| 112 | + - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076 |
0 commit comments