Skip to content

Commit fa99fda

Browse files
authored
Application Stress Tests (#3612)
1 parent c28e6d4 commit fa99fda

File tree

5 files changed

+269
-0
lines changed

5 files changed

+269
-0
lines changed

python/ray/autoscaler/autoscaler.py

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import logging
1010
import math
1111
import os
12+
from six import string_types
1213
from six.moves import queue
1314
import subprocess
1415
import threading
@@ -633,6 +634,8 @@ def check_extraneous(config, schema):
633634
continue
634635
elif isinstance(v, type):
635636
if not isinstance(config[k], v):
637+
if v is str and isinstance(config[k], string_types):
638+
continue
636639
raise ValueError(
637640
"Config key `{}` has wrong type {}, expected {}".format(
638641
k,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Runs on a g3.16xl node with 5 m5.24xl workers
2+
# Takes roughly 10 minutes.
3+
atari-impala:
4+
env:
5+
grid_search:
6+
- BreakoutNoFrameskip-v4
7+
- BeamRiderNoFrameskip-v4
8+
- QbertNoFrameskip-v4
9+
- SpaceInvadersNoFrameskip-v4
10+
run: IMPALA
11+
stop:
12+
timesteps_total: 3000000
13+
config:
14+
sample_batch_size: 50
15+
train_batch_size: 500
16+
num_workers: 128
17+
num_envs_per_worker: 5
18+
clip_rewards: True
19+
lr_schedule: [
20+
[0, 0.0005],
21+
[20000000, 0.000000000001],
22+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# An unique identifier for the head node and workers of this cluster.
2+
cluster_name: <<<CLUSTER_NAME>>>
3+
4+
# The minimum number of workers nodes to launch in addition to the head
5+
# node. This number should be >= 0.
6+
min_workers: <<<MIN_WORKERS>>>
7+
8+
# The maximum number of workers nodes to launch in addition to the head
9+
# node. This takes precedence over min_workers.
10+
max_workers: <<<MAX_WORKERS>>>
11+
12+
# This executes all commands on all nodes in the docker container,
13+
# and opens all the necessary ports to support the Ray cluster.
14+
# Empty string means disabled.
15+
docker:
16+
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
17+
container_name: "" # e.g. ray_docker
18+
19+
# The autoscaler will scale up the cluster to this target fraction of resource
20+
# usage. For example, if a cluster of 10 nodes is 100% busy and
21+
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
22+
# can be decreased to increase the aggressiveness of upscaling.
23+
# This value must be less than 1.0 for scaling to happen.
24+
target_utilization_fraction: 0.8
25+
26+
# If a node is idle for this many minutes, it will be removed.
27+
idle_timeout_minutes: 5
28+
29+
# Cloud-provider specific configuration.
30+
provider:
31+
type: aws
32+
region: us-west-2
33+
# Availability zone(s), comma-separated, that nodes may be launched in.
34+
# Nodes are currently spread between zones by a round-robin approach,
35+
# however this implementation detail should not be relied upon.
36+
availability_zone: us-west-2a,us-west-2b
37+
38+
# How Ray will authenticate with newly launched nodes.
39+
auth:
40+
ssh_user: ubuntu
41+
# By default Ray creates a new private keypair, but you can also use your own.
42+
# If you do so, make sure to also set "KeyName" in the head and worker node
43+
# configurations below.
44+
# ssh_private_key: /path/to/your/key.pem
45+
46+
# Provider-specific config for the head node, e.g. instance type. By default
47+
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
48+
# For more documentation on available fields, see:
49+
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
50+
head_node:
51+
InstanceType: <<<HEAD_TYPE>>>
52+
ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
53+
54+
# You can provision additional disk space with a conf as follows
55+
BlockDeviceMappings:
56+
- DeviceName: /dev/sda1
57+
Ebs:
58+
VolumeSize: 100
59+
60+
# Additional options in the boto docs.
61+
62+
# Provider-specific config for worker nodes, e.g. instance type. By default
63+
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
64+
# For more documentation on available fields, see:
65+
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
66+
worker_nodes:
67+
InstanceType: <<<WORKER_TYPE>>>
68+
ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
69+
70+
# Run workers on spot by default. Comment this out to use on-demand.
71+
InstanceMarketOptions:
72+
MarketType: spot
73+
# Additional options can be found in the boto docs, e.g.
74+
# SpotOptions:
75+
# MaxPrice: MAX_HOURLY_PRICE
76+
77+
# Additional options in the boto docs.
78+
79+
# Files or directories to copy to the head and worker nodes. The format is a
80+
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
81+
file_mounts: {
82+
# "/path1/on/remote/machine": "/path1/on/local/machine",
83+
# "/path2/on/remote/machine": "/path2/on/local/machine",
84+
}
85+
86+
# List of shell commands to run to set up nodes.
87+
setup_commands:
88+
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc
89+
- ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
90+
- rllib || pip install -U <<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
91+
- pip install -U tensorflow-gpu
92+
# Consider uncommenting these if you also want to run apt-get commands during setup
93+
# - sudo pkill -9 apt-get || true
94+
# - sudo pkill -9 dpkg || true
95+
# - sudo dpkg --configure -a
96+
97+
# Custom commands that will be run on the head node after common setup.
98+
head_setup_commands:
99+
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
100+
101+
# Custom commands that will be run on worker nodes after common setup.
102+
worker_setup_commands: []
103+
104+
# Command to start ray on the head node. You don't need to change this.
105+
head_start_ray_commands:
106+
- ray stop
107+
- ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
108+
109+
# Command to start ray on worker nodes. You don't need to change this.
110+
worker_start_ray_commands:
111+
- ray stop
112+
- ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#!/usr/bin/env bash
2+
# This script runs all of the application tests.
3+
# Currently includes an IMPALA stress test and a SGD stress test.
4+
# on both Python 2.7 and 3.6.
5+
# All tests use a separate cluster, and each cluster
6+
# will be destroyed upon test completion (or failure).
7+
8+
# Note that if the environment variable DEBUG_MODE is detected,
9+
# the clusters will not be automatically shut down after the test runs.
10+
11+
# This script will exit with code 1 if the test did not run successfully.
12+
13+
14+
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
15+
RAY_VERSION=$(git describe --tags --abbrev=0)
16+
RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log"
17+
18+
echo "Testing on latest version of Ray: $RAY_VERSION"
19+
echo "Logging to" $RESULT_FILE
20+
touch $RESULT_FILE
21+
22+
# This function identifies the right string for the Ray wheel.
23+
_find_wheel_str(){
24+
local python_version=$1
25+
# echo "PYTHON_VERSION", $python_version
26+
local wheel_str=""
27+
if [ $python_version == "p27" ]; then
28+
wheel_str="cp27-cp27mu"
29+
else
30+
wheel_str="cp36-cp36m"
31+
fi
32+
echo $wheel_str
33+
}
34+
35+
# Total time is roughly 25 minutes.
36+
# Actual test runtime is roughly 10 minutes.
37+
test_impala(){
38+
local PYTHON_VERSION=$1
39+
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
40+
41+
pushd "$ROOT_DIR"
42+
local TEST_NAME="rllib_impala_$PYTHON_VERSION"
43+
local CLUSTER="$TEST_NAME.yaml"
44+
echo "Creating IMPALA cluster YAML from template."
45+
46+
cat application_cluster_template.yaml |
47+
sed -e "
48+
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
49+
s/<<<RAY_VERSION>>>/$RAY_VERSION/;
50+
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
51+
s/<<<WORKER_TYPE>>>/m5.24xlarge/;
52+
s/<<<MIN_WORKERS>>>/5/;
53+
s/<<<MAX_WORKERS>>>/5/;
54+
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
55+
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
56+
57+
echo "Try running IMPALA stress test."
58+
{
59+
RLLIB_DIR=../../python/ray/rllib/
60+
ray up -y $CLUSTER &&
61+
ray rsync_up $CLUSTER $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
62+
sleep 1 &&
63+
ray exec $CLUSTER "
64+
rllib train -f tuned_examples/atari-impala-large.yaml --redis-address='localhost:6379' --queue-trials" &&
65+
echo "PASS: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
66+
} || echo "FAIL: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
67+
68+
# Tear down cluster.
69+
if [ "$DEBUG_MODE" = "" ]; then
70+
ray down -y $CLUSTER
71+
rm $CLUSTER
72+
else
73+
echo "Not tearing down cluster" $CLUSTER
74+
fi
75+
popd
76+
}
77+
78+
# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled).
79+
# Actual test runtime is roughly 10 minutes.
80+
test_sgd(){
81+
local PYTHON_VERSION=$1
82+
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
83+
84+
pushd "$ROOT_DIR"
85+
local TEST_NAME="sgd_$PYTHON_VERSION"
86+
local CLUSTER="$TEST_NAME.yaml"
87+
echo "Creating SGD cluster YAML from template."
88+
89+
cat application_cluster_template.yaml |
90+
sed -e "
91+
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
92+
s/<<<RAY_VERSION>>>/$RAY_VERSION/;
93+
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
94+
s/<<<WORKER_TYPE>>>/g3.16xlarge/;
95+
s/<<<MIN_WORKERS>>>/3/;
96+
s/<<<MAX_WORKERS>>>/3/;
97+
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
98+
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
99+
100+
echo "Try running SGD stress test."
101+
{
102+
SGD_DIR=$ROOT_DIR/../../python/ray/experimental/sgd/
103+
ray up -y $CLUSTER &&
104+
# TODO: fix submit so that args work
105+
ray rsync_up $CLUSTER $SGD_DIR/mnist_example.py mnist_example.py &&
106+
sleep 1 &&
107+
ray exec $CLUSTER "
108+
python mnist_example.py --redis-address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" &&
109+
echo "PASS: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
110+
} || echo "FAIL: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
111+
112+
# Tear down cluster.
113+
if [ "$DEBUG_MODE" = "" ]; then
114+
ray down -y $CLUSTER
115+
rm $CLUSTER
116+
else
117+
echo "Not tearing down cluster" $CLUSTER
118+
fi
119+
popd
120+
}
121+
122+
# RUN TESTS
123+
for PYTHON_VERSION in "p27" "p36"
124+
do
125+
test_impala $PYTHON_VERSION
126+
test_sgd $PYTHON_VERSION
127+
done
128+
129+
cat $RESULT_FILE
130+
cat $RESULT_FILE | grep FAIL > test.log
131+
[ ! -s test.log ] || exit 1

test/stress_tests/run_stress_tests.sh

+1
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ pushd "$ROOT_DIR"
3333
popd
3434

3535
cat $RESULT_FILE
36+
[ ! -s $RESULT_FILE ] || exit 1

0 commit comments

Comments
 (0)