-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathraytune.py
More file actions
196 lines (151 loc) · 7.37 KB
/
raytune.py
File metadata and controls
196 lines (151 loc) · 7.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Hyperparamter Tuning Imports
import os
os.system("source ray_env/bin/activate")
import argparse
import numpy as np
import ray
import tensorflow as tf
from ray import air, tune
from ray.air.checkpoint import Checkpoint
from ray.tune import CLIReporter, Stopper
from ray.tune.schedulers import ASHAScheduler, HyperBandScheduler
from Agent.agent import PGAgent
#ray.init("auto")
class MyTrainableClass(tune.Trainable):
def setup(self, config):
self.config = config
self.agent = PGAgent(config=config)
self.episode_reward_history = []
self.batch = 0
self.max_batch = self.agent.max_episodes // self.agent.batch_size
def step(self):
# Calculate the current episodes
episode = (self.batch + 1) * self.agent.batch_size
# Increment the batch counter
self.batch = self.batch + 1
# Gather the trajectories
episodes = self.agent.gather_trajectories()
# Group states, actions and returns in numpy arrays
# (make a single array of all states, actions and returns from all trajectories)
states = np.concatenate([ep['states'] for ep in episodes])
actions = np.concatenate([ep['actions'] for ep in episodes])
rewards = [ep['rewards'] for ep in episodes]
returns = np.concatenate([self.agent.compute_returns(ep_rwds) for ep_rwds in rewards])
returns = np.array(returns, dtype=np.float32)
id_action_pairs = np.array([[i, a] for i, a in enumerate(actions)])
self.agent.reinforce(states, id_action_pairs, returns, episode)
# Store collected rewards
for ep_rwds in rewards:
self.episode_reward_history.append(np.sum(ep_rwds))
avg_rewards = np.mean(self.episode_reward_history[-10:])
# with self.agent.writer.as_default():
# tf.summary.scalar("model/avg_rew", avg_rewards, step=episode)
if avg_rewards >= self.agent.max_average:
self.agent.Actor.save_weights(self.agent.agent_name + self.agent.env_name + ".h5")
return {"mean_reward": avg_rewards, "episode": episode, "batch": self.batch}
def reset_config(self, new_config):
self.config.update(new_config)
self.agent = PGAgent(config = new_config)
return True
def save_checkpoint(self, tmp_checkpoint_dir):
checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.h5")
self.agent.Actor.save_weights(checkpoint_path)
return tmp_checkpoint_dir
def load_checkpoint(self, tmp_checkpoint_dir):
checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.h5")
self.agent.Actor.load_weights(checkpoint_path)
if __name__ == "__main__":
#ray.init()
# search space
best_trail = {
"env_name": "CartPole-v1", # Name of the environment
"input_dim": 1, # Input dimension of the environment (CartPole-v1 = 1)
"agent_name": "PartitionedPQC Policy Gradient", # Name of the agent
# Hyperparamters for the training loop
"max_episodes" : 1200, # Number of episodes to run
"max_average": 500, # Average score to reach to stop training
"batch_size": 10, # the number of trajectories to collect before each REINFORCE update
# Hyperparamets for the dimensions of the PQC
"n_partitions": 2, # Number of partitions in the PQC.
"n_terms": 1, # A hyperparamter for number of terms to avaluate in the sum. (run time cost vs accuracy tradeoff)
"n_layers": 5, #Depth of the PQC.
# "rescale_parameter": tune.grid_search([0.001, 0,1, 1]), # Lambda Rescale parameter search. WHAT SHOULD THIS BE?
# Hyperparamters for the Policy Graidnet algorithm
"gamma": 1, # Discount factor for the rewards
"beta": 1, # Regularization parameter for the PQC.
# Hyperparamter for the learning rates of the MODEL
"theta_lr": 0.002, # learning rate for the rotational gates (Keep the same as orginal paper)
"zeta_lr": 0.0002, # Learning rate for the zeta paramters
"lambda_lr": 0.0002, # Learning rate for the Adam optimizer # 0.001 before
"x_l": False, # Do we take the exponential of the lambda paramters? (this tend to make the model very unstable for some reason)
"trainable_lambdas": True,
"rescaling_scheme": "factoring",
# Testing currently.
"input_lr": 0.1, # Learning rate for the Input encodings (Keep the same as orginal paper)
"obs_lr": 0.1, # Learning rate for the Adam optimizer (Keep the same as orginal paper)
}
# Experiment name
exp_name = "reshuffled_inputs"
# Define the logger
reporter = CLIReporter(max_progress_rows=100)
reporter.add_metric_column("mean_reward")
reporter.add_metric_column("episode")
class CustomStopper(Stopper):
def __init__(self):
self.should_stop = {}
self.should_stop_all = False
self.count_below_threshold = {}
self.stop_conditions = {
500: float("inf")
}
def __call__(self, trial_id, result):
if trial_id not in self.should_stop:
self.should_stop[trial_id] = False
self.count_below_threshold[trial_id] = {}
for threshold in self.stop_conditions:
self.count_below_threshold[trial_id][threshold] = 0
if not self.should_stop[trial_id]:
# if result["mean_reward"] >= 500:
# self.should_stop[trial_id] = True
if result["episode"] >= 1200:
self.should_stop[trial_id] = True
for threshold, episode in self.stop_conditions.items():
if result["episode"] > episode:
if result["mean_reward"] < threshold:
self.count_below_threshold[trial_id][threshold] += 1
else:
self.count_below_threshold[trial_id][threshold] = 0
if self.count_below_threshold[trial_id][threshold] >= 3:
self.should_stop[trial_id] = True
return self.should_stop[trial_id]
def stop_all(self):
return False
stopper=CustomStopper()
# This is the trail that uses
resource_group = tune.PlacementGroupFactory([{"CPU": 4, "CustomResource": 1}])
trainable_with_recources = tune.with_resources(MyTrainableClass, resource_group)
#The tuner object
# tuner = tune.Tuner(
# MyTrainableClass,
# run_config=air.RunConfig(
# name=exp_name,
# stop=stopper,
# progress_reporter=reporter,
# failure_config=air.FailureConfig(max_failures=2),
# verbose=1,
# ),
# tune_config=tune.TuneConfig(
# num_samples=10,
# mode="max",
# metric="mean_reward",
# max_concurrent_trials=1,
# ),
# param_space=best_trail,
# )
tuner = tune.Tuner.restore(
"~/ray_results/reshuffled_inputs",
MyTrainableClass,
resume_errored=False,
restart_errored=True,
)
results = tuner.fit()