|
| 1 | +import numpy as np |
| 2 | +from rl.agent.dqn import DQN |
| 3 | +from rl.util import logger |
| 4 | + |
| 5 | + |
| 6 | +class ActorCritic(DQN): |
| 7 | + |
| 8 | + ''' |
| 9 | + Actor Critic algorithm. The actor's policy |
| 10 | + is adjusted in the direction that will lead to |
| 11 | + better actions, guided by the critic |
| 12 | + Implementation adapted from |
| 13 | + http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html |
| 14 | +
|
| 15 | + Assumes one of the policies in actor_critic.py are being used |
| 16 | + ''' |
| 17 | + |
| 18 | + def __init__(self, env_spec, |
| 19 | + train_per_n_new_exp=1, |
| 20 | + gamma=0.95, lr=0.1, |
| 21 | + epi_change_lr=None, |
| 22 | + batch_size=16, n_epoch=5, hidden_layers=None, |
| 23 | + hidden_layers_activation='sigmoid', |
| 24 | + output_layer_activation='linear', |
| 25 | + auto_architecture=False, |
| 26 | + num_hidden_layers=3, |
| 27 | + first_hidden_layer_size=256, |
| 28 | + num_initial_channels=16, |
| 29 | + **kwargs): # absorb generic param without breaking |
| 30 | + # import only when needed to contain side-effects |
| 31 | + from keras.layers.core import Dense |
| 32 | + from keras.models import Sequential, load_model |
| 33 | + self.Dense = Dense |
| 34 | + self.Sequential = Sequential |
| 35 | + self.load_model = load_model |
| 36 | + |
| 37 | + super(ActorCritic, self).__init__(env_spec, |
| 38 | + train_per_n_new_exp, |
| 39 | + gamma, lr, |
| 40 | + epi_change_lr, |
| 41 | + batch_size, n_epoch, hidden_layers, |
| 42 | + hidden_layers_activation, |
| 43 | + output_layer_activation, |
| 44 | + auto_architecture, |
| 45 | + num_hidden_layers, |
| 46 | + first_hidden_layer_size, |
| 47 | + num_initial_channels, |
| 48 | + **kwargs) |
| 49 | + |
| 50 | + def build_model(self): |
| 51 | + self.build_actor() |
| 52 | + self.build_critic() |
| 53 | + logger.info("Actor and critic models built") |
| 54 | + |
| 55 | + def build_actor(self): |
| 56 | + actor = self.Sequential() |
| 57 | + super(ActorCritic, self).build_hidden_layers(actor) |
| 58 | + actor.add(self.Dense(self.env_spec['action_dim'], |
| 59 | + init='lecun_uniform', |
| 60 | + activation=self.output_layer_activation)) |
| 61 | + logger.info("Actor summary") |
| 62 | + actor.summary() |
| 63 | + self.actor = actor |
| 64 | + |
| 65 | + def build_critic(self): |
| 66 | + critic = self.Sequential() |
| 67 | + super(ActorCritic, self).build_hidden_layers(critic) |
| 68 | + critic.add(self.Dense(1, |
| 69 | + init='lecun_uniform', |
| 70 | + activation=self.output_layer_activation)) |
| 71 | + logger.info("Critic summary") |
| 72 | + critic.summary() |
| 73 | + self.critic = critic |
| 74 | + |
| 75 | + def compile_model(self): |
| 76 | + self.actor.compile( |
| 77 | + loss='mse', |
| 78 | + optimizer=self.optimizer.keras_optimizer) |
| 79 | + self.critic.compile( |
| 80 | + loss='mse', |
| 81 | + optimizer=self.optimizer.keras_optimizer) |
| 82 | + logger.info("Actor and critic compiled") |
| 83 | + |
| 84 | + def recompile_model(self, sys_vars): |
| 85 | + ''' |
| 86 | + Option to change model optimizer settings |
| 87 | + Currently only used for changing the learning rate |
| 88 | + Compiling does not affect the model weights |
| 89 | + ''' |
| 90 | + if self.epi_change_lr is not None: |
| 91 | + if (sys_vars['epi'] == self.epi_change_lr and |
| 92 | + sys_vars['t'] == 0): |
| 93 | + self.lr = self.lr / 10.0 |
| 94 | + self.optimizer.change_optim_param(**{'lr': self.lr}) |
| 95 | + self.actor.compile( |
| 96 | + loss='mse', |
| 97 | + optimizer=self.optimizer.keras_optimizer) |
| 98 | + self.critic.compile( |
| 99 | + loss='mse', |
| 100 | + optimizer=self.optimizer.keras_optimizer) |
| 101 | + logger.info( |
| 102 | + 'Actor and critic models recompiled with new settings: ' |
| 103 | + 'Learning rate: {}'.format(self.lr)) |
| 104 | + |
| 105 | + def train_critic(self, minibatch): |
| 106 | + Q_vals = np.clip(self.critic.predict(minibatch['states']), |
| 107 | + -self.clip_val, self.clip_val) |
| 108 | + Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']), |
| 109 | + -self.clip_val, self.clip_val) |
| 110 | + Q_targets = minibatch['rewards'] + self.gamma * \ |
| 111 | + (1 - minibatch['terminals']) * Q_next_vals.squeeze() |
| 112 | + Q_targets = np.expand_dims(Q_targets, axis=1) |
| 113 | + |
| 114 | + actor_delta = Q_next_vals - Q_vals |
| 115 | + loss = self.critic.train_on_batch(minibatch['states'], Q_targets) |
| 116 | + |
| 117 | + errors = abs(np.sum(Q_vals - Q_targets, axis=1)) |
| 118 | + self.memory.update(errors) |
| 119 | + return loss, actor_delta |
| 120 | + |
| 121 | + def train_actor(self, minibatch, actor_delta): |
| 122 | + old_vals = self.actor.predict(minibatch['states']) |
| 123 | + if self.env_spec['actions'] == 'continuous': |
| 124 | + A_targets = np.zeros( |
| 125 | + (actor_delta.shape[0], self.env_spec['action_dim'])) |
| 126 | + for j in range(A_targets.shape[1]): |
| 127 | + A_targets[:, j] = actor_delta.squeeze() |
| 128 | + else: |
| 129 | + A_targets = minibatch['actions'] * actor_delta + \ |
| 130 | + (1 - minibatch['actions']) * old_vals |
| 131 | + |
| 132 | + loss = self.actor.train_on_batch(minibatch['states'], A_targets) |
| 133 | + return loss |
| 134 | + |
| 135 | + def train_an_epoch(self): |
| 136 | + minibatch = self.memory.rand_minibatch(self.batch_size) |
| 137 | + critic_loss, actor_delta = self.train_critic(minibatch) |
| 138 | + actor_loss = self.train_actor(minibatch, actor_delta) |
| 139 | + return critic_loss + actor_loss |
0 commit comments