Skip to content

Commit 29bd213

Browse files
authored
Merge pull request #118 from kengz/policy-gradient
ActorCritic and DDPG
2 parents 80eff0e + 0b50a69 commit 29bd213

16 files changed

+899
-309
lines changed

rl/agent/actor_critic.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import numpy as np
2+
from rl.agent.dqn import DQN
3+
from rl.util import logger
4+
5+
6+
class ActorCritic(DQN):
7+
8+
'''
9+
Actor Critic algorithm. The actor's policy
10+
is adjusted in the direction that will lead to
11+
better actions, guided by the critic
12+
Implementation adapted from
13+
http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html
14+
15+
Assumes one of the policies in actor_critic.py are being used
16+
'''
17+
18+
def __init__(self, env_spec,
19+
train_per_n_new_exp=1,
20+
gamma=0.95, lr=0.1,
21+
epi_change_lr=None,
22+
batch_size=16, n_epoch=5, hidden_layers=None,
23+
hidden_layers_activation='sigmoid',
24+
output_layer_activation='linear',
25+
auto_architecture=False,
26+
num_hidden_layers=3,
27+
first_hidden_layer_size=256,
28+
num_initial_channels=16,
29+
**kwargs): # absorb generic param without breaking
30+
# import only when needed to contain side-effects
31+
from keras.layers.core import Dense
32+
from keras.models import Sequential, load_model
33+
self.Dense = Dense
34+
self.Sequential = Sequential
35+
self.load_model = load_model
36+
37+
super(ActorCritic, self).__init__(env_spec,
38+
train_per_n_new_exp,
39+
gamma, lr,
40+
epi_change_lr,
41+
batch_size, n_epoch, hidden_layers,
42+
hidden_layers_activation,
43+
output_layer_activation,
44+
auto_architecture,
45+
num_hidden_layers,
46+
first_hidden_layer_size,
47+
num_initial_channels,
48+
**kwargs)
49+
50+
def build_model(self):
51+
self.build_actor()
52+
self.build_critic()
53+
logger.info("Actor and critic models built")
54+
55+
def build_actor(self):
56+
actor = self.Sequential()
57+
super(ActorCritic, self).build_hidden_layers(actor)
58+
actor.add(self.Dense(self.env_spec['action_dim'],
59+
init='lecun_uniform',
60+
activation=self.output_layer_activation))
61+
logger.info("Actor summary")
62+
actor.summary()
63+
self.actor = actor
64+
65+
def build_critic(self):
66+
critic = self.Sequential()
67+
super(ActorCritic, self).build_hidden_layers(critic)
68+
critic.add(self.Dense(1,
69+
init='lecun_uniform',
70+
activation=self.output_layer_activation))
71+
logger.info("Critic summary")
72+
critic.summary()
73+
self.critic = critic
74+
75+
def compile_model(self):
76+
self.actor.compile(
77+
loss='mse',
78+
optimizer=self.optimizer.keras_optimizer)
79+
self.critic.compile(
80+
loss='mse',
81+
optimizer=self.optimizer.keras_optimizer)
82+
logger.info("Actor and critic compiled")
83+
84+
def recompile_model(self, sys_vars):
85+
'''
86+
Option to change model optimizer settings
87+
Currently only used for changing the learning rate
88+
Compiling does not affect the model weights
89+
'''
90+
if self.epi_change_lr is not None:
91+
if (sys_vars['epi'] == self.epi_change_lr and
92+
sys_vars['t'] == 0):
93+
self.lr = self.lr / 10.0
94+
self.optimizer.change_optim_param(**{'lr': self.lr})
95+
self.actor.compile(
96+
loss='mse',
97+
optimizer=self.optimizer.keras_optimizer)
98+
self.critic.compile(
99+
loss='mse',
100+
optimizer=self.optimizer.keras_optimizer)
101+
logger.info(
102+
'Actor and critic models recompiled with new settings: '
103+
'Learning rate: {}'.format(self.lr))
104+
105+
def train_critic(self, minibatch):
106+
Q_vals = np.clip(self.critic.predict(minibatch['states']),
107+
-self.clip_val, self.clip_val)
108+
Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']),
109+
-self.clip_val, self.clip_val)
110+
Q_targets = minibatch['rewards'] + self.gamma * \
111+
(1 - minibatch['terminals']) * Q_next_vals.squeeze()
112+
Q_targets = np.expand_dims(Q_targets, axis=1)
113+
114+
actor_delta = Q_next_vals - Q_vals
115+
loss = self.critic.train_on_batch(minibatch['states'], Q_targets)
116+
117+
errors = abs(np.sum(Q_vals - Q_targets, axis=1))
118+
self.memory.update(errors)
119+
return loss, actor_delta
120+
121+
def train_actor(self, minibatch, actor_delta):
122+
old_vals = self.actor.predict(minibatch['states'])
123+
if self.env_spec['actions'] == 'continuous':
124+
A_targets = np.zeros(
125+
(actor_delta.shape[0], self.env_spec['action_dim']))
126+
for j in range(A_targets.shape[1]):
127+
A_targets[:, j] = actor_delta.squeeze()
128+
else:
129+
A_targets = minibatch['actions'] * actor_delta + \
130+
(1 - minibatch['actions']) * old_vals
131+
132+
loss = self.actor.train_on_batch(minibatch['states'], A_targets)
133+
return loss
134+
135+
def train_an_epoch(self):
136+
minibatch = self.memory.rand_minibatch(self.batch_size)
137+
critic_loss, actor_delta = self.train_critic(minibatch)
138+
actor_loss = self.train_actor(minibatch, actor_delta)
139+
return critic_loss + actor_loss

0 commit comments

Comments
 (0)