-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
119 lines (97 loc) · 3.79 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import torch
import torch.nn as nn
from collections import deque
import itertools
from agent import Agent
from env import OthelloEnv, TURN_BLACK
from torch.utils.tensorboard import SummaryWriter
GAMMA = 0.99
BATCH_SIZE = 128
BUFFER_SIZE = 15_000
MIN_REPLAY_SIZE = 5000
EPSILON_START = 0.9
EPSILON_END = 0.02
EPSILON_DECAY = 1000
SAVE_FREQUENCY = 10_000
LR = 1e-4
TAU = 0.009
MOMENTUM = 0.15
GAME_PLAY_ITERATION = 10_0000
HIDDEN_SIZE = 256
CLIP_GRAD = 200
writer = SummaryWriter(f'runs/agent-H3-{HIDDEN_SIZE}-RMSProp-CLIP-{CLIP_GRAD}-LR-{LR:0.4f}-TAU-{TAU:0.3f}-MO-{MOMENTUM:0.2f}-testtttt')
# writer = SummaryWriter(f'runs/agent-H3-{HIDDEN_SIZE}-Adam-CLIP-{CLIP_GRAD}-LR-{LR:0.4f}-TAU-{TAU:0.3f}-MO-{MOMENTUM:0.2f}')
reward_buffer = deque([0.0], maxlen=1000)
games_played = 0
wins = 0
draws = 0
agent_total_moves = 0
agent_correct_moves = 0
env = OthelloEnv(TURN_BLACK)
agent = Agent(env,
batch_size=BATCH_SIZE,
gamma=GAMMA,
buffer_size=BUFFER_SIZE,
epsilon_params=(EPSILON_DECAY, EPSILON_START, EPSILON_END),
save_frequency=SAVE_FREQUENCY,
tau=TAU,
hidden_size=HIDDEN_SIZE)
writer.add_graph(agent.policy_net, torch.zeros(env.observation_space.shape))
writer.flush()
optimizer = torch.optim.AdamW(agent.policy_net.parameters(), lr=LR, amsgrad=True)
agent.initialize_buffer(MIN_REPLAY_SIZE)
observation = env.reset()
for step in itertools.count():
action, ok_move = agent.act(observation, step)
agent_total_moves += 1
if ok_move:
agent_correct_moves += 1
_, new_observation, reward, done = env.step(action, ok_move)
transition = (observation, action, reward, done, new_observation)
agent.add_transition(transition)
observation = new_observation
policy_values, expected_values = agent.calculate_quality_values()
loss = nn.functional.smooth_l1_loss(policy_values, expected_values.unsqueeze(1))
# gradient descent
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(agent.policy_net.parameters(), CLIP_GRAD)
optimizer.step()
# update target network
agent.update_nets(step)
if done:
score = env.agent_score()
reward_buffer.append(score)
if env.agent_win():
wins += 1
elif env.agent_draw():
draws += 1
observation = env.reset()
games_played += 1
win_ratio = 100 * wins // max(games_played, 1)
win_draw_ratio = 100 * (wins + draws) // max(games_played, 1)
correct_move_ratio = 100 * agent_correct_moves // max(agent_total_moves, 1)
writer.add_scalar('win_ratio', win_ratio, games_played, double_precision=True, new_style=True)
writer.add_scalar('win_draw_ratio', win_draw_ratio, games_played, double_precision=True, new_style=True)
writer.add_scalar('score', score, games_played, new_style=True)
writer.add_scalar('correct_move_ratio', correct_move_ratio, games_played, double_precision=True, new_style=True)
if games_played == GAME_PLAY_ITERATION:
break
if step % 1000 == 0:
print(f"step: {step}")
# win_ratio = 100 * wins // max(games_played, 1)
# win_draw_ratio = 100 * (wins + draws) // max(games_played, 1)
# correct_move_ratio = 100 * agent_correct_moves // max(agent_total_moves, 1)
# print()
# print(
# f"> correct_move_ratio: {correct_move_ratio:.2f}% wins: {wins}/{games_played} win ratio: [{win_ratio:.2f}%:{win_draw_ratio:.2f}%]")
# env = OthelloEnv(TURN_BLACK)
# observation = env.reset()
#
# while True:
# observation_t = torch.as_tensor(observation, dtype=torch.float32)
# action, ok_move = agent.policy_net.act(observation_t)
# observation, _, done, _, _, _ = env.step(action)
# env.render()
# if done:
# env.reset()