-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain_tp1_ex2.py
More file actions
113 lines (99 loc) · 4.14 KB
/
main_tp1_ex2.py
File metadata and controls
113 lines (99 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from gridworld import GridWorld1
import gridrender as gui
import numpy as np
import time
import matplotlib.pyplot as plt
env = GridWorld1
################################################################################
# investigate the structure of the environment
# - env.n_states: the number of states
# - env.state2coord: converts state number to coordinates (row, col)
# - env.coord2state: converts coordinates (row, col) into state number
# - env.action_names: converts action number [0,3] into a named action
# - env.state_actions: for each state stores the action availables
# For example
# print(env.state_actions[4]) -> [1,3]
# print(env.action_names[env.state_actions[4]]) -> ['down' 'up']
# - env.gamma: discount factor
################################################################################
print(env.state2coord)
print(env.coord2state)
print(env.state_actions[4])
print(env.action_names[env.state_actions[4]])
################################################################################
# Policy definition
# If you want to represent deterministic action you can just use the number of
# the action. Recall that in the terminal states only action 0 (right) is
# defined.
# In this case, you can use gui.renderpol to visualize the policy
################################################################################
pol = [1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 3]
gui.render_policy(env, pol)
################################################################################
# Try to simulate a trajectory
# you can use env.step(s,a, render=True) to visualize the transition
env.render = True
state = 0
fps = 1
for i in range(5):
action = np.random.choice(env.state_actions[state])
nexts, reward, term = env.step(state,action)
state = nexts
time.sleep(1./fps)
################################################################################s
# You can also visualize the q-function using render_q
################################################################################
# first get the maximum number of actions available
max_act = max(map(len, env.state_actions))
q = np.random.rand(env.n_states, max_act)
gui.render_q(env, q)
################################################################################
# Work to do: Q4
################################################################################
# here the v-function and q-function to be used for question 4
v_q4 = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855,
-0.93358351, -0.99447514]
q_q4 = [[0.87691855, 0.65706417],
[0.92820033, 0.84364237],
[0.98817903, -0.75639924, 0.89361129],
[0.00000000],
[-0.62503460, 0.67106071],
[-0.99447514, -0.70433689, 0.75620264],
[0.00000000],
[-0.82847001, 0.49505225],
[-0.87691855, -0.79703229],
[-0.93358351, -0.84424050, -0.93896668],
[-0.89268904, -0.99447514]
]
################################################################################
# Work to do: Q5
################################################################################
v_opt = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.82369294, 0.92820033, 0.00000000, 0.77818504, 0.82369294,
0.87691855, 0.82847001]
################################################################################
# ------------------ Question N°4 ------------------ #
env.render = False
J = []
for N in range(1, 100, 1):
Q = np.array([0.0] * env.n_states)
Tmax = 1000
gamma = 0.95
for n in range(N):
for initial_state in range(env.n_states):
#print(initial_state)
state = initial_state
for t in range(0,Tmax):
if (0 in env.state_actions[state]):
action = 0
else:
action = 3
nexts, reward, term = env.step(state,action)
Q[initial_state] += gamma**t * reward
state = nexts
if term: break
Q /= N
J.append((sum(Q,0) - sum(v_q4)) / 10)
print('Q estimation : ', Q)
# --------------------------------------------------- #
plt.plot(J)
plt.show()