-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqlearning.py
97 lines (84 loc) · 4.4 KB
/
qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#based on Square WORLD environment with 4 obstacles
import numpy as np
import math
import random
gamma = 0.75 # Discount factor
alpha = 0.9 # Learning rate
episode = 1000
location_to_state_coordination = {"L1": [1.5, 1.5, 0], "L2": [1.5, 0.5, 0], "L3": [1.5, -0.5, 0], "L4": [1.5, -1.5, 0], "L5": [0.5, 1.5, 0],
"L6": [0.5, 0.5, 0], "L7": [0.5, -0.5, 0], "L8": [0.5, -1.5, 0], "L9": [-0.5, 1.5, 0], "L10": [-0.5, 0.5, 0],
"L11": [-0.5, -0.5, 0], "L12": [-0.5, -1.5, 0], "L13": [-1.5, 1.5, 0], "L14": [-1.5, 0.5, 0],
"L15": [-1.5, -0.5, 0], "L16": [-1.5, -1.5, 0]}
location_to_state = {"L1": 0, "L2": 1, "L3": 2, "L4": 3, "L5": 4, "L6": 5, "L7": 6, "L8": 7, "L9": 8, "L10": 9,
"L11": 10, "L12": 11, "L13": 12, "L14": 13, "L15": 14, "L16": 15}
actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
rewards = np.array([[0, 1, 0, 0, 1, -10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 1, 0, 1, -10, -10, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 1, 0, -10, -10, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, -10, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, -10, 0, 0, 1, -10, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 0, 0, -10, 0, 0, 0, -10, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, -10, 0, 0, 0, -10, 0, 0, 1, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, -10, 1, 0, 0, -10, 0, 0, 0, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 1, -10, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, -10, -10, 0, 1, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, -10, -10, 1, 0, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10, 1, 0, 0, 1, 0]])
def optimal_route(start_location, end_location):
# X_start = 0
# Y_start = 0
# X_end_new = (math.ceil(X_end)-0.5)
# Y_end_new = (math.ceil(Y_end) - 0.5)
Q = np.zeros([16, 16])
rewards_new = np.copy(rewards)
ending_state = location_to_state[end_location]
rewards_new[ending_state, ending_state] = 900
for i in range(episode):
print(f"this is {i} episode")
current_state = random.choice([0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15])
print(f"current state is {current_state}")
playable_states = []
for j in range(0, 16):
if rewards_new[current_state, j] > 0:
playable_states.append(j)
print(f"the playable states are {playable_states}")
next_state = np.random.choice(playable_states)
print(f"the next state is {next_state}")
TD = rewards_new[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state, ])] \
- Q[current_state, next_state]
Q[current_state, next_state] += alpha * TD
print(Q)
# Initialize the optimal route with the starting location
route = [start_location]
next_location = start_location
while next_location != end_location:
starting_state = location_to_state[start_location]
next_state = np.argmax(Q[starting_state, ])
state_to_location = dict((state, location) for location, state in location_to_state.items())
next_location = state_to_location[next_state]
route.append(next_location)
start_location = next_location
return route
def location_to_coordination(result):
points = [location_to_state_coordination[result[res + 1]] for res in range(len(result) - 1)]
A = []
for i in range(len(points)):
for j in range(3):
A.append(points[i][j])
return A
# ------------------------------------RUN-----------------
# start_location="L1"
# # end_location = input("please tell me the end location: ")
# end_location = "L15"
# points_seq = location_to_coordination(optimal_route(start_location, end_location))
# print(points_seq)
# n = 3
# # Returns a list of lists [[point1], [point2],...[pointn]]
# points = [points_seq[i:i + n] for i in range(0, len(points_seq), n)]
# print(points)
# result = optimal_route(start_location, end_location)
# print(result)