|
21 | 21 |
|
22 | 22 | }
|
23 | 23 |
|
24 |
| -map_name_to_env = { "Ocean": { "Env" : OceanEnv, |
| 24 | +map_name_to_env = { "Ocean Env": { "Env" : OceanEnv, |
25 | 25 | "model" : (transition_probability_ocean, reward_probability_ocean),
|
26 | 26 | "is_state_done" : lambda state : state == 0,
|
27 |
| - "range_values" : [-20, 5] |
| 27 | + "range_values" : [-20, 5], |
| 28 | + "image_path" : "figure/ocean_env.jpeg", |
| 29 | + "description" : "In this environment you need to reach the beach as fast as possible. \ |
| 30 | + You start in the ocean and you can only move in the 2 directions. \ |
| 31 | + The state consist of the distance with the beach and is represented by an integer between 0 and 10 \ |
| 32 | + (you can't go more far than 10). The reward is -1 at each step and 0 when you reach the beach. \ |
| 33 | + The episode ends when you reach the beach. \ |
| 34 | + ", |
28 | 35 | },
|
29 | 36 |
|
30 |
| - "Nim" : { "Env" : NimEnv, |
| 37 | + "Nim's Game" : { "Env" : NimEnv, |
31 | 38 | "model" : (transition_probability_nim, reward_probability_nim),
|
32 | 39 | "is_state_done" : lambda state : state <= 0,
|
33 |
| - "range_values" : [-2, 2] |
| 40 | + "range_values" : [-2, 2], |
| 41 | + "image_path" : "figure/nim_env.png", |
| 42 | + "description" : "In this game you start with 10 matches and you can remove 1, 2 or 3 matches at each step (those are your actions). The player that removes the last match loses. You play against a random agent. The state consist of the number of matches left and is represented by an integer between 0 and n_matches=25. The reward is 1 if you win, -1 if you lose and 0 if the game is not finished. The episode ends when the game is finished." |
34 | 43 | },
|
35 | 44 |
|
36 | 45 | "n-Bandit Contextual" : { "Env" : ContextualBanditEnv,
|
37 | 46 | "model" : (transition_probability_CB, reward_probability_CB),
|
38 | 47 | "is_state_done" : lambda state : state == -1,
|
39 |
| - "range_values" : [-1, 4] |
| 48 | + "range_values" : [-1, 4], |
| 49 | + "image_path" : "figure/bandit_env.png", |
| 50 | + "description" : "In this famous environment, which is a foundation problem of theoretical RL, you have a slot machine with 4 arms. Each arm ill give you a reward following a random law that you don't now. This is contextual because which arm is better depends on the state. In particular here, the expected reward is r(s,a) = (s-a-1)%4 so the optimal action for each state is pi*(s)=s.", |
40 | 51 | },
|
41 | 52 |
|
42 | 53 | }
|
|
0 commit comments