@@ -41,13 +41,13 @@ class TaxiEnv(Env):
4141
4242 Map:
4343
44- +---------+
45- |R: | : :G|
46- | : | : : |
47- | : : : : |
48- | | : | : |
49- |Y| : |B: |
50- +---------+
44+ +---------+
45+ |R: | : :G|
46+ | : | : : |
47+ | : : : : |
48+ | | : | : |
49+ |Y| : |B: |
50+ +---------+
5151
5252 From "Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition"
5353 by Tom Dietterich [<a href="#taxi_ref">1</a>].
@@ -68,7 +68,7 @@ class TaxiEnv(Env):
6868 locations of the passenger (including the case when the passenger is in the
6969 taxi), and 4 destination locations.
7070
71- Destination on the map are represented with the first letter of the color.
71+ Destination on the ansi rendered map are represented with the first letter of the color.
7272
7373 Passenger locations:
7474 - 0: Red
@@ -100,7 +100,7 @@ class TaxiEnv(Env):
100100 and 3 destinations (excluding the passenger's current location).
101101
102102 ## Rewards
103- - -1 per step unless other reward is triggered.
103+ - -1 per step unless another reward is triggered.
104104 - +20 delivering passenger.
105105 - -10 executing "pickup" and "drop-off" actions illegally.
106106
@@ -111,16 +111,16 @@ class TaxiEnv(Env):
111111 The episode ends if the following happens:
112112
113113 - Termination:
114- 1. The taxi drops off the passenger.
114+ 1. The taxi drops off the passenger.
115115
116116 - Truncation (when using the time_limit wrapper):
117- 1. The length of the episode is 200.
117+ 1. The length of the episode is 200.
118118
119119 ## Information
120120
121121 `step()` and `reset()` return a dict with the following keys:
122- - p - transition probability for the state.
123- - action_mask - if actions will cause a transition to a new state.
122+ - "prob": transition probability for the state.
123+ - " action_mask": if actions will cause a transition to a new state. This was added in v0.25.0
124124
125125 For some cases, taking an action will have no effect on the state of the episode.
126126 In v0.25.0, ``info["action_mask"]`` contains a np.ndarray for each of the actions specifying
@@ -133,25 +133,34 @@ class TaxiEnv(Env):
133133
134134 ```python
135135 import gymnasium as gym
136- gym.make('Taxi-v3 ')
136+ gym.make('Taxi-v4 ')
137137 ```
138138
139- <a id="is_raining "></a>`is_raining =False`: If True the cab will move in intended direction with
140- probability of 80% else will move in either left or right of target direction with
141- equal probability of 10% in both directions .
139+ <a id="is_rainy "></a>`is_rainy =False`: If True the cab will move in the intended direction with probability
140+ 80%, controlled by `rainy_probability`, else in a lateral direction with equal probability.
141+ Pickup and dropoff actions remain deterministic (probability 1.0) .
142142
143- <a id="fickle_passenger"></a>`fickle_passenger=False`: If true the passenger has a 30% chance of changing
144- destinations when the cab has moved one square away from the passenger's source location. Passenger fickleness
145- only happens on the first pickup and successful movement. If the passenger is dropped off at the source location
146- and picked up again, it is not triggered again.
143+ <a id="rainy_probability"></a>`rainy_probability=0.8`: When `is_rainy=True`, the probability of
144+ moving in the intended direction. Each lateral direction is given `(1 - rainy_probability) / 2`.
145+
146+ <a id="fickle_passenger"></a>`fickle_passenger=False`: If True the passenger has a chance 30%,
147+ controlled by `fickle_probability`, of changing destinations when the cab has moved one square away from the
148+ passenger's source location. Passenger fickleness only happens on the first pickup and successful movement.
149+ If the passenger is dropped off at the source location and picked up again, it isn't triggered again.
150+
151+ <a id="fickle_probability"></a>`fickle_probability=0.3`: When `fickle_passenger=True`, the probability
152+ that the passenger changes destination on the first move after pickup.
147153
148154 ## References
149155 <a id="taxi_ref"></a>[1] T. G. Dietterich, “Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition,”
150156 Journal of Artificial Intelligence Research, vol. 13, pp. 227–303, Nov. 2000, doi: 10.1613/jair.639.
151157
152158 ## Version History
153- * v3: Map Correction + Cleaner Domain Description, v0.25.0 action masking added to the reset and step information
154- - In Gymnasium `1.2.0` the `is_rainy` and `fickle_passenger` arguments were added to align with Dietterich, 2000
159+ * v4: In v1.3.0, fix `is_rainy=True` and `fickle_passenger=True` implementations
160+ - Add `rainy_probability` and `fickle_probability` arguments to tune the stochastic behaviour
161+ * v3: Map Correction + Cleaner Domain Description,
162+ - In v0.25.0 action masking added to the reset and step information
163+ - In v1.2.0 added `is_rainy` and `fickle_passenger` arguments to align with Dietterich, 2000 Section 7.1
155164 * v2: Disallow Taxi start location = goal location, Update Taxi observations in the rollout, Update Taxi reward threshold.
156165 * v1: Remove (3,2) from locs, add passidx<4 check
157166 * v0: Initial version release
@@ -217,15 +226,21 @@ def _build_dry_transitions(self, row, col, pass_idx, dest_idx, action):
217226 new_state = self .encode (new_row , new_col , new_pass_idx , dest_idx )
218227 self .P [state ][action ].append ((1.0 , new_state , reward , terminated ))
219228
220- def _calc_new_position (self , row , col , movement , offset = 0 ):
221- """Calculates the new position for a row and col to the movement."""
229+ def _calc_new_position (self , row , col , movement ):
230+ """Calculates the new position for a row and col to the movement.
231+
232+ East/west moves are checked against the interior wall characters in
233+ ``self.desc``; north/south moves have no walls so only the grid
234+ boundary clamp applies.
235+ """
222236 dr , dc = movement
223237 new_row = max (0 , min (row + dr , self .max_row ))
224238 new_col = max (0 , min (col + dc , self .max_col ))
225- if self .desc [1 + new_row , 2 * new_col + offset ] == b":" :
226- return new_row , new_col
227- else : # Default to current position if not traversable
228- return row , col
239+ if dc == 1 and self .desc [1 + new_row , 2 * new_col ] != b":" :
240+ return row , col # east wall blocks
241+ if dc == - 1 and self .desc [1 + new_row , 2 * new_col + 2 ] != b":" :
242+ return row , col # west wall blocks
243+ return new_row , new_col
229244
230245 def _build_rainy_transitions (self , row , col , pass_idx , dest_idx , action ):
231246 """Computes the next action for a state (row, col, pass_idx, dest_idx) and action for `is_rainy`."""
@@ -236,24 +251,33 @@ def _build_rainy_transitions(self, row, col, pass_idx, dest_idx, action):
236251 reward = - 1 # default reward when there is no pickup/dropoff
237252 terminated = False
238253
254+ # (forward, left, right) relative to each heading.
255+ # Left/right follow the standard navigation convention:
256+ # south → left=east, right=west
257+ # north → left=west, right=east
258+ # east → left=north, right=south
259+ # west → left=south, right=north
239260 moves = {
240- 0 : ((1 , 0 ), (0 , - 1 ), (0 , 1 )), # Down
241- 1 : ((- 1 , 0 ), (0 , - 1 ), (0 , 1 )), # Up
242- 2 : ((0 , 1 ), (1 , 0 ), (- 1 , 0 )), # Right
243- 3 : ((0 , - 1 ), (1 , 0 ), (- 1 , 0 )), # Left
261+ 0 : ((1 , 0 ), (0 , 1 ), (0 , - 1 )), # Down (south): left=east, right=west
262+ 1 : ((- 1 , 0 ), (0 , - 1 ), (0 , 1 )), # Up (north): left=west, right=east
263+ 2 : ((0 , 1 ), (- 1 , 0 ), (1 , 0 )), # Right (east): left=north, right=south
264+ 3 : ((0 , - 1 ), (1 , 0 ), (- 1 , 0 )), # Left (west): left=south, right=north
244265 }
245266
246- # Check if movement is allowed
267+ # Check if the primary move is possible. When it is not (wall or grid
268+ # boundary), no lateral drift occurs either — consistent across all four
269+ # directions.
247270 if (
248- action in {0 , 1 }
271+ (action == 0 and row < self .max_row )
272+ or (action == 1 and row > 0 )
249273 or (action == 2 and self .desc [1 + row , 2 * col + 2 ] == b":" )
250274 or (action == 3 and self .desc [1 + row , 2 * col ] == b":" )
251275 ):
252276 dr , dc = moves [action ][0 ]
253277 new_row = max (0 , min (row + dr , self .max_row ))
254278 new_col = max (0 , min (col + dc , self .max_col ))
255279
256- left_pos = self ._calc_new_position (row , col , moves [action ][1 ], offset = 2 )
280+ left_pos = self ._calc_new_position (row , col , moves [action ][1 ])
257281 right_pos = self ._calc_new_position (row , col , moves [action ][2 ])
258282 elif action == 4 : # pickup
259283 new_pass_idx , reward = self ._pickup (taxi_loc , new_pass_idx , reward )
@@ -269,9 +293,15 @@ def _build_rainy_transitions(self, row, col, pass_idx, dest_idx, action):
269293 right_pos [0 ], right_pos [1 ], new_pass_idx , dest_idx
270294 )
271295
272- self .P [state ][action ].append ((0.8 , intended_state , - 1 , terminated ))
273- self .P [state ][action ].append ((0.1 , left_state , - 1 , terminated ))
274- self .P [state ][action ].append ((0.1 , right_state , - 1 , terminated ))
296+ self .P [state ][action ].append (
297+ (self .rainy_probability , intended_state , - 1 , terminated )
298+ )
299+ self .P [state ][action ].append (
300+ (self ._rainy_lateral_probability , left_state , - 1 , terminated )
301+ )
302+ self .P [state ][action ].append (
303+ (self ._rainy_lateral_probability , right_state , - 1 , terminated )
304+ )
275305 else :
276306 self .P [state ][action ].append ((1.0 , intended_state , reward , terminated ))
277307
@@ -280,19 +310,26 @@ def __init__(
280310 render_mode : str | None = None ,
281311 is_rainy : bool = False ,
282312 fickle_passenger : bool = False ,
313+ rainy_probability : float = 0.8 ,
314+ fickle_probability : float = 0.3 ,
283315 ):
284316 self .desc = np .asarray (MAP , dtype = "c" )
285317
286318 self .locs = locs = [(0 , 0 ), (0 , 4 ), (4 , 0 ), (4 , 3 )]
287319 self .locs_colors = [(255 , 0 , 0 ), (0 , 255 , 0 ), (255 , 255 , 0 ), (0 , 0 , 255 )]
288320
321+ self .rainy_probability = rainy_probability
322+ self ._rainy_lateral_probability = (1.0 - rainy_probability ) / 2.0
323+ self .fickle_probability = fickle_probability
324+
289325 num_states = 500
290326 num_rows = 5
291327 num_columns = 5
292328 self .max_row = num_rows - 1
293329 self .max_col = num_columns - 1
294330 self .initial_state_distrib = np .zeros (num_states )
295331 num_actions = 6
332+ # P = dict[state, dict[action, tuple[probability, next-state, reward, termination]]]
296333 self .P = {
297334 state : {action : [] for action in range (num_actions )}
298335 for state in range (num_states )
@@ -328,7 +365,7 @@ def __init__(
328365
329366 self .render_mode = render_mode
330367 self .fickle_passenger = fickle_passenger
331- self .fickle_step = self . fickle_passenger and self . np_random . random () < 0.3
368+ self .fickle_step = False
332369
333370 # pygame utils
334371 self .window = None
@@ -356,7 +393,7 @@ def encode(self, taxi_row, taxi_col, pass_loc, dest_idx):
356393 i += dest_idx
357394 return i
358395
359- def decode (self , i ):
396+ def decode (self , i ) -> tuple [ int , int , int , int ] :
360397 out = []
361398 out .append (i % 4 )
362399 i = i // 4
@@ -366,7 +403,7 @@ def decode(self, i):
366403 i = i // 5
367404 out .append (i )
368405 assert 0 <= i < 5
369- return reversed (out )
406+ return tuple ( reversed (out ) )
370407
371408 def action_mask (self , state : int ):
372409 """Computes an action mask for the action space using the state information."""
@@ -429,7 +466,9 @@ def reset(
429466 super ().reset (seed = seed )
430467 self .s = categorical_sample (self .initial_state_distrib , self .np_random )
431468 self .lastaction = None
432- self .fickle_step = self .fickle_passenger and self .np_random .random () < 0.3
469+ self .fickle_step = (
470+ self .fickle_passenger and self .np_random .random () < self .fickle_probability
471+ )
433472 self .taxi_orientation = 0
434473
435474 if self .render_mode == "human" :
0 commit comments