diff --git a/.gitignore b/.gitignore index 894a44c..522ac3c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ *.py[cod] *$py.class - +*.vscode/ # C extensions *.so diff --git a/MADRaS/__init__.py b/MADRaS/__init__.py index 384b487..c0d66af 100644 --- a/MADRaS/__init__.py +++ b/MADRaS/__init__.py @@ -1,9 +1,9 @@ """Env Registration.""" from gym.envs.registration import register -register( - id='Madras-v0', - entry_point='MADRaS.envs:MadrasEnv', - max_episode_steps=10000, - reward_threshold=25.0, -) +# register( +# id='Madras-v0', +# entry_point='MADRaS.envs:MadrasEnv', +# max_episode_steps=10000, +# reward_threshold=25.0, +# ) diff --git a/MADRaS/agents/generic/pid_test.py b/MADRaS/agents/generic/pid_test.py new file mode 100644 index 0000000..282530d --- /dev/null +++ b/MADRaS/agents/generic/pid_test.py @@ -0,0 +1,46 @@ +import numpy as np +import gym +from MADRaS.envs.gym_madras_v2 import MadrasEnv +import os +import sys +import logging + +logging.basicConfig(level=logging.DEBUG) + +def test_madras_pid(vel, file_name): + env = MadrasEnv() + for key, val in env.agents.items(): + print("Observation Space ", val.observation_space) + print("Obs_dim ", val.obs_dim) + print("Testing reset...") + obs = env.reset() + vel = float(vel) + a = [0.0, vel] + b = [0.1, 0.00] + c = [0.2, -0.2] + print("Initial observation: {}." + " Verify if the number of dimensions is right.".format(obs)) + for key, value in obs.items(): + print("{}: {}".format(key, len(value))) + print("Testing step...") + running_rew = 0 + speeds = [] + for t in range(300): + obs, r, done, _ = env.step({"MadrasAgent_0": a}) + #print("{}".format(obs)) + + # a = [0.0, 0.0] + running_rew += r["MadrasAgent_0"] + #print("{}: reward={}, done={}".format(t, running_rew, done)) + #logger.info("HELLO") + speeds.append(obs["MadrasAgent_0"][21]) + if (done['__all__']): + env.reset() + print(speeds) + np.save(file_name, np.array(speeds)) + os.system("pkill torcs") + + +if __name__=='__main__': + #test_madras_vanilla() + test_madras_pid(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/MADRaS/test_environment.py b/MADRaS/agents/generic/test_environment.py similarity index 100% rename from MADRaS/test_environment.py rename to MADRaS/agents/generic/test_environment.py diff --git a/MADRaS/agents/generic/test_environment_v2.py b/MADRaS/agents/generic/test_environment_v2.py new file mode 100644 index 0000000..fe309b2 --- /dev/null +++ b/MADRaS/agents/generic/test_environment_v2.py @@ -0,0 +1,69 @@ +import numpy as np +import gym +from MADRaS.envs.gym_madras_v2 import MadrasEnv +import os +import sys +import logging + +logging.basicConfig(filename='Telemetry.log', level=logging.DEBUG) +# logger = logging.getLogger(__name__) +# logger.setLevel(logging.DEBUG) +# fh = logging.FileHandler('Telemetry.log') +# fh.setLevel(logging.DEBUG) +# logger.addHandler(fh) + + +def test_madras_vanilla(): + env = MadrasEnv() + print("Testing reset...") + obs = env.reset() + print("Initial observation: {}." + " Verify if the number of dimensions {} is right.".format(obs, len(obs))) + print("Testing step...") + a = [0.0, 1.0, -1.0] + a = [0.0, 0.2, 0.0] + b = [0.1, 0.3, 0.0] + c = [0.2, 0.4, 0.0] + for t in range(4000): + obs, r, done, _ = env.step({"MadrasAgent_0": a, "MadrasAgent_1": b, "MadrasAgent_2": c}) + if ((t+1)%150 == 0): + a = [0.0, -1.0, 1.0] + print("{}: reward={}, done={}".format(t, r, done)) + dones = [x for x in done.values()] + if np.any(dones): + env.reset() + os.system("pkill torcs") + + +def test_madras_pid(): + env = MadrasEnv() + for key, val in env.agents.items(): + print("Observation Space ", val.observation_space) + print("Obs_dim ", val.obs_dim) + print("Testing reset...") + obs = env.reset() + a = [0.0, 0.2] + b = [0.1, 0.00] + c = [0.2, -0.2] + print("Initial observation: {}." + " Verify if the number of dimensions is right.".format(obs)) + for key, value in obs.items(): + print("{}: {}".format(key, len(value))) + print("Testing step...") + running_rew = 0 + for t in range(4000): + obs, r, done, _ = env.step({"MadrasAgent_0": a, "MadrasAgent_1": b, "MadrasAgent_2": c}) + #print("{}".format(obs)) + #if ((t+1)%15 == 0): + # a = [0.0, 0.0] + running_rew += r["MadrasAgent_0"] + #print("{}: reward={}, done={}".format(t, running_rew, done)) + #logger.info("HELLO") + if (done['__all__']): + env.reset() + os.system("pkill torcs") + + +if __name__=='__main__': + test_madras_vanilla() + #test_madras_pid() \ No newline at end of file diff --git a/MADRaS/eval_rllib_agent.py b/MADRaS/agents/rllib/eval_rllib_agent.py similarity index 100% rename from MADRaS/eval_rllib_agent.py rename to MADRaS/agents/rllib/eval_rllib_agent.py diff --git a/MADRaS/rllib_helpers.py b/MADRaS/agents/rllib/rllib_helpers.py similarity index 100% rename from MADRaS/rllib_helpers.py rename to MADRaS/agents/rllib/rllib_helpers.py diff --git a/MADRaS/train_rllib_agent.py b/MADRaS/agents/rllib/train_rllib_agent.py similarity index 100% rename from MADRaS/train_rllib_agent.py rename to MADRaS/agents/rllib/train_rllib_agent.py diff --git a/MADRaS/agents/rllib/train_rllib_multi_agent.py b/MADRaS/agents/rllib/train_rllib_multi_agent.py new file mode 100644 index 0000000..187c93f --- /dev/null +++ b/MADRaS/agents/rllib/train_rllib_multi_agent.py @@ -0,0 +1,101 @@ +import ray +import gym +import argparse +from ray.rllib.agents.ppo.ppo import PPOTrainer +from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy +from ray.tune.logger import pretty_print +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from MADRaS.envs.gym_madras_v2 import MadrasEnv +import logging +import numpy as np +logging.basicConfig(filename='Telemetry.log', level=logging.DEBUG) + +class MadrasRllib(MultiAgentEnv, MadrasEnv): + """ + MADRaS rllib Env wrapper. + """ + def __init__(self, *args): + MadrasEnv.__init__(self) + + def reset(self): + return MadrasEnv.reset(self) + + def step(self, action_dict): + return MadrasEnv.step(self, action_dict) + +def on_episode_end(info): + episode = info["episode"] + rewards = episode.agent_rewards + total_episode = episode.total_reward + + + episode.custom_metrics["agent0/rew_2"] = rewards[('MadrasAgent_0', 'ppo_policy_0')]**2.0 + episode.custom_metrics["agent1/rew_2"] = rewards[('MadrasAgent_1', 'ppo_policy_1')]**2.0 + episode.custom_metrics["env_rew_2"] = total_episode**2.0 + +def on_sample_end(info): + print(info.keys()) + sample = info["samples"] + print(dir(sample)) + splits = sample.policy_batches['ppo_policy_0'].split_by_episode() + print(len(splits)) + for split in splits: + print("EPISODE= ",np.sum(split['rewards'])) + + + +parser = argparse.ArgumentParser() +parser.add_argument("--num-iters", type=int, default=300) + +if __name__ == "__main__": + args = parser.parse_args() + ray.init() + + env = MadrasRllib() + + obs_spaces, action_spaces = [], [] + for agent in env.agents: + obs_spaces.append(env.agents[agent].observation_space) + action_spaces.append(env.agents[agent].action_space) + + print(obs_spaces) + print(action_spaces) + policies = {"ppo_policy_{}".format(i) : (PPOTFPolicy, obs_spaces[i], action_spaces[i], {}) for i in range(env.num_agents)} + + def policy_mapping_fn(agent_id): + id = agent_id.split("_")[-1] + return "ppo_policy_{}".format(id) + + ppo_trainer = PPOTrainer( + env=MadrasRllib, + config={ + "eager": False, + "num_workers": 1, + "num_gpus": 0, + "vf_clip_param": 20, + # "sample_batch_size": 20, #set them accordingly + "train_batch_size": 500, + "callbacks": { + "on_episode_end": on_episode_end, + #"on_sample_end": on_sample_end, + }, + #"lr": 5e-6, + # "sgd_minibatch_size": 24, + "multiagent": { + "policies": policies, + "policy_mapping_fn": policy_mapping_fn, + }, + }) + + #ppo_trainer.restore("{restore path}") + + for i in range(args.num_iters): + print("== Iteration", i, "==") + + # improve the PPO policy + if (i % 10 == 0): + checkpoint = ppo_trainer.save() + print("checkpoint saved at", checkpoint) + + logging.warning("-- PPO --") + print(pretty_print(ppo_trainer.train())) diff --git a/MADRaS/envs/__init__.py b/MADRaS/envs/__init__.py index 16c5aa1..b6eacf2 100644 --- a/MADRaS/envs/__init__.py +++ b/MADRaS/envs/__init__.py @@ -1,2 +1,2 @@ """Env Import.""" -from envs.gym_madras import MadrasEnv \ No newline at end of file +from MADRaS.envs.gym_madras_v2 import MadrasEnv \ No newline at end of file diff --git a/MADRaS/envs/data/communications.yml b/MADRaS/envs/data/communications.yml new file mode 100644 index 0000000..b493eb4 --- /dev/null +++ b/MADRaS/envs/data/communications.yml @@ -0,0 +1,22 @@ +MadrasAgent_0: + buff_size: 2 + vars: + - action + - speedX + - speedY + - track + comms: + - MadrasAgent_1 + - MadrasAgent_0 + + +MadrasAgent_1: + buff_size: 2 + vars: + - action + - speedX + - speedY + - trackPos + - opponents + comms: + - MadrasAgent_0 diff --git a/MADRaS/envs/data/sim_options.yml b/MADRaS/envs/data/sim_options.yml index 12c59aa..ae0f51a 100644 --- a/MADRaS/envs/data/sim_options.yml +++ b/MADRaS/envs/data/sim_options.yml @@ -5,10 +5,10 @@ gear_change: False client_max_steps: -1 # to be interpreted as np.inf visualise: True # whether to render TORCS window no_of_visualisations: 1 # To visualize multiple training instances (under MPI) in parallel set it to more than 1 -track_len: 2843.10 # in metres. CG Track 3. All track lengths can be found here: http://www.berniw.org/trb/tracks/tracklist.php -max_steps: 20000 # max episode length -track_width: 10.0 -target_speed: 13.89 # metres per sec +track_len: 5784.10 # in metres. All track lengths can be found here: http://www.berniw.org/trb/tracks/tracklist.php +max_steps: 5000 #40000 #15000 #20000 # max episode length +track_width: 11.0 +target_speed: 13.89 #27.78 # metres per sec state_dim: 29 early_stop: True normalize_actions: True # all actions in [-1, 1] @@ -52,7 +52,7 @@ rewards: TurnBackwardPenalty: scale: 10.0 AngAcclPenalty: - scale: 1.0 + scale: 2.0 max_ang_accl: 2.0 # Done function @@ -64,38 +64,219 @@ dones: - OutOfTrack # Traffic agents -# traffic: -# - ConstVelTrafficAgent: -# target_speed: 50 -# target_lane_pos: -0.5 -# collision_time_window: 1 # second -# pid_settings: -# accel_pid: -# - 10.5 # a_x -# - 0.05 # a_y -# - 2.8 # a_z -# steer_pid: -# - 5.1 -# - 0.001 -# - 0.000001 -# accel_scale: 1.0 -# steer_scale: 0.1 -# - SinusoidalSpeedAgent: -# speed_amplitude: 50 -# speed_time_period: 1000 -# target_lane_pos: -0.25 -# collision_time_window: 1 # second -# pid_settings: -# accel_pid: -# - 10.5 # a_x -# - 0.05 # a_y -# - 2.8 # a_z -# steer_pid: -# - 5.1 -# - 0.001 -# - 0.000001 -# accel_scale: 1.0 -# steer_scale: 0.1 +traffic: + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: -0.8 + high: 0.0 + parking_dist_from_start: + low: 185 + high: 200 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: 0.0 + high: 0.8 + parking_dist_from_start: + low: 165 + high: 180 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: -0.8 + high: 0.0 + parking_dist_from_start: + low: 145 + high: 160 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: 0.0 + high: 0.8 + parking_dist_from_start: + low: 125 + high: 140 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: -0.8 + high: 0.0 + parking_dist_from_start: + low: 100 + high: 120 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: 0.0 + high: 0.8 + parking_dist_from_start: + low: 80 + high: 95 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: -0.8 + high: 0.0 + parking_dist_from_start: + low: 50 + high: 75 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: 0.0 + high: 0.8 + parking_dist_from_start: + low: 30 + high: 45 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + - ParkedAgent: + target_speed: 50 + parking_lane_pos: + low: -0.8 + high: 0.0 + parking_dist_from_start: + low: 15 + high: 25 + collision_time_window: 1.2 + pid_settings: + accel_pid: + - 10.5 # a_x + - 0.05 # a_y + - 2.8 # a_z + steer_pid: + - 5.1 + - 0.001 + - 0.000001 + accel_scale: 1.0 + steer_scale: 0.1 + + # - ConstVelTrafficAgent: + # target_speed: 50 + # target_lane_pos: -0.5 + # collision_time_window: 1 # second + # pid_settings: + # accel_pid: + # - 10.5 # a_x + # - 0.05 # a_y + # - 2.8 # a_z + # steer_pid: + # - 5.1 + # - 0.001 + # - 0.000001 + # accel_scale: 1.0 + # steer_scale: 0.1 + # - SinusoidalSpeedAgent: + # speed_amplitude: 50 + # speed_time_period: 1000 + # target_lane_pos: -0.25 + # collision_time_window: 1 # second + # pid_settings: + # accel_pid: + # - 10.5 # a_x + # - 0.05 # a_y + # - 2.8 # a_z + # steer_pid: + # - 5.1 + # - 0.001 + # - 0.000001 + # accel_scale: 1.0 + # steer_scale: 0.1 # - RandomLaneSwitchAgent: # target_speed: 100 # collision_time_window: 1 # second diff --git a/MADRaS/envs/data/sim_options_v2.yml b/MADRaS/envs/data/sim_options_v2.yml index 2f5f5ef..e5ebdaa 100644 --- a/MADRaS/envs/data/sim_options_v2.yml +++ b/MADRaS/envs/data/sim_options_v2.yml @@ -1,10 +1,30 @@ torcs_server_port: 60934 -visualise: True # whether to render TORCS window +visualise: False # whether to render TORCS window no_of_visualisations: 1 # To visualize multiple training instances (under MPI) in parallel set it to more than 1 -track_len: 2843.10 # in metres. CG Track 3. All track lengths can be found here: http://www.berniw.org/trb/tracks/tracklist.php -max_steps: 20000 # max episode length +track_len: 2587.54 # in metres. CG Track 3. All track lengths can be found here: http://www.berniw.org/trb/tracks/tracklist.php +max_steps: 1000 # max episode length track_width: 10.0 +init_wait: 25 #initial waiting period for the learning agents after the ep starts vision: False # whether to use vision input +randomize_env: False +noisy_observations: True + +server_config: + max_cars: 1 + min_traffic_cars: 0 + track_names: + - forza + - aalborg + - alpine-1 + - alpine-2 + distance_to_start: 25 + torcs_server_config_dir: /home/sohan/.torcs/config/raceman/ # This must be the value for TORCS with rendering on + scr_server_config_dir: /home/sohan/usr/local/share/games/torcs/drivers/scr_server/ + traffic_car: p406 # get full list of cars here: /home/anirban/usr/local/share/games/torcs/cars/ + learning_car: # get full list of cars here: /home/anirban/usr/local/share/games/torcs/cars/ + - car1-stock1 + - car1-stock2 + # agent config: agents: @@ -13,9 +33,11 @@ agents: throttle: True gear_change: False client_max_steps: -1 # to be interpreted as np.inf - target_speed: 13.89 # metres per sec - state_dim: 24 + target_speed: 27.78 # metres per sec + state_dim: 24 # 24 early_stop: True + add_noise_to_actions: False + action_noise_std: 0.01 normalize_actions: True # all actions in [-1, 1] # PID params pid_assist: True @@ -35,6 +57,8 @@ agents: # Observation mode observations: mode: SingleAgentSimpleLapObs + multi_flag: True + buff_size: 1 normalize: False # gym_torcs normalizes by default obs_min: angle: -3.142 @@ -51,97 +75,267 @@ agents: scale: 1.0 AvgSpeedReward: scale: 1.0 + SuccessfulOvertakeReward: + scale: 10.0 CollisionPenalty: scale: 10.0 TurnBackwardPenalty: scale: 10.0 AngAcclPenalty: - scale: 1.0 + scale: 5.0 max_ang_accl: 2.0 - + # Done function dones: - - RaceOver - - TimeOut - Collision - TurnBackward - OutOfTrack - - MadrasAgent: - vision: False # whether to use vision input - throttle: True - gear_change: False - client_max_steps: -1 # to be interpreted as np.inf - target_speed: 12.89 # metres per sec - state_dim: 24 - early_stop: True - normalize_actions: True # all actions in [-1, 1] - # PID params - pid_assist: True - pid_settings: - accel_pid: - - 10.5 # a_x - - 0.05 # a_y - - 2.8 # a_z - steer_pid: - - 5.1 - - 0.001 - - 0.000001 - accel_scale: 1.0 - steer_scale: 0.1 - pid_latency: 10 - # Observation mode - observations: - mode: SingleAgentSimpleLapObs - normalize: False # gym_torcs normalizes by default - obs_min: - angle: -3.142 - track: 0.0 - trackPos: -1.0 - obs_max: - angle: 3.142 - track: 200.0 - trackPos: 1.0 + # - MadrasAgent: + # vision: False # whether to use vision input + # throttle: True + # gear_change: False + # client_max_steps: -1 # to be interpreted as np.inf + # target_speed: 27.78 # metres per sec + # state_dim: 24 # 24 + # early_stop: True + # normalize_actions: True # all actions in [-1, 1] + # # PID params + # pid_assist: False + # pid_settings: + # accel_pid: + # - 10.5 # a_x + # - 0.05 # a_y + # - 2.8 # a_z + # steer_pid: + # - 5.1 + # - 0.001 + # - 0.000001 + # accel_scale: 1.0 + # steer_scale: 0.1 + # pid_latency: 10 - # Reward function - rewards: - ProgressReward3: - scale: 1.0 - AvgSpeedReward: - scale: 1.0 - CollisionPenalty: - scale: 10.0 - TurnBackwardPenalty: - scale: 10.0 - AngAcclPenalty: - scale: 1.0 - max_ang_accl: 2.0 + # # Observation mode + # observations: + # mode: SingleAgentSimpleLapObs + # multi_flag: True + # buff_size: 1 + # normalize: False # gym_torcs normalizes by default + # obs_min: + # angle: -3.142 + # track: 0.0 + # trackPos: -1.0 + # obs_max: + # angle: 3.142 + # track: 200.0 + # trackPos: 1.0 - # Done function - dones: - - RaceOver - - TimeOut - - Collision - - TurnBackward - - OutOfTrack + # # Reward function + # rewards: + # ProgressReward3: + # scale: 1.0 + # AvgSpeedReward: + # scale: 1.0 + # SuccessfulOvertakeReward: + # scale: 10.0 + # CollisionPenalty: + # scale: 10.0 + # TurnBackwardPenalty: + # scale: 10.0 + # AngAcclPenalty: + # scale: 5.0 + # max_ang_accl: 2.0 + + # # Done function + # dones: + # - RaceOver + # - TimeOut + # - Collision + # - TurnBackward + # - OutOfTrack + + # - MadrasAgent: + # vision: False # whether to use vision input + # throttle: True + # gear_change: False + # client_max_steps: -1 # to be interpreted as np.inf + # target_speed: 12.89 # metres per sec + # state_dim: 24 + # early_stop: True + # normalize_actions: True # all actions in [-1, 1] + # # PID params + # pid_assist: False + # pid_settings: + # accel_pid: + # - 10.5 # a_x + # - 0.05 # a_y + # - 2.8 # a_z + # steer_pid: + # - 5.1 + # - 0.001 + # - 0.000001 + # accel_scale: 1.0 + # steer_scale: 0.1 + # pid_latency: 10 + + # # Observation mode + # observations: + # mode: SingleAgentSimpleLapObs + # multi_flag: False + # buff_size: 1 + # normalize: False # gym_torcs normalizes by default + # obs_min: + # angle: -3.142 + # track: 0.0 + # trackPos: -1.0 + # obs_max: + # angle: 3.142 + # track: 200.0 + # trackPos: 1.0 + + # # Reward function + # rewards: + # ProgressReward2: + # scale: 1.0 + # AvgSpeedReward: + # scale: 1.0 + # CollisionPenalty: + # scale: 10.0 + # TurnBackwardPenalty: + # scale: 10.0 + # AngAcclPenalty: + # scale: 1.0 + # max_ang_accl: 2.0 + # # TaskReward: + # # scale: 1.0 + + # # Done function + # dones: + # - RaceOver + # - TimeOut + # - TurnBackward + # - OutOfTrack # Traffic agents -traffic: - - ConstVelTrafficAgent: - target_speed: 30 - target_lane_pos: -0.5 - collision_time_window: 1 # second - pid_settings: - accel_pid: - - 10.5 # a_x - - 0.05 # a_y - - 2.8 # a_z - steer_pid: - - 5.1 - - 0.001 - - 0.000001 - accel_scale: 1.0 - steer_scale: 0.1 +# traffic: +# - ParkedAgent: +# track_len: 2587.54 +# target_speed: 50 +# parking_lane_pos: +# low: -0.54 +# high: -0.47 +# parking_dist_from_start: +# low: 35 +# high: 35 +# collision_time_window: 1.2 +# pid_settings: +# accel_pid: +# - 10.5 # a_x +# - 0.05 # a_y +# - 2.8 # a_z +# steer_pid: +# - 5.1 +# - 0.001 +# - 0.000001 +# accel_scale: 1.0 +# steer_scale: 0.1 +# - ParkedAgent: +# track_len: 2587.54 +# target_speed: 50 +# parking_lane_pos: +# low: 0.47 +# high: 0.54 +# parking_dist_from_start: +# low: 35 +# high: 35 +# collision_time_window: 1.2 +# pid_settings: +# accel_pid: +# - 10.5 # a_x +# - 0.05 # a_y +# - 2.8 # a_z +# steer_pid: +# - 5.1 +# - 0.001 +# - 0.000001 +# accel_scale: 1.0 +# steer_scale: 0.1 +# - ParkedAgent: +# target_speed: 80 +# parking_lane_pos: +# low: -0.6 +# high: 0.0 +# parking_dist_from_start: +# low: 50 +# high: 75 +# collision_time_window: 1.2 +# pid_settings: +# accel_pid: +# - 10.5 # a_x +# - 0.05 # a_y +# - 2.8 # a_z +# steer_pid: +# - 5.1 +# - 0.001 +# - 0.000001 +# accel_scale: 1.0 +# steer_scale: 0.1 +# - ParkedAgent: +# target_speed: 80 +# parking_lane_pos: +# low: 0.0 +# high: 0.4 +# parking_dist_from_start: +# low: 30 +# high: 45 +# collision_time_window: 1.2 +# pid_settings: +# accel_pid: +# - 10.5 # a_x +# - 0.05 # a_y +# - 2.8 # a_z +# steer_pid: +# - 5.1 +# - 0.001 +# - 0.000001 +# accel_scale: 1.0 +# steer_scale: 0.1 +# - ParkedAgent: +# target_speed: 80 +# parking_lane_pos: +# low: -0.6 +# high: 0.0 +# parking_dist_from_start: +# low: 15 +# high: 25 +# collision_time_window: 1.2 +# pid_settings: +# accel_pid: +# - 10.5 # a_x +# - 0.05 # a_y +# - 2.8 # a_z +# steer_pid: +# - 5.1 +# - 0.001 +# - 0.000001 +# accel_scale: 1.0 +# steer_scale: 0.1 + + # - ConstVelTrafficAgent: + # target_speed: 30 + # target_lane_pos: -0.5 + # collision_time_window: 1 # second + # pid_settings: + # accel_pid: + # - 10.5 # a_x + # - 0.05 # a_y + # - 2.8 # a_z + # steer_pid: + # - 5.1 + # - 0.001 + # - 0.000001 + # accel_scale: 1.0 + # steer_scale: 0.1 # - SinusoidalSpeedAgent: # speed_amplitude: 50 # speed_time_period: 1000 diff --git a/MADRaS/envs/done_manager.py b/MADRaS/envs/done_manager.py index 0373807..9ef60f5 100644 --- a/MADRaS/envs/done_manager.py +++ b/MADRaS/envs/done_manager.py @@ -3,7 +3,7 @@ import warnings -class DoneManager(object): +class DoneHandler(object): """Composes the done function from a given done configuration.""" def __init__(self, cfg): self.dones = {} @@ -79,7 +79,7 @@ def check_done(self, game_config, game_state): else: max_steps = game_config.max_steps if self.num_steps >= max_steps: - print("Done: Episode terminated due to timeout.") + print("Done: Episode terminated due to timeout.{}".format(self.num_steps)) self.num_steps = 0 return True else: @@ -91,19 +91,19 @@ def reset(self): class Collision(MadrasDone): def __init__(self): - self.damage = 0.0 + self.damage = 1000.0 def check_done(self, game_config, game_state): del game_config if self.damage < game_state["damage"]: print("Done: Episode terminated because agent collided.") - self.damage = 0.0 + self.damage = 1000.0 return True else: return False def reset(self): - self.damage = 0.0 + self.damage = 1000.0 class TurnBackward(MadrasDone): @@ -130,5 +130,22 @@ def check_done(self, game_config, game_state): else: return False + def reset(self): + self.num_steps = 0 + + +class Rank1(MadrasDone): + def __init__(self): + self.num_steps = 0 + + def check_done(self, game_config, game_state): + self.num_steps += 1 + if game_state["racePos"] == 1: + print("Done: Episode terminated because agent is Rank 1 after {} steps.".format(self.num_steps)) + self.num_steps = 0 + return True + else: + return False + def reset(self): self.num_steps = 0 \ No newline at end of file diff --git a/MADRaS/envs/gym_madras.py b/MADRaS/envs/gym_madras.py index 0ed74c8..d5d9ec3 100644 --- a/MADRaS/envs/gym_madras.py +++ b/MADRaS/envs/gym_madras.py @@ -102,10 +102,10 @@ def __init__(self, cfg_path=None): self.action_dim = 2 # LanePos, Velocity else: self.action_dim = 3 # Steer, Accel, Brake - self.observation_manager = om.ObservationManager(self._config.observations, + self.observation_handler = om.ObservationHandler(self._config.observations, self._config.vision) - self.reward_manager = rm.RewardManager(self._config.rewards) - self.done_manager = dm.DoneManager(self._config.dones) + self.reward_handler = rm.RewardHandler(self._config.rewards) + self.done_handler = dm.DoneHandler(self._config.dones) TorcsEnv.__init__(self, vision=self._config.vision, @@ -117,9 +117,9 @@ def __init__(self, cfg_path=None): if self._config.normalize_actions: self.action_space = gym.spaces.Box(low=-np.ones(3), high=np.ones(3)) - self.observation_space = self.observation_manager.get_observation_space() + self.observation_space = self.observation_handler.get_observation_space() - self.state_dim = self.observation_manager.get_state_dim() # No. of sensors input + self.state_dim = self.observation_handler.get_state_dim() # No. of sensors input self.env_name = 'Madras_Env' self.client_type = 0 # Snakeoil client type self.initial_reset = True @@ -129,7 +129,7 @@ def __init__(self, cfg_path=None): self.seed() self.start_torcs_process() if self._config.traffic: - self.traffic_manager = traffic.MadrasTrafficManager( + self.traffic_handler = traffic.MadrasTrafficHandler( self._config.torcs_server_port, 1, self._config.traffic) def seed(self, seed=None): @@ -179,7 +179,7 @@ def start_torcs_process(self): def reset(self): """Reset Method to be called at the end of each episode.""" if self._config.traffic: - self.traffic_manager.reset() + self.traffic_handler.reset() if self.initial_reset: self.wait_for_observation() @@ -198,11 +198,11 @@ def reset(self): print("Reset: Reset failed as agent started off track. Retrying...") self.distance_traversed = 0 - s_t = self.observation_manager.get_obs(self.ob, self._config) + s_t = self.observation_handler.get_obs(self.ob, self._config) if self._config.pid_assist: self.PID_controller.reset() - self.reward_manager.reset() - self.done_manager.reset() + self.reward_handler.reset() + self.done_handler.reset() print("Reset: Starting new episode") if np.any(np.asarray(self.ob.track) < 0): print("Reset produced bad track values.") @@ -250,14 +250,14 @@ def step_vanilla(self, action): "damage": self.client.S.d["damage"], "trackPos": self.client.S.d["trackPos"], "track": self.client.S.d["track"]} - reward = self.reward_manager.get_reward(self._config, game_state) + reward = self.reward_handler.get_reward(self._config, game_state) - done = self.done_manager.get_done_signal(self._config, game_state) + done = self.done_handler.get_done_signal(self._config, game_state) - next_obs = self.observation_manager.get_obs(self.ob, self._config) + next_obs = self.observation_handler.get_obs(self.ob, self._config) if done: if self._config.traffic: - self.traffic_manager.kill_all_traffic_agents() + self.traffic_handler.kill_all_traffic_agents() self.client.R.d["meta"] = True # Terminate the episode print('Terminating PID {}'.format(self.client.serverPID)) @@ -295,16 +295,16 @@ def step_pid(self, desire): "damage": self.client.S.d["damage"], "trackPos": self.client.S.d["trackPos"], "track": self.client.S.d["track"]} - reward += self.reward_manager.get_reward(self._config, game_state) + reward += self.reward_handler.get_reward(self._config, game_state) if self._config.pid_assist: self.PID_controller.update(self.ob) - done = self.done_manager.get_done_signal(self._config, game_state) + done = self.done_handler.get_done_signal(self._config, game_state) if done: self.client.R.d["meta"] = True # Terminate the episode print('Terminating PID {}'.format(self.client.serverPID)) break - next_obs = self.observation_manager.get_obs(self.ob, self._config) + next_obs = self.observation_handler.get_obs(self.ob, self._config) return next_obs, reward, done, info diff --git a/MADRaS/envs/gym_madras_v2.py b/MADRaS/envs/gym_madras_v2.py index 337bd72..8250186 100644 --- a/MADRaS/envs/gym_madras_v2.py +++ b/MADRaS/envs/gym_madras_v2.py @@ -15,25 +15,36 @@ import math from copy import deepcopy import numpy as np -import utils.snakeoil3_gym_v2 as snakeoil3 -from utils.gym_torcs_v2 import TorcsEnv -from controllers.pid import PIDController +import MADRaS.utils.snakeoil3_gym_v2 as snakeoil3 +from MADRaS.utils.gym_torcs_v2 import TorcsEnv +from MADRaS.controllers.pid import PIDController import gym from gym.utils import seeding +from gym import spaces import os import subprocess import signal import time from mpi4py import MPI import socket -import envs.config_parser as config_parser -import envs.reward_manager as rm -import envs.done_manager as dm -import envs.observation_manager as om -import traffic.traffic as traffic +import MADRaS.utils.config_parser as config_parser +import MADRaS.utils.torcs_server_config as torcs_config +import MADRaS.utils.reward_handler as rm +import MADRaS.utils.done_handler_v2 as dm +import MADRaS.utils.observation_handler as om +import MADRaS.utils.communications_buffer as cb +import MADRaS.utils.madras_datatypes as md +import MADRaS.traffic.traffic as traffic +from collections import OrderedDict import multiprocessing +import logging -DEFAULT_SIM_OPTIONS_FILE = "envs/data/sim_options_v2.yml" +mt = md.MadrasDatatypes() +#logger = logging.getLogger(__name__) +path_and_file = os.path.realpath(__file__) +path, file = os.path.split(path_and_file) +DEFAULT_SIM_OPTIONS_FILE = os.path.join(path, "data", "sim_options_v2.yml") +DEFAULT_COMMUNICATION_MAP = os.path.join(path, "data", "communications.yml") class MadrasAgent(TorcsEnv, gym.Env): @@ -55,17 +66,23 @@ def __init__(self, name, torcs_server_port, cfg, sim_info={}): self.action_dim = 2 # LanePos, Velocity else: self.action_dim = 3 # Steer, Accel, Brake - self.observation_manager = om.ObservationManager(self._config.observations, + self.observation_handler = om.ObservationHandler(self._config.observations, self._config.vision) if self._config.normalize_actions: - self.action_space = gym.spaces.Box(low=-np.ones(3), high=np.ones(3)) + self.action_space = gym.spaces.Box(low=-np.ones(self.action_dim), high=np.ones(self.action_dim)) - self.observation_space = self.observation_manager.get_observation_space() - self.obs_dim = self.observation_manager.get_state_dim() # No. of sensors input - self.state_dim = self.observation_manager.get_state_dim() - self.reward_manager = rm.RewardManager(self._config.rewards) - self.done_manager = dm.DoneManager(self._config.dones) + self.observation_space = self.observation_handler.get_observation_space() + self.obs_dim = self.observation_handler.get_state_dim() # No. of sensors input + self.state_dim = self.observation_handler.get_state_dim() + self.reward_handler = rm.RewardHandler(self._config.rewards) + self.done_handler = dm.DoneHandler(self._config.dones, self.name) self.initial_reset = True + self.step_num = 0 + self.prev_dist = 0 + self.prev_damage = 0 + self.prev_pos = np.inf + self.multi_flag = self._config.observations['multi_flag'] + self.buff_size = self._config.observations['buff_size'] def create_new_client(self): while True: @@ -85,20 +102,21 @@ def get_observation_from_server(self): raw_ob = self.client.S.d # Get the current full-observation from torcs self.ob = self.make_observation(raw_ob) - print("[{}]: Initial observation: {}".format(self.name, self.ob)) + if np.any(np.asarray(self.ob.track) < 0): print("Reset produced bad track values.") self.distance_traversed = 0 - s_t = self.observation_manager.get_obs(self.ob, self._config) + s_t = self.observation_handler.get_obs(self.ob, self._config) return s_t def complete_reset(self): if self._config.pid_assist: self.PID_controller.reset() - self.reward_manager.reset() - self.done_manager.reset() - print("Reset: Starting new episode") + self.reward_handler.reset() + self.done_handler.reset() + self.step_num = 0 + logging.info("Reset: Starting new episode") def reset_new(self, return_dict={}): @@ -127,12 +145,11 @@ def reset(self, return_dict={}): print("Reset produced bad track values.") self.distance_traversed = 0 print("Initial observation: {}".format(self.ob)) - s_t = self.observation_manager.get_obs(self.ob, self._config) + s_t = self.observation_handler.get_obs(self.ob, self._config) if self._config.pid_assist: self.PID_controller.reset() - self.reward_manager.reset() - self.done_manager.reset() - print("Reset: Starting new episode") + self.reward_handler.reset() + self.done_handler.reset() return_dict[self.name] = s_t return return_dict @@ -152,6 +169,7 @@ def wait_for_observation(self): raw_ob = self.client.S.d # Get the current full-observation from torcs self.ob = self.make_observation(raw_ob) + #self.client.respond_to_server() except: pass @@ -160,7 +178,6 @@ def step_vanilla(self, action): if self._config.normalize_actions: action[1] = (action[1] + 1) / 2.0 # acceleration back to [0, 1] action[2] = (action[2] + 1) / 2.0 # brake back to [0, 1] - r = 0.0 try: self.ob, r, done, info = TorcsEnv.step(self, 0, @@ -170,24 +187,39 @@ def step_vanilla(self, action): except Exception as e: print("Exception {} caught at port {}".format(str(e), self.torcs_server_port)) self.wait_for_observation() + # TODO(santara): step not performed... game_state = {"torcs_reward": r, "torcs_done": done, + "prev_distance_traversed": self.prev_dist, + "prev_racePos": self.prev_pos, + "prev_damage": self.prev_damage, "distance_traversed": self.client.S.d['distRaced'], "angle": self.client.S.d["angle"], "damage": self.client.S.d["damage"], "trackPos": self.client.S.d["trackPos"], - "track": self.client.S.d["track"]} - reward = self.reward_manager.get_reward(self._config, game_state) + "track": self.client.S.d["track"], + "racePos": self.client.S.d["racePos"], + "num_steps": self.step_num} + logging.info("[{}] Lane_pos:{} Distance:{} Speed{}".format(self.name, + self.client.S.d["trackPos"], + self.client.S.d['distRaced'], + self.ob.speedX*self.default_speed)) + + reward = self.reward_handler.get_reward(self._config, game_state) - done = self.done_manager.get_done_signal(self._config, game_state) + done = self.done_handler.get_done_signal(self._config, game_state) - next_obs = self.observation_manager.get_obs(self.ob, self._config) + next_obs, full_obs = self.observation_handler.get_obs(self.ob, self._config, full_flag=True) + info['prev_dist'] = game_state["distance_traversed"] + info['prev_damage'] = game_state["damage"] + info['prev_pos'] = game_state["racePos"] + info['full_obs'] = full_obs._asdict() return next_obs, reward, done, info - def step_pid(self, desire): + def step_pid(self, desire, e): """Execute single step with lane_pos, velocity controls.""" if self._config.normalize_actions: # [-1, 1] should correspond to [-self._config.target_speed, @@ -201,7 +233,10 @@ def step_pid(self, desire): reward = 0.0 + #import pdb; pdb.set_trace() + for PID_step in range(self._config.pid_settings['pid_latency']): + a_t = self.PID_controller.get_action(desire) try: self.ob, r, done, info = TorcsEnv.step(self, PID_step, @@ -210,33 +245,112 @@ def step_pid(self, desire): except Exception as e: print("Exception {} caught at port {}".format(str(e), self.torcs_server_port)) self.wait_for_observation() - # TODO(santara): step not performed... game_state = {"torcs_reward": r, "torcs_done": done, + "prev_racePos": self.prev_pos, + "prev_damage": self.prev_damage, + "prev_distance_traversed": self.prev_dist, "distance_traversed": self.client.S.d["distRaced"], "angle": self.client.S.d["angle"], "damage": self.client.S.d["damage"], "trackPos": self.client.S.d["trackPos"], - "track": self.client.S.d["track"]} - - reward += self.reward_manager.get_reward(self._config, game_state) + "track": self.client.S.d["track"], + "racePos": self.client.S.d["racePos"], + "num_steps": self.step_num} + + reward += self.reward_handler.get_reward(self._config, game_state) + self.prev_dist = game_state["distance_traversed"] + self.prev_damage = game_state["prev_damage"] + self.prev_pos = game_state["racePos"] + if self._config.pid_assist: self.PID_controller.update(self.ob) - done = self.done_manager.get_done_signal(self._config, game_state) + done = self.done_handler.get_done_signal(self._config, game_state) + info['prev_dist'] = self.prev_dist + info['prev_damage'] = game_state["damage"] + info['prev_pos'] = game_state["racePos"] if done: + e.set() + break + if e.is_set(): + print("[{}]: Stopping agent because some other agent has hit done".format(self.name)) break - next_obs = self.observation_manager.get_obs(self.ob, self._config) + next_obs, full_obs = self.observation_handler.get_obs(self.ob, self._config, full_flag=True) + + info['full_obs'] = full_obs._asdict() return next_obs, reward, done, info - def step(self, action, return_dict={}): + def step(self, action, e, return_dict={}): + if self._config.add_noise_to_actions: + noise = np.random.normal(scale=self._config.action_noise_std, size=self.action_dim) + action += noise if self._config.pid_assist: - return_dict[self.name] = self.step_pid(action) + return_dict[self.name] = self.step_pid(action, e) else: return_dict[self.name] = self.step_vanilla(action) return return_dict + def increment_(self, dist, damage, pos): + self.prev_dist = dist + self.prev_damage = damage + self.prev_pos = pos + #print("DIST_UPDATED: ", self.prev_dist) + + def increment_step(self): + if self._config.pid_assist: + self.step_num += self._config.pid_settings['pid_latency'] + else: + self.step_num += 1 + #self.increment_dist() + + def comms_init(self, info={}): + + + additional_vars_high = [] + additional_vars_low = [] + additional_dims = 0 + + for var in info["vars"]: + + if (var == 'track'): + additional_vars_high += 19*[1] + additional_vars_low += 19*[0] + additional_dims += 19 + elif (var == 'trackPos'): + additional_vars_high += [1] + additional_vars_low += [-1] + additional_dims += 1 + elif (var == 'opponents'): + additional_vars_high += 36*[1] + additional_vars_low += 36*[0] + additional_dims += 36 + elif ('speed' in var): #for handling speedX, speedY & speedZ + additional_vars_high += [np.inf] + additional_vars_low += [-np.inf] + additional_dims += 1 + elif (var == 'action'): + additional_vars_high += [1]*self.action_dim + additional_vars_low += [-1]*self.action_dim + additional_dims += self.action_dim + else: + additional_vars_high += [1] + additional_vars_low += [-1] + additional_dims += 1 + + logging.info("NAME:{}, lencomms:{}".format(self.name, len(info['comms']))) + self.obs_dim += additional_dims*len(info['comms']) + self.comms_buffer = cb.CommBuffer(self.name, self.buff_size, additional_dims*len(info['comms'])) + additional_vars_high = additional_vars_high*len(info['comms']) + additional_vars_low = additional_vars_low*len(info['comms']) + high = np.hstack((self.observation_space.high, additional_vars_high)) + low = np.hstack((self.observation_space.low, additional_vars_low)) + self.observation_space = spaces.Box(high=high, low=low) + logging.info("{}: {}".format(self.name, self._config.observations['buff_size'])) + + + class MadrasEnv(gym.Env): """Definition of the Gym Madras Environment.""" @@ -244,24 +358,46 @@ def __init__(self, cfg_path=DEFAULT_SIM_OPTIONS_FILE): # If `visualise` is set to False torcs simulator will run in headless mode self._config = config_parser.MadrasEnvConfig() self._config.update(config_parser.parse_yaml(cfg_path)) + self.communications_map = dict() self.torcs_proc = None self.seed() + + self.num_agents = len(self._config.agents) + self.agents = OrderedDict() + self.init_wait = self._config.init_wait + self.torcs_server_config = torcs_config.TorcsConfig( + self._config.server_config, num_learning_cars= self.num_agents, randomize=self._config.randomize_env) self.start_torcs_process() - self.agents ={} + if self._config.traffic: + self.traffic_handler = traffic.MadrasTrafficHandler( + self._config.torcs_server_port, len(self.agents), self._config.traffic) + + num_traffic_agents = len(self._config.traffic) if self._config.traffic else 0 + if self._config.agents: for i, agent in enumerate(self._config.agents): agent_name = [x for x in agent.keys()][0] + agent_cfg = agent[agent_name] name = '{}_{}'.format(agent_name, i) - torcs_server_port = self._config.torcs_server_port + i + torcs_server_port = self._config.torcs_server_port + i + self.torcs_server_config.num_traffic_cars self.agents[name] = MadrasAgent(name, torcs_server_port, agent_cfg, {"track_len": self._config.track_len, "max_steps": self._config.max_steps }) - if self._config.traffic: - self.traffic_manager = traffic.MadrasTrafficManager( - self._config.torcs_server_port, len(self.agents), self._config.traffic) - # self.action_dim = self.agents[0].action_dim # TODO(santara): Can I not have different action dims for different agents? + + if os.path.isfile(DEFAULT_COMMUNICATION_MAP): + self.communications_map = config_parser.parse_yaml(DEFAULT_COMMUNICATION_MAP) + + print("COMMUNICATIONS MAP", self.communications_map) + for comm_agent, agent_attrs in self.communications_map.items(): + self.agents[comm_agent].comms_init(agent_attrs) + + + # TODO(santara): Can I not have different action dims for different agents? + self.action_space = self.agents["MadrasAgent_0"].action_space + self.observation_space = self.agents["MadrasAgent_0"].observation_space + self.initial_reset = True print("Madras agents are: ", self.agents) @@ -289,6 +425,7 @@ def test_torcs_server_port(self): udp.close() def start_torcs_process(self): + #import pdb; pdb.set_trace() if self.torcs_proc is not None: os.killpg(os.getpgid(self.torcs_proc.pid), signal.SIGKILL) time.sleep(0.5) @@ -308,9 +445,9 @@ def reset_torcs(self): def execute_torcs_launch_command(self): command = None rank = MPI.COMM_WORLD.Get_rank() - + self.torcs_server_config.generate_torcs_server_config() if rank < self._config.no_of_visualisations and self._config.visualise: - command = 'export TORCS_PORT={} && vglrun torcs -t 10000000 -nolaptime'.format( + command = 'export TORCS_PORT={} && torcs -t 10000000 -nolaptime'.format( self._config.torcs_server_port) else: command = 'export TORCS_PORT={} && torcs -t 10000000 -r ~/.torcs/config/raceman/quickrace.xml -nolaptime'.format(self._config.torcs_server_port) @@ -320,60 +457,99 @@ def execute_torcs_launch_command(self): self.torcs_proc = subprocess.Popen([command], shell=True, preexec_fn=os.setsid) print("TORCS server PID is: ", self.torcs_proc.pid) + def reset_wait(self): + stop_action = {} + for agent_name, agent in self.agents.items(): + if (agent.action_dim == 3): + stop_action[agent_name] = [0.0, -1.0, 1.0] + else: + stop_action[agent_name] = [0.0, 0.0] + + s_t = None + for _ in range(self.init_wait): + next_obs, _, _, _ = self.step(stop_action) + s_t = next_obs + + return s_t + + def reset(self): """Reset Method to be called at the end of each episode.""" + #import pdb; pdb.set_trace() if not self.initial_reset: self.reset_torcs() + # self.torcs_server_config.generate_torcs_server_config() else: self.initial_reset = False + if self._config.traffic: - self.traffic_manager.reset() - s_t = {} - # TODO(santara): fix parallel reset - # jobs = [] - # manager = multiprocessing.Manager() - # return_dict = manager.dict() - # for i, agent in enumerate(self.agents): - # p = multiprocessing.Process(target=self.agents[agent].reset_new, args=(return_dict)) - # jobs.append(p) - # p.start() - - # for proc in jobs: - # proc.join() - # print (return_dict) - - # Serial reset : DEPRECATED, remove later - # for agent in self.agents: - # s_t[agent] = self.agents[agent].reset() - - # Create clients and connect their sockets - for agent in self.agents: + self.traffic_handler.reset(self.torcs_server_config.num_traffic_cars) + + s_t = OrderedDict() + + # Create clients and connect their sockets + for i, agent in enumerate(self.agents): + self.agents[agent].torcs_server_port = self._config.torcs_server_port + i + self.torcs_server_config.num_traffic_cars self.agents[agent].create_new_client() + # Collect first observations for agent in self.agents: + self.agents[agent].prev_dist = 0.0 s_t[agent] = self.agents[agent].get_observation_from_server() + self.agents[agent].client.respond_to_server() # To elimate 10s of timeout error, responding to the server after obs + self.agents[agent].increment_(self.agents[agent].client.S.d["distRaced"], self.agents[agent].client.S.d["damage"], self.agents[agent].client.S.d["racePos"]) + # Finish reset for agent in self.agents: self.agents[agent].complete_reset() + + for comm_agent in self.communications_map.keys(): #reset buffers + self.agents[comm_agent].comms_buffer.reset() + s_t[comm_agent] = np.hstack((s_t[comm_agent], self.agents[comm_agent].comms_buffer.request())) + + s_t = self.reset_wait() + return s_t def step(self, action): - next_obs, reward, done, info = {}, {}, {}, {} + next_obs, reward, done, info = {}, {}, {'__all__': False}, {} manager = multiprocessing.Manager() return_dict = manager.dict() + e = multiprocessing.Event() jobs = [] - for i, agent in enumerate(self.agents): - p = multiprocessing.Process(target=self.agents[agent].step, args=(action[i], return_dict)) + for agent in self.agents.keys(): + p = multiprocessing.Process(target=self.agents[agent].step, args=(action[agent], e, return_dict)) jobs.append(p) p.start() for proc in jobs: proc.join() - print (return_dict) + + done_check = False for agent in self.agents: next_obs[agent] = return_dict[agent][0] reward[agent] = return_dict[agent][1] done[agent] = return_dict[agent][2] + if (done[agent] == True): + """ + Although rllib supports individual agent resets + but MADRaS Env as of now has to be reset even if + one of the agents hits done. + """ + done_check = True info[agent] = return_dict[agent][3] + self.agents[agent].increment_step() + self.agents[agent].increment_(info[agent]['prev_dist'], info[agent]['prev_damage'], info[agent]['prev_pos']) + + for agent, agent_info in self.communications_map.items(): + full_obs_list = [] + for agent_c in agent_info["comms"]: + full_obs_list.append((info[agent_c]["full_obs"], action[agent_c])) + self.agents[agent].comms_buffer.insert(full_obs_list, agent_info["vars"]) + + done['__all__'] = done_check + + for comm_agent in self.communications_map.keys(): + next_obs[comm_agent] = np.hstack((next_obs[comm_agent], self.agents[comm_agent].comms_buffer.request())) return next_obs, reward, done, info diff --git a/MADRaS/example_controllers/__init__.py b/MADRaS/example_controllers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/MADRaS/example_controllers/behavior_reflex/DDPG/OU.py b/MADRaS/example_controllers/behavior_reflex/DDPG/OU.py deleted file mode 100644 index 7b58ce9..0000000 --- a/MADRaS/example_controllers/behavior_reflex/DDPG/OU.py +++ /dev/null @@ -1,7 +0,0 @@ -import random -import numpy as np - -class OU(object): - - def function(self, x, mu, theta, sigma): - return theta * (mu - x) + sigma * np.random.randn(1) \ No newline at end of file diff --git a/MADRaS/example_controllers/behavior_reflex/DDPG/ReplayBuffer.py b/MADRaS/example_controllers/behavior_reflex/DDPG/ReplayBuffer.py deleted file mode 100644 index 7fc88de..0000000 --- a/MADRaS/example_controllers/behavior_reflex/DDPG/ReplayBuffer.py +++ /dev/null @@ -1,45 +0,0 @@ -from collections import deque -import random - -class ReplayBuffer(object): - - def __init__(self, buffer_size): - self.buffer_size = buffer_size - self.num_experiences = 0 - self.buffer = deque() - self.mean_reward = 0.0 - - def getBatch(self, batch_size): - # Randomly sample batch_size examples - if self.num_experiences < batch_size: - return random.sample(self.buffer, self.num_experiences) - else: - return random.sample(self.buffer, batch_size) - - def size(self): - return self.buffer_size - - def add(self, state, action, reward, new_state, done): - experience = (state, action, reward, new_state, done) - if self.num_experiences < self.buffer_size: - self.buffer.append(experience) - self.num_experiences += 1 - else: - self.buffer.popleft() - self.buffer.append(experience) - self.mean_reward = ( (self.mean_reward * (self.num_experiences-1)) + reward) / float( self.num_experiences ) - - - def count(self): - # if buffer is full, return buffer size - # otherwise, return experience counter - return self.num_experiences - - def getMeanReward(self): - # if buffer is full, return buffer size - # otherwise, return experience counter - return self.mean_reward - - def erase(self): - self.buffer = deque() - self.num_experiences = 0 diff --git a/MADRaS/example_controllers/behavior_reflex/DDPG/__init__.py b/MADRaS/example_controllers/behavior_reflex/DDPG/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/MADRaS/example_controllers/behavior_reflex/DDPG/actor_network.py b/MADRaS/example_controllers/behavior_reflex/DDPG/actor_network.py deleted file mode 100644 index 8dba638..0000000 --- a/MADRaS/example_controllers/behavior_reflex/DDPG/actor_network.py +++ /dev/null @@ -1,143 +0,0 @@ -import tensorflow as tf -import numpy as np -import math - - -# Hyper Parameters -LAYER1_SIZE = 300 -LAYER2_SIZE = 400 -LEARNING_RATE = 1e-4 -TAU = 0.001 -BATCH_SIZE = 32 -class ActorNetwork: - """docstring for ActorNetwork""" - def __init__(self,sess,state_dim,action_dim): - - self.sess = sess - self.state_dim = state_dim - self.action_dim = action_dim - - # create actor network - self.state_input,self.action_output,self.net = self.create_network(state_dim,action_dim) - - # create target actor network - self.target_state_input,self.target_action_output,self.target_update,self.target_net = self.create_target_network(state_dim,action_dim,self.net) - - # define training rules - self.create_training_method() - - # self.sess.run(tf.initialize_all_variables()) - self.sess.run(tf.global_variables_initializer()) - - self.update_target() - #self.load_network() - - def create_training_method(self): - self.q_gradient_input = tf.placeholder("float",[None,self.action_dim]) - self.parameters_gradients = tf.gradients(self.action_output,self.net,-self.q_gradient_input) - '''for i, grad in enumerate(self.parameters_gradients): - if grad is not None: - self.parameters_gradients[i] = tf.clip_by_value(grad, -2.0,2.0)''' - self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients(zip(self.parameters_gradients,self.net)) - - def create_network(self,state_dim,action_dim): - - layer1_size = LAYER1_SIZE - layer2_size = LAYER2_SIZE - - state_input = tf.placeholder("float",[None,state_dim]) - - W1 = self.variable([state_dim,layer1_size],state_dim) - b1 = self.variable([layer1_size],state_dim) - W2 = self.variable([layer1_size,layer2_size],layer1_size) - b2 = self.variable([layer2_size],layer1_size) - #W3 = tf.Variable(tf.random_uniform([layer2_size,action_dim],-3e-3,3e-3)) - #b3 = tf.Variable(tf.random_uniform([action_dim],-3e-3,3e-3)) - - W_steer = tf.Variable(tf.random_uniform([layer2_size,1],-1e-4,1e-4)) - b_steer = tf.Variable(tf.random_uniform([1],-1e-4,1e-4)) - - W_accel = tf.Variable(tf.random_uniform([layer2_size,1],-1e-4,1e-4)) - b_accel = tf.Variable(tf.random_uniform([1],-1e-4,1e-4)) - - W_brake = tf.Variable(tf.random_uniform([layer2_size,1],-1e-4,1e-4)) - b_brake = tf.Variable(tf.random_uniform([1],-1e-4,1e-4)) - - - layer1 = tf.nn.relu(tf.matmul(state_input,W1) + b1) - layer2 = tf.nn.relu(tf.matmul(layer1,W2) + b2) - - - steer = tf.tanh(tf.matmul(layer2,W_steer) + b_steer) - accel = tf.sigmoid(tf.matmul(layer2,W_accel) + b_accel) - brake = tf.sigmoid(tf.matmul(layer2,W_brake) + b_brake) - - # action_output = tf.concat(1, [steer, accel, brake]) - action_output = tf.concat([steer, accel, brake], 1) - - return state_input,action_output,[W1,b1,W2,b2,W_steer,b_steer,W_accel,b_accel,W_brake,b_brake] - - def create_target_network(self,state_dim,action_dim,net): - state_input = tf.placeholder("float",[None,state_dim]) - ema = tf.train.ExponentialMovingAverage(decay=1-TAU) - target_update = ema.apply(net) - target_net = [ema.average(x) for x in net] - - layer1 = tf.nn.relu(tf.matmul(state_input,target_net[0]) + target_net[1]) - layer2 = tf.nn.relu(tf.matmul(layer1,target_net[2]) + target_net[3]) - - steer = tf.tanh(tf.matmul(layer2,target_net[4]) + target_net[5]) - accel = tf.sigmoid(tf.matmul(layer2,target_net[6]) + target_net[7]) - brake = tf.sigmoid(tf.matmul(layer2,target_net[8]) + target_net[9]) - - # action_output = tf.concat(1, [steer, accel, brake]) - action_output = tf.concat([steer, accel, brake], 1) - return state_input,action_output,target_update,target_net - - - - - def update_target(self): - self.sess.run(self.target_update) - - def train(self,q_gradient_batch,state_batch): - self.sess.run(self.optimizer,feed_dict={ - self.q_gradient_input:q_gradient_batch, - self.state_input:state_batch - }) - - def actions(self,state_batch): - return self.sess.run(self.action_output,feed_dict={ - self.state_input:state_batch - }) - - def action(self,state): - return self.sess.run(self.action_output,feed_dict={ - self.state_input:[state] - })[0] - - - def target_actions(self,state_batch): - return self.sess.run(self.target_action_output,feed_dict={ - self.target_state_input:state_batch - }) - - # f fan-in size - def variable(self,shape,f): - return tf.Variable(tf.random_uniform(shape,-1/math.sqrt(f),1/math.sqrt(f))) - ''' - def load_network(self): - self.saver = tf.train.Saver() - checkpoint = tf.train.get_checkpoint_state("saved_actor_networks") - if checkpoint and checkpoint.model_checkpoint_path: - self.saver.restore(self.sess, checkpoint.model_checkpoint_path) - print "Successfully loaded:", checkpoint.model_checkpoint_path - else: - print "Could not find old network weights" - def save_network(self,time_step): - print 'save actor-network...',time_step - self.saver.save(self.sess, 'saved_actor_networks/' + 'actor-network', global_step = time_step) - - ''' - - diff --git a/MADRaS/example_controllers/behavior_reflex/DDPG/critic_network.py b/MADRaS/example_controllers/behavior_reflex/DDPG/critic_network.py deleted file mode 100644 index 6d6c709..0000000 --- a/MADRaS/example_controllers/behavior_reflex/DDPG/critic_network.py +++ /dev/null @@ -1,135 +0,0 @@ - -import tensorflow as tf -import numpy as np -import math - - -LAYER1_SIZE = 300 -LAYER2_SIZE = 600 -LEARNING_RATE = 1e-3 -TAU = 0.001 -L2 = 0.0001 - -class CriticNetwork: - """docstring for CriticNetwork""" - def __init__(self,sess,state_dim,action_dim): - self.time_step = 0 - self.sess = sess - # create q network - self.state_input,\ - self.action_input,\ - self.q_value_output,\ - self.net = self.create_q_network(state_dim,action_dim) - - # create target q network (the same structure with q network) - self.target_state_input,\ - self.target_action_input,\ - self.target_q_value_output,\ - self.target_update = self.create_target_q_network(state_dim,action_dim,self.net) - - self.create_training_method() - - # initialization - # self.sess.run(tf.initialize_all_variables()) - self.sess.run(tf.global_variables_initializer()) - - self.update_target() - - def create_training_method(self): - # Define training optimizer - self.y_input = tf.placeholder("float",[None,1]) - weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.net]) - self.cost = tf.reduce_mean(tf.square(self.y_input - self.q_value_output)) + weight_decay - self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.cost) - '''self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE) - self.parameters_gradients = self.optimizer.compute_gradients(self.cost) - - for i, (grad,var) in enumerate(self.parameters_gradients): - if grad is not None: - self.parameters_gradients[i] = (tf.clip_by_value(grad, -5.0,5.0),var) - - self.train_op = self.optimizer.apply_gradients(self.parameters_gradients)''' - self.action_gradients = tf.gradients(self.q_value_output,self.action_input) - - def create_q_network(self,state_dim,action_dim): - # the layer size could be changed - layer1_size = LAYER1_SIZE - layer2_size = LAYER2_SIZE - - state_input = tf.placeholder("float",[None,state_dim]) - action_input = tf.placeholder("float",[None,action_dim]) - - W1 = self.variable([state_dim,layer1_size],state_dim) - b1 = self.variable([layer1_size],state_dim) - W2 = self.variable([layer1_size,layer2_size],layer1_size+action_dim) - W2_action = self.variable([action_dim,layer2_size],layer1_size+action_dim) - b2 = self.variable([layer2_size],layer1_size+action_dim) - W3 = tf.Variable(tf.random_uniform([layer2_size,1],-3e-3,3e-3)) - b3 = tf.Variable(tf.random_uniform([1],-3e-3,3e-3)) - - layer1 = tf.nn.relu(tf.matmul(state_input,W1) + b1) - layer2 = tf.nn.relu(tf.matmul(layer1,W2) + tf.matmul(action_input,W2_action) + b2) - q_value_output = tf.identity(tf.matmul(layer2,W3) + b3) - - return state_input,action_input,q_value_output,[W1,b1,W2,W2_action,b2,W3,b3] - - def create_target_q_network(self,state_dim,action_dim,net): - - state_input = tf.placeholder("float",[None,state_dim]) - action_input = tf.placeholder("float",[None,action_dim]) - - ema = tf.train.ExponentialMovingAverage(decay=1-TAU) - target_update = ema.apply(net) - target_net = [ema.average(x) for x in net] - - layer1 = tf.nn.relu(tf.matmul(state_input,target_net[0]) + target_net[1]) - layer2 = tf.nn.relu(tf.matmul(layer1,target_net[2]) + tf.matmul(action_input,target_net[3]) + target_net[4]) - q_value_output = tf.identity(tf.matmul(layer2,target_net[5]) + target_net[6]) - - return state_input,action_input,q_value_output,target_update - - def update_target(self): - self.sess.run(self.target_update) - - def train(self,y_batch,state_batch,action_batch): - self.time_step += 1 - self.sess.run(self.optimizer,feed_dict={ - self.y_input:y_batch, - self.state_input:state_batch, - self.action_input:action_batch - }) - - def gradients(self,state_batch,action_batch): - return self.sess.run(self.action_gradients,feed_dict={ - self.state_input:state_batch, - self.action_input:action_batch - })[0] - - def target_q(self,state_batch,action_batch): - return self.sess.run(self.target_q_value_output,feed_dict={ - self.target_state_input:state_batch, - self.target_action_input:action_batch - }) - - def q_value(self,state_batch,action_batch): - return self.sess.run(self.q_value_output,feed_dict={ - self.state_input:state_batch, - self.action_input:action_batch}) - - # f fan-in size - def variable(self,shape,f): - return tf.Variable(tf.random_uniform(shape,-1/math.sqrt(f),1/math.sqrt(f))) - ''' - def load_network(self): - self.saver = tf.train.Saver() - checkpoint = tf.train.get_checkpoint_state("saved_critic_networks") - if checkpoint and checkpoint.model_checkpoint_path: - self.saver.restore(self.sess, checkpoint.model_checkpoint_path) - print "Successfully loaded:", checkpoint.model_checkpoint_path - else: - print "Could not find old network weights" - - def save_network(self,time_step): - print 'save critic-network...',time_step - self.saver.save(self.sess, 'saved_critic_networks/' + 'critic-network', global_step = time_step) - ''' diff --git a/MADRaS/example_controllers/behavior_reflex/DDPG/ddpg.py b/MADRaS/example_controllers/behavior_reflex/DDPG/ddpg.py deleted file mode 100644 index 16e2404..0000000 --- a/MADRaS/example_controllers/behavior_reflex/DDPG/ddpg.py +++ /dev/null @@ -1,141 +0,0 @@ - -# ----------------------------------- -# Deep Deterministic Policy Gradient -# Author: Flood Sung -# Date: 2016.5.4 -# ----------------------------------- - -import gym -import tensorflow as tf -import numpy as np -from example_controllers.behavior_reflex.DDPG.OU import OU -import math, random -from example_controllers.behavior_reflex.DDPG.critic_network import CriticNetwork -from example_controllers.behavior_reflex.DDPG.actor_network import ActorNetwork -from example_controllers.behavior_reflex.DDPG.ReplayBuffer import ReplayBuffer -#from configurations import save_location - - -# Hyper Parameters: - -REPLAY_BUFFER_SIZE = 100000 -REPLAY_START_SIZE = 100 -BATCH_SIZE = 32 -GAMMA = 0.99 - - -class DDPG: - """docstring for DDPG""" - def __init__(self, env_name, state_dim,action_dim,save_location): - self.name = 'DDPG' # name for uploading results - self.env_name = env_name - # Randomly initialize actor network and critic network - # with both their target networks - self.state_dim = state_dim - self.action_dim = action_dim - self.save_location = save_location - # Ensure action bound is symmetric - self.time_step = 0 - self.sess = tf.InteractiveSession() - - self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) - self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) - - # initialize replay buffer - self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) - - # Initialize a random process the Ornstein-Uhlenbeck process for action exploration - self.OU = OU() - - # loading networks - self.saver = tf.train.Saver(max_to_keep=0) - checkpoint = tf.train.get_checkpoint_state(self.save_location) - if checkpoint and checkpoint.model_checkpoint_path: - self.saver.restore(self.sess, checkpoint.model_checkpoint_path) - print("Successfully loaded:", checkpoint.model_checkpoint_path) - else: - print("Could not find old network weights") - - def train(self): - #print "train step",self.time_step - # Sample a random minibatch of N transitions from replay buffer - minibatch = self.replay_buffer.getBatch(BATCH_SIZE) - state_batch = np.asarray([data[0] for data in minibatch]) - action_batch = np.asarray([data[1] for data in minibatch]) - reward_batch = np.asarray([data[2] for data in minibatch]) - next_state_batch = np.asarray([data[3] for data in minibatch]) - done_batch = np.asarray([data[4] for data in minibatch]) - - # for action_dim = 1 - action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) - - # Calculate y_batch - - next_action_batch = self.actor_network.target_actions(next_state_batch) - q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) - y_batch = [] - for i in range(len(minibatch)): - if done_batch[i]: - y_batch.append(reward_batch[i]) - else : - y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) - y_batch = np.resize(y_batch,[BATCH_SIZE,1]) - # Update critic by minimizing the loss L - self.critic_network.train(y_batch,state_batch,action_batch) - - # Update the actor policy using the sampled gradient: - action_batch_for_gradients = self.actor_network.actions(state_batch) - q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) - - self.actor_network.train(q_gradient_batch,state_batch) - - # Update the target networks - self.actor_network.update_target() - self.critic_network.update_target() - - def saveNetwork(self,i): - self.saver.save(self.sess, self.save_location + 'network' + str(i)+'DDPG.ckpt', global_step = self.time_step,) - - - def action(self,state): - action = self.actor_network.action(state) - action[0] = np.clip( action[0], -1 , 1 ) - action[1] = np.clip( action[1], 0 , 1 ) - action[2] = np.clip( action[2], 0 , 1 ) - #print "Action:", action - return action - - def noise_action(self,state,epsilon): - # Select action a_t according to the current policy and exploration noise - action = self.actor_network.action(state) - #print action.shape - #print "Action_No_Noise:", action - noise_t = np.zeros(self.action_dim) - noise_t[0] = epsilon * self.OU.function(action[0], 0.0 , 0.60, 0.80) - noise_t[1] = epsilon * self.OU.function(action[1], 0.5 , 1.00, 0.10) - noise_t[2] = epsilon * self.OU.function(action[2], -0.1 , 1.00, 0.05) - - if random.random() <= 0.1: - # print("********Stochastic brake***********") - noise_t[2] = epsilon * self.OU.function(action[2], 0.2 , 1.00, 0.10) - - action = action + noise_t - action[0] = np.clip(action[0], -1, 1) - action[1] = np.clip(action[1], 0 , 1) - action[2] = np.clip(action[2], 0 , 1) - - #print "Action_Noise:", action - return action - - def perceive(self,state,action,reward,next_state,done): - # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer - - if ( not (math.isnan( reward ))): - self.replay_buffer.add(state,action,reward,next_state,done) - - self.time_step = self.time_step + 1 - # Store transitions to replay start size then start training - if self.replay_buffer.count() > REPLAY_START_SIZE: - self.train() - - diff --git a/MADRaS/example_controllers/behavior_reflex/__init__.py b/MADRaS/example_controllers/behavior_reflex/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/MADRaS/example_controllers/behavior_reflex/configurations.py b/MADRaS/example_controllers/behavior_reflex/configurations.py deleted file mode 100644 index d1b0522..0000000 --- a/MADRaS/example_controllers/behavior_reflex/configurations.py +++ /dev/null @@ -1,28 +0,0 @@ -################################################################### -# -# All the configurations are done here. -# -# 1. Toggle the is_training flag to 0 to test a saved model. -# 2. epsilon_start is, as the name suggests, where the annealing epsilon starts from -# 3. total_explore is used as : epsilon -= 1/total_explore -# -################################################################### - -visualize_after = 5 -is_training = 1 - -total_explore = 600000.0 -max_eps = 6000 -max_steps_eps = 10000 - -wait_at_beginning = 0 -initial_wait_period = 200 # to give the other cars a headstart of these many steps - -epsilon_start = 1 -start_reward = -10000 # these need to be changed if restarting the playGame.py script - -save_location = './weights/' - -torcsPort = 3101 -configFile = '~/.torcs/config/raceman/quickrace.xml' -# configFile = '~/.torcs/config/raceman/practice.xml' diff --git a/MADRaS/example_controllers/behavior_reflex/multi_agent.py b/MADRaS/example_controllers/behavior_reflex/multi_agent.py deleted file mode 100644 index 97e288b..0000000 --- a/MADRaS/example_controllers/behavior_reflex/multi_agent.py +++ /dev/null @@ -1,24 +0,0 @@ -import threading -import multiprocessing -import numpy as np -import tensorflow as tf -import example_controllers.behavior_reflex.playGame_DDPG as playGame_DDPG -import os -from random import choice -from time import sleep -from time import time -#import pymp - -with tf.device("/cpu:0"): - num_workers = 3 #multiprocessing.cpu_count() #use this when you want to use all the cpus - print("numb of workers is" + str(num_workers)) - -with tf.Session() as sess: - worker_threads = [] -#with pymp.Parallel(4) as p: #uncomment this for parallelization of threads - for i in range(num_workers): - worker_work = lambda: (playGame_DDPG.playGame(f_diagnostics=""+str(i), train_indicator=1, port=3001+i)) - t = threading.Thread(target=(worker_work)) - t.start() - sleep(0.5) - worker_threads.append(t) diff --git a/MADRaS/example_controllers/behavior_reflex/playGame_DDPG.py b/MADRaS/example_controllers/behavior_reflex/playGame_DDPG.py deleted file mode 100644 index becefd0..0000000 --- a/MADRaS/example_controllers/behavior_reflex/playGame_DDPG.py +++ /dev/null @@ -1,210 +0,0 @@ -import sys -from utils.gym_torcs import TorcsEnv -import numpy as np -import utils.snakeoil3_gym as snakeoil3 -import collections as col -import random -import argparse -import tensorflow as tf -import timeit -import math -import sys -import os -from example_controllers.behavior_reflex.configurations import * -from example_controllers.behavior_reflex.DDPG.ddpg import * - -import gc -gc.enable() - -np.random.seed(1337) - - -def playGame(f_diagnostics, train_indicator, port=3101): # 1 means Train, 0 means simply Run - - action_dim = 3 #Steering/Acceleration/Brake - state_dim = 65 #of sensors input - - env_name = 'Torcs_Env' - weights_location = save_location + str(port) + "/" - agent = DDPG(env_name, state_dim, action_dim, weights_location) - - # Generate a Torcs environment - print("I have been asked to use port: ", port) - env = TorcsEnv(vision=False, throttle=True, gear_change=False) - ob = None - while ob is None: - try: - client = snakeoil3.Client(p=port, vision=False) # Open new UDP in vtorcs - client.MAX_STEPS = np.inf - - client.get_servers_input(0) # Get the initial input from torcs - - obs = client.S.d # Get the current full-observation from torcs - ob = env.make_observation(obs) - - s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) - except: - pass - - EXPLORE = total_explore - episode_count = max_eps - max_steps = max_steps_eps - epsilon = epsilon_start - done = False - epsilon_steady_state = 0.01 # This is used for early stopping. - - totalSteps = 0 - best_reward = -100000 - running_avg_reward = 0. - - - print("TORCS Experiment Start.") - for i in range(episode_count): - - early_stop = 1 - total_reward = 0. - info = {'termination_cause':0} - distance_traversed = 0. - speed_array=[] - trackPos_array=[] - - print('\n\nStarting new episode...\n') - - for step in range(max_steps): - - # Take noisy actions during training - if (train_indicator): - epsilon -= 1.0 / EXPLORE - epsilon = max(epsilon, epsilon_steady_state) - a_t = agent.noise_action(s_t,epsilon) - else: - a_t = agent.action(s_t) # [steer, accel, brake] - try: - ob, r_t, done, info = env.step(step, client, a_t, early_stop) - if done: - break - analyse_info(info, printing=False) - - s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) - distance_traversed += ob.speedX*np.cos(ob.angle) #Assuming 1 step = 1 second - speed_array.append(ob.speedX*np.cos(ob.angle)) - trackPos_array.append(ob.trackPos) - - #Checking for nan rewards: - if (math.isnan( r_t )): - r_t = 0.0 - for bad_r in range( 50 ): - print( 'Bad Reward Found' ) - break - - # Add to replay buffer only if training - if (train_indicator): - agent.perceive(s_t,a_t,r_t,s_t1,done) # Add experience to replay buffer - - except Exception as e: - print("Exception caught after training, at port " + str(i) + str(e) ) - ob = None - while ob is None: - try: - client = snakeoil3.Client(p=port, vision=False) # Open new UDP in vtorcs - client.MAX_STEPS = np.inf - client.get_servers_input(0) # Get the initial input from torcs - obs = client.S.d # Get the current full-observation from torcs - ob = env.make_observation(obs) - except: - pass - continue - total_reward += r_t - s_t = s_t1 - - # Displaying progress every 15 steps. - if ( (np.mod(step,15)==0) ): - print("Episode", i, "Step", step, "Epsilon", epsilon , "Action", a_t, "Reward", r_t ) - - totalSteps += 1 - if done: - break - - if ((train_indicator ==1 )): - if(i%300 == 0): - agent.saveNetwork(i) - - running_avg_reward = running_average(running_avg_reward, i+1, total_reward) - - - print("TOTAL REWARD @ " + str(i) +"-th Episode : Num_Steps= " + str(step) + "; Max_steps= " + str(max_steps) +"; Reward= " + str(total_reward) +"; Running average reward= " + str(running_avg_reward)) - print("Total Step: " + str(totalSteps)) - print("") - - print(info) - try: - if 'termination_cause' in info.keys() and info['termination_cause']=='hardReset': - print('Hard reset by some agent') - ob, client = env.reset(client=client, relaunch=True) - else: - ob, client = env.reset(client=client, relaunch=True) - except Exception as e: - print("Exception caught after episode end, at port " + str(i) + str(e) ) - ob = None - while ob is None: - try: - client = snakeoil3.Client(p=port, vision=False) # Open new UDP in vtorcs - client.MAX_STEPS = np.inf - client.get_servers_input(0) # Get the initial input from torcs - obs = client.S.d # Get the current full-observation from torcs - ob = env.make_observation(obs) - except: - print("Another exception caught while handling exception, at port " + str(i) + str(e) ) - - - s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm, ob.opponents)) - - env.end() # This is for shutting down TORCS - print("Finish.") - -def document_episode(episode_no, distance_traversed, speed_array, trackPos_array, info, running_avg_reward, f_diagnostics): - """ - Note down a tuple of diagnostic values for each episode - (episode_no, distance_traversed, mean(speed_array), std(speed_array), mean(trackPos_array), std(trackPos_array), info[termination_cause], running_avg_reward) - """ - f_diagnostics.write(str(episode_no)+",") - f_diagnostics.write(str(distance_traversed)+",") - f_diagnostics.write(str(np.mean(speed_array))+",") - f_diagnostics.write(str(np.std(speed_array))+",") - f_diagnostics.write(str(np.mean(trackPos_array))+",") - f_diagnostics.write(str(np.std(trackPos_array))+",") - f_diagnostics.write(str(info['termination_cause'])+",") - f_diagnostics.write(str(running_avg_reward)+"\n") - - -def running_average(prev_avg, num_episodes, new_val): - total = prev_avg*(num_episodes-1) - total += new_val - return np.float(total/num_episodes) - -def analyse_info(info, printing=True): - simulation_state = ['Normal', 'Terminated as car is OUT OF TRACK', 'Terminated as car has SMALL PROGRESS', 'Terminated as car has TURNED BACKWARDS'] - if printing and info['termination_cause']!=0: - print(simulation_state[info['termination_cause']]) - -if __name__ == "__main__": - - try: - port = int(sys.argv[1]) - except Exception as e: - # raise e - print("Usage : python %s " % (sys.argv[0])) - sys.exit() - - print('is_training : ' + str(is_training)) - print('Starting best_reward : ' + str(start_reward)) - print( total_explore ) - print( max_eps ) - print( max_steps_eps ) - print( epsilon_start ) - print('config_file : ' + str(configFile)) - - # f_diagnostics = open('output_logs/diagnostics_for_window_' + sys.argv[1]+'_with_fixed_episode_length', 'w') #Add date and time to file name - f_diagnostics = "" - playGame(f_diagnostics, train_indicator=0, port=port) - # f_diagnostics.close() diff --git a/MADRaS/example_controllers/pid/DDPG/OU.py b/MADRaS/example_controllers/pid/DDPG/OU.py deleted file mode 100644 index 4488514..0000000 --- a/MADRaS/example_controllers/pid/DDPG/OU.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Noise Function.""" -import numpy as np - - -class OU(object): - """Ornstein Uhlenbeck Process.""" - - def function(self, x, mu, theta, sigma): - """The stochastic equation.""" - return theta * (mu - x) + sigma * np.random.randn(1) - - -if __name__ == "__main__": - NUM_SAMPLES = 1000 - x = np.linspace(0, 10, NUM_SAMPLES) - noise = OU() - dx = np.asarray([noise.function(x_i, 5, 0.15, 0.1) - for x_i in x]).reshape((NUM_SAMPLES,)) - y = x + dx - import matplotlib.pyplot as plt - plt.plot(x, y, c='r') - plt.plot(x, x) - plt.show() diff --git a/MADRaS/example_controllers/pid/DDPG/ReplayBuffer.py b/MADRaS/example_controllers/pid/DDPG/ReplayBuffer.py deleted file mode 100644 index b8cc65e..0000000 --- a/MADRaS/example_controllers/pid/DDPG/ReplayBuffer.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Prioritized experience replay buffer.""" -import random -from queue import PriorityQueue -from utils.madras_datatypes import Madras - -madras = Madras() - - -class experience(object): - """Experience class format.""" - - def __init__(self, state, action, reward, next_state, done, TD): - """Init Method""" - self.state = state - self.action = action - self.reward = reward - self.next_state = next_state - self.done = done - self.TD = TD - - def __gt__(self, other): - """Redifining >.""" - return (self.TD > other.TD) - - def __lt__(self, other): - """Redifining <.""" - return (self.TD < other.TD) - - def __eq__(self, other): - """Redifining ==.""" - return (self.TD == other.TD) - -class ReplayBuffer(object): - """Prioritized Experiance replay.""" - - def __init__(self, buffer_size): - """Init Method.""" - self.buffer_size = buffer_size - self.num_experiences = 0 - self.buffer = PriorityQueue() - self.mean_reward = 0.0 - - def getBatch(self, batch_size): - """Sampling.""" - if self.num_experiences < batch_size: - return random.sample(list(self.buffer.queue), self.num_experiences) - else: - return random.sample(list(self.buffer.queue), batch_size) - - def size(self): - """Get size.""" - return self.buffer_size - - def add(self, state, action, reward, new_state, done, TD): - """Add Experience.""" - exp = experience(state, action, reward, new_state, done, TD) - if self.num_experiences < self.buffer_size: - self.buffer.put(exp) - self.num_experiences += 1 - else: - self.buffer.get() - self.buffer.put(exp) - self.mean_reward = ((self.mean_reward * (self.num_experiences - 1)) + - reward) / madras.floatX(self.num_experiences) - - def count(self): - """If buffer is full. - - Return buffer size otherwise, return experience counter. - """ - return self.num_experiences - - def getMeanReward(self): - """Get Mean Reward.""" - return self.mean_reward diff --git a/MADRaS/example_controllers/pid/DDPG/__init__.py b/MADRaS/example_controllers/pid/DDPG/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/MADRaS/example_controllers/pid/DDPG/actor_network.py b/MADRaS/example_controllers/pid/DDPG/actor_network.py deleted file mode 100644 index d5e7d99..0000000 --- a/MADRaS/example_controllers/pid/DDPG/actor_network.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Actor-Network.""" - -import tensorflow as tf - -# Hyper Parameters -LAYER1_SIZE = 300 -LAYER2_SIZE = 400 -LEARNING_RATE = 1e-4 -TAU = 0.001 -BATCH_SIZE = 32 - - -class ActorNetwork: - """docstring for ActorNetwork.""" - - def __init__(self, sess, state_dim, action_dim): - """Init Method.""" - self.sess = sess - self.state_dim = state_dim - self.action_dim = action_dim - # create actor network - self.state_input, self.action_output, self.net =\ - self.create_network(state_dim, action_dim) - - # create target actor network - self.target_state_input, self.target_action_output, self.target_update, self.target_net =\ - self.create_target_network(state_dim, action_dim, self.net) - - # define training rules - self.create_training_method() - - # self.sess.run(tf.initialize_all_variables()) - self.sess.run(tf.global_variables_initializer()) - - self.update_target() - # self.load_network() - - def create_training_method(self): - """Trainer.""" - self.q_gradient_input = tf.placeholder("float", [None, - self.action_dim]) - self.parameters_gradients = tf.gradients(self.action_output, - self.net, - -self.q_gradient_input) - self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).apply_gradients( - list(zip(self.parameters_gradients, self.net))) - - def create_network(self, state_dim, action_dim): - """Network Creater.""" - layer1_size = LAYER1_SIZE - layer2_size = LAYER2_SIZE - - state_input = tf.placeholder("float", [None, state_dim]) - regularization = tf.contrib.layers.l2_regularizer(scale=0.01) - - # ###################GET_VARIABLES################## - W1 = tf.get_variable("W1", shape=[state_dim, layer1_size], - regularizer=regularization) - b1 = tf.get_variable("b1", shape=[layer1_size], - regularizer=regularization, - initializer=tf.zeros_initializer) - W2 = tf.get_variable("W2", shape=[layer1_size, layer2_size], - regularizer=regularization) - b2 = tf.get_variable("b2", shape=[layer2_size], - regularizer=regularization, - initializer=tf.zeros_initializer) - - layer1 = tf.nn.relu(tf.matmul(state_input, W1) + b1) - layer1_norm = tf.layers.batch_normalization(layer1) - layer2 = tf.nn.relu(tf.matmul(layer1_norm, W2) + b2) - layer2_norm = tf.layers.batch_normalization(layer2) - - W_trackpos = tf.get_variable("W_trackpos", shape=[layer2_size, 1], - regularizer=regularization) - b_trackpos = tf.get_variable("b_trackpos", shape=[1], - regularizer=regularization, - initializer=tf.zeros_initializer) - W_vel = tf.get_variable("W_vel", shape=[layer2_size, 1], - regularizer=regularization) - b_vel = tf.get_variable("b_vel", shape=[1], - regularizer=regularization, - initializer=tf.zeros_initializer) - - trackpos = tf.nn.relu(tf.matmul(layer2_norm, W_trackpos) + b_trackpos) - trackpos_res = trackpos + tf.gather(state_input, indices=[20], axis=1) - - velocity = tf.nn.relu(tf.matmul(layer2_norm, W_vel) + b_vel) - velocity_res = velocity + tf.gather(state_input, indices=[21], axis=1) - action_output = tf.concat([trackpos_res, velocity_res], 1) - - return state_input, action_output, [W1, b1, W2, b2, - W_trackpos, b_trackpos, - W_vel, b_vel] - - def create_target_network(self, state_dim, action_dim, net): - """Target Network Creater.""" - state_input = tf.placeholder("float", [None, state_dim]) - ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) - target_update = ema.apply(net) - target_net = [ema.average(x) for x in net] - - layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + target_net[1]) - layer2 = tf.nn.relu(tf.matmul(layer1, target_net[2]) + target_net[3]) - - trackpos = tf.nn.relu(tf.matmul(layer2, target_net[4]) + target_net[5]) - trackpos_res = trackpos + tf.gather(state_input, indices=[20], axis=1) - velocity = tf.nn.relu(tf.matmul(layer2, target_net[6]) + target_net[7]) - velocity_res = velocity + tf.gather(state_input, indices=[21], axis=1) - - action_output = tf.concat([trackpos_res, velocity_res], 1) - return state_input, action_output, target_update, target_net - - def update_target(self): - """Target update.""" - self.sess.run(self.target_update) - - def train(self, q_gradient_batch, state_batch): - """Train Function.""" - self.sess.run(self.optimizer, feed_dict={ - self.q_gradient_input: q_gradient_batch, - self.state_input: state_batch}) - - def actions(self, state_batch): - """Get Network Output.""" - return self.sess.run(self.action_output, feed_dict={ - self.state_input: state_batch}) - - def action(self, state): - """Get output for a single obs.""" - return self.sess.run(self.action_output, feed_dict={ - self.state_input: [state]})[0] - - def target_actions(self, state_batch): - """Get Target Network Outputs.""" - return self.sess.run(self.target_action_output, feed_dict={ - self.target_state_input: state_batch}) diff --git a/MADRaS/example_controllers/pid/DDPG/critic_network.py b/MADRaS/example_controllers/pid/DDPG/critic_network.py deleted file mode 100644 index 33ea328..0000000 --- a/MADRaS/example_controllers/pid/DDPG/critic_network.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Critic-Network.""" - -import tensorflow as tf - -LAYER1_SIZE = 300 -LAYER2_SIZE = 600 -LEARNING_RATE = 1e-3 -TAU = 0.001 -L2 = 0.0001 - - -class CriticNetwork: - """docstring for CriticNetwork.""" - - def __init__(self, sess, state_dim, action_dim): - """Init Method.""" - self.time_step = 0 - self.sess = sess - # create q network - self.state_input, self.action_input, self.q_value_output, self.net =\ - self.create_q_network(state_dim, action_dim) - - # create target q network (the same structure with q network) - self.target_state_input, self.target_action_input, self.target_q_value_output, self.target_update =\ - self.create_target_q_network(state_dim, action_dim, self.net) - - self.create_training_method() - - # initialization - # self.sess.run(tf.initialize_all_variables()) - self.sess.run(tf.global_variables_initializer()) - - self.update_target() - - def create_training_method(self): - """Define training optimizer.""" - self.y_input = tf.placeholder("float", [None, 1]) - weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.net]) - self.TD_error = tf.reduce_mean(tf.square(self.y_input - self.q_value_output)) - self.cost = self.TD_error + weight_decay - self.optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.cost) - self.action_gradients = tf.gradients(self.q_value_output, self.action_input) - - def create_q_network(self, state_dim, action_dim): - """Q Network.""" - layer1_size = LAYER1_SIZE - layer2_size = LAYER2_SIZE - - state_input = tf.placeholder("float", [None, state_dim]) - action_input = tf.placeholder("float", [None, action_dim]) - regularization = tf.contrib.layers.l2_regularizer(scale=0.01) - - W1 = tf.get_variable("W1_c", shape=[state_dim, layer1_size], - regularizer=regularization) - b1 = tf.get_variable("b1_c", shape=[layer1_size], - regularizer=regularization) - W2 = tf.get_variable("W2_c", shape=[layer1_size, layer2_size], - regularizer=regularization) - W2_action = tf.get_variable("W2_action", shape=[action_dim, layer2_size], - regularizer=regularization) - b2 = tf.get_variable("b2_c", shape=[layer2_size], - regularizer=regularization) - W3 = tf.get_variable("W3_c", shape=[layer2_size, 1], - regularizer=regularization) - b3 = tf.get_variable("b3_c", shape=[1], regularizer=regularization) - - layer1 = tf.nn.relu(tf.matmul(state_input, W1) + b1) - layer1_norm = tf.layers.batch_normalization(layer1) - layer2 = tf.nn.relu(tf.matmul(layer1_norm, W2) + - tf.matmul(action_input, W2_action) + b2) - layer2_norm = tf.layers.batch_normalization(layer2) - q_value_output = tf.identity(tf.matmul(layer2_norm, W3) + b3) - - return state_input, action_input, q_value_output, [W1, b1, W2, - W2_action, - b2, W3, b3] - - def create_target_q_network(self, state_dim, action_dim, net): - """Target Q Network.""" - state_input = tf.placeholder("float", [None, state_dim]) - action_input = tf.placeholder("float", [None, action_dim]) - - ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) - target_update = ema.apply(net) - target_net = [ema.average(x) for x in net] - - layer1 = tf.nn.relu(tf.matmul(state_input, target_net[0]) + - target_net[1]) - layer1_norm = tf.layers.batch_normalization(layer1) - layer2 = tf.nn.relu(tf.matmul(layer1_norm, target_net[2]) + - tf.matmul(action_input, target_net[3]) + - target_net[4]) - layer2_norm = tf.layers.batch_normalization(layer2) - q_value_output = tf.identity(tf.matmul(layer2_norm, target_net[5]) + - target_net[6]) - - return state_input,action_input,q_value_output,target_update - - def update_target(self): - """Update Target Network.""" - self.sess.run(self.target_update) - - def train(self, y_batch, state_batch, action_batch): - """Train Step.""" - self.time_step += 1 - self.sess.run(self.optimizer, feed_dict={ - self.y_input: y_batch, - self.state_input: state_batch, - self.action_input: action_batch}) - - def gradients(self, state_batch, action_batch): - """Get Network Gradients.""" - return self.sess.run(self.action_gradients, feed_dict={ - self.state_input: state_batch, - self.action_input: action_batch})[0] - - def target_q(self, state_batch, action_batch): - """Output of the Target Q Network.""" - return self.sess.run(self.target_q_value_output, feed_dict={ - self.target_state_input: state_batch, - self.target_action_input: action_batch}) - - def q_value(self, state_batch, action_batch): - """Get Q-Value.""" - return self.sess.run(self.q_value_output, feed_dict={ - self.state_input: state_batch, - self.action_input: action_batch}) - - def td_error(self, state, action, y): - """TD Error Calculation.""" - return self.sess.run(self.TD_error, feed_dict={self.y_input: y, - self.state_input: state, - self.action_input: action}) diff --git a/MADRaS/example_controllers/pid/DDPG/ddpg.py b/MADRaS/example_controllers/pid/DDPG/ddpg.py deleted file mode 100644 index e0ae64e..0000000 --- a/MADRaS/example_controllers/pid/DDPG/ddpg.py +++ /dev/null @@ -1,173 +0,0 @@ -"""Deep Deterministic Policy Gradient Algorithm.""" -import math -import random -import numpy as np -from copy import deepcopy -import tensorflow as tf - -from example_controllers.pid.DDPG.OU import OU -from example_controllers.pid.DDPG.critic_network import CriticNetwork -from example_controllers.pid.DDPG.actor_network import ActorNetwork -from example_controllers.pid.DDPG.ReplayBuffer import ReplayBuffer -from utils.display_utils import * - -# Hyper Parameters: - -REPLAY_BUFFER_SIZE = 100000 -REPLAY_START_SIZE = 100 -BATCH_SIZE = 32 -GAMMA = 0.99 - - -class DDPG: - """Class DDPG.""" - - def __init__(self, env_name, state_dim, action_dim, save_location, reward_threshold, thresh_coin_toss): - """Init Method.""" - self.name = 'DDPG' # name for uploading results - self.env_name = env_name - # Randomly initialize actor network and critic network - # with both their target networks - self.state_dim = state_dim - self.action_dim = action_dim - self.save_location = save_location - - # Ensure action bound is symmetric - self.time_step = 0 - self.sess = tf.InteractiveSession() - - self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) - self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) - - # initialize replay buffer - self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) - - # Initialize a random process the Ornstein-Uhlenbeck process noise - self.OU = OU() - self.visualize = [] - - self.episode_printer = StructuredPrinter() - self.step_printer = StructuredPrinter() - - # loading networks - self.saver = tf.train.Saver() - checkpoint = tf.train.get_checkpoint_state(save_location) - if checkpoint and checkpoint.model_checkpoint_path: - self.saver.restore(self.sess, checkpoint.model_checkpoint_path) - print("Successfully loaded:", checkpoint.model_checkpoint_path) - else: - print("Could not find old network weights") - self.reward_threshold = reward_threshold - self.thresh_coin_toss = thresh_coin_toss - - def train(self): - """Train Method.""" - # Sample a random minibatch of N transitions from replay buffer - minibatch = self.replay_buffer.getBatch(BATCH_SIZE) - state_batch = np.asarray([data.state for data in minibatch]) - action_batch = np.asarray([data.action for data in minibatch]) - reward_batch = np.asarray([data.reward for data in minibatch]) - next_state_batch = np.asarray([data.next_state for data in minibatch]) - done_batch = np.asarray([data.done for data in minibatch]) - - # for action_dim = 1 - action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) - - # Calculate y_batch - - next_action_batch = self.actor_network.target_actions(next_state_batch) - q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) - y_batch = [] - for i in range(len(minibatch)): - if done_batch[i]: - y_batch.append(reward_batch[i]) - else: - y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) - y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) - # Update critic by minimizing the loss L - self.critic_network.train(y_batch, state_batch, action_batch) - - # Update the actor policy using the sampled gradient: - action_batch_for_gradients = self.actor_network.actions(state_batch) - q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients) - self.actor_network.train(q_gradient_batch, state_batch) - # Update the target networks - self.actor_network.update_target() - self.critic_network.update_target() - - def saveNetwork(self, i): - """Saver.""" - self.saver.save(self.sess, self.save_location + - self.env_name + 'network' + str(i) + - 'DDPG.ckpt', global_step=self.time_step) - - def action(self, state): - """Compute action.""" - action = self.actor_network.action(state) - return action - - def noise_action(self, state, epsilon): - """Select action a_t according to the current policy and noise.""" - action = self.actor_network.action(state) - temp_action = deepcopy(action) - self.step_printer.data["pi_TrackPos"] = action[0] - self.step_printer.data["pi_Velocity"] = action[1] - noise_t = np.zeros(self.action_dim) - noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80) - noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) - action = action + noise_t - action[0] = np.clip(action[0], -1, 1) - action[1] = np.clip(action[1], 0, 1) - self.visualize.append((temp_action[1], - self.critic_network.q_value( - state.reshape((1, self.state_dim)), - action.reshape((1, self.action_dim))))) - return action, self.step_printer - - def perceive(self, temp_buffer, traj_reward): - """Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer.""" - if (traj_reward > self.reward_threshold) or\ - (self.replay_buffer.num_experiences == 0): - self.episode_printer.data["Replay_Buffer"] = "GOOD-Added" - # print("Adding GOOD trajectory with reward %0.2f"%traj_reward) - for sample in temp_buffer: - if (not (math.isnan(sample[2]))): - next_action = self.actor_network.target_actions( - sample[3].reshape((1, self.state_dim))) - q_value = self.critic_network.target_q( - sample[3].reshape((1, self.state_dim)), - next_action.reshape((1, self.action_dim))) - y = sample[2] if sample[4] else sample[2] + GAMMA * q_value - TD = self.critic_network.td_error( - sample[0].reshape((1, self.state_dim)), - sample[1].reshape((1, self.action_dim)), - np.resize(y, [1, 1])) - self.replay_buffer.add(sample[0], sample[1], - sample[2], sample[3], sample[4], TD) - else: - if random.uniform(0, 1) < self.thresh_coin_toss: - self.episode_printer.data["Replay_Buffer"] = "BAD-Added" - # print("Adding LUCKY BAD trajectory with reward%0.2f"%traj_reward) - for sample in temp_buffer: - if (not (math.isnan(sample[2]))): - next_action = self.actor_network.target_actions( - sample[3].reshape((1, self.state_dim))) - q_value = self.critic_network.target_q( - sample[3].reshape((1, self.state_dim)), - next_action.reshape((1, self.action_dim))) - y = sample[2] if sample[4] else sample[2] + GAMMA * q_value - TD = self.critic_network.td_error( - sample[0].reshape((1, self.state_dim)), - sample[1].reshape((1, self.action_dim)), - np.resize(y, [1, 1])) - self.replay_buffer.add(sample[0], sample[1], - sample[2], sample[3], sample[4], TD) - else: - self.episode_printer.data["Replay_Buffer"] = "BAD-Rejected" - # print("REJECTING BAD trajectory with reward %0.2f"%traj_reward) - - self.time_step = self.time_step + 1 - # Store transitions to replay start size then start training - if self.replay_buffer.count() > REPLAY_START_SIZE: - self.train() - return self.episode_printer diff --git a/MADRaS/example_controllers/pid/__init__.py b/MADRaS/example_controllers/pid/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/MADRaS/example_controllers/pid/configurations.yml b/MADRaS/example_controllers/pid/configurations.yml deleted file mode 100644 index 161bdd4..0000000 --- a/MADRaS/example_controllers/pid/configurations.yml +++ /dev/null @@ -1,20 +0,0 @@ -agent: - total_explore: 6000000.0 - max_eps: 400 - max_steps_eps: 5000 - epsilon_start: 0.4 - delta: 0.2 - start_reward: -10000.0 - reward_threshold: -200 - thresh_coin_toss: 0.3 - speed_buffer_size: 100 - velocity_limit: 100.0 - acceleration_limit: 0.5 - -configs: - save_location: "save_network_checkpoints/" - f_diagnostics: "./logger.log" - torcsPort: 3001 - configFile: "~/.torcs/config/raceman/quickrace.xml" - fig_save_dir: "./figures_heatmaps" - is_training: 1 diff --git a/MADRaS/example_controllers/pid/multi_agent.py b/MADRaS/example_controllers/pid/multi_agent.py deleted file mode 100644 index 26def08..0000000 --- a/MADRaS/example_controllers/pid/multi_agent.py +++ /dev/null @@ -1,19 +0,0 @@ -import threading -import multiprocessing -import tensorflow as tf -import example_controllers.pid.playGame_DDPG_pid as playGame_DDPG -from time import sleep, time - -with tf.device("/cpu:0"): - num_workers = 3 # multiprocessing.cpu_count() #use this when you want to use all the cpus - print("numb of workers is" + str(num_workers)) - -with tf.Session() as sess: - worker_threads = [] -#with pymp.Parallel(4) as p: #uncomment this for parallelization of threads - for i in range(num_workers): - worker_work = lambda: (playGame_DDPG.playGame(f_diagnostics=""+str(i), train_indicator=1, port=3001+i)) - t = threading.Thread(target=(worker_work)) - t.start() - sleep(0.5) - worker_threads.append(t) diff --git a/MADRaS/example_controllers/pid/playGame_DDPG_pid.py b/MADRaS/example_controllers/pid/playGame_DDPG_pid.py deleted file mode 100644 index f58c8c0..0000000 --- a/MADRaS/example_controllers/pid/playGame_DDPG_pid.py +++ /dev/null @@ -1,267 +0,0 @@ -""" -DDPG Training. - -PID Control = True -""" -import sys -import os -import gc -import yaml -from copy import deepcopy -import numpy as np -import matplotlib.pyplot as plt - -from utils.madras_datatypes import Madras -from utils.gym_madras import MadrasEnv -from utils.display_utils import * -from example_controllers.pid.DDPG.ddpg import * - - -np.random.seed(1337) -gc.enable() -figure = plt.figure() -madras = Madras() - -with open("example_controllers/pid/configurations.yml", "r") as ymlfile: - cfg = yaml.load(ymlfile) - -# config parameters are printed in main - - -def visualize_action_value(list_Q, fig, ep_no): - """Heatmap Plotter.""" - actions_lane = [] - actions_vel = [] - Q_values = [] - for act, q in list_Q: - actions_lane.append(act[0]) - actions_vel.append(act[1]) - Q_values.append(q[0]) - actions_vel = np.asarray(actions_vel) - actions_lane = np.asarray(actions_lane) - Q_values = np.asarray(Q_values) - plot_heatmap(actions_lane, actions_vel, Q_values) - # plt.subplot(211) - # plt.clf() - # plt.pause(1) - # plt.scatter(actions_lane, Q_values, alpha=0.5) - # plt.title("Lane_Pos") - # plt.subplot(212) - # plt.scatter(actions_vel, Q_values, alpha=0.5) - # plt.title("Velocity") - plt.savefig(os.path.join(cfg['configs']['fig_save_dir'], - "plot_" + str(ep_no) + ".png")) - # plt.show(block=False) - # plt.pause(1) - plt.clf() - # plt.pause(1) - # fig.clear() - - -def playGame(f_diagnostics, train_indicator, port=3101): - """Training Method.""" - EXPLORE = cfg['agent']['total_explore'] - episode_count = cfg['agent']['max_eps'] - max_steps = cfg['agent']['max_steps_eps'] - EPSILON = cfg['agent']['epsilon_start'] - DELTA = cfg['agent']['delta'] - done = False - epsilon_steady_state = 0.01 # This is used for early stopping. - running_average_distance = 0. - max_distance = 0. - best_reward = cfg['agent']['start_reward'] - running_avg_reward = 0. - weights_location = cfg['configs']['save_location'] + str(port) + "/" - episode_printer = StructuredPrinter(mode="episode") - step_printer = StructuredPrinter(mode="step") - # Set up a Torcs environment - print(("TORCS has been asked to use port: ", port)) - env = MadrasEnv(vision=False, throttle=True, gear_change=False, - port=port, pid_assist=True) - - agent = DDPG(env.env_name, env.state_dim, env.action_dim, weights_location, - cfg['agent']['reward_threshold'], cfg['agent']['thresh_coin_toss']) - - s_t = env.reset() - print("\n\n-----------------Experiment Begins-----------------------\n\n") - - for i in range(episode_count): - - save_indicator = 1 - temp_distance = 0. - total_reward = 0. - info = {'termination_cause': 0} - distance_traversed = 0. - speed_array = [] - trackPos_array = [] - dmg = 0.0 - dmg_S = 0.0 - discount = 0.1 - CRASH_COUNTER = 0 - SPEED_BUFFER_SIZE = cfg['agent']['speed_buffer_size'] - count_stop = 0 - temp_replay_buffer = [] - - for step in range(max_steps): - dst = env.distance_traversed - - if (train_indicator): - - step_printer.data["Dist_Raced"] = dst - if dst < running_average_distance: - epsilon = EPSILON - DELTA - step_printer.data["eps"] = epsilon - step_printer.explore_mode = -1 - elif dst > max_distance: - epsilon = EPSILON + DELTA - step_printer.data["eps"] = epsilon - step_printer.explore_mode = 1 - else: - epsilon = deepcopy(EPSILON) - step_printer.data["eps"] = epsilon - step_printer.explore_mode = 0 - EPSILON -= 1.0 / EXPLORE - EPSILON = max(EPSILON, epsilon_steady_state) - d_t, step_info = agent.noise_action(s_t, epsilon) - for k in list(step_info.data.keys()): - step_printer.data[k] = step_info.data[k] - # Giving the car an inital kick - if step <= 50: - d_t[1] = abs(d_t[1]) + 0.1 - - else: # Testing phase - d_t = agent.action(s_t) - try: - s_t1, r_t, done, info = env.step(d_t) - distance_traversed += env.distance_traversed - speed_array.append(env.ob.speedX * np.cos(env.ob.angle)) - trackPos_array.append(env.ob.trackPos) - # Add to replay buffer only if training - if train_indicator: - temp_replay_buffer.append([s_t, d_t, r_t, s_t1, done]) - total_reward += r_t - s_t = s_t1 - dmg_S = 0.1 * dmg_S + (env.ob.damage - dmg) * 10 - if not dmg == env.ob.damage: - CRASH_COUNTER = CRASH_COUNTER + 1 - dmg = env.ob.damage - step_printer.data["Desired_Trackpos"] = d_t[0] - step_printer.data["Desired_Velocity"] = d_t[1] - step_printer.data["Reward"] = r_t - step_printer.print_step(step) - - except: - pass - - if done: - temp_distance = dst - if temp_distance > max_distance: - max_distance = temp_distance - break - - if train_indicator: - episode_info = agent.perceive(temp_replay_buffer, total_reward) - # Train agent - - for k in list(episode_info.data.keys()): - episode_printer.data[k] = episode_info.data[k] - - temp_replay_buffer = [] - - # Saving the best model yet - if save_indicator == 1 and train_indicator == 1: - if best_reward < total_reward: - if(best_reward < total_reward): - best_reward = total_reward - agent.saveNetwork(port) - - running_avg_reward = running_average(running_avg_reward, - i + 1, total_reward) - running_average_distance = running_average(running_average_distance, - i + 1, temp_distance) - temp_distance = 0. - - episode_printer.data["Total_Steps"] = step - episode_printer.data["Agv_Speed"] = np.mean(speed_array) - episode_printer.data["Avg_TrackPos"] = np.mean(trackPos_array) - episode_printer.data["Dist_Traversed"] = distance_traversed - episode_printer.data["Traj_Reward"] = total_reward - episode_printer.data["Run_Avg_Traj_Reward"] = running_avg_reward - episode_printer.data["Run_Avg_Dist_Trav"] = running_average_distance - episode_printer.data["Max_Dist_Trav"] = max_distance - episode_printer.data["Replay_Buffer_Size"] = agent.replay_buffer.num_experiences - - episode_printer.print_episode(i) - ''' - agent.visualize = [] - for data in agent.replay_buffer.getBatch(500):#use getBatch(500) - here to reduce latency - action = np.asarray(data.action) - state = np.asarray(data.state) - agent.visualize.append((action,agent.critic_network.q_value(state.reshape((1,agent.state_dim)),action.reshape((1,agent.action_dim))))) - visualize_action_value(agent.visualize,figure,i) - agent.visualize = [] - ''' - s_t = env.reset(info) - """ document_episode(i, distance_traversed, speed_array, trackPos_array, - info, running_avg_reward, f_diagnostics) - """ - - env.end() # This is for shutting down TORCS-engine - plt.close(figure) - print("Finished.") - - -def document_episode(episode_no, distance_traversed, speed_array, - trackPos_array, info, running_avg_reward, f_diagnostics): - """ - Note down a tuple of diagnostic values for each episode. - - (episode_no, distance_traversed, mean(speed_array), std(speed_array), - mean(trackPos_array), std(trackPos_array), info[termination_cause], - running_avg_reward) - """ - f_diagnostics.write(str(episode_no) + ",") - f_diagnostics.write(str(distance_traversed) + ",") - f_diagnostics.write(str(np.mean(speed_array)) + ",") - f_diagnostics.write(str(np.std(speed_array)) + ",") - f_diagnostics.write(str(np.mean(trackPos_array)) + ",") - f_diagnostics.write(str(np.std(trackPos_array)) + ",") - f_diagnostics.write(str(running_avg_reward) + "\n") - - -def running_average(prev_avg, num_episodes, new_val): - """Running average compute.""" - total = prev_avg * (num_episodes - 1) - total += new_val - return madras.floatX(total / num_episodes) - -def analyse_info(info, printing=True): - """Print Helper Function""" - simulation_state = ['Normal', 'Terminated as car is OUT OF TRACK', - 'Terminated as car has SMALL PROGRESS', - 'Terminated as car has TURNED BACKWARDS'] - if printing and info['termination_cause'] != 0: - print((simulation_state[info['termination_cause']])) - -if __name__ == "__main__": - - try: - port = int(sys.argv[1]) - except Exception as e: - print("Usage : python %s " % (sys.argv[0])) - sys.exit() - - print('is_training : ' + str(cfg['configs']['is_training'])) - print('Starting best_reward : ' + str(cfg['agent']['start_reward'])) - print((cfg['agent']['total_explore'])) - print((cfg['agent']['max_eps'])) - print((cfg['agent']['max_steps_eps'])) - print((cfg['agent']['epsilon_start'])) - print('config_file : ' + cfg['configs']['configFile']) - f_diagnostics = open("logger_17June.log", 'w') - f_diagnostics.write("EPISODE,distance_traversed,mean_speed,std_speed,\ - mean_track,std_track,reward_runn\n") - playGame(f_diagnostics, train_indicator=cfg['configs']['is_training'], - port=port) - f_diagnostics.close() diff --git a/MADRaS/test_environment_v2.py b/MADRaS/test_environment_v2.py deleted file mode 100644 index 8ab1369..0000000 --- a/MADRaS/test_environment_v2.py +++ /dev/null @@ -1,42 +0,0 @@ -import numpy as np -import gym -from envs.gym_madras_v2 import MadrasEnv -import os - - -def test_madras_vanilla(): - env = MadrasEnv() - print("Testing reset...") - obs = env.reset() - print("Initial observation: {}." - " Verify if the number of dimensions {} is right.".format(obs, len(obs))) - print("Testing step...") - for t in range(20000): - obs, r, done, _ = env.step([[0.0, 1.0, -1.0]]) - print("{}: reward={}, done={}".format(t, r, done)) - dones = [x for x in done.values()] - if np.all(dones): - env.reset() - os.system("pkill torcs") - - -def test_madras_pid(): - env = MadrasEnv() - print("Testing reset...") - obs = env.reset() - print("Initial observation: {}." - " Verify if the number of dimensions {} is 29.".format(obs, len(obs))) - print("Testing step...") - for t in range(2000): - obs, r, done, _ = env.step([[0.0, 1.0], - [0.0, 1.0]]) - print("{}: reward={}, done={}".format(t, r, done)) - dones = [x for x in done.values()] - if np.all(dones): - env.reset() - os.system("pkill torcs") - - -if __name__=='__main__': - # test_madras_vanilla() - test_madras_pid() \ No newline at end of file diff --git a/MADRaS/traffic/traffic.py b/MADRaS/traffic/traffic.py index abc68bd..befeddb 100644 --- a/MADRaS/traffic/traffic.py +++ b/MADRaS/traffic/traffic.py @@ -1,35 +1,39 @@ import numpy as np -from utils.gym_torcs import TorcsEnv -import utils.snakeoil3_gym as snakeoil3 -from controllers.pid import PIDController -import utils.madras_datatypes as md +from MADRaS.utils.gym_torcs_v2 import TorcsEnv +import MADRaS.utils.snakeoil3_gym as snakeoil3 +from MADRaS.controllers.pid import PIDController from multiprocessing import Process +from collections import OrderedDict +from time import time -MadrasDatatypes = md.MadrasDatatypes() -class MadrasTrafficManager(object): +class MadrasTrafficHandler(object): """Creates the traffic agents for a given training configuration.""" def __init__(self, torcs_server_port, num_learning_agents, cfg): - self.traffic_agents = {} + self.traffic_agents = OrderedDict() self.traffic_processes = [] + self.num_episodes_of_training = 0 for i, agent in enumerate(cfg): agent_type = [x for x in agent.keys()][0] # TODO(santara): find a better way of extracting key from a dictionary with a single entry agent_config = agent[agent_type] agent_name = '{}_{}'.format(agent_type, i) - agent_port = torcs_server_port + i + num_learning_agents + agent_port = torcs_server_port + i try: exec("self.traffic_agents['{}'] = {}({}, {}, '{}')".format( agent_name, agent_type, agent_port, agent_config, agent_name)) except Exception as e: - raise ValueError("Unknown traffic class {} or incomplete specification.\n Original error: {}".format( + raise ValueError("Unknown traffic class {} or " + "incomplete specification.\n Original error: {}".format( agent_type, e)) def get_traffic_agents(self): return self.traffic_agents - def flag_off_traffic(self): - for agent in self.traffic_agents.values(): - self.traffic_processes.append(Process(target=agent.flag_off)) + def flag_off_traffic(self, num_cars_to_reset): + self.num_episodes_of_training += 1 + for i, agent in enumerate(self.traffic_agents.values()): + if i < num_cars_to_reset: + self.traffic_processes.append(Process(target=agent.flag_off, args=(i+self.num_episodes_of_training,))) for traffic_process in self.traffic_processes: traffic_process.start() @@ -38,9 +42,10 @@ def kill_all_traffic_agents(self): traffic_process.terminate() self.traffic_processes = [] - def reset(self): + def reset(self, num_cars_to_reset): self.kill_all_traffic_agents() - self.flag_off_traffic() + self.flag_off_traffic(num_cars_to_reset) + class MadrasTrafficAgent(object): @@ -53,7 +58,6 @@ def __init__(self, port, cfg, name='MadrasTraffic'): self.env = TorcsEnv(vision=(cfg["vision"] if "vision" in cfg else False), throttle=(cfg["throttle"] if "throttle" in cfg else True), gear_change=(cfg["gear_change"] if "gear_change" in cfg else False), - visualise=(self.cfg["visualise"] if "visualise" in self.cfg else False), name=self.name ) self.PID_controller = PIDController(cfg["pid_settings"]) @@ -84,15 +88,17 @@ def wait_for_observation(self): def get_action(self): raise NotImplementedError("Successor classes must implement this method") - def flag_off(self, sleep=0): + def flag_off(self, random_seed=0): + del random_seed self.wait_for_observation() + self.client.respond_to_server() print("[{}]: My server is at {}".format(self.name, self.client.serverPID)) self.is_alive = True while True: if self.is_alive: action = self.get_action() try: - self.ob, _, done, info = self.env.step(0, self.client, action) + self.ob, _, done, _ = self.env.step(0, self.client, action) except Exception as e: print("Exception {} caught by {} traffic agent at port {}".format( @@ -209,3 +215,50 @@ def get_action(self): self.brake = 1 self.stopped_for -= 1 return np.asarray([self.steer, self.accel, self.brake]) + + +class ParkedAgent(MadrasTrafficAgent): + def __init__(self, port, cfg, name): + super(ParkedAgent, self).__init__(port, cfg, name) + self.target_speed = cfg["target_speed"]/self.env.default_speed + self.track_len = cfg["track_len"] + + def flag_off(self, random_seed=0): + self.time = time() + np.random.seed(random_seed) + parking_lane_pos_range = (self.cfg["parking_lane_pos"]["high"] - + self.cfg["parking_lane_pos"]["low"]) + self.parking_lane_pos = (self.cfg["parking_lane_pos"]["low"] + + np.random.random()*parking_lane_pos_range) + parking_dist_range = (self.cfg["parking_dist_from_start"]["high"] - + self.cfg["parking_dist_from_start"]["low"]) + self.parking_dist_from_start = (self.cfg["parking_dist_from_start"]["low"] + + np.random.random()*parking_dist_range) + self.behind_finish_line = True + self.prev_dist_from_start = self.track_len + self.parked = False + self.init_flag = True + super(ParkedAgent, self).flag_off() + + def get_action(self): + self.distance_from_start = self.ob.distFromStart + + if (self.behind_finish_line): + self.distance_from_start -= self.track_len + + if (abs(self.distance_from_start) >= 0.95*self.track_len): + self.behind_finish_line = False + + #print("START DIST:{} TRACK LEN: {}".format(self.distance_from_start, self.track_len)) + if not self.behind_finish_line and self.distance_from_start >= self.parking_dist_from_start: + self.steer, self.accel, self.brake = 0.0, 0.0, 1.0 + if not self.parked: + print("{} parked at lanepos: {}, distFromStart: {} after time {} sec".format( + self.name, self.parking_lane_pos, self.parking_dist_from_start, time()-self.time)) + self.parked = True + else: + action = self.PID_controller.get_action([self.parking_lane_pos, self.target_speed]) + self.steer, self.accel, self.brake = action[0], action[1], action[2] + self.avoid_impending_collision() + self.prev_dist_from_start = self.distance_from_start + return np.asarray([self.steer, self.accel, self.brake]) \ No newline at end of file diff --git a/MADRaS/utils/communications_buffer.py b/MADRaS/utils/communications_buffer.py new file mode 100644 index 0000000..d8b0540 --- /dev/null +++ b/MADRaS/utils/communications_buffer.py @@ -0,0 +1,56 @@ +from queue import Queue +from copy import deepcopy +import numpy as np +import MADRaS.utils.madras_datatypes as md + +mt = md.MadrasDatatypes() + +class CommBuffer: + """ + To hold the last k actions of taken + by an agent. + """ + + def __init__(self, agent_id, size, buffer_dim): + self._agent_id = agent_id + self._size = size + self._buffer_dim = buffer_dim + self._buffer = Queue(maxsize=self._size) + self._curr_size = 0 + + def insert(self, full_obs, var_list): + + if (self._curr_size < self._size): + self._curr_size += 1 + self._buffer.put(self.parse_buffer_items(full_obs, var_list)) + else: + _ = self._buffer.get() + self._buffer.put(self.parse_buffer_items(full_obs, var_list)) + + def request(self): + temp_queue = Queue(maxsize=self._size) + ret = np.zeros((self._size*self._buffer_dim,), dtype=mt.floatX) + for i in range(self._curr_size): + agent_var = self._buffer.get() + temp_queue.put(agent_var) + ret[i*self._buffer_dim: (i+1)*self._buffer_dim] = agent_var + self._buffer = temp_queue + return ret + + def parse_buffer_items(self, full_obs_list, var_list): + buffer_array = [] + for full_obs_act in full_obs_list: + full_obs, action = full_obs_act + for var in var_list: + if var == 'action': + buffer_array.append(action) + else: + val = full_obs[var] + buffer_array.append(val) + buffer_array = np.hstack(buffer_array) + return buffer_array + + def reset(self): + del self._buffer + self._buffer = Queue(maxsize=self._size) + self._curr_size = 0 diff --git a/MADRaS/envs/config_parser.py b/MADRaS/utils/config_parser.py similarity index 84% rename from MADRaS/envs/config_parser.py rename to MADRaS/utils/config_parser.py index 1ad6a44..2ba7cb5 100644 --- a/MADRaS/envs/config_parser.py +++ b/MADRaS/utils/config_parser.py @@ -15,6 +15,10 @@ def __init__(self): self.no_of_visualisations = 1 self.track_len = 7014.6 self.max_steps = 20000 + self.server_config = {} + self.noisy_observations = False + self.track_limits = {'low': -1, 'high': 1} + self.randomoze_env = False self.agents = {} self.traffic = [] @@ -28,7 +32,8 @@ def update(self, cfg_dict): if cfg_dict is None: return direct_attributes = ['torcs_server_port', 'visualise', 'vision', 'no_of_visualizations', 'track_len', - 'max_steps', 'agents', 'traffic'] + 'max_steps', 'agents', 'traffic', 'init_wait', 'server_config', 'track_limits', + 'randomize_env'] for key in direct_attributes: if key in cfg_dict: exec("self.{} = {}".format(key, cfg_dict[key])) @@ -50,6 +55,8 @@ def __init__(self): self.normalize_actions = False self.early_stop = True self.observations = None + self.action_noise_std = 0.001 + self.add_noise_to_actions = False self.rewards = {} self.dones = {} @@ -64,7 +71,7 @@ def update(self, cfg_dict): return direct_attributes = ['track_len', 'max_steps', 'vision', 'throttle', 'gear_change', 'pid_assist', 'pid_latency', 'target_speed', 'early_stop', 'normalize_actions', 'observations', 'rewards', 'dones', - 'pid_settings'] + 'pid_settings', 'action_noise_std', 'add_noise_to_actions'] for key in direct_attributes: if key in cfg_dict: exec("self.{} = {}".format(key, cfg_dict[key])) diff --git a/MADRaS/utils/data/car_config.template b/MADRaS/utils/data/car_config.template new file mode 100644 index 0000000..870f839 --- /dev/null +++ b/MADRaS/utils/data/car_config.template @@ -0,0 +1,4 @@ +
+ + +
\ No newline at end of file diff --git a/MADRaS/utils/data/car_dynamics.py b/MADRaS/utils/data/car_dynamics.py new file mode 100644 index 0000000..83dd8c7 --- /dev/null +++ b/MADRaS/utils/data/car_dynamics.py @@ -0,0 +1,95 @@ +dynamics_data = { + "155-DTM": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000], + [750.0, 363.5057471264368, 408.80079284902195, 435.3448275862069, 462.6436781609196, 462.35795454545456, 471.264367816092, 451.24350555006237, 401.02675179762593, 352.4077041741376, 330.06878830032144, 750.0, 750.0, 750.0, 750.0, 711.6161949594086, 667.1401827744456, 627.8966426112429, 593.0134957995072, 561.8022591784805, 533.7121462195564]], + "acura-nsx-sz": [[0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000], + [280, 300, 350, 390, 430, 430, 450, 450, 460, 460, 460, 440, 420, 390, 360, 350, 340, 320, 300, 280, 250]], + "baja-bug": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000], + [2000.0, 150.60536398467436, 160.51027785254712, 177.2462587670076, 177.3406511327918, 156.87826830977735, 109.13270838941034, 75.0287370177196, 42.145593869731805, 1186.8182037348374, 1068.1363833613536, 971.0330757830488, 890.1136528011281, 821.643371816426, 762.9545595438242, 712.0909222409025, 667.5852396008461, 628.3155196243257, 593.4091018674187, 562.1770438743966, 534.0681916806768]], + "buggy": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000], + [2000.0, 193.25583777291413, 254.64298624195746, 345.58690989979937, 384.2380774543822, 363.7756946313678, 297.8413499794324, 229.6334072360509, 181.8878473156839, 1186.8182037348374, 1068.1363833613536, 971.0330757830488, 890.1136528011281, 821.643371816426, 762.9545595438242, 712.0909222409025, 667.5852396008461, 628.3155196243257, 593.4091018674187, 562.1770438743966, 534.0681916806768]], + "car1-ow1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000], + [100, 100, 120, 140, 160, 180, 220, 260, 300, 340, 340, 340.0, 340.0, 345.0, 350.0, 355.0, 360.0, 360.0, 360.0, 330.0, 300.0]], + "car1-stock1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [100, 200, 333, 420, 520, 640, 670, 710, 690, 663, 590]], + "car1-stock2": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [100, 290, 390, 480, 560, 640, 650, 640, 600, 560, 500]], + "car1-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [100.0, 160.0, 190.0, 280.0, 350, 405.0, 443.0, 465.0, 483.0, 415.0, 360.0]], + "car1-trb3": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [100.0, 160.0, 190.0, 240.0, 300.0, 350, 405, 435, 430, 370, 293]], + "car2-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [100.0, 220.0, 310.0, 370.0, 420.0, 460.0, 490.0, 500.0, 485.0, 428.0, 310.0]], + "car3-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [90.0, 230.0, 350.0, 450.0, 530.0, 520.0, 500.0, 475.0, 450.0, 415.0, 369.0]], + "car4-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [80.0, 190.0, 230.0, 330.0, 405.0, 475.0, 505.0, 515.0, 480.0, 410.0, 330.0]], + "car5-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [97.0, 222.0, 325.0, 470.0, 560.0, 555.0, 545.0, 511.0, 471.0, 410.0, 320.0]], + "car6-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [91.0, 287.0, 377.0, 455.0, 493.0, 517.0, 523.0, 502.0, 473.0, 347.98, 185.10]], + "car7-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [91.0, 287.0, 377.0, 455.0, 492.0, 517.0, 501.0, 492.0, 453.0, 333.98, 185.10]], + "car8-trb1": [[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], + [97.0, 222.0, 325.0, 470.0, 560.0, 555.0, 545.0, 521.0, 471.0, 410.0, 320.0]], + "kc-2000gt": [[0.000000, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000], + [0, 140, 150, 150, 155, 160, 165, 168, 172, 175, 177, 172, 167, 161, 152, 142, 132, 130, 0.000000, 0.000000, 0.000000]], + "kc-5300gt": [[0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000], + [0, 110, 170, 260, 350, 430, 475, 488, 480, 470, 460, 445, 415, 380, 335]], + "kc-110": [[0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000], + [0, 60, 75, 87, 98, 109, 115, 118, 120, 122, 121.6, 121.2, 120.8, 120.4, 120, 107, 95, 80, 64, 0, 0]], + "p406": [[0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000], + [20, 60, 110, 110, 140, 167, 195, 220, 240, 260, 255, 250, 220, 200, 160]], + "pw-imprezawrc": [[0, 1024, 2048, 3072, 4096, 5120, 5888, 7168, 8192, 9216, 10240], + [100, 317, 339, 355, 415, 452, 482, 438, 364, 210, 147]], + "pw-focuswrc": [[0, 1024, 2048, 3072, 4096, 5120, 5888, 7168, 8192, 9216, 10240], + [100, 217, 339, 365, 461, 466, 472, 428, 354, 200, 147]], + "pw-evoviwrc": [[0, 1024, 2048, 3072, 4096, 5120, 5888, 7168, 8192, 9216, 10000], + [0, 230, 345, 395, 461, 471, 472, 458, 354, 200, 147]], + "pw-206wrc": [[0, 1024, 2048, 3072, 4096, 5120, 5888, 7168, 8192, 9216, 10000], + [100, 247, 369, 395, 466, 472, 467, 458, 384, 230, 177]], + "pw-corollawrc": [[0, 1024, 2048, 3072, 4096, 5120, 5888, 7168, 8192, 9216, 10000], + [100, 227, 310, 375, 471, 472, 462, 438, 364, 210, 157]], + "pw-306wrc": [[0, 1024, 2048, 3072, 4096, 5120, 5888, 7168, 8192, 9216, 10000], + [100, 247, 369, 395, 456, 462, 458, 445, 384, 230, 177]] +} + +import matplotlib.pyplot as plt + +def generate_plots(): + for car in dynamics_data.keys(): + rpm = dynamics_data[car][0] + torque = dynamics_data[car][1] + plt.figure() + plt.plot(rpm, torque) + plt.title(car) + plt.xlabel('rpm') + plt.ylabel('torque (N.m)') + plt.savefig(car+'.png') + + +cars_to_compare = [ + "car1-stock1", + "car1-stock2", + "155-DTM", + "car3-trb1", + "kc-2000gt", + "buggy", + "baja-bug" +] + + +def plot_to_compare(): + plt.figure() + for car in cars_to_compare: + rpm = dynamics_data[car][0] + torque = dynamics_data[car][1] + plt.plot(rpm, torque) + plt.title("Engine responses compared (Train)") + plt.xlabel('rpm') + plt.ylabel('torque (N.m)') + plt.legend(cars_to_compare) + plt.savefig('engine_responses_train.png') + +if __name__=='__main__': + # generate_plots() + plot_to_compare() \ No newline at end of file diff --git a/MADRaS/utils/data/quickrace.template b/MADRaS/utils/data/quickrace.template new file mode 100644 index 0000000..db49804 --- /dev/null +++ b/MADRaS/utils/data/quickrace.template @@ -0,0 +1,78 @@ + + + + + +
+ + + + +
+ +
+ +
+ + +
+ +
+ +
+
+ +
+ +
+ +
+ + + + + +
+ + + + + + +
+ +
+ +
+ + + + {car_config} + +
+ +
+ +
+ +
+ +
+ +
+ +
+ + +
+
+ +
+ +
+ +
+ +
+ +
\ No newline at end of file diff --git a/MADRaS/utils/data/scr_server_config.template b/MADRaS/utils/data/scr_server_config.template new file mode 100644 index 0000000..2e3e47a --- /dev/null +++ b/MADRaS/utils/data/scr_server_config.template @@ -0,0 +1,129 @@ + + + + + + + + + + +
+
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+ + + + + + + + + +
+
+
+
diff --git a/MADRaS/utils/data/track_details.py b/MADRaS/utils/data/track_details.py new file mode 100644 index 0000000..bb04d22 --- /dev/null +++ b/MADRaS/utils/data/track_details.py @@ -0,0 +1,63 @@ +track_lengths = { + "aalborg": 2587.54, + "alpine-1": 6355.65, + "alpine-2": 3773.57, + "brondehach": 3919.31, + "g-track-1": 2057.56, + "g-track-2": 3185.83, + "g-track-3": 2843.10, + "corkscrew": 3608.45, + "eroad": 3260.43, + "e-track-1": 3243.64, + "e-track-2": 5380.50, + "e-track-3": 4208.37, + "e-track-4": 7041.68, + "e-track-6": 4441.29, + "forza": 5784.10, + "ole-road-1": 6282.81, + "ruudskogen": 3274.20, + "spring": 22129.77, + "street-1": 3823.05, + "wheel-1": 4328.54, + "wheel-2": 6205.46, + "dirt-1": 1072.93, + "dirt-2": 1760.94, + "dirt-3": 2205.93, + "dirt-4": 3260.43, + "dirt-5": 1072.93, + "dirt-6": 3147.46, + "mixed-1": 1014.22, + "mixed-2": 1412.90 +} + +track_widths = { + "aalborg": 10.0, + "alpine-1": 12.0, + "alpine-2": 10.0, + "brondehach": 13.0, + "g-track-1": 15.0, + "g-track-2": 15.0, + "g-track-3": 10.0, + "corkscrew": 12.0, + "eroad": 16.0, + "e-track-1": 15.0, + "e-track-2": 12.0, + "e-track-3": 12.0, + "e-track-4": 15.0, + "e-track-6": 13.0, + "forza": 11.0, + "ole-road-1": 10.0, + "ruudskogen": 11.0, + "spring": 12.0, + "street-1": 14.0, + "wheel-1": 14.0, + "wheel-2": 12.0, + "dirt-1": 10.0, + "dirt-2": 10.0, + "dirt-3": 10.0, + "dirt-4": 16.0, + "dirt-5": 10.0, + "dirt-6": 15.0, + "mixed-1": 10.0, + "mixed-2": 10.0 +} \ No newline at end of file diff --git a/MADRaS/utils/done_handler_v2.py b/MADRaS/utils/done_handler_v2.py new file mode 100644 index 0000000..e53f2d6 --- /dev/null +++ b/MADRaS/utils/done_handler_v2.py @@ -0,0 +1,161 @@ +import numpy as np +import math +import warnings + + +class DoneHandler(object): + """Composes the done function from a given done configuration.""" + def __init__(self, cfg, agent_id): + self.agent_id = agent_id + self.dones = {} + for key in cfg: + try: + exec("self.dones['{}'] = {}()".format(key, key)) + except: + raise ValueError("Unknown done class {}".format(key)) + + if not self.dones: + warnings.warn("No done function specified. Setting TorcsDone " + "as done.") + self.dones['TorcsDone'] = TorcsDone() + + def get_done_signal(self, game_config, game_state): + done_signals = [] + + for key, done_function in self.dones.items(): + done_val = done_function.check_done(game_config, game_state) + done_signals.append(done_val) + if done_val: + if hasattr(done_function, "num_steps"): + done_function.reason = done_function.reason.format(done_function.num_steps) + out = "[{}] {}".format(self.agent_id, done_function.reason) + print(out) + + signal = np.any(done_signals) + return signal + + def reset(self): + for done_function in self.dones.values(): + done_function.reset() + + +class MadrasDone(object): + """Base class of MADRaS done function classes. + Any new done class must inherit this class and implement + the following methods: + - [required] check_done(game_config, game_state) + - [optional] reset() + """ + def __init__(self, reason): + self.reason = reason + pass + + def check_done(self, game_config, game_state): + del game_config, game_state + raise NotImplementedError("Successor class must implement this method.") + + def reset(self): + pass + + +class TorcsDone(MadrasDone): + """Vanilla done function provided by TORCS.""" + + def __init__(self): + MadrasDone.__init__(self, "Done: Torcs Done") + + def check_done(self, game_config, game_state): + del game_config + if not math.isnan(game_state["torcs_done"]): + return game_state["torcs_done"] + else: + return True + + +class RaceOver(MadrasDone): + """Terminates episode when the agent has finishes one lap.""" + def __init__(self): + MadrasDone.__init__(self, "Done: Race over!") + + def check_done(self, game_config, game_state): + if game_state["distance_traversed"] >= game_config.track_len: + return True + else: + return False + + +class TimeOut(MadrasDone): + + def __init__(self): + MadrasDone.__init__(self, "Done: Episode terminated due to timeout.") + + def check_done(self, game_config, game_state): + self.num_steps = game_state["num_steps"] + if not game_config.max_steps: + max_steps = int(game_config.track_len / game_config.target_speed * 50) + else: + max_steps = game_config.max_steps + if self.num_steps >= max_steps: + return True + else: + return False + + +class Collision(MadrasDone): + + def __init__(self): + self.damage = 0.0 + MadrasDone.__init__(self, "Done: Episode terminated because agent collided.") + + def check_done(self, game_config, game_state): + del game_config + if self.damage < game_state["damage"]: + self.damage = 0.0 + return True + else: + return False + + def reset(self): + self.damage = 0.0 + + +class TurnBackward(MadrasDone): + + def __init__(self): + MadrasDone.__init__(self, "Done: Episode terminated because agent turned backward.") + + def check_done(self, game_config, game_state): + del game_config + if np.cos(game_state["angle"]) < 0: + return True + else: + return False + + +class OutOfTrack(MadrasDone): + + def __init__(self): + MadrasDone.__init__(self, "Done: Episode terminated because agent went out of track" + " after {} steps.") + + def check_done(self, game_config, game_state): + self.num_steps = game_state["num_steps"] + if (game_state["trackPos"] < -1 or game_state["trackPos"] > 1 + or np.any(np.asarray(game_state["track"]) < 0)): + return True + else: + return False + + +class Rank1(MadrasDone): + + def __init__(self): + MadrasDone.__init__(self, "Done: Episode terminated because agent is Rank 1" + " after {} steps.") + + def check_done(self, game_config, game_state): + self.num_steps = game_state["num_steps"] + if game_state["racePos"] == 1: + return True + else: + return False \ No newline at end of file diff --git a/MADRaS/utils/gym_torcs.py b/MADRaS/utils/gym_torcs.py index af0d3bc..f0cd129 100644 --- a/MADRaS/utils/gym_torcs.py +++ b/MADRaS/utils/gym_torcs.py @@ -13,10 +13,10 @@ from gym import spaces import numpy as np import copy -import utils.snakeoil3_gym as snakeoil3 -import utils.madras_datatypes as md +import MADRaS.utils.snakeoil3_gym as snakeoil3 +import MADRaS.utils.madras_datatypes as md -madras = md.MadrasDatatypes() +mt = md.MadrasDatatypes() class TorcsEnv: @@ -40,8 +40,8 @@ def __init__(self, vision=False, throttle=False, gear_change=False, obs_dim=29, if throttle is False: # Throttle is generally True self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,)) else: - high = np.array([1., 1., 1.], dtype=madras.floatX) - low = np.array([-1., 0., 0.], dtype=madras.floatX) + high = np.array([1., 1., 1.], dtype=mt.floatX) + low = np.array([-1., 0., 0.], dtype=mt.floatX) self.action_space = spaces.Box(low=low, high=high) # steer,accel,brake if vision is False: # Vision has to be set True if you need the images from the simulator @@ -49,8 +49,8 @@ def __init__(self, vision=False, throttle=False, gear_change=False, obs_dim=29, low = -high self.observation_space = spaces.Box(low, high) else: - high = np.array([1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255], dtype=madras.floatX) - low = np.array([0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0], dtype=madras.floatX) + high = np.array([1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255], dtype=mt.floatX) + low = np.array([0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0], dtype=mt.floatX) self.observation_space = spaces.Box(low=low, high=high) @@ -251,19 +251,21 @@ def make_observation(self, raw_obs): 'rpm', 'track', 'trackPos', - 'wheelSpinVel'] + 'wheelSpinVel', + 'distFromStart'] Observation = col.namedtuple('Observation', names) - return Observation(focus=np.array(raw_obs['focus'], dtype=madras.floatX)/200., - speedX=np.array(raw_obs['speedX'], dtype=madras.floatX)/self.default_speed, - speedY=np.array(raw_obs['speedY'], dtype=madras.floatX)/self.default_speed, - speedZ=np.array(raw_obs['speedZ'], dtype=madras.floatX)/self.default_speed, - angle=np.array(raw_obs['angle'], dtype=madras.floatX)/3.1416, - damage=np.array(raw_obs['damage'], dtype=madras.floatX), - opponents=np.array(raw_obs['opponents'], dtype=madras.floatX)/200., - rpm=np.array(raw_obs['rpm'], dtype=madras.floatX)/10000, - track=np.array(raw_obs['track'], dtype=madras.floatX)/200., - trackPos=np.array(raw_obs['trackPos'], dtype=madras.floatX)/1., - wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=madras.floatX)) + return Observation(focus=np.array(raw_obs['focus'], dtype=mt.floatX)/200., + speedX=np.array(raw_obs['speedX'], dtype=mt.floatX)/self.default_speed, + speedY=np.array(raw_obs['speedY'], dtype=mt.floatX)/self.default_speed, + speedZ=np.array(raw_obs['speedZ'], dtype=mt.floatX)/self.default_speed, + angle=np.array(raw_obs['angle'], dtype=mt.floatX)/3.1416, + damage=np.array(raw_obs['damage'], dtype=mt.floatX), + opponents=np.array(raw_obs['opponents'], dtype=mt.floatX)/200., + rpm=np.array(raw_obs['rpm'], dtype=mt.floatX)/10000, + track=np.array(raw_obs['track'], dtype=mt.floatX)/200., + trackPos=np.array(raw_obs['trackPos'], dtype=mt.floatX)/1., + wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=mt.floatX), + distFromStart=np.array(raw_obs['distFromStart'], dtype=mt.floatX)) else: names = ['focus', 'speedX', 'speedY', 'speedZ', 'angle', @@ -278,13 +280,13 @@ def make_observation(self, raw_obs): # Get RGB from observation image_rgb = self.obs_vision_to_image_rgb(raw_obs[names[8]]) - return Observation(focus=np.array(raw_obs['focus'], dtype=madras.floatX)/200., - speedX=np.array(raw_obs['speedX'], dtype=madras.floatX)/self.default_speed, - speedY=np.array(raw_obs['speedY'], dtype=madras.floatX)/self.default_speed, - speedZ=np.array(raw_obs['speedZ'], dtype=madras.floatX)/self.default_speed, - opponents=np.array(raw_obs['opponents'], dtype=madras.floatX)/200., - rpm=np.array(raw_obs['rpm'], dtype=madras.floatX), - track=np.array(raw_obs['track'], dtype=madras.floatX)/200., - trackPos=np.array(raw_obs['trackPos'], dtype=madras.floatX)/1., - wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=madras.floatX), + return Observation(focus=np.array(raw_obs['focus'], dtype=mt.floatX)/200., + speedX=np.array(raw_obs['speedX'], dtype=mt.floatX)/self.default_speed, + speedY=np.array(raw_obs['speedY'], dtype=mt.floatX)/self.default_speed, + speedZ=np.array(raw_obs['speedZ'], dtype=mt.floatX)/self.default_speed, + opponents=np.array(raw_obs['opponents'], dtype=mt.floatX)/200., + rpm=np.array(raw_obs['rpm'], dtype=mt.floatX), + track=np.array(raw_obs['track'], dtype=mt.floatX)/200., + trackPos=np.array(raw_obs['trackPos'], dtype=mt.floatX)/1., + wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=mt.floatX), img=image_rgb) diff --git a/MADRaS/utils/gym_torcs_v2.py b/MADRaS/utils/gym_torcs_v2.py index c255504..2d89eec 100644 --- a/MADRaS/utils/gym_torcs_v2.py +++ b/MADRaS/utils/gym_torcs_v2.py @@ -13,10 +13,10 @@ from gym import spaces import numpy as np import copy -import utils.snakeoil3_gym_v2 as snakeoil3 -import utils.madras_datatypes as md +import MADRaS.utils.snakeoil3_gym_v2 as snakeoil3 +import MADRaS.utils.madras_datatypes as md -madras = md.MadrasDatatypes() +mt = md.MadrasDatatypes() class TorcsEnv: @@ -38,8 +38,8 @@ def __init__(self, vision=False, throttle=False, gear_change=False, obs_dim=29, if throttle is False: # Throttle is generally True self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,)) else: - high = np.array([1., 1., 1.], dtype=madras.floatX) - low = np.array([-1., 0., 0.], dtype=madras.floatX) + high = np.array([1., 1., 1.], dtype=mt.floatX) + low = np.array([-1., 0., 0.], dtype=mt.floatX) self.action_space = spaces.Box(low=low, high=high) # steer,accel,brake if vision is False: # Vision has to be set True if you need the images from the simulator @@ -47,8 +47,8 @@ def __init__(self, vision=False, throttle=False, gear_change=False, obs_dim=29, low = -high self.observation_space = spaces.Box(low, high) else: - high = np.array([1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255], dtype=madras.floatX) - low = np.array([0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0], dtype=madras.floatX) + high = np.array([1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255], dtype=mt.floatX) + low = np.array([0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0], dtype=mt.floatX) self.observation_space = spaces.Box(low=low, high=high) @@ -59,7 +59,6 @@ def terminate(self): def step(self, step, client, u, early_stop=1): - print("In STEP of {}: My server is {}".format(self.name, client.serverPID)) reward = 0 this_action = self.agent_to_torcs(u) @@ -110,7 +109,6 @@ def step(self, step, client, u, early_stop=1): # Save the previous full-obs from torcs for the reward calculation obs_pre = copy.deepcopy(client.S.d) - # One-Step Dynamics Update ################################# # Apply the Agent's action into torcs client.respond_to_server() @@ -119,9 +117,10 @@ def step(self, step, client, u, early_stop=1): code = client.get_servers_input(step) if code==-1: + obs_pre_ret = self.make_observation(obs_pre) client.R.d['meta'] = True print('Terminating because server stopped responding') - return obs_pre, 0, client.R.d['meta'], {'termination_cause':'hardReset'} + return obs_pre_ret, 0, client.R.d['meta'], {'termination_cause':'hardReset'} # Get the current full-observation from torcs obs = client.S.d @@ -225,19 +224,21 @@ def make_observation(self, raw_obs): 'rpm', 'track', 'trackPos', - 'wheelSpinVel'] + 'wheelSpinVel', + 'distFromStart'] Observation = col.namedtuple('Observation', names) - return Observation(focus=np.array(raw_obs['focus'], dtype=madras.floatX)/200., - speedX=np.array(raw_obs['speedX'], dtype=madras.floatX)/self.default_speed, - speedY=np.array(raw_obs['speedY'], dtype=madras.floatX)/self.default_speed, - speedZ=np.array(raw_obs['speedZ'], dtype=madras.floatX)/self.default_speed, - angle=np.array(raw_obs['angle'], dtype=madras.floatX)/3.1416, - damage=np.array(raw_obs['damage'], dtype=madras.floatX), - opponents=np.array(raw_obs['opponents'], dtype=madras.floatX)/200., - rpm=np.array(raw_obs['rpm'], dtype=madras.floatX)/10000, - track=np.array(raw_obs['track'], dtype=madras.floatX)/200., - trackPos=np.array(raw_obs['trackPos'], dtype=madras.floatX)/1., - wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=madras.floatX)) + return Observation(focus=np.array(raw_obs['focus'], dtype=mt.floatX)/200., + speedX=np.array(raw_obs['speedX'], dtype=mt.floatX)/self.default_speed, + speedY=np.array(raw_obs['speedY'], dtype=mt.floatX)/self.default_speed, + speedZ=np.array(raw_obs['speedZ'], dtype=mt.floatX)/self.default_speed, + angle=np.array(raw_obs['angle'], dtype=mt.floatX)/3.1416, + damage=np.array(raw_obs['damage'], dtype=mt.floatX), + opponents=np.array(raw_obs['opponents'], dtype=mt.floatX)/200., + rpm=np.array(raw_obs['rpm'], dtype=mt.floatX)/10000, + track=np.array(raw_obs['track'], dtype=mt.floatX)/200., + trackPos=np.array(raw_obs['trackPos'], dtype=mt.floatX)/1., + wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=mt.floatX), + distFromStart=np.array(raw_obs['distFromStart'], dtype=mt.floatX)) else: names = ['focus', 'speedX', 'speedY', 'speedZ', 'angle', @@ -252,13 +253,13 @@ def make_observation(self, raw_obs): # Get RGB from observation image_rgb = self.obs_vision_to_image_rgb(raw_obs[names[8]]) - return Observation(focus=np.array(raw_obs['focus'], dtype=madras.floatX)/200., - speedX=np.array(raw_obs['speedX'], dtype=madras.floatX)/self.default_speed, - speedY=np.array(raw_obs['speedY'], dtype=madras.floatX)/self.default_speed, - speedZ=np.array(raw_obs['speedZ'], dtype=madras.floatX)/self.default_speed, - opponents=np.array(raw_obs['opponents'], dtype=madras.floatX)/200., - rpm=np.array(raw_obs['rpm'], dtype=madras.floatX), - track=np.array(raw_obs['track'], dtype=madras.floatX)/200., - trackPos=np.array(raw_obs['trackPos'], dtype=madras.floatX)/1., - wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=madras.floatX), + return Observation(focus=np.array(raw_obs['focus'], dtype=mt.floatX)/200., + speedX=np.array(raw_obs['speedX'], dtype=mt.floatX)/self.default_speed, + speedY=np.array(raw_obs['speedY'], dtype=mt.floatX)/self.default_speed, + speedZ=np.array(raw_obs['speedZ'], dtype=mt.floatX)/self.default_speed, + opponents=np.array(raw_obs['opponents'], dtype=mt.floatX)/200., + rpm=np.array(raw_obs['rpm'], dtype=mt.floatX), + track=np.array(raw_obs['track'], dtype=mt.floatX)/200., + trackPos=np.array(raw_obs['trackPos'], dtype=mt.floatX)/1., + wheelSpinVel=np.array(raw_obs['wheelSpinVel'], dtype=mt.floatX), img=image_rgb) diff --git a/MADRaS/envs/observation_manager.py b/MADRaS/utils/observation_handler.py similarity index 72% rename from MADRaS/envs/observation_manager.py rename to MADRaS/utils/observation_handler.py index 810b181..d212c3f 100644 --- a/MADRaS/envs/observation_manager.py +++ b/MADRaS/utils/observation_handler.py @@ -1,10 +1,10 @@ import numpy as np from gym import spaces -import utils.madras_datatypes as md +import MADRaS.utils.madras_datatypes as md -MadrasDatatypes = md.MadrasDatatypes() +mt = md.MadrasDatatypes() -class ObservationManager(object): +class ObservationHandler(object): """Composes the observation vector for a given observation mode.""" def __init__(self, cfg, vision=False): self.cfg = cfg @@ -15,10 +15,13 @@ def __init__(self, cfg, vision=False): except: raise ValueError("Unrecognized observation mode {}".format(cfg["mode"])) - def get_obs(self, full_obs, game_config): + def get_obs(self, full_obs, game_config, full_flag=False): if self.cfg["normalize"]: full_obs = self.normalize_obs(full_obs, game_config) - return self.obs.get_obs(full_obs) + if not full_flag: + return self.obs.get_obs(full_obs) + else: + return self.obs.get_obs(full_obs), full_obs def min_max_normalize(self, var, min, max): offset = (min + max) / 2.0 @@ -82,10 +85,10 @@ def observation_space(self, vision): else: high = np.array([1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255], - dtype=MadrasDatatypes.floatX) + dtype=mt.floatX) low = np.array([0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0], - dtype=MadrasDatatypes.floatX) + dtype=mt.floatX) observation_space = spaces.Box(low=low, high=high) return observation_space @@ -110,9 +113,38 @@ def observation_space(self, vision): if vision: raise NotImplementedError("Vision inputs not supported yet.") high = np.asarray([1] + 19*[1] + [np.inf] + 3*[np.inf], - dtype=MadrasDatatypes.floatX) + dtype=mt.floatX) low = np.asarray([-1] + 19*[0] + [-np.inf] + 3*[-np.inf], - dtype=MadrasDatatypes.floatX) + dtype=mt.floatX) + + observation_space = spaces.Box(low=low, high=high) + + return observation_space + + +class SingleAgentInTrafficObs(MadrasObs): + def get_obs(self, full_obs): + track = [(x if x > 0 else 0) for x in full_obs.track] + obs = np.hstack((full_obs.angle, + track, + full_obs.trackPos, + full_obs.speedX, + full_obs.speedY, + full_obs.speedZ, + full_obs.opponents)) + return obs + + @property + def observation_dim(self): + return 60 + + def observation_space(self, vision): + if vision: + raise NotImplementedError("Vision inputs not supported yet.") + high = np.asarray([1] + 19*[1] + [np.inf] + 3*[np.inf] + 36*[1], + dtype=mt.floatX) + low = np.asarray([-1] + 19*[0] + [-np.inf] + 3*[-np.inf] + 36*[0], + dtype=mt.floatX) observation_space = spaces.Box(low=low, high=high) diff --git a/MADRaS/envs/reward_manager.py b/MADRaS/utils/reward_handler.py similarity index 73% rename from MADRaS/envs/reward_manager.py rename to MADRaS/utils/reward_handler.py index 7ef3684..0b243b7 100644 --- a/MADRaS/envs/reward_manager.py +++ b/MADRaS/utils/reward_handler.py @@ -4,7 +4,7 @@ from copy import deepcopy -class RewardManager(object): +class RewardHandler(object): """Composes the reward function from a given reward configuration.""" def __init__(self, cfg): self.rewards = {} @@ -79,7 +79,7 @@ def reset(self): class ProgressReward2(ProgressReward): def compute_reward(self, game_config, game_state): target_speed = game_config.target_speed / 50 # m/step - progress = game_state["distance_traversed"] - self.prev_dist + progress = game_state["distance_traversed"] - game_state["prev_distance_traversed"] reward = self.cfg["scale"] * np.min([1.0, progress/target_speed]) self.prev_dist = deepcopy(game_state["distance_traversed"]) return reward @@ -87,9 +87,14 @@ def compute_reward(self, game_config, game_state): class ProgressReward3(ProgressReward): def compute_reward(self, game_config, game_state): - target_speed = game_config.target_speed / 50 # m/step - progress = game_state["distance_traversed"] - self.prev_dist + #print("CURRENT {} PREV {} DIFF {}".format(game_state["distance_traversed"], self.prev_dist, game_state["distance_traversed"] - self.prev_dist)) + target_speed = game_config.target_speed / 50.0 # m/step + progress = game_state["distance_traversed"] - game_state["prev_distance_traversed"] reward = self.cfg["scale"] * (progress/target_speed) + + #if (reward > 0.6): + # print("CURRENT {} PREV {} DIFF {}".format(game_state["distance_traversed"], game_state["prev_distance_traversed"], game_state["distance_traversed"] - game_state["prev_distance_traversed"])) + # print("PROGRESS {} rew {}".format(progress, progress/target_speed)) self.prev_dist = deepcopy(game_state["distance_traversed"]) return reward @@ -117,7 +122,7 @@ def __init__(self, cfg): def compute_reward(self, game_config, game_state): del game_config reward = 0.0 - if self.damage < game_state["damage"]: + if game_state["prev_damage"] < game_state["damage"]: reward = -self.cfg["scale"] return reward @@ -153,4 +158,37 @@ def compute_reward(self, game_config, game_state): else: reward = 0 self.prev_angles = self.prev_angles[1:] + return reward + + +class SuccessfulOvertakeReward(MadrasReward): + def __init__(self, cfg): + self.rank = np.inf + super(SuccessfulOvertakeReward, self).__init__(cfg) + + def reset(self): + self.rank = np.inf + + def compute_reward(self, game_config, game_state): + reward = 0.0 + if math.isinf(game_state["racePos"]): # very fist step + return reward + if game_state["racePos"] < game_state["prev_racePos"]: + #self.rank = game_state["racePos"] + reward = self.cfg["scale"] + return reward + +class TaskReward(MadrasReward): + def __init__(self, cfg): + super(TaskReward, self).__init__(cfg) + self.got_reward = False + + def reset(self): + self.got_reward = False + + def compute_reward(self, game_config, game_state): + reward = 0 + if ((game_state["racePos"] <= 2) and (not self.got_reward)): + reward = self.cfg["scale"] * 100 + self.got_reward = False return reward \ No newline at end of file diff --git a/MADRaS/utils/snakeoil3_gym_v2.py b/MADRaS/utils/snakeoil3_gym_v2.py index fb74027..06bc42b 100644 --- a/MADRaS/utils/snakeoil3_gym_v2.py +++ b/MADRaS/utils/snakeoil3_gym_v2.py @@ -205,6 +205,7 @@ def setup_connection(self): sockdata, addr = self.so.recvfrom(data_size) sockdata = sockdata.decode('utf-8') except socket.error as emsg: + #print("[{}]: SocketData: {}".format(self.name, sockdata)) print("[{}]: SocketError: {}".format(self.name, emsg)) print("{} Waiting for server on {}............".format(self.name, self.port)) print("[{}]: Count Down : {}".format(self.name, n_fail)) @@ -275,18 +276,18 @@ def get_servers_input(self, step): sockdata, addr = self.so.recvfrom(data_size) sockdata = sockdata.decode('utf-8') except socket.error as emsg: + print("[{}]: SocketData: {}".format(self.name, sockdata)) print('[{}]: SocketError: {}'.format(self.name, emsg)) print("{} Waiting for server data on {}..............".format(self.name, self.port)) print("[{}]: Server count down : {}".format(self.name, n_fail)) - if n_fail < 0: - self.shutdown() - return -1 - n_fail = n_fail_org + # if n_fail < 0: + # self.shutdown() + # return -1 + # n_fail = n_fail_org n_fail -= 1 - print("[{}]: My server PID is {}".format(self.name, self.serverPID)) if '***identified***' in sockdata: print("{} Client connected on {}..............".format(self.name, self.port)) continue @@ -381,7 +382,7 @@ def fancyout(self): # 'gear', 'distRaced', 'distFromStart', - # 'racePos', + 'racePos', 'opponents', 'wheelSpinVel', 'z', diff --git a/MADRaS/utils/torcs_server_config.py b/MADRaS/utils/torcs_server_config.py new file mode 100644 index 0000000..19e1d0e --- /dev/null +++ b/MADRaS/utils/torcs_server_config.py @@ -0,0 +1,129 @@ +import numpy as np +import random +import os +import logging +import MADRaS.utils.data.track_details as track_details +logger = logging.getLogger(__name__) + +path_and_file = os.path.realpath(__file__) +path, _ = os.path.split(path_and_file) + +QUICKRACE_TEMPLATE_PATH = os.path.join(path, "data", "quickrace.template") +CAR_CONFIG_TEMPATE_PATH = os.path.join(path, "data", "car_config.template") +SCR_SERVER_CONFIG_TEMPLATE_PATH = os.path.join(path, "data", "scr_server_config.template") + +MAX_NUM_CARS = 10 # There are 10 scr-servers in the TORCS GUI + +DIRT_TRACK_NAMES = [ + "dirt-1", + "dirt-2", + "dirt-3", + "dirt-4", + "dirt-5", + "dirt-6", + "mixed-1", + "mixed-2" +] + +ROAD_TRACK_NAMES = [ + "aalborg", + "alpine-1", + "alpine-2", + "brondehach", + "g-track-1", + "g-track-2", + "g-track-3", + "corkscrew", + "eroad", + # "e-track-1", # Segmentation fault from torcs: no track observations after about 1000 steps + "e-track-2", + "e-track-3", + "e-track-4", + "e-track-6", + "forza", + "ole-road-1", + "ruudskogen", + "spring", + "street-1", + "wheel-1", + "wheel-2" +] + + + +class TorcsConfig(object): + def __init__(self, cfg, num_learning_cars, randomize=False): + self.max_cars = cfg["max_cars"] if "max_cars" in cfg else MAX_NUM_CARS + self.min_traffic_cars = cfg["min_traffic_cars"] if "min_traffic_cars" in cfg else 0 + self.track_category = cfg["track_category"] if "track_category" in cfg else "road" + self.num_learning_cars = num_learning_cars + self.track_names = cfg["track_names"] if "track_names" in cfg else ROAD_TRACK_NAMES + self.distance_to_start = cfg["distance_to_start"] if "distance_to_start" in cfg else 0 + self.torcs_server_config_dir = (cfg["torcs_server_config_dir"] if "torcs_server_config_dir" in cfg + else "/home/sohan/.torcs/config/raceman/") + self.scr_server_config_dir = (cfg["scr_server_config_dir"] if "scr_server_config_dir" in cfg + else "/home/sohan/usr/local/share/games/torcs/drivers/scr_server/") + with open(QUICKRACE_TEMPLATE_PATH, 'r') as f: + self.quickrace_template = f.read() + with open(CAR_CONFIG_TEMPATE_PATH, 'r') as f: + self.car_config_template = f.read() + with open(SCR_SERVER_CONFIG_TEMPLATE_PATH, 'r') as f: + self.scr_server_config_template = f.read() + self.randomize = randomize + self.quickrace_xml_path = os.path.join(self.torcs_server_config_dir, "quickrace.xml") + self.scr_server_xml_path = os.path.join(self.scr_server_config_dir, "scr_server.xml") + self.traffic_car_type = cfg['traffic_car'] if 'traffic_car' in cfg else 'car1-trb1' + self.learning_car_types = cfg['learning_car'] + self.track_length = 0 + self.track_width = 0 + + def get_num_traffic_cars(self): + if not self.randomize: + return self.max_cars-self.num_learning_cars + else: + if (self.min_traffic_cars == (self.max_cars-self.num_learning_cars)): + return 0 + num_traffic_cars = np.random.randint(low=self.min_traffic_cars, high=self.max_cars-self.num_learning_cars+1) + return num_traffic_cars + + def get_track_name(self): + if not self.randomize: + track_name = self.track_names[0] + else: + track_name = random.sample(self.track_names, 1)[0] + logging.info("-------------------------CURRENT TRACK:{}------------------------".format(track_name)) + self.track_length = track_details.track_lengths[track_name] + self.track_width = track_details.track_widths[track_name] + return track_name + + def get_learning_car_type(self): + if not self.randomize: + learning_car_types = self.learning_car_types[:self.num_learning_cars] + else: + learning_car_types = random.sample(self.learning_car_types, self.num_learning_cars) + for i, car in enumerate(learning_car_types): + logging.info("-------------------------LEARNING CAR {}:{}------------------------".format(i, car)) + return learning_car_types + + def generate_torcs_server_config(self): + self.num_traffic_cars = self.get_num_traffic_cars() + logging.info("-----------------------Num. Traffic Cars:{}-----------------------".format(self.num_traffic_cars)) + + car_config = "\n".join(self.car_config_template.format( + **{"section_no": i+1, "car_no": i}) for i in range( + self.num_traffic_cars + self.num_learning_cars)) + context = { + "track_name": self.get_track_name(), + "track_category": self.track_category, + "distance_to_start": self.distance_to_start, + "car_config": car_config + } + torcs_server_config = self.quickrace_template.format(**context) + with open(self.quickrace_xml_path, "w") as f: + f.write(torcs_server_config) + self.learning_car_types = self.get_learning_car_type() + car_name_list = ([self.traffic_car_type]*self.num_traffic_cars + self.learning_car_types + + [self.traffic_car_type]*(MAX_NUM_CARS-self.num_traffic_cars-self.num_learning_cars)) + scr_server_config = self.scr_server_config_template.format(*car_name_list) + with open(self.scr_server_xml_path, "w") as f: + f.write(scr_server_config) \ No newline at end of file diff --git a/setup.py b/setup.py index 751a2d3..0c66f0a 100644 --- a/setup.py +++ b/setup.py @@ -2,10 +2,10 @@ from setuptools import setup setup(name='MADRaS', - version='0.1', + version='0.2', description='Multi Agent Driving Simulator', install_requires=['gym', 'pyyaml', 'pTable', 'matplotlib', - 'tensorflow']) + 'mpi4py'])