Parallel-Hydrone-DRL/agent.py at main · alikolling/Parallel-Hydrone-DRL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#! /usr/bin/env python3

import rospy
from utils.utils import OUNoise, empty_torch_queue, test_goals
from collections import deque
import gym_hydrone
import numpy as np
import torch
import time
import gym
import os


class Agent(object):
    def __init__(self, config, policy, global_episode, global_step, n_agent=0, agent_type='exploration', log_dir=''):
        print(f"Initializing agent {n_agent}...")
        self.config = config
        self.action_low = [-0.25, -0.25, -0.25]
        self.action_high = [0.25, 0.25, 0.25]
        self.n_agent = n_agent
        self.agent_type = agent_type
        self.max_steps = config['max_ep_length']  # maximum number of steps per episode
        self.num_episode_save = config['num_episode_save']
        self.global_episode = global_episode
        self.global_step = global_step
        self.local_episode = 0
        self.log_dir = log_dir
        # number of future steps to collect experiences for N-step returns
        self.n_step_returns = config['n_step_return']
        self.discount_rate = config['discount_rate']  # Discount rate (gamma) for future rewards
        # agent gets latest parameters from learner every update_agent_ep episodes
        self.update_agent_ep = config['update_agent_ep']

        # Initialise deque buffer to store experiences for N-step returns
        self.exp_buffer = deque()

        # Create environment
        self.ou_noise = OUNoise(dim=config['action_dim'], low=self.action_low, high=self.action_high)
        self.ou_noise.reset()

        self.actor = policy
        print("Started agent", n_agent, "using", config['device'])

    def update_actor_learner(self, learner_w_queue, training_on):
        """Update local actor to the actor from learner. """
        if not training_on.value:
            return
        try:
            source = learner_w_queue.get_nowait()
        except:
            return
        target = self.actor
        for target_param, source_param in zip(target.parameters(), source):
            w = torch.tensor(source_param).float()
            target_param.data.copy_(w)
        del source

    def run(self, training_on, replay_queue, learner_w_queue, logs):
        time.sleep(1)
        os.environ['ROS_MASTER_URI'] = "http://localhost:{}/".format(11311 + self.n_agent)
        rospy.init_node(self.config['env_name'].replace('-', '_') + "_w{}".format(self.n_agent))
        goal = None
        if self.config['test']:
            goal = [test_goals(self.local_episode)]
        env = gym.make(self.config['env_name'], env_stage=self.config['env_stage'], observation_mode=0, continuous=True, goal_list=goal)
        time.sleep(1)

        best_reward = -float("inf")
        rewards = []
        while training_on.value if not self.config['test'] else (self.local_episode <= self.config['test_trials']):
            episode_reward = 0
            num_steps = 0
            self.local_episode += 1
            ep_start_time = time.time()
            goal = None
            if self.config['test']:
                goal = [test_goals(self.local_episode)]
                print("New Goal:", goal)
            state = env.reset(new_random_goals=True if not self.config['test'] else False, goal=goal)
            if not self.config['test']:
                self.exp_buffer.clear()
                self.ou_noise.reset()
            done = False
            while not done:
                for s in range(len(state)):
                    if state[s] > 2.5:
                        state[s] = 2.5

                if self.config['model'] == 'PDSRL' or self.config['model'] == 'SAC':
                    action = self.actor.get_action(torch.Tensor(state).to(self.config['device']), exploitation=True if self.agent_type == "exploitation" else False)
                    action = action.detach().cpu().numpy().flatten()
                else:
                    action = self.actor.get_action(np.array(state))
                    if self.agent_type == "exploration":
                        action = action.squeeze(0)
                        action = self.ou_noise.get_action(action, num_steps)
                    else:
                        action = action.detach().cpu().numpy().flatten()
                action[0] = np.clip(action[0], self.action_low[0], self.action_high[0])
                action[1] = np.clip(action[1], self.action_low[1], self.action_high[1])
                action[2] = np.clip(action[2], self.action_low[2], self.action_high[2])

                next_state, reward, done, info = env.step(action)
                episode_reward += reward

                if not self.config['test']:
                    self.exp_buffer.append((state, action, reward))

                    # We need at least N steps in the experience buffer before we can compute Bellman
                    # rewards and add an N-step experience to replay memory
                    if len(self.exp_buffer) >= self.config['n_step_return']:
                        state_0, action_0, reward_0 = self.exp_buffer.popleft()
                        discounted_reward = reward_0
                        gamma = self.config['discount_rate']
                        for (_, _, r_i) in self.exp_buffer:
                            discounted_reward += r_i * gamma
                            gamma *= self.config['discount_rate']
                        # We want to fill buffer only with form explorator
                        if self.agent_type == "exploration":
                            try:
                                replay_queue.put_nowait([state_0, action_0, discounted_reward, next_state, done, gamma])
                            except:
                                pass

                state = next_state

                if done or num_steps == self.max_steps:
                    # add rest of experiences remaining in buffer
                    if not self.config['test']:
                        while len(self.exp_buffer) != 0:
                            state_0, action_0, reward_0 = self.exp_buffer.popleft()
                            discounted_reward = reward_0
                            gamma = self.config['discount_rate']
                            for (_, _, r_i) in self.exp_buffer:
                                discounted_reward += r_i * gamma
                                gamma *= self.config['discount_rate']
                            if self.agent_type == "exploration":
                                try:
                                    replay_queue.put_nowait([state_0, action_0, discounted_reward, next_state, done,
                                                             gamma])
                                except:
                                    pass
                    break

                num_steps += 1
                with self.global_step.get_lock():
                    self.global_step.value += 1

                if self.config['test']:
                    position = env.get_position()  # Get x and y turtlebot position to compute test charts
                    logs[3] = position[0]
                    logs[4] = position[1]
                    logs[5] = position[2]

            with self.global_episode.get_lock():
                self.global_episode.value += 1

            # Log metrics
            episode_timing = time.time() - ep_start_time
            print(f"Agent: [{self.n_agent}/{self.config['num_agents'] - 1}] Episode: [{self.local_episode}/"
                  f"{self.config['test_trials'] if self.config['test'] else self.config['num_episodes']}] Reward: "
                  f"[{episode_reward}/200] Step: {self.global_step.value} Episode Timing: {round(episode_timing, 2)}s")
            aux = 6 + self.n_agent * 3
            with logs.get_lock():
                if not self.config['test']:
                    logs[aux] = episode_reward
                    logs[aux+1] = episode_timing
                    logs[aux+2] = self.local_episode
                else:
                    logs[0] = episode_reward
                    logs[1] = episode_timing
                    logs[2] = self.local_episode

            # Saving agent
            if not self.config['test']:
                reward_outperformed = episode_reward - best_reward > self.config["save_reward_threshold"]
                time_to_save = self.local_episode % self.num_episode_save == 0
                #if self.agent_type == "exploitation" and (time_to_save or reward_outperformed):
                #    if episode_reward > best_reward:
                #        best_reward = episode_reward
                #    self.save(f"local_episode_{self.local_episode}_reward_{best_reward:4f}")

                if self.agent_type == "exploration" and self.n_agent == 1 and (time_to_save or reward_outperformed):
                    if episode_reward > best_reward:
                        best_reward = episode_reward
                    self.save(f"local_episode_{self.local_episode}_reward_{best_reward:4f}")

                rewards.append(episode_reward)
                if self.agent_type == "exploration" and self.local_episode % self.config['update_agent_ep'] == 0:
                    self.update_actor_learner(learner_w_queue, training_on)

        if not self.config['test']:
            empty_torch_queue(replay_queue)
        print(f"Agent {self.n_agent} done.")

    def save(self, checkpoint_name):
        process_dir = f"{self.log_dir}/{self.config['model']}_{self.config['dense_size']}_A{self.config['num_agents']}_S{self.config['env_stage']}_{'P' if self.config['replay_memory_prioritized'] else 'N'}"
        if not os.path.exists(process_dir):
            os.makedirs(process_dir)
        model_fn = f"{process_dir}/{checkpoint_name}.pt"
        torch.save(self.actor.state_dict(), model_fn)