partitioned_quantum_reinforcement_learning/raytune.py at main · algopapi/partitioned_quantum_reinforcement_learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Hyperparamter Tuning Imports
import os

os.system("source ray_env/bin/activate")
import argparse

import numpy as np
import ray
import tensorflow as tf
from ray import air, tune
from ray.air.checkpoint import Checkpoint
from ray.tune import CLIReporter, Stopper
from ray.tune.schedulers import ASHAScheduler, HyperBandScheduler

from Agent.agent import PGAgent

#ray.init("auto")

class MyTrainableClass(tune.Trainable):
    def setup(self, config):
        self.config = config
        self.agent = PGAgent(config=config)

        self.episode_reward_history = []
        self.batch = 0
        self.max_batch = self.agent.max_episodes // self.agent.batch_size

    def step(self):

        # Calculate the current episodes
        episode = (self.batch + 1) * self.agent.batch_size
        # Increment the batch counter
        self.batch = self.batch + 1

        # Gather the trajectories
        episodes = self.agent.gather_trajectories()

        # Group states, actions and returns in numpy arrays
        # (make a single array of all states, actions and returns from all trajectories)
        states = np.concatenate([ep['states'] for ep in episodes])
        actions = np.concatenate([ep['actions'] for ep in episodes])
        rewards = [ep['rewards'] for ep in episodes]
        returns = np.concatenate([self.agent.compute_returns(ep_rwds) for ep_rwds in rewards])
        returns = np.array(returns, dtype=np.float32)

        id_action_pairs = np.array([[i, a] for i, a in enumerate(actions)])

        self.agent.reinforce(states, id_action_pairs, returns, episode)

        # Store collected rewards
        for ep_rwds in rewards:
            self.episode_reward_history.append(np.sum(ep_rwds))

        avg_rewards = np.mean(self.episode_reward_history[-10:])

        # with self.agent.writer.as_default():
        #     tf.summary.scalar("model/avg_rew", avg_rewards, step=episode)
        if avg_rewards >= self.agent.max_average:
            self.agent.Actor.save_weights(self.agent.agent_name + self.agent.env_name + ".h5")

        return {"mean_reward": avg_rewards, "episode": episode, "batch": self.batch}


    def reset_config(self, new_config):
        self.config.update(new_config)
        self.agent = PGAgent(config = new_config)
        return True

    def save_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.h5")
        self.agent.Actor.save_weights(checkpoint_path)
        return tmp_checkpoint_dir

    def load_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.h5")
        self.agent.Actor.load_weights(checkpoint_path)


if __name__ == "__main__":
    #ray.init()

    # search space
    best_trail = {
        "env_name": "CartPole-v1", # Name of the environment
        "input_dim": 1, # Input dimension of the environment (CartPole-v1 = 1)
        "agent_name": "PartitionedPQC Policy Gradient", # Name of the agent

        # Hyperparamters for the training loop
        "max_episodes" : 1200, # Number of episodes to run
        "max_average": 500, # Average score to reach to stop training
        "batch_size": 10, # the number of trajectories to collect before each REINFORCE update

        # Hyperparamets for the dimensions of the PQC
        "n_partitions": 2, # Number of partitions in the PQC.
        "n_terms": 1, # A hyperparamter for number of terms to avaluate in the sum. (run time cost vs accuracy tradeoff)
        "n_layers": 5, #Depth of the PQC.
        # "rescale_parameter": tune.grid_search([0.001, 0,1, 1]), # Lambda Rescale parameter search. WHAT SHOULD THIS BE?

        # Hyperparamters for the Policy Graidnet algorithm
        "gamma": 1, # Discount factor for the rewards
        "beta": 1, # Regularization parameter for the PQC.

        # Hyperparamter for the learning rates of the MODEL
        "theta_lr": 0.002, # learning rate for the rotational gates (Keep the same as orginal paper)
        "zeta_lr":  0.0002, # Learning rate for the zeta paramters
        "lambda_lr": 0.0002, # Learning rate for the Adam optimizer # 0.001 before

        "x_l": False, # Do we take the exponential of the lambda paramters? (this tend to make the model very unstable for some reason)
        "trainable_lambdas": True,
        "rescaling_scheme": "factoring",

        # Testing currently.
        "input_lr":  0.1, # Learning rate for the Input encodings (Keep the same as orginal paper)
        "obs_lr":  0.1, # Learning rate for the Adam optimizer (Keep the same as orginal paper)
    }

    # Experiment name
    exp_name = "reshuffled_inputs"

    # Define the logger
    reporter = CLIReporter(max_progress_rows=100)
    reporter.add_metric_column("mean_reward")
    reporter.add_metric_column("episode")

    class CustomStopper(Stopper):
        def __init__(self):
            self.should_stop = {}
            self.should_stop_all = False
            self.count_below_threshold = {}
            self.stop_conditions = {
                500: float("inf")
            }

        def __call__(self, trial_id, result):
            if trial_id not in self.should_stop:
                self.should_stop[trial_id] = False
                self.count_below_threshold[trial_id] = {}
                for threshold in self.stop_conditions:
                    self.count_below_threshold[trial_id][threshold] = 0

            if not self.should_stop[trial_id]:

                # if result["mean_reward"] >= 500:
                #     self.should_stop[trial_id] = True

                if result["episode"] >= 1200:
                    self.should_stop[trial_id] = True

                for threshold, episode in self.stop_conditions.items():
                    if result["episode"] > episode:
                        if result["mean_reward"] < threshold:
                            self.count_below_threshold[trial_id][threshold] += 1
                        else:
                            self.count_below_threshold[trial_id][threshold] = 0

                        if self.count_below_threshold[trial_id][threshold] >= 3:
                            self.should_stop[trial_id] = True

            return self.should_stop[trial_id]

        def stop_all(self):
            return False

    stopper=CustomStopper()

    # This is the trail that uses
    resource_group = tune.PlacementGroupFactory([{"CPU": 4, "CustomResource": 1}])
    trainable_with_recources = tune.with_resources(MyTrainableClass, resource_group)

    #The tuner object
    # tuner = tune.Tuner(
    #     MyTrainableClass,
    #     run_config=air.RunConfig(
    #         name=exp_name,
    #         stop=stopper,
    #         progress_reporter=reporter,
    #         failure_config=air.FailureConfig(max_failures=2),
    #         verbose=1,
    #     ),
    #     tune_config=tune.TuneConfig(
    #         num_samples=10,
    #         mode="max",
    #         metric="mean_reward",
    #         max_concurrent_trials=1,
    #     ),
    #     param_space=best_trail,
    # )

    tuner = tune.Tuner.restore(
        "~/ray_results/reshuffled_inputs",
        MyTrainableClass,
        resume_errored=False,
        restart_errored=True,
    )

    results = tuner.fit()