From 70bc864156e12327eef979c964528e185ded4aca Mon Sep 17 00:00:00 2001 From: muupan Date: Wed, 26 May 2021 23:55:11 +0900 Subject: [PATCH 1/3] Add a test of ACER with fixed covariance, which fails for now --- tests/agents_tests/test_acer.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/agents_tests/test_acer.py b/tests/agents_tests/test_acer.py index 25fbebc2d..9a0aacf40 100644 --- a/tests/agents_tests/test_acer.py +++ b/tests/agents_tests/test_acer.py @@ -15,7 +15,11 @@ from pfrl.experiments.evaluator import run_evaluation_episodes from pfrl.experiments.train_agent_async import train_agent_async from pfrl.nn import ConcatObsAndAction -from pfrl.policies import GaussianHeadWithDiagonalCovariance, SoftmaxCategoricalHead +from pfrl.policies import ( + GaussianHeadWithDiagonalCovariance, + GaussianHeadWithFixedCovariance, + SoftmaxCategoricalHead, +) from pfrl.q_functions import DiscreteActionValueHead from pfrl.replay_buffers import EpisodicReplayBuffer @@ -263,6 +267,15 @@ def test_compute_loss_with_kl_constraint_gaussian(): _test_compute_loss_with_kl_constraint(policy) +def test_compute_loss_with_kl_constraint_gaussian_with_fixed_covariance(): + action_size = 3 + policy = nn.Sequential( + nn.Linear(1, action_size), + GaussianHeadWithFixedCovariance(), + ) + _test_compute_loss_with_kl_constraint(policy) + + def test_compute_loss_with_kl_constraint_softmax(): n_actions = 3 policy = nn.Sequential( From 711bff6b7943974c9ce6c34955ef93291133eb44 Mon Sep 17 00:00:00 2001 From: muupan Date: Wed, 26 May 2021 23:57:07 +0900 Subject: [PATCH 2/3] Support fixed covariance by filtering out non-learnable params Loss function used in the test is changed because maximizing the entropy has no effect for Gaussian with fixed covariance --- pfrl/agents/acer.py | 9 ++++++++- tests/agents_tests/test_acer.py | 8 +++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pfrl/agents/acer.py b/pfrl/agents/acer.py index 47ef07d5d..1f80997e3 100644 --- a/pfrl/agents/acer.py +++ b/pfrl/agents/acer.py @@ -170,12 +170,19 @@ def evaluator(action): def get_params_of_distribution(distrib): + """Returns learnable parameters of a given distribution.""" if isinstance(distrib, torch.distributions.Independent): return get_params_of_distribution(distrib.base_dist) elif isinstance(distrib, torch.distributions.Categorical): + assert distrib._param.requires_grad return (distrib._param,) elif isinstance(distrib, torch.distributions.Normal): - return distrib.loc, distrib.scale + # Either loc or scale must be learnable + params = tuple( + param for param in [distrib.loc, distrib.scale] if param.requires_grad + ) + assert len(params) > 0 + return params else: raise NotImplementedError("{} is not supported by ACER".format(type(distrib))) diff --git a/tests/agents_tests/test_acer.py b/tests/agents_tests/test_acer.py index 9a0aacf40..8f9db360a 100644 --- a/tests/agents_tests/test_acer.py +++ b/tests/agents_tests/test_acer.py @@ -295,11 +295,13 @@ def _test_compute_loss_with_kl_constraint(base_policy): with torch.no_grad(): # Compute KL divergence against the original distribution base_distrib = base_policy(x) + some_action = base_distrib.sample() def base_loss_func(distrib): - # Any loss that tends to increase KL divergence should be ok - kl = torch.distributions.kl_divergence(base_distrib, distrib) - return -(kl + distrib.entropy()) + # Any loss that tends to increase KL divergence should be ok. + # Here I choose to minimize the log probability of some fixed action. + # The loss is clipped to avoid NaN. + return torch.max(distrib.log_prob(some_action), torch.as_tensor(-20.)) def compute_kl_after_update(loss_func, n=100): policy = copy.deepcopy(base_policy) From 5e99719f2dc381e1effb3d1543f1f44a90aadd94 Mon Sep 17 00:00:00 2001 From: muupan Date: Thu, 27 May 2021 00:36:12 +0900 Subject: [PATCH 3/3] Black --- tests/agents_tests/test_acer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/agents_tests/test_acer.py b/tests/agents_tests/test_acer.py index 8f9db360a..721290042 100644 --- a/tests/agents_tests/test_acer.py +++ b/tests/agents_tests/test_acer.py @@ -301,7 +301,7 @@ def base_loss_func(distrib): # Any loss that tends to increase KL divergence should be ok. # Here I choose to minimize the log probability of some fixed action. # The loss is clipped to avoid NaN. - return torch.max(distrib.log_prob(some_action), torch.as_tensor(-20.)) + return torch.max(distrib.log_prob(some_action), torch.as_tensor(-20.0)) def compute_kl_after_update(loss_func, n=100): policy = copy.deepcopy(base_policy)