From 07766eabcba439ca2f2821c3d593662e47b99ebc Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 15:45:41 +0800 Subject: [PATCH 01/39] refactor: change on_policy structure --- .../on_policy/{ => base}/natural_pg.py | 2 +- .../on_policy/{ => base}/policy_gradient.py | 4 +- omnisafe/algorithms/on_policy/base/ppo.py | 145 ++++++++++++++++++ .../algorithms/on_policy/{ => base}/trpo.py | 2 +- .../on_policy/{ => first_order}/focops.py | 2 +- .../on_policy/{ => naive_lagrange}/npg_lag.py | 2 +- .../on_policy/{ => naive_lagrange}/pdo.py | 2 +- .../on_policy/{ => naive_lagrange}/ppo_lag.py | 5 +- .../{ => naive_lagrange}/trpo_lag.py | 2 +- .../on_policy/{ => pid_lagrange}/cppo_pid.py | 6 +- omnisafe/algorithms/on_policy/ppo.py | 65 -------- .../on_policy/{ => second_order}/cpo.py | 2 +- .../on_policy/{ => second_order}/pcpo.py | 2 +- 13 files changed, 161 insertions(+), 80 deletions(-) rename omnisafe/algorithms/on_policy/{ => base}/natural_pg.py (98%) rename omnisafe/algorithms/on_policy/{ => base}/policy_gradient.py (99%) create mode 100644 omnisafe/algorithms/on_policy/base/ppo.py rename omnisafe/algorithms/on_policy/{ => base}/trpo.py (99%) rename omnisafe/algorithms/on_policy/{ => first_order}/focops.py (98%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/npg_lag.py (98%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/pdo.py (97%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/ppo_lag.py (96%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/trpo_lag.py (98%) rename omnisafe/algorithms/on_policy/{ => pid_lagrange}/cppo_pid.py (94%) delete mode 100644 omnisafe/algorithms/on_policy/ppo.py rename omnisafe/algorithms/on_policy/{ => second_order}/cpo.py (99%) rename omnisafe/algorithms/on_policy/{ => second_order}/pcpo.py (99%) diff --git a/omnisafe/algorithms/on_policy/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py similarity index 98% rename from omnisafe/algorithms/on_policy/natural_pg.py rename to omnisafe/algorithms/on_policy/base/natural_pg.py index 4b86d0ee2..6201ac307 100644 --- a/omnisafe/algorithms/on_policy/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, diff --git a/omnisafe/algorithms/on_policy/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py similarity index 99% rename from omnisafe/algorithms/on_policy/policy_gradient.py rename to omnisafe/algorithms/on_policy/base/policy_gradient.py index 0c84eb44b..1a4400119 100644 --- a/omnisafe/algorithms/on_policy/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -57,9 +57,11 @@ def __init__( cfgs: (default: :const:`None`) This is a dictionary of the algorithm hyper-parameters. """ - self.env = wrapper_registry.get(wrapper_type)(env_id) self.algo = algo self.cfgs = deepcopy(cfgs) + self.env = wrapper_registry.get(wrapper_type)( + env_id, cfgs=self.cfgs._asdict().get('env_cfgs') + ) assert self.cfgs.steps_per_epoch % distributed_utils.num_procs() == 0 self.local_steps_per_epoch = cfgs.steps_per_epoch // distributed_utils.num_procs() diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py new file mode 100644 index 000000000..56ddde25e --- /dev/null +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -0,0 +1,145 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PPO algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient +from omnisafe.utils import distributed_utils + + +@registry.register +class PPO(PolicyGradient): + """The Proximal Policy Optimization Algorithms (PPO) Algorithm. + + References: + Paper Name: Proximal Policy Optimization Algorithms. + Paper author: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. + Paper URL: https://arxiv.org/pdf/1707.06347.pdf + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo', + wrapper_type: str = 'OnPolicyEnvWrapper', + ): + """Initialize PPO.""" + self.clip = cfgs.clip + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def compute_loss_pi(self, data: dict): + """Compute policy loss.""" + dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) + # Importance ratio + ratio = torch.exp(_log_p - data['log_p']) + ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) + loss_pi = -(torch.min(ratio * data['adv'], ratio_clip * data['adv'])).mean() + loss_pi += self.cfgs.entropy_coef * dist.entropy().mean() + + # Useful extra info + approx_kl = (0.5 * (dist.mean - data['act']) ** 2 / dist.stddev**2).mean().item() + ent = dist.entropy().mean().item() + pi_info = dict(kl=approx_kl, ent=ent, ratio=ratio_clip.mean().item()) + + return loss_pi, pi_info + + def slice_data(self, data) -> dict: + """slice data for mini batch update""" + + slice_data = [] + obs = data['obs'] + act = data['act'] + target_v = data['target_v'] + log_p = data['log_p'] + adv = data['adv'] + discounted_ret = data['discounted_ret'] + cost_adv = data['cost_adv'] + target_v = data['target_v'] + batch_size = self.cfgs.batch_size + for i in range(int(len(obs) / batch_size)): + slice_data.append( + { + 'obs': obs[i * batch_size : (i + 1) * batch_size], + 'act': act[i * batch_size : (i + 1) * batch_size], + 'target_v': target_v[i * batch_size : (i + 1) * batch_size], + 'log_p': log_p[i * batch_size : (i + 1) * batch_size], + 'adv': adv[i * batch_size : (i + 1) * batch_size], + 'discounted_ret': discounted_ret[i * batch_size : (i + 1) * batch_size], + 'cost_adv': cost_adv[i * batch_size : (i + 1) * batch_size], + } + ) + + return slice_data + + def update_policy_net(self, data) -> None: + """update policy network""" + + # Slice data for mini batch update + slice_data = self.slice_data(data) + + # Get prob. distribution before updates: used to measure KL distance + with torch.no_grad(): + self.p_dist = self.actor_critic.actor(slice_data[0]['obs']) + + # Get loss and info values before update + pi_l_old, _ = self.compute_loss_pi(data=slice_data[0]) + loss_pi_before = pi_l_old.item() + + # Train policy with multiple steps of gradient descent + for i in range(self.cfgs.actor_iters): + for batch_data in slice_data: + self.actor_optimizer.zero_grad() + loss_pi, pi_info = self.compute_loss_pi(data=batch_data) + loss_pi.backward() + # Apply L2 norm + if self.cfgs.use_max_grad_norm: + torch.nn.utils.clip_grad_norm_( + self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm + ) + + # Average grads across MPI processes + distributed_utils.mpi_avg_grads(self.actor_critic.actor.net) + self.actor_optimizer.step() + + q_dist = self.actor_critic.actor(batch_data['obs']) + torch_kl = torch.distributions.kl.kl_divergence(self.p_dist, q_dist).mean().item() + + if self.cfgs.kl_early_stopping: + # Average KL for consistent early stopping across processes + if distributed_utils.mpi_avg(torch_kl) > self.cfgs.target_kl: + self.logger.log(f'Reached ES criterion after {i+1} steps.') + break + + # Track when policy iteration is stopped; Log changes from update + self.logger.store( + **{ + 'Loss/Loss_pi': loss_pi.item(), + 'Loss/Delta_loss_pi': loss_pi.item() - loss_pi_before, + 'Train/StopIter': i + 1, + 'Values/Adv': data['adv'].numpy(), + 'Train/Entropy': pi_info['ent'], + 'Train/KL': torch_kl, + 'Train/PolicyRatio': pi_info['ratio'], + } + ) diff --git a/omnisafe/algorithms/on_policy/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py similarity index 99% rename from omnisafe/algorithms/on_policy/trpo.py rename to omnisafe/algorithms/on_policy/base/trpo.py index 630a7dcd8..f4fde9504 100644 --- a/omnisafe/algorithms/on_policy/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, diff --git a/omnisafe/algorithms/on_policy/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py similarity index 98% rename from omnisafe/algorithms/on_policy/focops.py rename to omnisafe/algorithms/on_policy/first_order/focops.py index 1b80e1962..cd19b4b37 100644 --- a/omnisafe/algorithms/on_policy/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange from omnisafe.utils import distributed_utils diff --git a/omnisafe/algorithms/on_policy/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py similarity index 98% rename from omnisafe/algorithms/on_policy/npg_lag.py rename to omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index aa10d0b8a..2876edb7a 100644 --- a/omnisafe/algorithms/on_policy/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG from omnisafe.common.lagrange import Lagrange diff --git a/omnisafe/algorithms/on_policy/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py similarity index 97% rename from omnisafe/algorithms/on_policy/pdo.py rename to omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index 633dbebe7..2446f13b0 100644 --- a/omnisafe/algorithms/on_policy/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange diff --git a/omnisafe/algorithms/on_policy/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py similarity index 96% rename from omnisafe/algorithms/on_policy/ppo_lag.py rename to omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index 5ce205555..954af7d9f 100644 --- a/omnisafe/algorithms/on_policy/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange @@ -38,11 +38,10 @@ def __init__( env_id, cfgs, algo='PPO-Lag', - clip=0.2, wrapper_type: str = 'OnPolicyEnvWrapper', ): """Initialize PPO-Lag algorithm.""" - self.clip = clip + self.clip = cfgs.clip PolicyGradient.__init__( self, env_id=env_id, diff --git a/omnisafe/algorithms/on_policy/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py similarity index 98% rename from omnisafe/algorithms/on_policy/trpo_lag.py rename to omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index e3282863f..93c9630e1 100644 --- a/omnisafe/algorithms/on_policy/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.trpo import TRPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.common.lagrange import Lagrange diff --git a/omnisafe/algorithms/on_policy/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py similarity index 94% rename from omnisafe/algorithms/on_policy/cppo_pid.py rename to omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 85581a066..10afd7920 100644 --- a/omnisafe/algorithms/on_policy/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.pid_lagrange import PIDLagrangian @@ -28,7 +28,7 @@ class CPPOPid(PolicyGradient, PIDLagrangian): References: Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1705.10528 + Paper URL: https://arxiv.org/abs/2007.03964 """ @@ -50,7 +50,7 @@ def __init__( PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) self.clip = self.cfgs.clip - self.cost_limit = self.cfgs.cost_limit + self.cost_limit = self.cfgs.PID_cfgs.cost_limit def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/ppo.py b/omnisafe/algorithms/on_policy/ppo.py deleted file mode 100644 index e24335344..000000000 --- a/omnisafe/algorithms/on_policy/ppo.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright 2022 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the PPO algorithm.""" - -import torch - -from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient - - -@registry.register -class PPO(PolicyGradient): - """The Proximal Policy Optimization Algorithms (PPO) Algorithm. - - References: - Paper Name: Proximal Policy Optimization Algorithms. - Paper author: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - Paper URL: https://arxiv.org/pdf/1707.06347.pdf - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - algo='ppo', - clip=0.2, - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - """Initialize PPO.""" - self.clip = clip - super().__init__( - env_id=env_id, - cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, - ) - - def compute_loss_pi(self, data: dict): - """Compute policy loss.""" - dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) - # Importance ratio - ratio = torch.exp(_log_p - data['log_p']) - ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) - loss_pi = -(torch.min(ratio * data['adv'], ratio_clip * data['adv'])).mean() - loss_pi += self.cfgs.entropy_coef * dist.entropy().mean() - - # Useful extra info - approx_kl = (0.5 * (dist.mean - data['act']) ** 2 / dist.stddev**2).mean().item() - ent = dist.entropy().mean().item() - pi_info = dict(kl=approx_kl, ent=ent, ratio=ratio_clip.mean().item()) - - return loss_pi, pi_info diff --git a/omnisafe/algorithms/on_policy/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py similarity index 99% rename from omnisafe/algorithms/on_policy/cpo.py rename to omnisafe/algorithms/on_policy/second_order/cpo.py index 207bd0abc..dee873cc6 100644 --- a/omnisafe/algorithms/on_policy/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -18,7 +18,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.trpo import TRPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, diff --git a/omnisafe/algorithms/on_policy/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py similarity index 99% rename from omnisafe/algorithms/on_policy/pcpo.py rename to omnisafe/algorithms/on_policy/second_order/pcpo.py index edef0fdf4..0e5104502 100644 --- a/omnisafe/algorithms/on_policy/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.trpo import TRPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, From 2bb1e22d377d8492f10ef9ddc2dc8f14995d8d61 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 15:50:58 +0800 Subject: [PATCH 02/39] feat: add Saute algorithm --- .../on_policy/saute/ppo_lag_saute.py | 50 ++++++++++++++++++ .../algorithms/on_policy/saute/ppo_saute.py | 52 +++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py create mode 100644 omnisafe/algorithms/on_policy/saute/ppo_saute.py diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py new file mode 100644 index 000000000..8a12d24c4 --- /dev/null +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -0,0 +1,50 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Saute algorithm.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagSaute(PPOLag): + """Saute algorithm implemented by PPOLag. + + References: + Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, + David Mguni, Jun Wang, Haitham Bou-Ammar. + Paper URL: https://arxiv.org/abs/2202.06558 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_lag_saute', + wrapper_type: str = 'SauteEnvWrapper', + ) -> None: + r"""Initialize PPOLagSaute.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py new file mode 100644 index 000000000..8a6198c9f --- /dev/null +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -0,0 +1,52 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Saute algorithm.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOSaute(PPO): + """Saute algorithm implemented by PPO. + + References: + Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, + David Mguni, Jun Wang, Haitham Bou-Ammar. + Paper URL: https://arxiv.org/abs/2202.06558 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_saute', + clip=0.2, + wrapper_type: str = 'SauteEnvWrapper', + ) -> None: + r"""Initialize PPOSaute.""" + self.clip = clip + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') From 462cdc3ae91ddf945f51496295cfe0cc58f30b52 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 15:55:09 +0800 Subject: [PATCH 03/39] feat: add saute wrapper --- omnisafe/configs/on-policy/PPOLagSaute.yaml | 126 ++++++++++++ omnisafe/configs/on-policy/PPOSaute.yaml | 116 +++++++++++ omnisafe/wrappers/on_policy_wrapper.py | 15 +- omnisafe/wrappers/saute_wrapper.py | 210 ++++++++++++++++++++ 4 files changed, 465 insertions(+), 2 deletions(-) create mode 100644 omnisafe/configs/on-policy/PPOLagSaute.yaml create mode 100644 omnisafe/configs/on-policy/PPOSaute.yaml create mode 100644 omnisafe/wrappers/saute_wrapper.py diff --git a/omnisafe/configs/on-policy/PPOLagSaute.yaml b/omnisafe/configs/on-policy/PPOLagSaute.yaml new file mode 100644 index 000000000..c956352f6 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagSaute.yaml @@ -0,0 +1,126 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + unsafe_reward: -0.1 + # ``safety_budget`` in saute is actually the same as ``cost_limmit``. + safety_budget: 25 + saute_gamma: 0.9997 + scale_safety_budget: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/PPOSaute.yaml b/omnisafe/configs/on-policy/PPOSaute.yaml new file mode 100644 index 000000000..be7ef49ce --- /dev/null +++ b/omnisafe/configs/on-policy/PPOSaute.yaml @@ -0,0 +1,116 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + unsafe_reward: -0.1 + # ``safety_budget`` in saute is actually the same as ``cost_limmit``. + safety_budget: 25 + saute_gamma: 0.9997 + scale_safety_budget: True diff --git a/omnisafe/wrappers/on_policy_wrapper.py b/omnisafe/wrappers/on_policy_wrapper.py index 8284c8d2c..93d6ab89f 100644 --- a/omnisafe/wrappers/on_policy_wrapper.py +++ b/omnisafe/wrappers/on_policy_wrapper.py @@ -14,6 +14,10 @@ # ============================================================================== """env_wrapper""" +import collections +from copy import deepcopy +from typing import Optional + import safety_gymnasium import torch @@ -24,9 +28,16 @@ class OnPolicyEnvWrapper: # pylint: disable=too-many-instance-attributes """env_wrapper""" - def __init__(self, env_id, render_mode=None): - # check env_id is str + def __init__(self, env_id, cfgs: Optional[collections.namedtuple] = None, render_mode=None): + r"""Initialize environment wrapper. + + Args: + env_id (str): environment id. + cfgs (collections.namedtuple): configs. + render_mode (str): render mode. + """ self.env = safety_gymnasium.make(env_id, render_mode=render_mode) + self.cfgs = deepcopy(cfgs) self.env_id = env_id self.render_mode = render_mode self.metadata = self.env.metadata diff --git a/omnisafe/wrappers/saute_wrapper.py b/omnisafe/wrappers/saute_wrapper.py new file mode 100644 index 000000000..25628ce91 --- /dev/null +++ b/omnisafe/wrappers/saute_wrapper.py @@ -0,0 +1,210 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""saute env_wrapper""" + +import numpy as np +import torch +from gymnasium import spaces + +from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY + + +@WRAPPER_REGISTRY.register +class SauteEnvWrapper(OnPolicyEnvWrapper): + r"""SauteEnvWrapper.""" + + def __init__( + self, + env_id, + cfgs, + render_mode=None, + ) -> None: + r"""Initialize SauteEnvWrapper. + + Args: + env_id (str): environment id + cfgs (dict): configuration dictionary + render_mode (str): render mode + + """ + super().__init__(env_id, render_mode) + + self.unsafe_reward = cfgs.unsafe_reward + self.saute_gamma = cfgs.saute_gamma + if cfgs.scale_safety_budget: + self.safety_budget = ( + cfgs.safety_budget + * (1 - self.saute_gamma**self.max_ep_len) + / (1 - self.saute_gamma) + / np.float32(self.max_ep_len) + ) + else: + self.safety_budget = cfgs.safety_budget + self.safety_obs = 1.0 + high = np.array(np.hstack([self.env.observation_space.high, np.inf]), dtype=np.float32) + low = np.array(np.hstack([self.env.observation_space.low, np.inf]), dtype=np.float32) + self.observation_space = spaces.Box(high=high, low=low) + + def augment_obs(self, obs: np.array, safety_obs: np.array): + r"""Augmenting the obs with the safety obs. + + Args: + obs (np.array): observation + safety_obs (np.array): safety observation + + Returns: + augmented_obs (np.array): augmented observation + """ + augmented_obs = np.hstack([obs, safety_obs]) + return augmented_obs + + def safety_step(self, cost: np.ndarray) -> np.ndarray: + r"""Update the normalized safety obs. + + Args: + cost (np.array): cost + + Returns: + safety_obs (np.array): normalized safety observation + """ + self.safety_obs -= cost / self.safety_budget + self.safety_obs /= self.saute_gamma + return self.safety_obs + + def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.ndarray: + r"""Update the reward. + + Args: + reward (np.array): reward + next_safety_obs (np.array): next safety observation + + Returns: + reward (np.array): updated reward + """ + reward = reward * (next_safety_obs > 0) + self.unsafe_reward * (next_safety_obs <= 0) + return reward + + def reset(self, seed=None): + r"""reset environment + + Args: + seed (int): seed for environment reset + + Returns: + self.curr_o (np.array): current observation + info (dict): environment info + """ + self.curr_o, info = self.env.reset(seed=seed) + self.safety_obs = 1.0 + self.curr_o = self.augment_obs(self.curr_o, self.safety_obs) + return self.curr_o, info + + def step(self, action): + r"""Step environment. + + Args: + action (np.array): action + + Returns: + augmented_obs (np.array): augmented observation + reward (np.array): reward + cost (np.array): cost + terminated (bool): whether the episode is terminated + truncated (bool): whether the episode is truncated + info (dict): environment info + """ + next_obs, reward, cost, terminated, truncated, info = self.env.step(action) + next_safety_obs = self.safety_step(cost) + info['true_reward'] = reward + info['safety_obs'] = next_safety_obs + reward = self.safety_reward(reward, next_safety_obs) + augmented_obs = self.augment_obs(next_obs, next_safety_obs) + + return augmented_obs, reward, cost, terminated, truncated, info + + # pylint: disable-next=too-many-locals + def roll_out(self, agent, buf, logger): + r"""Collect data and store to experience buffer. + + Args: + agent (Agent): agent + buf (Buffer): buffer + logger (Logger): logger + + Returns: + ep_ret (float): episode return + ep_costs (float): episode costs + ep_len (int): episode length + ep_budget (float): episode budget + """ + obs, _ = self.reset() + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + for step_i in range(self.local_steps_per_epoch): + action, value, cost_value, logp = agent.step(torch.as_tensor(obs, dtype=torch.float32)) + next_obs, reward, cost, done, truncated, info = self.step(action) + ep_ret += info['true_reward'] + ep_costs += (self.cost_gamma**ep_len) * cost + ep_len += 1 + ep_budget += self.safety_obs + + # Save and log + # Notes: + # - raw observations are stored to buffer (later transformed) + # - reward scaling is performed in buffer + buf.store( + obs=obs, + act=action, + rew=reward, + val=value, + logp=logp, + cost=cost, + cost_val=cost_value, + ) + + # Store values for statistic purpose + if self.use_cost: + logger.store(**{'Values/V': value, 'Values/C': cost_value}) + else: + logger.store(**{'Values/V': value}) + + # Update observation + obs = next_obs + + timeout = ep_len == self.max_ep_len + terminal = done or timeout or truncated + epoch_ended = step_i == self.local_steps_per_epoch - 1 + + if terminal or epoch_ended: + if timeout or epoch_ended: + _, value, cost_value, _ = agent(torch.as_tensor(obs, dtype=torch.float32)) + else: + value, cost_value = 0.0, 0.0 + + # Automatically compute GAE in buffer + buf.finish_path(value, cost_value, penalty_param=float(self.penalty_param)) + + # Only save EpRet / EpLen if trajectory finished + if terminal: + logger.store( + **{ + 'Metrics/EpRet': ep_ret, + 'Metrics/EpLen': ep_len, + 'Metrics/EpCost': ep_costs, + 'Metrics/EpBudget': ep_budget, + } + ) + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + obs, _ = self.reset() From 8b201e3563dd7b89e9ba5e6f2a411f0035369511 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 16:11:16 +0800 Subject: [PATCH 04/39] feat: add new algorithms --- docs/source/spelling_wordlist.txt | 48 +- examples/train_policy.py | 13 +- omnisafe/algorithms/__init__.py | 72 ++- omnisafe/algorithms/off_policy/ddpg.py | 96 ++-- omnisafe/algorithms/off_policy/ddpg_lag.py | 114 ++++ omnisafe/algorithms/off_policy/sac.py | 137 +++++ omnisafe/algorithms/off_policy/sac_lag.py | 151 ++++++ omnisafe/algorithms/off_policy/sddpg.py | 189 +++++++ omnisafe/algorithms/off_policy/td3.py | 82 +++ omnisafe/algorithms/off_policy/td3_lag.py | 120 +++++ .../on_policy/{ => first_order}/cup.py | 5 +- .../on_policy/pid_lagrange/trpo_pid.py | 93 ++++ .../on_policy/simmer/ppo_lag_simmer_pid.py | 52 ++ .../on_policy/simmer/ppo_lag_simmer_q.py | 52 ++ .../on_policy/simmer/ppo_simmer_pid.py | 51 ++ .../on_policy/simmer/ppo_simmer_q.py | 50 ++ omnisafe/configs/off-policy/CVPO.yaml | 127 +++++ omnisafe/configs/off-policy/DDPG.yaml | 69 ++- omnisafe/configs/off-policy/DDPGLag.yaml | 114 ++++ omnisafe/configs/off-policy/SAC.yaml | 108 ++++ omnisafe/configs/off-policy/SACLag.yaml | 117 ++++ omnisafe/configs/off-policy/SDDPG.yaml | 114 ++++ omnisafe/configs/off-policy/TD3.yaml | 102 ++++ omnisafe/configs/off-policy/TD3Lag.yaml | 112 ++++ omnisafe/configs/on-policy/CPPOPid.yaml | 106 ++-- omnisafe/configs/on-policy/CUP.yaml | 4 +- omnisafe/configs/on-policy/NPGLag.yaml | 79 ++- omnisafe/configs/on-policy/NaturalPG.yaml | 71 ++- omnisafe/configs/on-policy/PDO.yaml | 75 ++- omnisafe/configs/on-policy/PPO.yaml | 73 ++- omnisafe/configs/on-policy/PPOLag.yaml | 6 +- .../configs/on-policy/PPOLagSimmerPid.yaml | 143 +++++ omnisafe/configs/on-policy/PPOLagSimmerQ.yaml | 145 +++++ omnisafe/configs/on-policy/PPOSimmerPid.yaml | 135 +++++ omnisafe/configs/on-policy/PPOSimmerQ.yaml | 137 +++++ .../configs/on-policy/PolicyGradient.yaml | 66 ++- omnisafe/configs/on-policy/TRPO.yaml | 73 ++- omnisafe/configs/on-policy/TRPOLag.yaml | 2 +- omnisafe/configs/on-policy/TRPOPid.yaml | 137 +++++ omnisafe/models/actor/actor_builder.py | 28 + .../models/actor/gaussian_annealing_actor.py | 2 +- .../models/actor/gaussian_stdnet_actor.py | 9 +- omnisafe/models/actor/mlp_actor.py | 28 +- omnisafe/models/actor_q_critic.py | 53 +- omnisafe/models/constraint_actor_q_critic.py | 12 +- omnisafe/models/critic/q_critic.py | 35 +- omnisafe/utils/algo_utils.py | 51 ++ omnisafe/wrappers/__init__.py | 3 + omnisafe/wrappers/off_policy_wrapper.py | 5 +- omnisafe/wrappers/simmer_wrapper.py | 508 ++++++++++++++++++ tests/test_policy.py | 32 +- 51 files changed, 3938 insertions(+), 268 deletions(-) create mode 100644 omnisafe/algorithms/off_policy/ddpg_lag.py create mode 100644 omnisafe/algorithms/off_policy/sac.py create mode 100644 omnisafe/algorithms/off_policy/sac_lag.py create mode 100644 omnisafe/algorithms/off_policy/sddpg.py create mode 100644 omnisafe/algorithms/off_policy/td3.py create mode 100644 omnisafe/algorithms/off_policy/td3_lag.py rename omnisafe/algorithms/on_policy/{ => first_order}/cup.py (98%) create mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py create mode 100644 omnisafe/configs/off-policy/CVPO.yaml create mode 100644 omnisafe/configs/off-policy/DDPGLag.yaml create mode 100644 omnisafe/configs/off-policy/SAC.yaml create mode 100644 omnisafe/configs/off-policy/SACLag.yaml create mode 100644 omnisafe/configs/off-policy/SDDPG.yaml create mode 100644 omnisafe/configs/off-policy/TD3.yaml create mode 100644 omnisafe/configs/off-policy/TD3Lag.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagSimmerPid.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagSimmerQ.yaml create mode 100644 omnisafe/configs/on-policy/PPOSimmerPid.yaml create mode 100644 omnisafe/configs/on-policy/PPOSimmerQ.yaml create mode 100644 omnisafe/configs/on-policy/TRPOPid.yaml create mode 100644 omnisafe/wrappers/simmer_wrapper.py diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 1d609b912..a0b1256a7 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -10,7 +10,7 @@ pragma fmt func sys -bool +ol len str iter @@ -165,7 +165,6 @@ xmax ymin ymax vel -pos quaternion Quaternions Jacobian @@ -182,3 +181,48 @@ Binbin Zhou Pengfei Yaodong +buf +Aivar +Sootla +Alexander +Cowen +Taher +Jafferjee +Ziyan +Wang +David +Mguni +Jun +Haitham +u +Ammar +Sun +Ziping +Xu +Meng +Fang +Zhenghao +Peng +Jiadong +Guo +Bo +Dai +lei +bool +MDP +Bolei +Bou +Hao +Tuomas +Haarnoja +Aurick +Meger +Herke +Fujimoto +Lyapunov +Yinlam +Ofir +Nachum +Aleksandra +Duenez +Ghavamzadeh diff --git a/examples/train_policy.py b/examples/train_policy.py index 675d51455..a699ac0fa 100644 --- a/examples/train_policy.py +++ b/examples/train_policy.py @@ -25,8 +25,17 @@ '--algo', type=str, default='PPOLag', - help='Choose from: {PolicyGradient, PPO, PPOLag, NaturalPG,' - ' TRPO, TRPOLag, PDO, NPGLag, CPO, PCPO, FOCOPS, CPPOPid,CUP', + help='Choose from: ' + 'On Policy:' + 'PolicyGradient, NaturalPG, TRPO, PPO,' + 'PDO, NPGLag, TRPOLag, PPOLag, CPO, PCPO, FOCOPS, CUP,' + 'CPPOPid, TRPOPid,' + 'PPOSaute, PPOSimmer, PPOSimmerPid, PPOSimmerQ, PPOEarlyTerminated,' + 'PPOLagSaute, PPOLagSimmerPid, PPOLagSimmerQ, PPOLagEarlyTerminated,' + 'Off Policy:' + 'DDPG, TD3, SAC,' + 'DDPGLag, TD3Lag, SACLag,' + 'SDDPG', ) parser.add_argument( '--env-id', diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index 13f758cac..a1a296477 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -13,43 +13,57 @@ # limitations under the License. # ============================================================================== """Safe Reinforcement Learning algorithms.""" - -# Off Policy Safe -from omnisafe.algorithms.off_policy.ddpg import DDPG - -# On Policy Safe -from omnisafe.algorithms.on_policy.cpo import CPO -from omnisafe.algorithms.on_policy.cppo_pid import CPPOPid -from omnisafe.algorithms.on_policy.cup import CUP -from omnisafe.algorithms.on_policy.focops import FOCOPS -from omnisafe.algorithms.on_policy.natural_pg import NaturalPG -from omnisafe.algorithms.on_policy.npg_lag import NPGLag -from omnisafe.algorithms.on_policy.pcpo import PCPO -from omnisafe.algorithms.on_policy.pdo import PDO -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient -from omnisafe.algorithms.on_policy.ppo import PPO -from omnisafe.algorithms.on_policy.ppo_lag import PPOLag -from omnisafe.algorithms.on_policy.trpo import TRPO -from omnisafe.algorithms.on_policy.trpo_lag import TRPOLag +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO +from omnisafe.algorithms.on_policy.first_order.cup import CUP +from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS +from omnisafe.algorithms.on_policy.naive_lagrange.npg_lag import NPGLag +from omnisafe.algorithms.on_policy.naive_lagrange.pdo import PDO +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag +from omnisafe.algorithms.on_policy.naive_lagrange.trpo_lag import TRPOLag +from omnisafe.algorithms.on_policy.pid_lagrange.cppo_pid import CPPOPid +from omnisafe.algorithms.on_policy.saute.ppo_lag_saute import PPOLagSaute +from omnisafe.algorithms.on_policy.saute.ppo_saute import PPOSaute +from omnisafe.algorithms.on_policy.second_order.cpo import CPO +from omnisafe.algorithms.on_policy.second_order.pcpo import PCPO algo_type = { - 'off-policy': ['DDPG'], + 'off-policy': [ + 'DDPG', + 'DDPGLag', + 'TD3', + 'TD3Lag', + 'SAC', + 'SACLag', + 'SDDPG', + 'CVPO', + ], 'on-policy': [ - 'CPO', - 'FOCOPS', - 'CPPOPid', - 'FOCOPS', - 'NaturalPG', - 'NPGLag', - 'PCPO', - 'PDO', 'PolicyGradient', - 'PPO', - 'PPOLag', + 'NaturalPG', 'TRPO', + 'PPO', + 'PDO', + 'NPGLag', 'TRPOLag', + 'PPOLag', + 'CPPOPid', + 'TRPOPid', + 'FOCOPS', 'CUP', + 'CPO', + 'PCPO', + 'PPOSimmerPid', + 'PPOSimmerQ', + 'PPOLagSimmerQ', + 'PPOLagSimmerPid', + 'PPOSaute', + 'PPOLagSaute', + 'PPOEarlyTerminated', + 'PPOLagEarlyTerminated', ], 'model-based': ['MBPPOLag', 'SafeLoop'], } diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 58ab15f12..6d066e0f4 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -48,7 +48,14 @@ def __init__( algo: str = 'DDPG', wrapper_type: str = 'OffPolicyEnvWrapper', ): - """Initialize DDPG.""" + r"""Initialize DDPG. + + Args: + env_id (str): Environment ID. + cfgs (dict): Configuration dictionary. + algo (str): Algorithm name. + wrapper_type (str): Wrapper type. + """ self.env = wrapper_registry.get(wrapper_type)( env_id, use_cost=cfgs.use_cost, @@ -139,7 +146,8 @@ def __init__( self.logger.log('Start with training.') def set_learning_rate_scheduler(self): - """Set up learning rate scheduler.""" + r"""Set up learning rate scheduler.""" + scheduler = None if self.cfgs.linear_lr_decay: # Linear anneal @@ -152,9 +160,8 @@ def linear_anneal(epoch): return scheduler def _init_mpi(self): - """ - Initialize MPI specifics - """ + r"""Initialize MPI specifics.""" + if distributed_utils.num_procs() > 1: # Avoid slowdowns from PyTorch + MPI combo distributed_utils.setup_torch_for_mpi() @@ -179,10 +186,6 @@ def _ac_training_setup(self): param.requires_grad = False for param in self.ac_targ.cost_critic.parameters(): param.requires_grad = False - if self.algo in ['SAC', 'TD3', 'SACLag', 'TD3Lag']: - # Freeze target networks with respect to optimizer (only update via polyak averaging) - for param in self.ac_targ.critic_.parameters(): - param.requires_grad = False def check_distributed_parameters(self): """ @@ -198,21 +201,27 @@ def check_distributed_parameters(self): assert np.allclose(global_min, global_max), f'{key} not synced.' def compute_loss_pi(self, data: dict): - """ + r""" computing pi/actor loss + Args: + data (dict): data dictionary + Returns: torch.Tensor """ action, _ = self.actor_critic.actor.predict(data['obs'], deterministic=True) - loss_pi = self.actor_critic.critic(data['obs'], action) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] pi_info = {} return -loss_pi.mean(), pi_info def compute_loss_v(self, data): - """ + r""" computing value loss + Args: + data (dict): data dictionary + Returns: torch.Tensor """ @@ -223,22 +232,25 @@ def compute_loss_v(self, data): data['obs_next'], data['done'], ) - q_value = self.actor_critic.critic(obs, act) + q_value = self.actor_critic.critic(obs, act)[0] # Bellman backup for Q function with torch.no_grad(): - act_targ, _ = self.ac_targ.actor.predict(obs, deterministic=True) - q_targ = self.ac_targ.critic(obs_next, act_targ) + act_targ = self.ac_targ.actor.predict(obs, deterministic=True, need_log_prob=False) + q_targ = self.ac_targ.critic(obs_next, act_targ)[0] backup = rew + self.cfgs.gamma * (1 - done) * q_targ # MSE loss against Bellman backup loss_q = ((q_value - backup) ** 2).mean() # Useful info for logging - q_info = dict(Q1Vals=q_value.detach().numpy()) + q_info = dict(QVals=q_value.detach().numpy()) return loss_q, q_info def compute_loss_c(self, data): - """ + r""" computing cost loss + Args: + data (dict): data dictionary + Returns: torch.Tensor """ @@ -249,12 +261,12 @@ def compute_loss_c(self, data): data['obs_next'], data['done'], ) - cost_q_value = self.actor_critic.cost_critic(obs, act) + cost_q_value = self.actor_critic.cost_critic(obs, act)[0] # Bellman backup for Q function with torch.no_grad(): - action, _ = self.ac_targ.pi.predict(obs_next, deterministic=True) - qc_targ = self.ac_targ.c(obs_next, action) + action, _ = self.ac_targ.actor.predict(obs_next, deterministic=True) + qc_targ = self.ac_targ.cost_critic(obs_next, action)[0] backup = cost + self.cfgs.gamma * (1 - done) * qc_targ # MSE loss against Bellman backup loss_qc = ((cost_q_value - backup) ** 2).mean() @@ -264,7 +276,7 @@ def compute_loss_c(self, data): return loss_qc, qc_info def learn(self): - """ + r""" This is main function for algorithm update, divided into the following steps: (1). self.rollout: collect interactive data from environment (2). self.update: perform actor/critic updates @@ -314,7 +326,11 @@ def learn(self): return self.actor_critic def update(self, data): - """update""" + r"""Update + + Args: + data (dict): data dictionary + """ # First run one gradient descent step for Q. self.update_value_net(data) if self.cfgs.use_cost: @@ -327,7 +343,7 @@ def update(self, data): for param in self.actor_critic.critic.parameters(): param.requires_grad = False - # Next run one gradient descent step for pi. + # Next run one gradient descent step for actor. self.update_policy_net(data) # Unfreeze Q-network so you can optimize it at next DDPG step. @@ -342,7 +358,7 @@ def update(self, data): self.polyak_update_target() def polyak_update_target(self): - """polyak update target network""" + r"""polyak update target network""" with torch.no_grad(): for param, param_targ in zip(self.actor_critic.parameters(), self.ac_targ.parameters()): # Notes: We use an in-place operations "mul_", "add_" to update target @@ -351,7 +367,11 @@ def polyak_update_target(self): param_targ.data.add_((1 - self.cfgs.polyak) * param.data) def update_policy_net(self, data) -> None: - """update policy network""" + r"""update policy network + + Args: + data (dict): data dictionary + """ # Train policy with one steps of gradient descent self.actor_optimizer.zero_grad() loss_pi, _ = self.compute_loss_pi(data) @@ -360,16 +380,24 @@ def update_policy_net(self, data) -> None: self.logger.store(**{'Loss/Pi': loss_pi.item()}) def update_value_net(self, data: dict) -> None: - """update value network""" + r"""update value network + + Args: + data (dict): data dictionary + """ # Train value critic with one steps of gradient descent self.critic_optimizer.zero_grad() loss_q, q_info = self.compute_loss_v(data) loss_q.backward() self.critic_optimizer.step() - self.logger.store(**{'Loss/Value': loss_q.item(), 'Q1Vals': q_info['Q1Vals']}) + self.logger.store(**{'Loss/Value': loss_q.item(), 'QVals': q_info['QVals']}) def update_cost_net(self, data): - """update cost network""" + r"""update cost network + + Args: + data (dict): data dictionary + """ # Train cost critic with one steps of gradient descent self.cost_critic_optimizer.zero_grad() loss_qc, qc_info = self.compute_loss_c(data) @@ -378,7 +406,7 @@ def update_cost_net(self, data): self.logger.store(**{'Loss/Cost': loss_qc.item(), 'QCosts': qc_info['QCosts']}) def test_agent(self): - """test agent""" + r"""Test agent""" for _ in range(self.num_test_episodes): # self.env.set_rollout_cfgs(deterministic=True, rand_a=False) self.env.roll_out( @@ -391,7 +419,7 @@ def test_agent(self): ) def log(self, epoch, total_steps): - """Log info about epoch""" + r"""Log info about epoch""" fps = self.cfgs.steps_per_epoch / (time.time() - self.epoch_time) # Step the actor learning rate scheduler if provided if self.scheduler and self.cfgs.linear_lr_decay: @@ -402,13 +430,13 @@ def log(self, epoch, total_steps): self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('Metrics/EpRet') - self.logger.log_tabular('Metrics/EpCosts') + self.logger.log_tabular('Metrics/EpCost') self.logger.log_tabular('Metrics/EpLen') self.logger.log_tabular('Test/EpRet') - self.logger.log_tabular('Test/EpCosts') + self.logger.log_tabular('Test/EpCost') self.logger.log_tabular('Test/EpLen') self.logger.log_tabular('Values/V', min_and_max=True) - self.logger.log_tabular('Q1Vals') + self.logger.log_tabular('QVals') if self.cfgs.use_cost: self.logger.log_tabular('Values/C', min_and_max=True) self.logger.log_tabular('QCosts') @@ -424,7 +452,7 @@ def log(self, epoch, total_steps): self.logger.log_tabular('Misc/RewScaleMean', reward_scale_mean) self.logger.log_tabular('Misc/RewScaleStddev', reward_scale_stddev) if self.cfgs.exploration_noise_anneal: - noise_std = np.exp(self.actor_critic.pi.log_std[0].item()) + noise_std = np.exp(self.actor_critic.actor.log_std[0].item()) self.logger.log_tabular('Misc/ExplorationNoiseStd', noise_std) self.algorithm_specific_logs() self.logger.log_tabular('TotalEnvSteps', total_steps) diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py new file mode 100644 index 000000000..32bc1842b --- /dev/null +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -0,0 +1,114 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the DDPGLag algorithm.""" + + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.common.lagrange import Lagrange + + +@registry.register +class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes + r"""The Lagrange version of DDPG Algorithm. + + References: + Paper Name: Continuous control with deep reinforcement learning. + Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, + Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. + Paper URL: https://arxiv.org/abs/1509.02971 + + """ + + def __init__( + self, + env_id: str, + cfgs=None, + algo: str = 'DDPG-Lag', + wrapper_type: str = 'OffPolicyEnvWrapper', + ): + """Initialize DDPG.""" + DDPG.__init__( + self, + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + Lagrange.__init__( + self, + cost_limit=self.cfgs.lagrange_cfgs.cost_limit, + lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, + lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, + lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + ) + + def algorithm_specific_logs(self): + r"""Use this method to collect log information.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) + + def compute_loss_pi(self, data: dict): + r""" + computing pi/actor loss + + Args: + data (dict): data from replay buffer + + Returns: + torch.Tensor + """ + action = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=False + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] + penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() + loss_pi -= ( + self.lagrangian_multiplier * self.actor_critic.cost_critic(data['obs'], action)[0] + ) + loss_pi /= 1 + penalty + pi_info = {} + return -loss_pi.mean(), pi_info + + def update(self, data): + r"""update""" + Jc = data['cost'].sum().item() + self.update_lagrange_multiplier(Jc) + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for pi. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next DDPG step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py new file mode 100644 index 000000000..fae6793e7 --- /dev/null +++ b/omnisafe/algorithms/off_policy/sac.py @@ -0,0 +1,137 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the DDPG algorithm.""" + + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG + + +@registry.register +class SAC(DDPG): # pylint: disable=too-many-instance-attributes + r"""Implementation of the SAC algorithm. + + References: + Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine + Paper URL: https://arxiv.org/abs/1801.01290 + + """ + + def __init__( + self, + env_id: str, + cfgs=None, + algo: str = 'SAC', + wrapper_type: str = 'OffPolicyEnvWrapper', + ): + r"""Initialize SAC.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + self.alpha = cfgs.alpha + self.alpha_gamma = cfgs.alpha_gamma + + # pylint: disable=too-many-locals + def compute_loss_v(self, data): + r""" + Computing value loss + + Args: + data (dict): data from replay buffer + + Returns: + torch.Tensor + """ + obs, act, rew, obs_next, done = ( + data['obs'], + data['act'], + data['rew'], + data['obs_next'], + data['done'], + ) + q_value_list = self.actor_critic.critic(obs, act) + # Bellman backup for Q function + with torch.no_grad(): + act_targ, logp_a_next = self.ac_targ.actor.predict( + obs, deterministic=False, need_log_prob=True + ) + q_targ = torch.min(torch.vstack(self.ac_targ.critic(obs_next, act_targ)), dim=0).values + backup = rew + self.cfgs.gamma * (1 - done) * (q_targ - self.alpha * logp_a_next) + # MSE loss against Bellman backup + loss_q = [] + q_values = [] + for q_value in q_value_list: + loss_q.append(torch.mean((q_value - backup) ** 2)) + q_values.append(torch.mean(q_value)) + + # Useful info for logging + q_info = dict(QVals=sum(q_values).detach().numpy()) + return sum(loss_q), q_info + + def compute_loss_pi(self, data: dict): + r""" + Computing pi/actor loss + + Args: + data (dict): data from replay buffer + + Returns: + torch.Tensor + """ + action, logp_a = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=True + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] - self.alpha * logp_a + pi_info = {'LogPi': logp_a.detach().numpy()} + return -loss_pi.mean(), pi_info + + def update(self, data): + r"""Update""" + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for actor. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next DDPG step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() + self.alpha_discount() + + def alpha_discount(self): + r"""Alpha discount.""" + self.alpha *= self.alpha_gamma diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py new file mode 100644 index 000000000..cf783777a --- /dev/null +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -0,0 +1,151 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the SACLag algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.common.lagrange import Lagrange + + +@registry.register +class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes + r"""The Lagrange version of SAC algorithm. + + References: + Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine + Paper URL: https://arxiv.org/abs/1801.01290 + + """ + + def __init__( + self, + env_id: str, + cfgs=None, + algo: str = 'SAC-Lag', + wrapper_type: str = 'OffPolicyEnvWrapper', + ): + r"""Initialize SACLag. + + Args: + env_id (str): environment id + cfgs (dict): configuration + algo (str): algorithm name + wrapper_type (str): environment wrapper type + """ + SAC.__init__( + self, + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + Lagrange.__init__( + self, + cost_limit=self.cfgs.lagrange_cfgs.cost_limit, + lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, + lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, + lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + ) + + def algorithm_specific_logs(self): + r""" + Use this method to collect log information. + """ + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) + + def compute_loss_pi(self, data: dict): + r""" + Computing pi/actor loss + + Returns: + torch.Tensor + """ + action, logp_a = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=True + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] - self.alpha * logp_a + penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() + loss_pi -= ( + self.lagrangian_multiplier * self.actor_critic.cost_critic(data['obs'], action)[0] + ) + loss_pi /= 1 + penalty + pi_info = {} + return -loss_pi.mean(), pi_info + + def compute_loss_c(self, data): + r""" + computing cost loss + + Returns: + torch.Tensor + """ + obs, act, cost, obs_next, done = ( + data['obs'], + data['act'], + data['rew'], + data['obs_next'], + data['done'], + ) + cost_q_value = self.actor_critic.cost_critic(obs, act)[0] + + # Bellman backup for Q function + with torch.no_grad(): + act_targ, logp_a_next = self.ac_targ.actor.predict( + obs_next, deterministic=False, need_log_prob=True + ) + qc_targ = self.ac_targ.cost_critic(obs_next, act_targ)[0] + backup = cost + self.cfgs.gamma * (1 - done) * (qc_targ - self.alpha * logp_a_next) + # MSE loss against Bellman backup + loss_qc = ((cost_q_value - backup) ** 2).mean() + # Useful info for logging + qc_info = dict(QCosts=cost_q_value.detach().numpy()) + + return loss_qc, qc_info + + def update(self, data): + r"""update""" + Jc = data['cost'].sum().item() + self.update_lagrange_multiplier(Jc) + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for pi. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next SAC step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() + self.alpha_discount() diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py new file mode 100644 index 000000000..c0af2c89a --- /dev/null +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -0,0 +1,189 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the SDDPG algorithm.""" + + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.utils import distributed_utils +from omnisafe.utils.tools import ( + conjugate_gradients, + get_flat_gradients_from, + get_flat_params_from, + set_param_values_to_model, +) + + +@registry.register +class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name + r"""Implementation of SDDPG Algorithm. + + References: + Paper Name: Lyapunov-based Safe Policy Optimization for Continuous Control. + Paper author: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, Mohammad Ghavamzadeh. + Paper URL: https://arxiv.org/abs/1901.10031 + + """ + + def __init__( + self, + env_id: str, + cfgs=None, + algo: str = 'SDDPG', + wrapper_type: str = 'OffPolicyEnvWrapper', + ): + r"""Initialize SDDPG. + + Args: + env_id (str): environment id + cfgs (dict): configurations + algo (str): algorithm name + wrapper_type (str): environment wrapper type + """ + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + self.beta = cfgs.beta + self.cg_damping = cfgs.cg_damping + self.cg_iters = cfgs.cg_iters + self.fvp_obs = None + self.target_kl = cfgs.target_kl + self.gamma = cfgs.gamma + self.d_init = cfgs.d_init + + def update(self, data): + r"""Update + + Args: + data (dict): data dictionary + """ + # First run one gradient descent step for Q. + self.fvp_obs = data['obs'][::4] + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for actor. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next DDPG step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() + + def Fvp(self, params): + r""" + Build the Hessian-vector product based on an approximation of the KL-divergence. + For details see John Schulman's PhD thesis (pp. 40) http://joschu.net/docs/thesis.pdf + + Args: + params (torch.Tensor): parameters + + Returns: + flat_grad_grad_kl (torch.Tensor): flat gradient of gradient of KL + """ + self.actor_critic.actor.net.zero_grad() + q_dist = self.actor_critic.actor.get_distribution(self.fvp_obs) + with torch.no_grad(): + p_dist = self.actor_critic.actor.get_distribution(self.fvp_obs) + kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean() + + grads = torch.autograd.grad(kl, self.actor_critic.actor.net.parameters(), create_graph=True) + flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) + + kl_p = (flat_grad_kl * params).sum() + grads = torch.autograd.grad( + kl_p, self.actor_critic.actor.net.parameters(), retain_graph=False + ) + # contiguous indicating, if the memory is contiguously stored or not + flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]) + distributed_utils.mpi_avg_torch_tensor(flat_grad_grad_kl) + return flat_grad_grad_kl + params * self.cg_damping + + def compute_loss_cost_performance(self, data): + r"""Compute loss of cost performance + + Args: + data (dict): data dictionary + + Returns: + loss (torch.Tensor): loss of cost performance + """ + # Compute loss + action, _ = self.actor_critic.actor.predict(data['obs'], deterministic=True) + loss_pi = self.actor_critic.cost_critic(data['obs'], action)[0] + pi_info = {} + return loss_pi.mean(), pi_info + + # pylint: disable=invalid-name,too-many-arguments,too-many-locals + def update_policy_net(self, data) -> None: + r"""update policy network + + Args: + data (dict): data dictionary + """ + # Train policy with one steps of gradient descent + theta_old = get_flat_params_from(self.actor_critic.actor.net) + + self.actor_optimizer.zero_grad() + loss_pi, _ = self.compute_loss_pi(data) + loss_pi.backward() + + g_flat = get_flat_gradients_from(self.actor_critic.actor.net) + g_flat *= -1 + + x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters) + assert torch.isfinite(x).all() + + eps = 1.0e-8 + xHx = torch.dot(x, self.Fvp(x)) + + alpha = torch.sqrt(2 * self.target_kl / (xHx + eps)) + + self.actor_optimizer.zero_grad() + loss_cost, _ = self.compute_loss_cost_performance(data) + loss_cost.backward() + + b_flat = get_flat_gradients_from(self.actor_critic.actor.net) + d = conjugate_gradients(self.Fvp, b_flat, self.cg_iters) + dHd = torch.dot(d, self.Fvp(d)) + sHd = torch.dot(d, self.Fvp(d)) + + epsilon = (1 - self.gamma) * (self.d_init - loss_cost) + lambda_star = (-self.beta * epsilon - sHd) / (dHd + eps) + + final_step_dir = -alpha / self.beta * (self.Fvp(x) - lambda_star * self.Fvp(d)) + new_theta = theta_old + final_step_dir + set_param_values_to_model(self.actor_critic.actor.net, new_theta) + + self.logger.store(**{'Loss/Pi': loss_pi.item()}) diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py new file mode 100644 index 000000000..f93e480e6 --- /dev/null +++ b/omnisafe/algorithms/off_policy/td3.py @@ -0,0 +1,82 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the DDPG algorithm.""" + + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG + + +@registry.register +class TD3(DDPG): # pylint: disable=too-many-instance-attributes + r"""Implementation of TD3 Algorithm. + + References: + Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. + Paper author: Scott Fujimoto, Herke van Hoof, David Meger. + Paper URL: https://arxiv.org/abs/1802.09477 + + """ + + def __init__( + self, + env_id: str, + cfgs=None, + algo: str = 'TD3', + wrapper_type: str = 'OffPolicyEnvWrapper', + ): + """Initialize DDPG.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def compute_loss_v(self, data): + r""" + computing value loss + + Args: + data (dict): data from replay buffer + + Returns: + torch.Tensor + """ + obs, act, rew, obs_next, done = ( + data['obs'], + data['act'], + data['rew'], + data['obs_next'], + data['done'], + ) + q_value_list = self.actor_critic.critic(obs, act) + # Bellman backup for Q function + with torch.no_grad(): + act_targ = self.ac_targ.actor.predict(obs, deterministic=False, need_log_prob=False) + q_targ = torch.min(torch.vstack(self.ac_targ.critic(obs_next, act_targ)), dim=0).values + backup = rew + self.cfgs.gamma * (1 - done) * q_targ + # MSE loss against Bellman backup + loss_q = [] + q_values = [] + for q_value in q_value_list: + loss_q.append(torch.mean((q_value - backup) ** 2)) + q_values.append(torch.mean(q_value)) + + # Useful info for logging + q_info = dict(QVals=sum(q_values).detach().numpy()) + return sum(loss_q), q_info diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py new file mode 100644 index 000000000..b4a4b6289 --- /dev/null +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -0,0 +1,120 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the TD3Lag algorithm.""" + + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.td3 import TD3 +from omnisafe.common.lagrange import Lagrange + + +@registry.register +class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes + r"""The Lagrange version of TD3 Algorithm. + + References: + Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. + Paper author: Scott Fujimoto, Herke van Hoof, David Meger. + Paper URL: https://arxiv.org/abs/1802.09477 + + """ + + def __init__( + self, + env_id: str, + cfgs=None, + algo: str = 'TD3-Lag', + wrapper_type: str = 'OffPolicyEnvWrapper', + ): + r"""Initialize TD3. + + Args: + env_id (str): environment id + cfgs (dict): configurations + algo (str): algorithm name + wrapper_type (str): environment wrapper type + """ + TD3.__init__( + self, + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + Lagrange.__init__( + self, + cost_limit=self.cfgs.lagrange_cfgs.cost_limit, + lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, + lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, + lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + ) + + def algorithm_specific_logs(self): + r"""Use this method to collect log information.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) + + def compute_loss_pi(self, data: dict): + r""" + computing pi/actor loss + + Args: + data (dict): data + + Returns: + torch.Tensor + """ + action = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=False + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] + penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() + loss_pi -= ( + self.lagrangian_multiplier * self.actor_critic.cost_critic(data['obs'], action)[0] + ) + loss_pi /= 1 + penalty + pi_info = {} + return -loss_pi.mean(), pi_info + + def update(self, data): + r"""update""" + Jc = data['cost'].sum().item() + self.update_lagrange_multiplier(Jc) + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for pi. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next TD3 step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() diff --git a/omnisafe/algorithms/on_policy/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py similarity index 98% rename from omnisafe/algorithms/on_policy/cup.py rename to omnisafe/algorithms/on_policy/first_order/cup.py index bbaa32541..099e3b6ad 100644 --- a/omnisafe/algorithms/on_policy/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange from omnisafe.utils import distributed_utils @@ -56,6 +56,7 @@ def __init__( lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, ) self.lam = self.cfgs.lam self.eta = self.cfgs.eta @@ -65,7 +66,7 @@ def __init__( def algorithm_specific_logs(self): super().algorithm_specific_logs() - self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier) + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) self.logger.log_tabular('Train/MaxRatio', self.max_ratio) self.logger.log_tabular('Train/MinRatio', self.min_ratio) diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py new file mode 100644 index 000000000..5ebaf5e1b --- /dev/null +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -0,0 +1,93 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the TRPO Pid-Lagrange algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.trpo import TRPO +from omnisafe.common.pid_lagrange import PIDLagrangian + + +@registry.register +class TRPOPid(TRPO, PIDLagrangian): + """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + + References: + Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + Paper URL: https://arxiv.org/abs/2007.03964 + + """ + + def __init__( + self, + env_id, + cfgs, + algo: str = 'TRPO-PID', + wrapper_type: str = 'OnPolicyEnvWrapper', + ): + + TRPO.__init__( + self, + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) + self.cost_limit = self.cfgs.cost_limit + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.cost_penalty) + self.logger.log_tabular('PID/pid_Kp', self.pid_kp) + self.logger.log_tabular('PID/pid_Ki', self.pid_ki) + self.logger.log_tabular('PID/pid_Kd', self.pid_kd) + + def compute_loss_pi(self, data: dict): + """compute loss for policy""" + dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) + ratio = torch.exp(_log_p - data['log_p']) + + # Compute loss via ratio and advantage + loss_pi = -(ratio * data['adv']).mean() + loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean() + + penalty = self.cost_penalty + loss_pi += penalty * (ratio * data['cost_adv']).mean() + loss_pi /= 1 + penalty + + # Useful extra info + approx_kl = 0.5 * (data['log_p'] - _log_p).mean().item() + ent = dist.entropy().mean().item() + pi_info = dict(kl=approx_kl, ent=ent, ratio=ratio.mean().item()) + + return loss_pi, pi_info + + def update(self): + """update policy""" + raw_data, data = self.buf.pre_process_data() + # sub-sampling accelerates calculations + self.fvp_obs = data['obs'][::4] + # Note that logger already uses MPI statistics across all processes.. + ep_costs = self.logger.get_stats('Metrics/EpCost')[0] + # First update Lagrange multiplier parameter + self.pid_update(ep_costs) + # now update policy and value network + self.update_policy_net(data=data) + self.update_value_net(data=data) + self.update_cost_net(data=data) + return raw_data, data diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py new file mode 100644 index 000000000..6abfa72e7 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -0,0 +1,52 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Pid Simmer algorithm by PPOLag.""" + + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagSimmerPid(PPOLag): + """Simmer algorithm (PID version) implemented by PPOLag. + + References: + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_lag_simmer_pid', + wrapper_type: str = 'SimmerEnvWrapper', + ): + r"""Initialize PPOLagSimmerPid algorithm.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def algorithm_specific_logs(self): + r"""Log the algorithm specific metrics.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py new file mode 100644 index 000000000..e92ab1bf9 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -0,0 +1,52 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Q Simmer algorithm by PPOLag.""" + + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagSimmerQ(PPOLag): + """Simmer algorithm (Q version) implemented by PPOLag. + + References: + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_lag_simmer_q', + wrapper_type: str = 'SimmerEnvWrapper', + ): + """Initialize PPOLagSimmerQ algorithm.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def algorithm_specific_logs(self): + r"""Log the algorithm specific metrics.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py new file mode 100644 index 000000000..848a21994 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -0,0 +1,51 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Pid Simmer algorithm by PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOSimmerPid(PPO): + """Simmer algorithm (PID version) implemented by PPO. + + References: + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_simmer_pid', + wrapper_type: str = 'SimmerEnvWrapper', + ) -> None: + r"""Initialize PPOSimmerPid.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def algorithm_specific_logs(self): + r"""Log the algorithm specific metrics.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py new file mode 100644 index 000000000..11f40c892 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py @@ -0,0 +1,50 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Q Simmer algorithm by PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOSimmerQ(PPO): + r"""Simmer algorithm (Q version) implemented by PPO. + + References: + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_simmer_q', + wrapper_type: str = 'SimmerEnvWrapper', + ) -> None: + r"""Initialize PPOSimmerQ.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/configs/off-policy/CVPO.yaml b/omnisafe/configs/off-policy/CVPO.yaml new file mode 100644 index 000000000..17cd052c0 --- /dev/null +++ b/omnisafe/configs/off-policy/CVPO.yaml @@ -0,0 +1,127 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class DDPG---------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + +## ----------------------------------Basic configurations for base class CVPO------------------- ## + kl_mean_constraint: 0.01 + kl_var_constraint: 0.0001 + kl_constraint: 0.01 + alpha_mean_scale: 1.0 + alpha_var_scale: 100.0 + alpha_scale: 10.0 + alpha_mean_max: 0.1 + alpha_var_max: 10.0 + alpha_max: 1.0 + sample_action_num: 64 + mstep_iteration_num: 5 + dual_constraint: 0.1 + cost_limit: 25 + cost_start: 50 + cost_end: 25 + decay_epoch: 100 + use_cost_decay: True + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "cholesky" + cov_min: 1e-4 + mu_clamp_min: -5 + mu_clamp_max: 5 + cov_clamp_min: -5 + cov_clamp_max: 20 + + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/DDPG.yaml b/omnisafe/configs/off-policy/DDPG.yaml index 48c76ae4c..116ba7f95 100644 --- a/omnisafe/configs/off-policy/DDPG.yaml +++ b/omnisafe/configs/off-policy/DDPG.yaml @@ -14,50 +14,91 @@ # ============================================================================== defaults: - # Basic Configurations - epochs: 100 - steps_per_epoch: 4000 + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class DDPG---------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps update_after: 1000 + # Update every `update_every` steps update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 10 + # The max length of per epoch max_ep_len: 1000 + # The number of test episodes num_test_episodes: 10 - actor_lr: 0.001 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The soft update coefficient polyak: 0.999 + # The discount factor of GAE gamma: 0.99 + # Actor perdorm random action before `start_steps` steps start_steps: 10000 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - Env_cfgs: - start_step: 1000 - update_every: 100 - # Optional Configuration - ## Whether to use cost critic - use_cost: False + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: False + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - pi_type: "dire" + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise act_noise: 0.1 + # Size of hidden layers hidden_sizes: [400, 300] + # Activation function activation: relu + # Configuration of Critic network val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers hidden_sizes: [400, 300] + # Activation function activation: relu - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## replay_buffer_cfgs: + # The size of replay buffer size: 50000 + # The size of batch batch_size: 256 diff --git a/omnisafe/configs/off-policy/DDPGLag.yaml b/omnisafe/configs/off-policy/DDPGLag.yaml new file mode 100644 index 000000000..cb6f94a0c --- /dev/null +++ b/omnisafe/configs/off-policy/DDPGLag.yaml @@ -0,0 +1,114 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer------------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 +## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.01 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/off-policy/SAC.yaml b/omnisafe/configs/off-policy/SAC.yaml new file mode 100644 index 000000000..8beafad08 --- /dev/null +++ b/omnisafe/configs/off-policy/SAC.yaml @@ -0,0 +1,108 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class SAC---------------------- ## + # The entropy coefficient + alpha: 0.2 + # The learning rate of Alpha + alpha_gamma: 0.99 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: False + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "gaussian_stdnet" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/SACLag.yaml b/omnisafe/configs/off-policy/SACLag.yaml new file mode 100644 index 000000000..42f21b52c --- /dev/null +++ b/omnisafe/configs/off-policy/SACLag.yaml @@ -0,0 +1,117 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class SAC---------------------- ## + # The entropy coefficient + alpha: 0.2 + # The learning rate of Alpha + alpha_gamma: 0.99 + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "gaussian_stdnet" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 +## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.01 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/off-policy/SDDPG.yaml b/omnisafe/configs/off-policy/SDDPG.yaml new file mode 100644 index 000000000..45878f10f --- /dev/null +++ b/omnisafe/configs/off-policy/SDDPG.yaml @@ -0,0 +1,114 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 200 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + + ## ---------------------------Basic configurations for derived class SDDPG-------------------- ## + # The normalize coefficient + beta: 1.5 + # The discontinuous coefficient for conjugate gradient + cg_damping: 0.1 + # The max iteration for conjugate gradient + cg_iters: 10 + # The constraint for KL divergence + target_kl: 0.01 + # Hypperparameter for SDDPG + d_init: 5 + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/TD3.yaml b/omnisafe/configs/off-policy/TD3.yaml new file mode 100644 index 000000000..90ce7a0ae --- /dev/null +++ b/omnisafe/configs/off-policy/TD3.yaml @@ -0,0 +1,102 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: False + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/TD3Lag.yaml b/omnisafe/configs/off-policy/TD3Lag.yaml new file mode 100644 index 000000000..49ae779fe --- /dev/null +++ b/omnisafe/configs/off-policy/TD3Lag.yaml @@ -0,0 +1,112 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 +## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.01 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/CPPOPid.yaml b/omnisafe/configs/on-policy/CPPOPid.yaml index 7ceebef06..618d5fe00 100644 --- a/omnisafe/configs/on-policy/CPPOPid.yaml +++ b/omnisafe/configs/on-policy/CPPOPid.yaml @@ -14,79 +14,121 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 - use_cost: True - cost_gamma: 1.0 - target_kl: 0.01 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class CPPO_PID - cost_limit: 25.0 - clip: 0. - pid_Kp: 0.01 - pid_Ki: 0.01 - pid_Kd: 0.01 - lagrangian_multiplier_init: 0.001 - pid_d_delay: 10 - pid_delta_p_ema_alpha: 0.95 # 0 for hard update, 1 for no update - pid_delta_d_ema_alpha: 0.95 - sum_norm: True # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 - diff_norm: False # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 - penalty_max: 100 # only used if sum_norm=diff_norm=False + ## -------------------------Basic configurations for derived class CPPOPid-------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 - # Optional Configuration - ## Whether to use cost critic - use_cost_critic: True + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: True + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - standardized_obs: False - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + + ## --------------------------------------Configuration For PID--------------------------------- ## PID_cfgs: + # KP for PID pid_kp: 0.01 + # KI for PID pid_ki: 0.01 + # KD for PID pid_kd: 0.01 + # The init value of lagrangian multiplier lagrangian_multiplier_init: 0.001 + # The delay rate of KD pid_d_delay: 10 - pid_delta_p_ema_alpha: 0.95 # 0 for hard update, 1 for no update + # 0 for hard update, 1 for no update + pid_delta_p_ema_alpha: 0.95 + # The same as above pid_delta_d_ema_alpha: 0.95 - sum_norm: True # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 - diff_norm: False # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 - penalty_max: 100 # only used if sum_norm=diff_norm=False + # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 + sum_norm: True + # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 + diff_norm: False + # Only used if sum_norm=diff_norm=False + penalty_max: 100 + # Tolerance of violation cost_limit: 25.0 diff --git a/omnisafe/configs/on-policy/CUP.yaml b/omnisafe/configs/on-policy/CUP.yaml index 3991f7914..5cd9c3486 100644 --- a/omnisafe/configs/on-policy/CUP.yaml +++ b/omnisafe/configs/on-policy/CUP.yaml @@ -43,7 +43,7 @@ defaults: # The Address for saving training process data data_dir: "./runs" - ## ---------------------------Basic configurations for derived class FOCOPS------------------- ## + ## ----------------------------Basic configurations for derived class CUP-------------------- ## # The thereshold for KL early stopping target_kl: 0.01 # Tolerance of constraint violation @@ -127,3 +127,5 @@ defaults: lambda_lr: 0.035 # Type of lagrangian optimizer lambda_optimizer: "Adam" + # The upper bound of lagrange multiplier + lagrangian_upper_bound: 2.0 diff --git a/omnisafe/configs/on-policy/NPGLag.yaml b/omnisafe/configs/on-policy/NPGLag.yaml index bef481360..c1540f9ab 100644 --- a/omnisafe/configs/on-policy/NPGLag.yaml +++ b/omnisafe/configs/on-policy/NPGLag.yaml @@ -14,63 +14,110 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 - steps_per_epoch: 20000 - actor_iters: 40 - critic_iters: 20 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class PDO_Lag + ## --------------------------Basic configurations for derived class NaturalPG----------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic use_cost: True + # Cost discounted factor cost_gamma: 1.0 - linear_lr_decay: True + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: True - use_max_grad_norm: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - ## Configuration For Mode + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae + # Whether to use standardized reward standardized_reward: True + # Whether to use standardized cost standardized_cost: True - reward_penalty: False - ## Configuration For Lagrange + ## --------------------------------Configuration For Lagrangian multiplier-------------------- ## lagrange_cfgs: + # Tolerance of constraint violation cost_limit: 25.0 - lagrangian_multiplier_init: 0.1 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier lambda_lr: 0.035 + # Type of lagrangian optimizer lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/NaturalPG.yaml b/omnisafe/configs/on-policy/NaturalPG.yaml index 1f81e5f7e..180a7a594 100644 --- a/omnisafe/configs/on-policy/NaturalPG.yaml +++ b/omnisafe/configs/on-policy/NaturalPG.yaml @@ -14,57 +14,100 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 - steps_per_epoch: 20000 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class Natural PG + ## --------------------------Basic configurations for derived class NaturalPG----------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False - kl_early_stopping: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - ## Configuration For Mode + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/PDO.yaml b/omnisafe/configs/on-policy/PDO.yaml index aa1ecce82..917981a2d 100644 --- a/omnisafe/configs/on-policy/PDO.yaml +++ b/omnisafe/configs/on-policy/PDO.yaml @@ -14,63 +14,110 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 - steps_per_epoch: 20000 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class PDO + ## --------------------------Basic configurations for derived class NaturalPG----------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic use_cost: True + # Cost discounted factor cost_gamma: 1.0 - linear_lr_decay: True + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: True - use_max_grad_norm: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - ## Configuration For Mode + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost standardized_cost: True - reward_penalty: False - ## Configuration For Lagrange + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## lagrange_cfgs: + # Tolerance of constraint violation cost_limit: 25.0 + # Initial value of lagrangian multiplier lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier lambda_lr: 0.035 + # Type of lagrangian optimizer lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml index 56a113edb..41452dc01 100644 --- a/omnisafe/configs/on-policy/PPO.yaml +++ b/omnisafe/configs/on-policy/PPO.yaml @@ -14,53 +14,96 @@ # ============================================================================== defaults: - # Basic Configurations + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 - target_kl: 0.01 + # The Address for saving training process data data_dir: "./runs" - seed: 0 + ## ---------------------------Basic configurations for derived class PPO---------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 - standardized_obs: True - exploration_noise_anneal: True - kl_early_stopping: True + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/PPOLag.yaml b/omnisafe/configs/on-policy/PPOLag.yaml index 3f6327ca2..87bb2b7d6 100644 --- a/omnisafe/configs/on-policy/PPOLag.yaml +++ b/omnisafe/configs/on-policy/PPOLag.yaml @@ -42,9 +42,13 @@ defaults: critic_lr: 0.001 # The Address for saving training process data data_dir: "./runs" - ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + ## ---------------------------Basic configurations for derived class PPO---------------------- ## # The thereshold for KL early stopping target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## diff --git a/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml new file mode 100644 index 000000000..2b4910a33 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml @@ -0,0 +1,143 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.05 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.999 + # Whether to scale the safety budget + scale_safety_budget: True + # Type of Simmer Controller + simmer_controller: 'PID' + # Configuration of Simmer Controller + controller_cfgs: + # Kp for PID + pid_kp: 0.1 + # Ki for PID + pid_ki: 0.01 + # Kd for PID + pid_kd: 0.01 + # The step size for PID + step_size: 2 + # Lowpass filter coefficient + tau: 0.95 diff --git a/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml new file mode 100644 index 000000000..cdbcf2190 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml @@ -0,0 +1,145 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.05 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.999 + # Whether to scale the safety budget + scale_safety_budget: True + # Type of Simmer Controller + simmer_controller: 'Q' + # Configurations for controller + controller_cfgs: + # The dim of state space + state_dim: 5 + # The dim of action space + act_dim: 3 + # The theshold of safety budget + threshold: 2 + # The learning rate of Q network + q_lr: 0.1 + # The hyperparameter of episilon greedy + epsilon: 0.8 + # Lowpass filter coefficient + tau: 0.95 diff --git a/omnisafe/configs/on-policy/PPOSimmerPid.yaml b/omnisafe/configs/on-policy/PPOSimmerPid.yaml new file mode 100644 index 000000000..469b7340e --- /dev/null +++ b/omnisafe/configs/on-policy/PPOSimmerPid.yaml @@ -0,0 +1,135 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.5 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.9997 + # Whether to scale the safety budget + scale_safety_budget: True + # Type of Simmer Controller + simmer_controller: 'PID' + # Configuration of Simmer Controller + controller_cfgs: + # Kp for PID + pid_kp: 0.1 + # Ki for PID + pid_ki: 0.01 + # Kd for PID + pid_kd: 0.01 + # The step size for PID + step_size: 3 + # Lowpass filter coefficient + tau: 0.05 diff --git a/omnisafe/configs/on-policy/PPOSimmerQ.yaml b/omnisafe/configs/on-policy/PPOSimmerQ.yaml new file mode 100644 index 000000000..6b4d4fe44 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOSimmerQ.yaml @@ -0,0 +1,137 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.1 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.9997 + # Whether to scale the safety budget + scale_safety_budget: False + # Type of Simmer Controller + simmer_controller: 'Q' + # Configurations for controller + controller_cfgs: + # The dim of state space + state_dim: 5 + # The dim of action space + act_dim: 3 + # The theshold of safety budget + threshold: 2 + # The learning rate of Q network + q_lr: 0.1 + # The hyperparameter of episilon greedy + epsilon: 0.8 + # Lowpass filter coefficient + tau: 0.95 diff --git a/omnisafe/configs/on-policy/PolicyGradient.yaml b/omnisafe/configs/on-policy/PolicyGradient.yaml index 033c62086..4ee16278c 100644 --- a/omnisafe/configs/on-policy/PolicyGradient.yaml +++ b/omnisafe/configs/on-policy/PolicyGradient.yaml @@ -14,53 +14,89 @@ # ============================================================================== defaults: - # Basic Configurations + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 - num_mini_batches: 32 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 - target_kl: 0.01 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 - use_cost_critic: False + # Whther to use linear decay of learning rate linear_lr_decay: False - exploration_noise_anneal: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: False + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - standardized_obs: True - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index f0f4176fb..3e52373bd 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -14,57 +14,100 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class Natural PG + ## -----------------------------Basic configurations for derived class CPO--------------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 - exploration_noise_anneal: True - standardized_obs: True + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False - kl_early_stopping: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/TRPOLag.yaml b/omnisafe/configs/on-policy/TRPOLag.yaml index 0e438436f..9ba653926 100644 --- a/omnisafe/configs/on-policy/TRPOLag.yaml +++ b/omnisafe/configs/on-policy/TRPOLag.yaml @@ -75,7 +75,7 @@ defaults: # Whether to use reward scaling scale_rewards: False # Whether to use standardized observation - standardized_obs: True + standardized_obs: False ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: # Whether to share the weight of Actor network with Critic network diff --git a/omnisafe/configs/on-policy/TRPOPid.yaml b/omnisafe/configs/on-policy/TRPOPid.yaml new file mode 100644 index 000000000..87ca151ed --- /dev/null +++ b/omnisafe/configs/on-policy/TRPOPid.yaml @@ -0,0 +1,137 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## -----------------------------Basic configurations for derived class CPO--------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient + cg_damping: 0.1 + # Number of conjugate gradient iterations + cg_iters: 10 + # Subsampled observation + fvp_obs: None + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## --------------------------------------Configuration For PID--------------------------------- ## + PID_cfgs: + # KP for PID + pid_kp: 0.01 + # KI for PID + pid_ki: 0.01 + # KD for PID + pid_kd: 0.01 + # The init value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # The delay rate of KD + pid_d_delay: 10 + # 0 for hard update, 1 for no update + pid_delta_p_ema_alpha: 0.95 + # The same as above + pid_delta_d_ema_alpha: 0.95 + # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 + sum_norm: True + # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 + diff_norm: False + # Only used if sum_norm=diff_norm=False + penalty_max: 100 + # Tolerance of violation + cost_limit: 25.0 diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index 8f325f7b2..f12be7eed 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -14,12 +14,16 @@ # ============================================================================== """Implementation of ActorBuilder.""" +from typing import Optional + import torch.nn as nn from omnisafe.models.actor.categorical_actor import CategoricalActor +from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor from omnisafe.models.actor.gaussian_annealing_actor import GaussianAnnealingActor from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor +from omnisafe.models.actor.mlp_actor import MLPActor from omnisafe.utils.model_utils import Activation, InitFunction @@ -36,6 +40,7 @@ def __init__( activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'xavier_uniform', shared: nn.Module = None, + act_noise: Optional[float] = None, ) -> None: self.obs_dim = obs_dim self.act_dim = act_dim @@ -43,6 +48,7 @@ def __init__( self.activation = activation self.weight_initialization_mode = weight_initialization_mode self.shared = shared + self.act_noise = act_noise def build_actor(self, actor_type: str, **kwargs): """Build actor network.""" @@ -86,4 +92,26 @@ def build_actor(self, actor_type: str, **kwargs): shared=self.shared, **kwargs, ) + if actor_type == 'dire': + return MLPActor( + obs_dim=self.obs_dim, + act_dim=self.act_dim, + act_noise=self.act_noise, + hidden_sizes=self.hidden_sizes, + activation=self.activation, + weight_initialization_mode=self.weight_initialization_mode, + shared=self.shared, + **kwargs, + ) + if actor_type == 'cholesky': + return MLPCholeskyActor( + obs_dim=self.obs_dim, + act_dim=self.act_dim, + hidden_sizes=self.hidden_sizes, + activation=self.activation, + weight_initialization_mode=self.weight_initialization_mode, + shared=self.shared, + **kwargs, + ) + raise NotImplementedError(f'Actor type {actor_type} is not implemented.') diff --git a/omnisafe/models/actor/gaussian_annealing_actor.py b/omnisafe/models/actor/gaussian_annealing_actor.py index 3bdf51014..065abc801 100644 --- a/omnisafe/models/actor/gaussian_annealing_actor.py +++ b/omnisafe/models/actor/gaussian_annealing_actor.py @@ -69,7 +69,7 @@ def _distribution(self, obs): mean = self.net(obs) return Normal(mean, self._std) - def predict(self, obs, deterministic=False, need_log_prob=False): + def predict(self, obs, deterministic=False, need_log_prob=True): dist = self._distribution(obs) if deterministic: out = dist.mean diff --git a/omnisafe/models/actor/gaussian_stdnet_actor.py b/omnisafe/models/actor/gaussian_stdnet_actor.py index 5ae43116b..2c67f2052 100644 --- a/omnisafe/models/actor/gaussian_stdnet_actor.py +++ b/omnisafe/models/actor/gaussian_stdnet_actor.py @@ -30,9 +30,9 @@ def __init__( self, obs_dim, act_dim, - act_min: torch.Tensor, act_max: torch.Tensor, - hidden_sizes, + act_min: torch.Tensor, + hidden_sizes: list, activation, weight_initialization_mode, shared=None, @@ -94,12 +94,13 @@ def predict(self, obs, deterministic=False, need_log_prob=False): action = torch.tanh(out) action = self.act_min + (action + 1) * 0.5 * (self.act_max - self.act_min) + action = torch.clamp(action, self.act_min, self.act_max) if need_log_prob: log_prob = dist.log_prob(out).sum(axis=-1) log_prob -= torch.log(1.00001 - torch.tanh(out) ** 2).sum(axis=-1) - return out, log_prob - return out + return action.to(torch.float32), log_prob + return action.to(torch.float32) def forward(self, obs, act=None): dist = self._distribution(obs) diff --git a/omnisafe/models/actor/mlp_actor.py b/omnisafe/models/actor/mlp_actor.py index ccd8a3bf2..616e3d55e 100644 --- a/omnisafe/models/actor/mlp_actor.py +++ b/omnisafe/models/actor/mlp_actor.py @@ -32,15 +32,18 @@ def __init__( obs_dim: int, act_dim: int, act_noise, - act_limit, + act_max, + act_min, hidden_sizes: list, activation: Activation, weight_initialization_mode: InitFunction = 'xavier_uniform', shared: nn.Module = None, ): super().__init__(obs_dim, act_dim, hidden_sizes, activation) - self.act_limit = act_limit + self.act_max = act_max + self.act_min = act_min self.act_noise = act_noise + self._std = 0.5 * torch.ones(self.act_dim, dtype=torch.float32) if shared is not None: # use shared layers action_head = build_mlp_network( @@ -62,15 +65,24 @@ def _distribution(self, obs): mean = self.net(obs) return Normal(mean, self._std) + def get_distribution(self, obs): + """Get the distribution of actor.""" + return self._distribution(obs) + def forward(self, obs, act=None): - """forward""" + """Forward""" # Return output from network scaled to action space limits. - return self.act_limit * self.net(obs) + return self.act_max * self.net(obs) - def predict(self, obs, deterministic=False, need_log_prob=False): + def predict(self, obs, deterministic=False, need_log_prob=True): if deterministic: - action = self.act_limit * self.net(obs) + action = self.act_max * self.net(obs) else: - action = self.act_limit * self.net(obs) + action = self.act_max * self.net(obs) action += self.act_noise * np.random.randn(self.act_dim) - return action.to(torch.float32), torch.tensor(1, dtype=torch.float32) + + action = torch.clamp(action, self.act_min, self.act_max) + if need_log_prob: + return action.to(torch.float32), torch.tensor(1, dtype=torch.float32) + + return action.to(torch.float32) diff --git a/omnisafe/models/actor_q_critic.py b/omnisafe/models/actor_q_critic.py index 66dc66a83..8e6d49708 100644 --- a/omnisafe/models/actor_q_critic.py +++ b/omnisafe/models/actor_q_critic.py @@ -17,8 +17,8 @@ import numpy as np import torch import torch.nn as nn -from gymnasium.spaces import Box +from omnisafe.models.actor import ActorBuilder from omnisafe.models.actor.mlp_actor import MLPActor from omnisafe.models.critic.q_critic import QCritic from omnisafe.utils.model_utils import build_mlp_network @@ -45,15 +45,10 @@ def __init__( self.obs_shape = observation_space.shape self.obs_oms = OnlineMeanStd(shape=self.obs_shape) if standardized_obs else None self.act_dim = action_space.shape[0] - self.act_limit = action_space.high[0] + self.act_max = torch.as_tensor(action_space.high) + self.act_min = torch.as_tensor(action_space.low) self.ac_kwargs = model_cfgs.ac_kwargs # build policy and value functions - if isinstance(action_space, Box): - if model_cfgs.pi_type == 'dire': - actor_fn = MLPActor - act_dim = action_space.shape[0] - else: - raise ValueError self.obs_dim = observation_space.shape[0] @@ -71,31 +66,42 @@ def __init__( else: shared = None - self.actor = actor_fn( + actor_builder = ActorBuilder( obs_dim=self.obs_dim, - act_dim=act_dim, + act_dim=self.act_dim, act_noise=model_cfgs.ac_kwargs.pi.act_noise, - act_limit=self.act_limit, hidden_sizes=model_cfgs.ac_kwargs.pi.hidden_sizes, activation=model_cfgs.ac_kwargs.pi.activation, weight_initialization_mode=weight_initialization_mode, shared=shared, ) + + if self.ac_kwargs.pi.actor_type == 'cholesky': + self.actor = actor_builder.build_actor( + self.ac_kwargs.pi.actor_type, + act_max=self.act_max, + act_min=self.act_min, + cov_min=self.ac_kwargs.pi.cov_min, + mu_clamp_min=self.ac_kwargs.pi.mu_clamp_min, + mu_clamp_max=self.ac_kwargs.pi.mu_clamp_max, + cov_clamp_min=self.ac_kwargs.pi.cov_clamp_min, + cov_clamp_max=self.ac_kwargs.pi.cov_clamp_max, + ) + else: + self.actor = actor_builder.build_actor( + self.ac_kwargs.pi.actor_type, + act_max=self.act_max, + act_min=self.act_min, + ) + self.critic = QCritic( self.obs_dim, - act_dim, - hidden_sizes=model_cfgs.ac_kwargs.val.hidden_sizes, - activation=model_cfgs.ac_kwargs.val.activation, - weight_initialization_mode=weight_initialization_mode, - shared=shared, - ) - self.critic_ = QCritic( - self.obs_dim, - act_dim, + self.act_dim, hidden_sizes=model_cfgs.ac_kwargs.val.hidden_sizes, activation=model_cfgs.ac_kwargs.val.activation, weight_initialization_mode=weight_initialization_mode, shared=shared, + num_critics=model_cfgs.ac_kwargs.val.num_critics, ) def step(self, obs, deterministic=False): @@ -120,7 +126,8 @@ def step(self, obs, deterministic=False): else: action, logp_a = self.pi.predict(obs, determinstic=deterministic) value = self.v(obs, action) - action = np.clip(action.numpy(), -self.act_limit, self.act_limit) + action = action.to(torch.float32) + action = np.clip(action.numpy(), self.act_min, self.act_max) return action, value.numpy(), logp_a.numpy() @@ -130,8 +137,8 @@ def anneal_exploration(self, frac): frac: progress of epochs, i.e. current epoch / total epochs e.g. 10 / 100 = 0.1 """ - if hasattr(self.pi, 'set_log_std'): - self.pi.set_log_std(1 - frac) + if hasattr(self.actor, 'set_log_std'): + self.actor.set_log_std(1 - frac) def forward(self, obs, act): """Compute the value of a given state-action pair.""" diff --git a/omnisafe/models/constraint_actor_q_critic.py b/omnisafe/models/constraint_actor_q_critic.py index e6116866e..049d636bf 100644 --- a/omnisafe/models/constraint_actor_q_critic.py +++ b/omnisafe/models/constraint_actor_q_critic.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of ConstraintActorQCritic.""" -import numpy as np import torch from omnisafe.models.actor_q_critic import ActorQCritic @@ -67,9 +66,10 @@ def step(self, obs, deterministic=False): # Note: Update RMS in Algorithm.running_statistics() method # self.obs_oms.update(obs) if self.training else None obs = self.obs_oms(obs) - action, logp_a = self.actor.predict(obs, deterministic=deterministic) - value = self.critic(obs, action) - cost_value = self.cost_critic(obs, action) - action = np.clip(action.numpy(), -self.act_limit, self.act_limit) + action, logp_a = self.actor.predict( + obs, deterministic=deterministic, need_log_prob=True + ) + value = self.critic(obs, action)[0] + cost_value = self.cost_critic(obs, action)[0] - return action, value.numpy(), cost_value.numpy(), logp_a.numpy() + return action.numpy(), value.numpy(), cost_value.numpy(), logp_a.numpy() diff --git a/omnisafe/models/critic/q_critic.py b/omnisafe/models/critic/q_critic.py index 2b4db763e..904df3701 100644 --- a/omnisafe/models/critic/q_critic.py +++ b/omnisafe/models/critic/q_critic.py @@ -34,6 +34,7 @@ def __init__( activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'xavier_uniform', shared: nn.Module = None, + num_critics: int = 1, ) -> None: """Initialize.""" Critic.__init__( @@ -45,17 +46,22 @@ def __init__( weight_initialization_mode=weight_initialization_mode, shared=shared, ) - self.obs_encoder = build_mlp_network( - [obs_dim, hidden_sizes[0]], - activation=activation, - output_activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.net = build_mlp_network( - [hidden_sizes[0] + act_dim] + hidden_sizes[1:] + [1], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) + self.critic_list = [] + for idx in range(num_critics): + obs_encoder = build_mlp_network( + [obs_dim, hidden_sizes[0]], + activation=activation, + output_activation=activation, + weight_initialization_mode=weight_initialization_mode, + ) + net = build_mlp_network( + [hidden_sizes[0] + act_dim] + hidden_sizes[1:] + [1], + activation=activation, + weight_initialization_mode=weight_initialization_mode, + ) + critic = nn.Sequential(obs_encoder, net) + self.critic_list.append(critic) + self.add_module(f'critic_{idx}', critic) def forward( self, @@ -63,5 +69,8 @@ def forward( act: Optional[torch.Tensor] = None, ): """Forward.""" - obs = self.obs_encoder(obs) - return torch.squeeze(self.net(torch.cat([obs, act], dim=-1)), -1) + res = [] + for critic in self.critic_list: + encodered_obs = critic[0](obs) + res.append(torch.squeeze(critic[1](torch.cat([encodered_obs, act], dim=-1)), -1)) + return res diff --git a/omnisafe/utils/algo_utils.py b/omnisafe/utils/algo_utils.py index 4c10341c5..cb944bb8b 100644 --- a/omnisafe/utils/algo_utils.py +++ b/omnisafe/utils/algo_utils.py @@ -13,3 +13,54 @@ # limitations under the License. # ============================================================================== """Implementation of the algo utils.""" +import torch + + +def bt(m: torch.tensor): + return m.transpose(dim0=-2, dim1=-1) + + +def btr(m: torch.tensor): + return m.diagonal(dim1=-2, dim2=-1).sum(-1) + + +def safe_inverse(A, det): + indices = torch.where(det <= 1e-6) + # pseudoinverse + if len(indices[0]) > 0: + return torch.linalg.pinv(A) + return A.inverse() + + +def gaussian_kl(μi, μ, Ai, A): + """ + decoupled KL between two multivariate gaussian distribution + C_μ = KL(f(x|μi,Σi)||f(x|μ,Σi)) + C_Σ = KL(f(x|μi,Σi)||f(x|μi,Σ)) + :param μi: (B, n) + :param μ: (B, n) + :param Ai: (B, n, n) + :param A: (B, n, n) + :return: C_μ, C_Σ: scalar + mean and covariance terms of the KL + :return: mean of determinanats of Σi, Σ + ref : https://stanford.edu/~jduchi/projects/general_notes.pdf page.13 + """ + n = A.size(-1) + μi = μi.unsqueeze(-1) # (B, n, 1) + μ = μ.unsqueeze(-1) # (B, n, 1) + Σi = Ai @ bt(Ai) # (B, n, n) + Σ = A @ bt(A) # (B, n, n) + Σi_det = Σi.det() # (B,) + Σ_det = Σ.det() # (B,) + Σi_inv = safe_inverse(Σi, Σi_det) # (B, n, n) + Σ_inv = safe_inverse(Σ, Σ_det) # (B, n, n) + # determinant can be minus due to numerical calculation error + # https://github.com/daisatojp/mpo/issues/11 + Σi_det = torch.clamp_min(Σi_det, 1e-6) + Σ_det = torch.clamp_min(Σ_det, 1e-6) + inner_μ = ((μ - μi).transpose(-2, -1) @ Σi_inv @ (μ - μi)).squeeze() # (B,) + inner_Σ = torch.log(Σ_det / Σi_det) - n + btr(Σ_inv @ Σi) # (B,) + C_μ = 0.5 * torch.mean(inner_μ) + C_Σ = 0.5 * torch.mean(inner_Σ) + return C_μ, C_Σ, torch.mean(Σi_det), torch.mean(Σ_det) diff --git a/omnisafe/wrappers/__init__.py b/omnisafe/wrappers/__init__.py index 9eb8141a6..482ce94af 100644 --- a/omnisafe/wrappers/__init__.py +++ b/omnisafe/wrappers/__init__.py @@ -14,5 +14,8 @@ # ============================================================================== """Environment wrappers.""" +from omnisafe.wrappers.early_terminated_wrapper import EarlyTerminatedEnvWrapper from omnisafe.wrappers.off_policy_wrapper import OffPolicyEnvWrapper from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.saute_wrapper import SauteEnvWrapper +from omnisafe.wrappers.simmer_wrapper import SimmerEnvWrapper diff --git a/omnisafe/wrappers/off_policy_wrapper.py b/omnisafe/wrappers/off_policy_wrapper.py index ee42c0f54..83710981f 100644 --- a/omnisafe/wrappers/off_policy_wrapper.py +++ b/omnisafe/wrappers/off_policy_wrapper.py @@ -53,7 +53,6 @@ def __init__( # self.deterministic = False self.local_steps_per_epoch = None self.cost_gamma = None - self.use_cost = None self.penalty_param = None def make(self): @@ -130,7 +129,7 @@ def roll_out( **{ 'Metrics/EpRet': ep_ret, 'Metrics/EpLen': ep_len, - 'Metrics/EpCosts': ep_cost, + 'Metrics/EpCost': ep_cost, } ) self.curr_o, _ = self.env.reset(seed=self.seed) @@ -142,7 +141,7 @@ def roll_out( **{ 'Test/EpRet': ep_ret, 'Test/EpLen': ep_len, - 'Test/EpCosts': ep_cost, + 'Test/EpCost': ep_cost, } ) self.curr_o, _ = self.env.reset(seed=self.seed) diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py new file mode 100644 index 000000000..8d1a6fb35 --- /dev/null +++ b/omnisafe/wrappers/simmer_wrapper.py @@ -0,0 +1,508 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY pid_kiND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""env_wrapper""" + +import copy + +import numpy as np +import torch +from gymnasium import spaces + +from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY + + +class PidController: # pylint: disable=too-many-instance-attributes + """Using PID controller to control the safety budget in Simmer environment.""" + + def __init__( + self, + cfgs, + safety_budget: float = 25.0, + lower_budget: float = 1.0, + upper_budget: float = 25.0, + ) -> None: + r"""Initialize the PID controller. + + Args: + cfgs (CfgNode): Configurations. + safety_budget (float): The initial safety budget. + lower_budget (float): The lower bound of safety budget. + upper_budget (float): The upper bound of safety budget. + """ + # PID parameters. + self.pid_kp = cfgs.pid_kp + self.pid_ki = cfgs.pid_ki + self.pid_kd = cfgs.pid_kd + + # Low pass filter. + self.tau = cfgs.tau + + # Initialize the PID controller. + self.error = 0.0 + self.error_i = 0.0 + self.prev_action = 0 + self.prev_raw_action = 0 + self.step_size = cfgs.step_size + + # Set the initial safety budget. + self.safety_budget = safety_budget + self.lower_budget = lower_budget + self.upper_budget = upper_budget + + def compute_raw_action(self, obs: float): + r"""Compute the raw action based on current obs. + + Args: + obs (float): The current observation. + + Returns: + float: The raw action. + """ + + # Low pass filter. + error_p = self.tau * self.error + (1 - self.tau) * (self.safety_budget - obs) + self.error_i += self.error + error_d = self.pid_kd * (self.prev_action - self.prev_raw_action) + + # Compute PID error. + curr_raw_action = self.pid_kp * error_p + self.pid_ki * self.error_i + self.pid_kd * error_d + return curr_raw_action + + def act(self, obs: float): + r"""Compute the safety budget based on the observation ``Jc``. + + Args: + obs (float): The current observation. + + Returns: + float: The safety budget. + """ + curr_raw_action = self.compute_raw_action(obs) + + # Clip the raw action. + curr_action = np.clip(curr_raw_action, -self.step_size, self.step_size) + self.prev_action = curr_action + self.prev_raw_action = curr_raw_action + raw_budget = self.safety_budget + curr_action + + # Clip the safety budget. + self.safety_budget = np.clip(raw_budget, self.lower_budget, self.upper_budget) + + return self.safety_budget + + +class QController: # pylint: disable=too-many-instance-attributes + """Using Q-learning to control the safety budget in Simmer environment.""" + + def __init__( + self, + cfgs, + safety_budget: float = 25.0, + lower_budget: float = 1.0, + upper_budget: float = 25.0, + ) -> None: + r""" " + Initialize the Q-learning controller. + + Args: + cfgs (CfgNode): The config file. + safety_budget (float): The initial safety budget. + lower_budget (float): The lower bound of the safety budget. + upper_budget (float): The upper bound of the safety budget. + """ + + # Set the initial safety budget. + self.lower_budget = lower_budget + self.upper_budget = upper_budget + + # Initialize the Q-learning controller. + self.state_dim = cfgs.state_dim + self.act_dim = cfgs.act_dim + self.q_function = np.zeros((cfgs.state_dim, cfgs.act_dim)) + self.state_space = np.linspace(self.lower_budget, self.upper_budget, cfgs.state_dim) + self.action_space = np.linspace(-1, 1, cfgs.act_dim, dtype=int) + self.state = safety_budget + self.init_idx = np.argwhere(self.state_space == self.state) + self.action = 0 + self.step(self.action) + + # Set the Q-learning parameters. + self.tau = cfgs.tau + self.threshold = cfgs.threshold + self.q_lr = cfgs.q_lr + + # Use epsilon greedy to explore the environment. + self.epsilon = cfgs.epsilon + + # Initialize the observation (Cost value per epoch) buffer. + self.prev_obs = copy.copy(self.state) + self.filtered_obs_buffer = [] + self.filtered_obs = 0 + + def get_state_idx(self, state: float): + r"""Get the state index. + + Args: + state (float): The current state. + + Returns: + int: The state index.""" + state_idx = np.argwhere(self.state_space == state)[0][0] + return state_idx + + def get_action_idx(self, action: float): + r"""Get the action index. + + Args: + action (float): The current action. + + Returns: + int: The action index. + """ + action_idx = np.argwhere(self.action_space == action) + return action_idx + + def get_random_action(self): + r"""Get the random action. + + Returns: + float: The random action. + """ + action_idx = np.random.randint(0, self.act_dim) + return self.action_space[action_idx] + + def get_greedy_action(self, state: float): + r"""Get the greedy action. + + Args: + state (float): The current state(``cost_limit``). + + Returns: + float: The greedy action.""" + state_idx = self.get_state_idx(state) + action_idx = np.argmax(self.q_function[state_idx, :]) + action = self.action_space[action_idx] + return action + + def update_q_function(self, state: float, action: float, reward: float, next_state: float): + r"""Update the Q function using the Bellman equation. + + Args: + state (float): The current state. + action (float): The current action. + reward (float): The reward. + next_state (float): The next state. + """ + state_idx = self.get_state_idx(state) + action_idx = self.get_action_idx(action) + next_state_idx = self.get_state_idx(next_state) + self.q_function[state_idx, action_idx] = (1 - self.q_lr) * self.q_function[ + state_idx, action_idx + ] + self.q_lr * (reward + self.tau * np.max(self.q_function[next_state_idx, :])) + + def step(self, action: float): + r"""Step the environment. + + Args: + action (float): The current action. + """ + state_idx = self.get_state_idx(self.state) + state_idx = np.clip(state_idx + action, 0, self.state_dim - 1, dtype=int) + self.state = self.state_space[state_idx] + return self.state + + def reward(self, state: float, action: float, obs: float): + r"""Get the reward function based on whether the observation is within the threshold. + + Args: + state (float): The current state. + action (float): The current action. + obs (float): The observation. + + Returns: + float: The reward. + """ + action_idx = self.get_action_idx(action) + if int(self.threshold > obs - state and obs - state > -self.threshold): + reward = np.array([-1, 1, 0.5])[action_idx] + elif int(obs - state <= -self.threshold): + reward = np.array([-1, 0, 2])[action_idx] + elif int(obs - state >= self.threshold): + reward = np.array([2, -1, -1])[action_idx] + return reward[0] + + def act(self, obs: float): + r"""Return the safety budget based on the observation. + + Args: + obs (float): The observation. + + Returns: + float: The safety budget. + """ + prev_obs = self.filtered_obs + self.filtered_obs = self.tau * prev_obs + (1 - self.tau) * obs + self.filtered_obs_buffer.append(self.filtered_obs) + state = self.state + + # Use epsilon greedy to explore the environment + epsilon = np.random.random() + if epsilon > self.epsilon: + action = self.get_random_action() + else: + action = self.get_greedy_action(state) + reward = self.reward(state, action, self.filtered_obs) + next_state = self.step(action) + safety_budget = next_state + + # Update the Q function + self.update_q_function(state, action, reward, next_state) + return safety_budget + + +@WRAPPER_REGISTRY.register +class SimmerEnvWrapper(OnPolicyEnvWrapper): # pylint: disable=too-many-instance-attributes + """Wrapper for the Simmer environment.""" + + def __init__( + self, + env_id, + cfgs, + render_mode=None, + ) -> None: + r"""Initialize the Simmer environment wrapper. + + Args: + env_id (str): The environment id. + cfgs (Config): The configuration. + render_mode (str): The render mode. + """ + super().__init__(env_id, render_mode) + + self.unsafe_reward = cfgs.unsafe_reward + self.simmer_gamma = cfgs.simmer_gamma + if cfgs.scale_safety_budget: + self.safety_budget = ( + cfgs.lower_budget + * (1 - self.simmer_gamma**self.max_ep_len) + / (1 - self.simmer_gamma) + / np.float32(self.max_ep_len) + ) + self.lower_budget = ( + cfgs.lower_budget + * (1 - self.simmer_gamma**self.max_ep_len) + / (1 - self.simmer_gamma) + / np.float32(self.max_ep_len) + ) + self.upper_budget = ( + cfgs.upper_budget + * (1 - self.simmer_gamma**self.max_ep_len) + / (1 - self.simmer_gamma) + / np.float32(self.max_ep_len) + ) + else: + self.safety_budget = cfgs.lower_budget + self.lower_budget = cfgs.lower_budget + self.upper_budget = cfgs.upper_budget + self.rel_safety_budget = self.safety_budget / self.upper_budget + self.safety_obs = self.rel_safety_budget + high = np.array(np.hstack([self.env.observation_space.high, np.inf]), dtype=np.float32) + low = np.array(np.hstack([self.env.observation_space.low, np.inf]), dtype=np.float32) + self.observation_space = spaces.Box(high=high, low=low) + if cfgs.simmer_controller == 'PID': + self.controller = PidController( + cfgs.controller_cfgs, + safety_budget=self.safety_budget, + lower_budget=self.lower_budget, + upper_budget=self.upper_budget, + ) + elif cfgs.simmer_controller == 'Q': + self.controller = QController( + cfgs.controller_cfgs, + safety_budget=self.safety_budget, + lower_budget=self.lower_budget, + upper_budget=self.upper_budget, + ) + else: + raise NotImplementedError( + f'Controller type {cfgs.simmer_controller} is not implemented.' + ) + + def augment_obs(self, obs: np.array, safety_obs: np.array): + r"""Augmenting the obs with the safety obs, if needed. + + Args: + obs (np.array): The observation. + safety_obs (np.array): The safety observation. + + Returns: + np.array: The augmented observation. + """ + augmented_obs = np.hstack([obs, safety_obs]) + return augmented_obs + + def safety_step(self, cost: np.ndarray) -> np.ndarray: + r"""Update the normalized safety obs. + + Args: + cost (np.ndarray): The cost. + + Returns: + np.ndarray: The normalized safety obs. + """ + self.safety_obs -= cost / self.upper_budget + self.safety_obs /= self.simmer_gamma + return self.safety_obs + + def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.ndarray: + r"""Update the reward based on the safety obs. + + Args: + reward (np.ndarray): The reward. + next_safety_obs (np.ndarray): The next safety obs. + + Returns: + np.ndarray: The updated reward. + """ + reward = reward * (next_safety_obs > 0) + self.unsafe_reward * (next_safety_obs <= 0) + return reward + + def reset(self, seed=None): + r"""reset environment + + Args: + seed (int): The seed. + + Returns: + np.array: The augmented observation. + dict: The info. + """ + self.curr_o, info = self.env.reset(seed=seed) + self.rel_safety_budget = self.safety_budget / self.upper_budget + self.safety_obs = self.rel_safety_budget + self.curr_o = self.augment_obs(self.curr_o, self.safety_obs) + return self.curr_o, info + + def step(self, action): + r"""step environment + + Args: + action (np.array): The action. + + Returns: + np.array: The augmented observation. + np.array: The reward. + np.array: The cost. + bool: The terminated flag. + bool: The truncated flag. + dict: The info. + """ + next_obs, reward, cost, terminated, truncated, info = self.env.step(action) + next_safety_obs = self.safety_step(cost) + info['true_reward'] = reward + info['safety_obs'] = next_safety_obs + reward = self.safety_reward(reward, next_safety_obs) + augmented_obs = self.augment_obs(next_obs, next_safety_obs) + + return augmented_obs, reward, cost, terminated, truncated, info + + def set_budget(self, Jc): + r"""Set the safety budget. + + Args: + Jc (np.array): The safety budget. + + Returns: + np.array: The safety budget. + """ + self.safety_budget = self.controller.act(Jc) + + # pylint: disable-next=too-many-locals + def roll_out(self, agent, buf, logger): + r"""collect data and store to experience buffer. + + Args: + agent (Agent): The agent. + buf (Buffer): The buffer. + logger (Logger): The logger. + + Returns: + float: The episode return. + float: The episode cost. + int: The episode length. + float: The episode budget. + """ + obs, _ = self.reset() + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + for step_i in range(self.local_steps_per_epoch): + action, value, cost_value, logp = agent.step(torch.as_tensor(obs, dtype=torch.float32)) + next_obs, reward, cost, done, truncated, info = self.step(action) + ep_ret += info['true_reward'] + ep_costs += (self.cost_gamma**ep_len) * cost + ep_len += 1 + ep_budget += self.safety_obs + + # Save and log + # Notes: + # - raw observations are stored to buffer (later transformed) + # - reward scaling is performed in buffer + buf.store( + obs=obs, + act=action, + rew=reward, + val=value, + logp=logp, + cost=cost, + cost_val=cost_value, + ) + + # Store values for statistic purpose + if self.use_cost: + logger.store(**{'Values/V': value, 'Values/C': cost_value}) + else: + logger.store(**{'Values/V': value}) + + # Update observation + obs = next_obs + + timeout = ep_len == self.max_ep_len + terminal = done or timeout or truncated + epoch_ended = step_i == self.local_steps_per_epoch - 1 + + if terminal or epoch_ended: + if timeout or epoch_ended: + _, value, cost_value, _ = agent(torch.as_tensor(obs, dtype=torch.float32)) + else: + value, cost_value = 0.0, 0.0 + + # Automatically compute GAE in buffer + buf.finish_path(value, cost_value, penalty_param=float(self.penalty_param)) + + # Only save EpRet / EpLen if trajectory finished + if terminal: + logger.store( + **{ + 'Metrics/EpRet': ep_ret, + 'Metrics/EpLen': ep_len, + 'Metrics/EpCost': ep_costs, + 'Metrics/EpBudget': ep_budget, + 'Metrics/SafetyBudget': self.safety_budget, + } + ) + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + obs, _ = self.reset() + # Update safety budget after each epoch. + self.set_budget(logger.get_stats('Metrics/EpCost')[0]) diff --git a/tests/test_policy.py b/tests/test_policy.py index 1d27864fb..3a6fbe1db 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -19,7 +19,7 @@ @helpers.parametrize( - algo=[ + on_policy_algo=[ 'PolicyGradient', 'PPO', 'PPOLag', @@ -33,11 +33,37 @@ 'FOCOPS', 'CPPOPid', 'CUP', + 'PPOSaute', + 'PPOSimmerPid', + 'PPOSimmerQ', + 'PPOEarlyTerminated', + 'PPOLagSaute', + 'PPOLagSimmerPid', + 'PPOLagSimmerQ', + 'PPOLagEarlyTerminated', ] ) -def test_on_policy(algo): +def test_on_policy(on_policy_algo): """Test algorithms""" env_id = 'SafetyPointGoal1-v0' custom_cfgs = {'epochs': 1, 'steps_per_epoch': 2000, 'pi_iters': 1, 'critic_iters': 1} - agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1) + agent = omnisafe.Agent(on_policy_algo, env_id, custom_cfgs=custom_cfgs, parallel=1) + agent.learn() + + +@helpers.parametrize( + off_policy_algo=[ + 'DDPG', + 'TD3', + 'SAC', + 'DDPGLag', + 'TD3Lag', + 'SACLag', + ] +) +def test_off_policy(off_policy_algo): + """Test algorithms""" + env_id = 'SafetyPointGoal1-v0' + custom_cfgs = {'epochs': 1, 'steps_per_epoch': 2000, 'pi_iters': 1, 'critic_iters': 1} + agent = omnisafe.Agent(off_policy_algo, env_id, custom_cfgs=custom_cfgs, parallel=1) agent.learn() From f8d99f00a3e4c91124c2acde1e1d1b65b57306c3 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 16:18:35 +0800 Subject: [PATCH 05/39] add new algorithms --- omnisafe/algorithms/__init__.py | 17 ++- .../early_terminated/ppo_early_terminated.py | 45 +++++++ .../ppo_lag_early_terminated.py | 45 +++++++ .../configs/on-policy/PPOEarlyTerminated.yaml | 109 ++++++++++++++++ .../on-policy/PPOLagEarlyTerminated.yaml | 119 ++++++++++++++++++ omnisafe/wrappers/early_terminated_wrapper.py | 87 +++++++++++++ tests/test_policy.py | 1 + 7 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py create mode 100644 omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py create mode 100644 omnisafe/configs/on-policy/PPOEarlyTerminated.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml create mode 100644 omnisafe/wrappers/early_terminated_wrapper.py diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index a1a296477..65e3c876e 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -13,10 +13,21 @@ # limitations under the License. # ============================================================================== """Safe Reinforcement Learning algorithms.""" +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.algorithms.off_policy.sac_lag import SACLag +from omnisafe.algorithms.off_policy.sddpg import SDDPG +from omnisafe.algorithms.off_policy.td3 import TD3 +from omnisafe.algorithms.off_policy.td3_lag import TD3Lag from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.algorithms.on_policy.base.ppo import PPO from omnisafe.algorithms.on_policy.base.trpo import TRPO +from omnisafe.algorithms.on_policy.early_terminated.ppo_early_terminated import PPOEarlyTerminated +from omnisafe.algorithms.on_policy.early_terminated.ppo_lag_early_terminated import ( + PPOLagEarlyTerminated, +) from omnisafe.algorithms.on_policy.first_order.cup import CUP from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS from omnisafe.algorithms.on_policy.naive_lagrange.npg_lag import NPGLag @@ -24,10 +35,15 @@ from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag from omnisafe.algorithms.on_policy.naive_lagrange.trpo_lag import TRPOLag from omnisafe.algorithms.on_policy.pid_lagrange.cppo_pid import CPPOPid +from omnisafe.algorithms.on_policy.pid_lagrange.trpo_pid import TRPOPid from omnisafe.algorithms.on_policy.saute.ppo_lag_saute import PPOLagSaute from omnisafe.algorithms.on_policy.saute.ppo_saute import PPOSaute from omnisafe.algorithms.on_policy.second_order.cpo import CPO from omnisafe.algorithms.on_policy.second_order.pcpo import PCPO +from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_pid import PPOLagSimmerPid +from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_q import PPOLagSimmerQ +from omnisafe.algorithms.on_policy.simmer.ppo_simmer_pid import PPOSimmerPid +from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ algo_type = { @@ -39,7 +55,6 @@ 'SAC', 'SACLag', 'SDDPG', - 'CVPO', ], 'on-policy': [ 'PolicyGradient', diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py new file mode 100644 index 000000000..aff4a014d --- /dev/null +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -0,0 +1,45 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Early terminated algorithm by PPO.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOEarlyTerminated(PPO): + """Early terminated algorithm implemented by PPO. + + References: + Paper Name: Safe Exploration by Solving Early Terminated MDP + Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou + Paper URL: https://arxiv.org/abs/2107.04200 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_early_terminated', + wrapper_type: str = 'EarlyTerminatedEnvWrapper', + ) -> None: + r"""Initialize PPO_Earyly_Terminated.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py new file mode 100644 index 000000000..7b72846d0 --- /dev/null +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -0,0 +1,45 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Early terminated algorithm by PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagEarlyTerminated(PPOLag): + """Early terminated algorithm implemented by PPOLag. + + References: + Paper Name: Safe Exploration by Solving Early Terminated MDP + Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou + Paper URL: https://arxiv.org/abs/2107.04200 + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + env_id, + cfgs, + algo='ppo_lag_early_terminated', + wrapper_type: str = 'EarlyTerminatedEnvWrapper', + ) -> None: + r"""Initialize PPO_Lag_Earyly_Terminated.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + algo=algo, + wrapper_type=wrapper_type, + ) diff --git a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml new file mode 100644 index 000000000..2a80bd88b --- /dev/null +++ b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml @@ -0,0 +1,109 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml new file mode 100644 index 000000000..853fa1938 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml @@ -0,0 +1,119 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/wrappers/early_terminated_wrapper.py b/omnisafe/wrappers/early_terminated_wrapper.py new file mode 100644 index 000000000..78fb726dc --- /dev/null +++ b/omnisafe/wrappers/early_terminated_wrapper.py @@ -0,0 +1,87 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Early terminated wrapper""" + +import torch + +from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY + + +@WRAPPER_REGISTRY.register +class EarlyTerminatedEnvWrapper(OnPolicyEnvWrapper): # pylint: disable=too-many-instance-attributes + r"""EarlyTerminatedEnvWrapper.""" + + # pylint: disable-next=too-many-locals + def roll_out(self, agent, buf, logger): + r"""Collect data and store to experience buffer. + Terminated when the episode is done or the episode length is larger than max_ep_len + or cost is unequal to 0.""" + obs, _ = self.env.reset() + ep_ret, ep_costs, ep_len = 0.0, 0.0, 0 + for step_i in range(self.local_steps_per_epoch): + action, value, cost_value, logp = agent.step(torch.as_tensor(obs, dtype=torch.float32)) + next_obs, reward, cost, done, truncated, _ = self.step(action) + ep_ret += reward + ep_costs += (self.cost_gamma**ep_len) * cost + ep_len += 1 + + # Save and log + # Notes: + # - raw observations are stored to buffer (later transformed) + # - reward scaling is performed in buffer + buf.store( + obs=obs, + act=action, + rew=reward, + val=value, + logp=logp, + cost=cost, + cost_val=cost_value, + ) + + # Store values for statistic purpose + if self.use_cost: + logger.store(**{'Values/V': value, 'Values/C': cost_value}) + else: + logger.store(**{'Values/V': value}) + + # Update observation + obs = next_obs + + timeout = ep_len == self.max_ep_len + terminal = done or timeout or truncated or cost + epoch_ended = step_i == self.local_steps_per_epoch - 1 + + if terminal or epoch_ended: + if timeout or epoch_ended: + _, value, cost_value, _ = agent(torch.as_tensor(obs, dtype=torch.float32)) + else: + value, cost_value = 0.0, 0.0 + + # Automatically compute GAE in buffer + buf.finish_path(value, cost_value, penalty_param=float(self.penalty_param)) + + # Only save EpRet / EpLen if trajectory finished + if terminal: + logger.store( + **{ + 'Metrics/EpRet': ep_ret, + 'Metrics/EpLen': ep_len, + 'Metrics/EpCost': ep_costs, + } + ) + ep_ret, ep_costs, ep_len = 0.0, 0.0, 0 + obs, _ = self.env.reset() diff --git a/tests/test_policy.py b/tests/test_policy.py index 3a6fbe1db..bede24b9b 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -59,6 +59,7 @@ def test_on_policy(on_policy_algo): 'DDPGLag', 'TD3Lag', 'SACLag', + 'SDDPG', ] ) def test_off_policy(off_policy_algo): From 63a6275138823bb0d7d7da885207995314b6a462 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 16:25:29 +0800 Subject: [PATCH 06/39] add new algorithms --- omnisafe/configs/off-policy/CVPO.yaml | 127 ------------------------- omnisafe/models/actor/actor_builder.py | 1 - omnisafe/utils/algo_utils.py | 51 ---------- 3 files changed, 179 deletions(-) delete mode 100644 omnisafe/configs/off-policy/CVPO.yaml diff --git a/omnisafe/configs/off-policy/CVPO.yaml b/omnisafe/configs/off-policy/CVPO.yaml deleted file mode 100644 index 17cd052c0..000000000 --- a/omnisafe/configs/off-policy/CVPO.yaml +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2022 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -defaults: - # --------------------------------------Basic Configurations----------------------------------- # - ## -----------------------------Basic configurations for base class DDPG---------------------- ## - # The random seed - seed: 0 - # Number of epochs - epochs: 500 - # Number of steps per epoch - steps_per_epoch: 6000 - # Update after `update_after` steps - update_after: 1000 - # Update every `update_every` steps - update_every: 50 - # Check if all models own the same parameter values every `check_freq` epoch - check_freq: 25 - # Save model to disk every `check_freq` epochs - save_freq: 10 - # The max length of per epoch - max_ep_len: 1000 - # The number of test episodes - num_test_episodes: 10 - # The learning rate of Actor network - actor_lr: 0.0003 - # The learning rate of Critic network - critic_lr: 0.001 - # The soft update coefficient - polyak: 0.999 - # The discount factor of GAE - gamma: 0.99 - # Actor perdorm random action before `start_steps` steps - start_steps: 10000 - # The Address for saving training process data - data_dir: "./runs" - -## ----------------------------------Basic configurations for base class CVPO------------------- ## - kl_mean_constraint: 0.01 - kl_var_constraint: 0.0001 - kl_constraint: 0.01 - alpha_mean_scale: 1.0 - alpha_var_scale: 100.0 - alpha_scale: 10.0 - alpha_mean_max: 0.1 - alpha_var_max: 10.0 - alpha_max: 1.0 - sample_action_num: 64 - mstep_iteration_num: 5 - dual_constraint: 0.1 - cost_limit: 25 - cost_start: 50 - cost_end: 25 - decay_epoch: 100 - use_cost_decay: True - # ---------------------------------------Optional Configuration-------------------------------- # - ## -----------------------------------Configuration For Cost Critic--------------------------- ## - # Whether to use cost critic - use_cost: True - # Cost discounted factor - cost_gamma: 1.0 - # Whther to use linear decay of learning rate - linear_lr_decay: False - # Whether to use exploration noise anneal - exploration_noise_anneal: False - # Whther to use reward penalty - reward_penalty: False - # Whether to use KL early stopping - kl_early_stopping: False - # Whether to use max gradient norm - use_max_grad_norm: False - # The thereshold of max gradient norm - max_grad_norm: 0.5 - # Whether to use reward scaling - scale_rewards: False - # Whether to use standardized observation - standardized_obs: True - ## ---------------------------------------Configuration For Model----------------------------- ## - model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False - # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". - weight_initialization_mode: "kaiming_uniform" - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: "cholesky" - cov_min: 1e-4 - mu_clamp_min: -5 - mu_clamp_max: 5 - cov_clamp_min: -5 - cov_clamp_max: 20 - - # The standard deviation of Gaussian noise - act_noise: 0.1 - # Size of hidden layers - hidden_sizes: [400, 300] - # Activation function - activation: relu - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [400, 300] - # Activation function - activation: relu - ## --------------------------------------Configuration For Buffer----------------------------- ## - replay_buffer_cfgs: - # The size of replay buffer - size: 50000 - # The size of batch - batch_size: 256 diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index f12be7eed..760b50080 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -110,7 +110,6 @@ def build_actor(self, actor_type: str, **kwargs): hidden_sizes=self.hidden_sizes, activation=self.activation, weight_initialization_mode=self.weight_initialization_mode, - shared=self.shared, **kwargs, ) diff --git a/omnisafe/utils/algo_utils.py b/omnisafe/utils/algo_utils.py index cb944bb8b..4c10341c5 100644 --- a/omnisafe/utils/algo_utils.py +++ b/omnisafe/utils/algo_utils.py @@ -13,54 +13,3 @@ # limitations under the License. # ============================================================================== """Implementation of the algo utils.""" -import torch - - -def bt(m: torch.tensor): - return m.transpose(dim0=-2, dim1=-1) - - -def btr(m: torch.tensor): - return m.diagonal(dim1=-2, dim2=-1).sum(-1) - - -def safe_inverse(A, det): - indices = torch.where(det <= 1e-6) - # pseudoinverse - if len(indices[0]) > 0: - return torch.linalg.pinv(A) - return A.inverse() - - -def gaussian_kl(μi, μ, Ai, A): - """ - decoupled KL between two multivariate gaussian distribution - C_μ = KL(f(x|μi,Σi)||f(x|μ,Σi)) - C_Σ = KL(f(x|μi,Σi)||f(x|μi,Σ)) - :param μi: (B, n) - :param μ: (B, n) - :param Ai: (B, n, n) - :param A: (B, n, n) - :return: C_μ, C_Σ: scalar - mean and covariance terms of the KL - :return: mean of determinanats of Σi, Σ - ref : https://stanford.edu/~jduchi/projects/general_notes.pdf page.13 - """ - n = A.size(-1) - μi = μi.unsqueeze(-1) # (B, n, 1) - μ = μ.unsqueeze(-1) # (B, n, 1) - Σi = Ai @ bt(Ai) # (B, n, n) - Σ = A @ bt(A) # (B, n, n) - Σi_det = Σi.det() # (B,) - Σ_det = Σ.det() # (B,) - Σi_inv = safe_inverse(Σi, Σi_det) # (B, n, n) - Σ_inv = safe_inverse(Σ, Σ_det) # (B, n, n) - # determinant can be minus due to numerical calculation error - # https://github.com/daisatojp/mpo/issues/11 - Σi_det = torch.clamp_min(Σi_det, 1e-6) - Σ_det = torch.clamp_min(Σ_det, 1e-6) - inner_μ = ((μ - μi).transpose(-2, -1) @ Σi_inv @ (μ - μi)).squeeze() # (B,) - inner_Σ = torch.log(Σ_det / Σi_det) - n + btr(Σ_inv @ Σi) # (B,) - C_μ = 0.5 * torch.mean(inner_μ) - C_Σ = 0.5 * torch.mean(inner_Σ) - return C_μ, C_Σ, torch.mean(Σi_det), torch.mean(Σ_det) From c6b4b321b6eaa8187da17f3fa219c36bef877286 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 16:32:29 +0800 Subject: [PATCH 07/39] docs: update README.md --- README.md | 92 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index ab9d85a05..c295dbdee 100644 --- a/README.md +++ b/README.md @@ -22,24 +22,26 @@ The simulation environment around OmniSafe and a series of reliable algorithm im ### Table of Contents -- [Overview](#overview) -- [Implemented Algorithms](#implemented-algorithms) - - [Published in 2022](#published-in-2022) - - [List of Algorithms](#list-of-algorithms) -- [SafeRL Environments](#saferl-environments) - - [Safety Gymnasium](#safety-gymnasium) - - [Vision-base Safe RL](#vision-base-safe-rl) - - [Environment Usage](#environment-usage) -- [Installation](#installation) - - [Prerequisites](#prerequisites) - - [Install from source](#install-from-source) - - [Examples](#examples) -- [Getting Started](#getting-started) - - [1. Run Agent from preset yaml file](#1-run-agent-from-preset-yaml-file) - - [2. Run Agent from custom config dict](#2-run-agent-from-custom-config-dict) - - [3. Run Agent from custom terminal config](#3-run-agent-from-custom-terminal-config) -- [The OmniSafe Team](#the-omnisafe-team) -- [License](#license) +- [OmniSafe](#omnisafe) + - [Table of Contents ](#table-of-contents---) + - [Overview](#overview) + - [Implemented Algorithms](#implemented-algorithms) + - [Published **in 2022**](#published-in-2022) + - [List of Algorithms](#list-of-algorithms) + - [SafeRL Environments](#saferl-environments) + - [Safety Gymnasium](#safety-gymnasium) + - [Vision-base Safe RL](#vision-base-safe-rl) + - [Environment Usage](#environment-usage) + - [Installation](#installation) + - [Prerequisites](#prerequisites) + - [Install from source](#install-from-source) + - [Examples](#examples) + - [Getting Started](#getting-started) + - [1. Run Agent from preset yaml file](#1-run-agent-from-preset-yaml-file) + - [2. Run Agent from custom config dict](#2-run-agent-from-custom-config-dict) + - [3. Run Agent from custom terminal config](#3-run-agent-from-custom-terminal-config) + - [The OmniSafe Team](#the-omnisafe-team) + - [License](#license) -------------------------------------------------------------------------------- @@ -53,7 +55,7 @@ Here we provide a table for comparison of **OmniSafe's algorithm core** and exis | [safe-control-gym](https://github.com/utiasDSL/safe-control-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/utiasDSL/safe-control-gym?label=last%20update) | PyTorch | PyBullet | 5**(2)** | | :x: | :x: | | Velocity-Constraints**(3)** | N/A | N/A | N/A | N/A | :x: | :x: | | [mujoco-circle](https://github.com/ymzhang01/mujoco-circle)
![GitHub last commit](https://img.shields.io/github/last-commit/ymzhang01/mujoco-circle?label=last%20update) | PyTorch | N/A | 0 | N/A | :x: | :x: | -| OmniSafe
![GitHub last commit](https://img.shields.io/github/last-commit/PKU-MARL/omnisafe?label=last%20update) | PyTorch | **MuJoCo 2.3.0+** | **25+** | `torch.distributed` | :heavy_check_mark: | :heavy_check_mark: | +| OmniSafe
![GitHub last commit](https://img.shields.io/github/last-commit/PKU-MARL/omnisafe?label=last%20update) | PyTorch | **MuJoCo 2.3.0+** | **25+** | `torch.distributed` | ✅ | ✅ | (1): Maintenance (expect bug fixes and minor updates), the last commit is 19 Nov 2021. Safety Gym depends on `mujoco-py` 2.0.2.7, which was updated on Oct 12, 2019.
(2): We only count the safe's algorithm.
@@ -68,14 +70,14 @@ The supported interface algorithms currently include: ### Published **in 2022** -- 😃 **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** -- 😃 **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** -- 😞 **Under Test**[NeurIPS 2022] [Effects of Safety State Augmentation on +- ✅ **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** +- ✅ **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** +- **Under Test**[NeurIPS 2022] [Effects of Safety State Augmentation on Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) -- 😃 **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) -- 😞 **Under Test**[ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) -- 😞 **Under Test**[ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) -- 😃 **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning](https://arxiv.org/abs/2205.11814) **The original author of the paper contributed code** +- ✅ **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) +- **Under Test**[ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) +- **Under Test**[ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) +- ✅ **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning](https://arxiv.org/abs/2205.11814) **The original author of the paper contributed code** - **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) - **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) @@ -83,41 +85,41 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) ### List of Algorithms > On Policy Safe -- :heavy_check_mark:[The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) -- :heavy_check_mark:[The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) -- :heavy_check_mark:[ICML 2017][Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) -- :heavy_check_mark:[ICLR 2019][Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) -- :heavy_check_mark:[ICML 2020][Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (PID-Lag)](https://arxiv.org/abs/2007.03964) -- :heavy_check_mark:[NeurIPS 2020][First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) -- :heavy_check_mark:[AAAI 2020][IPO: Interior-point Policy Optimization under Constraints (IPO)](https://arxiv.org/abs/1910.09615) -- :heavy_check_mark:[ICLR 2020][Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) -- :heavy_check_mark:[ICML 2021][CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) +- ✅[The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) +- ✅[The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) +- ✅[ICML 2017][Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) +- ✅[ICLR 2019][Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) +- ✅[ICML 2020][Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (PID-Lag)](https://arxiv.org/abs/2007.03964) +- ✅[NeurIPS 2020][First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) +- ✅[AAAI 2020][IPO: Interior-point Policy Optimization under Constraints (IPO)](https://arxiv.org/abs/1910.09615) +- ✅[ICLR 2020][Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) +- ✅[ICML 2021][CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) > Off Policy Safe -- :heavy_check_mark:The Lagrange version of TD3 (TD3-Lag) -- :heavy_check_mark:The Lagrange version of DDPG (DDPG-Lag) -- :heavy_check_mark:The Lagrange version of SAC (SAC-Lag) -- :heavy_check_mark:[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG)](https://arxiv.org/abs/1901.10031) -- :heavy_check_mark:[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) +- ✅The Lagrange version of TD3 (TD3-Lag) +- ✅The Lagrange version of DDPG (DDPG-Lag) +- ✅The Lagrange version of SAC (SAC-Lag) +- ✅[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG)](https://arxiv.org/abs/1901.10031) +- ✅[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) - [ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) > Model Base Safe - [NeurIPS 2021][Safe Reinforcement Learning by Imagining the Near Future (SMBPO)](https://arxiv.org/abs/2202.07789) -- :heavy_check_mark:[CoRL 2021 Oral][Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) -- :heavy_check_mark:[AAAI 2022][Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) +- ✅[CoRL 2021 Oral][Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) +- ✅[AAAI 2022][Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) - [NeurIPS 2022][Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) - [ICLR 2022] [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) > Offline Safe -- :heavy_check_mark:[The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) -- :heavy_check_mark:[The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) +- ✅[The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) +- ✅[The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) - [AAAI 2022] [Constraints Penalized Q-learning for Safe Offline Reinforcement Learning CPQ](https://arxiv.org/abs/2107.09003) - [ICLR 2022 spotlight] [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) - [ICML 2022][Constrained Offline Policy Optimization (COPO)](https://proceedings.mlr.press/v162/polosky22a.html) > Other -- :heavy_check_mark:[Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) +- ✅[Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) - [RA-L 2021] [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) - [ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) - [NeurIPS 2022] [Effects of Safety State Augmentation on From 24d656a100d56adcad357602ab790e8537ad744f Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 16:50:43 +0800 Subject: [PATCH 08/39] refactor: correct comments --- omnisafe/algorithms/off_policy/ddpg.py | 31 +++++++------------ omnisafe/algorithms/off_policy/ddpg_lag.py | 5 ++- omnisafe/algorithms/off_policy/sac.py | 6 ++-- omnisafe/algorithms/off_policy/sac_lag.py | 12 +++---- omnisafe/algorithms/off_policy/sddpg.py | 4 +-- omnisafe/algorithms/off_policy/td3_lag.py | 5 ++- .../early_terminated/ppo_early_terminated.py | 2 +- .../ppo_lag_early_terminated.py | 2 +- .../on_policy/saute/ppo_lag_saute.py | 2 +- .../algorithms/on_policy/saute/ppo_saute.py | 2 +- .../on_policy/simmer/ppo_lag_simmer_pid.py | 2 +- .../on_policy/simmer/ppo_lag_simmer_q.py | 2 +- .../on_policy/simmer/ppo_simmer_pid.py | 2 +- 13 files changed, 31 insertions(+), 46 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 6d066e0f4..a8af6c20b 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -172,12 +172,10 @@ def _init_mpi(self): self.logger.log(f'Done! (took {time.time()-start:0.3f} sec.)') def algorithm_specific_logs(self): - """ - Use this method to collect log information. - """ + r"""Use this method to collect log information.""" def _ac_training_setup(self): - """Set up target network for off_policy training.""" + r"""Set up target network for off_policy training.""" self.ac_targ = deepcopy(self.actor_critic) # Freeze target networks with respect to optimizer (only update via polyak averaging) for param in self.ac_targ.actor.parameters(): @@ -188,9 +186,7 @@ def _ac_training_setup(self): param.requires_grad = False def check_distributed_parameters(self): - """ - Check if parameters are synchronized across all processes. - """ + r"""Check if parameters are synchronized across all processes.""" if distributed_utils.num_procs() > 1: self.logger.log('Check if distributed parameters are synchronous..') modules = {'Policy': self.actor_critic.actor.net, 'Value': self.actor_critic.critic.net} @@ -201,8 +197,7 @@ def check_distributed_parameters(self): assert np.allclose(global_min, global_max), f'{key} not synced.' def compute_loss_pi(self, data: dict): - r""" - computing pi/actor loss + r"""Computing pi/actor loss Args: data (dict): data dictionary @@ -216,8 +211,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def compute_loss_v(self, data): - r""" - computing value loss + r"""Computing value loss Args: data (dict): data dictionary @@ -245,8 +239,7 @@ def compute_loss_v(self, data): return loss_q, q_info def compute_loss_c(self, data): - r""" - computing cost loss + r"""Computing cost loss Args: data (dict): data dictionary @@ -358,7 +351,7 @@ def update(self, data): self.polyak_update_target() def polyak_update_target(self): - r"""polyak update target network""" + r"""Polyak update target network.""" with torch.no_grad(): for param, param_targ in zip(self.actor_critic.parameters(), self.ac_targ.parameters()): # Notes: We use an in-place operations "mul_", "add_" to update target @@ -367,7 +360,7 @@ def polyak_update_target(self): param_targ.data.add_((1 - self.cfgs.polyak) * param.data) def update_policy_net(self, data) -> None: - r"""update policy network + r"""Update policy network. Args: data (dict): data dictionary @@ -380,7 +373,7 @@ def update_policy_net(self, data) -> None: self.logger.store(**{'Loss/Pi': loss_pi.item()}) def update_value_net(self, data: dict) -> None: - r"""update value network + r"""Update value network. Args: data (dict): data dictionary @@ -393,7 +386,7 @@ def update_value_net(self, data: dict) -> None: self.logger.store(**{'Loss/Value': loss_q.item(), 'QVals': q_info['QVals']}) def update_cost_net(self, data): - r"""update cost network + r"""Update cost network. Args: data (dict): data dictionary @@ -406,7 +399,7 @@ def update_cost_net(self, data): self.logger.store(**{'Loss/Cost': loss_qc.item(), 'QCosts': qc_info['QCosts']}) def test_agent(self): - r"""Test agent""" + r"""Test agent.""" for _ in range(self.num_test_episodes): # self.env.set_rollout_cfgs(deterministic=True, rand_a=False) self.env.roll_out( @@ -419,7 +412,7 @@ def test_agent(self): ) def log(self, epoch, total_steps): - r"""Log info about epoch""" + r"""Log info about epoch.""" fps = self.cfgs.steps_per_epoch / (time.time() - self.epoch_time) # Step the actor learning rate scheduler if provided if self.scheduler and self.cfgs.linear_lr_decay: diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 32bc1842b..03329ce85 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -62,8 +62,7 @@ def algorithm_specific_logs(self): self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r""" - computing pi/actor loss + r"""Computing pi/actor loss Args: data (dict): data from replay buffer @@ -84,7 +83,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def update(self, data): - r"""update""" + r"""Update.""" Jc = data['cost'].sum().item() self.update_lagrange_multiplier(Jc) # First run one gradient descent step for Q. diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index fae6793e7..365029866 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -51,8 +51,7 @@ def __init__( # pylint: disable=too-many-locals def compute_loss_v(self, data): - r""" - Computing value loss + r"""Computing value loss. Args: data (dict): data from replay buffer @@ -87,8 +86,7 @@ def compute_loss_v(self, data): return sum(loss_q), q_info def compute_loss_pi(self, data: dict): - r""" - Computing pi/actor loss + r"""Computing pi/actor loss. Args: data (dict): data from replay buffer diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index cf783777a..6fda58841 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -64,15 +64,12 @@ def __init__( ) def algorithm_specific_logs(self): - r""" - Use this method to collect log information. - """ + r"""Use this method to collect log information.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r""" - Computing pi/actor loss + r"""Computing pi/actor loss. Returns: torch.Tensor @@ -90,8 +87,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def compute_loss_c(self, data): - r""" - computing cost loss + r"""Computing cost loss Returns: torch.Tensor @@ -120,7 +116,7 @@ def compute_loss_c(self, data): return loss_qc, qc_info def update(self, data): - r"""update""" + r"""Update.""" Jc = data['cost'].sum().item() self.update_lagrange_multiplier(Jc) # First run one gradient descent step for Q. diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index c0af2c89a..2bda7d163 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -69,7 +69,7 @@ def __init__( self.d_init = cfgs.d_init def update(self, data): - r"""Update + r"""Update. Args: data (dict): data dictionary @@ -147,7 +147,7 @@ def compute_loss_cost_performance(self, data): # pylint: disable=invalid-name,too-many-arguments,too-many-locals def update_policy_net(self, data) -> None: - r"""update policy network + r"""Update policy network. Args: data (dict): data dictionary diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index b4a4b6289..60a42e201 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -68,8 +68,7 @@ def algorithm_specific_logs(self): self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r""" - computing pi/actor loss + r"""Computing pi/actor loss Args: data (dict): data @@ -90,7 +89,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def update(self, data): - r"""update""" + r"""Update.""" Jc = data['cost'].sum().item() self.update_lagrange_multiplier(Jc) # First run one gradient descent step for Q. diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py index aff4a014d..0777ff876 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -20,7 +20,7 @@ @registry.register class PPOEarlyTerminated(PPO): - """Early terminated algorithm implemented by PPO. + r"""Early terminated algorithm implemented by PPO. References: Paper Name: Safe Exploration by Solving Early Terminated MDP diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py index 7b72846d0..2ac6f9d73 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -20,7 +20,7 @@ @registry.register class PPOLagEarlyTerminated(PPOLag): - """Early terminated algorithm implemented by PPOLag. + r"""Early terminated algorithm implemented by PPOLag. References: Paper Name: Safe Exploration by Solving Early Terminated MDP diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py index 8a12d24c4..559c5017a 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -20,7 +20,7 @@ @registry.register class PPOLagSaute(PPOLag): - """Saute algorithm implemented by PPOLag. + r"""Saute algorithm implemented by PPOLag. References: Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py index 8a6198c9f..e8c16afab 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -20,7 +20,7 @@ @registry.register class PPOSaute(PPO): - """Saute algorithm implemented by PPO. + r"""Saute algorithm implemented by PPO. References: Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py index 6abfa72e7..d7a4abeb8 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -21,7 +21,7 @@ @registry.register class PPOLagSimmerPid(PPOLag): - """Simmer algorithm (PID version) implemented by PPOLag. + r"""Simmer algorithm (PID version) implemented by PPOLag. References: Paper Name: Effects of Safety State Augmentation on Safe Exploration. diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py index e92ab1bf9..dbbd532be 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -21,7 +21,7 @@ @registry.register class PPOLagSimmerQ(PPOLag): - """Simmer algorithm (Q version) implemented by PPOLag. + r"""Simmer algorithm (Q version) implemented by PPOLag. References: Paper Name: Effects of Safety State Augmentation on Safe Exploration. diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py index 848a21994..40a0a430d 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -20,7 +20,7 @@ @registry.register class PPOSimmerPid(PPO): - """Simmer algorithm (PID version) implemented by PPO. + r"""Simmer algorithm (PID version) implemented by PPO. References: Paper Name: Effects of Safety State Augmentation on Safe Exploration. From d328045d5776fbb6313159c8a32200e67f47180b Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 17:03:40 +0800 Subject: [PATCH 09/39] refactor: correct comments --- omnisafe/algorithms/off_policy/ddpg.py | 28 +++++++++++----------- omnisafe/algorithms/off_policy/ddpg_lag.py | 6 ++--- omnisafe/algorithms/off_policy/sac.py | 8 +++---- omnisafe/algorithms/off_policy/sddpg.py | 20 ++++++++-------- omnisafe/algorithms/off_policy/td3.py | 6 ++--- omnisafe/algorithms/off_policy/td3_lag.py | 14 +++++------ 6 files changed, 41 insertions(+), 41 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index a8af6c20b..f17cd7eeb 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -197,13 +197,13 @@ def check_distributed_parameters(self): assert np.allclose(global_min, global_max), f'{key} not synced.' def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss + r"""Computing pi/actor loss. Args: - data (dict): data dictionary + data (dict): data dictionary. Returns: - torch.Tensor + torch.Tensor. """ action, _ = self.actor_critic.actor.predict(data['obs'], deterministic=True) loss_pi = self.actor_critic.critic(data['obs'], action)[0] @@ -211,13 +211,13 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def compute_loss_v(self, data): - r"""Computing value loss + r"""Computing value loss. Args: - data (dict): data dictionary + data (dict): data dictionary. Returns: - torch.Tensor + torch.Tensor. """ obs, act, rew, obs_next, done = ( data['obs'], @@ -239,13 +239,13 @@ def compute_loss_v(self, data): return loss_q, q_info def compute_loss_c(self, data): - r"""Computing cost loss + r"""Computing cost loss. Args: - data (dict): data dictionary + data (dict): data dictionary. Returns: - torch.Tensor + torch.Tensor. """ obs, act, cost, obs_next, done = ( data['obs'], @@ -276,7 +276,7 @@ def learn(self): (3). log epoch/update information for visualization and terminal log print. Returns: - model and environment + model and environment. """ for steps in range(0, self.local_steps_per_epoch * self.epochs, self.update_every): @@ -319,10 +319,10 @@ def learn(self): return self.actor_critic def update(self, data): - r"""Update + r"""Update. Args: - data (dict): data dictionary + data (dict): data dictionary. """ # First run one gradient descent step for Q. self.update_value_net(data) @@ -363,7 +363,7 @@ def update_policy_net(self, data) -> None: r"""Update policy network. Args: - data (dict): data dictionary + data (dict): data dictionary. """ # Train policy with one steps of gradient descent self.actor_optimizer.zero_grad() @@ -389,7 +389,7 @@ def update_cost_net(self, data): r"""Update cost network. Args: - data (dict): data dictionary + data (dict): data dictionary. """ # Train cost critic with one steps of gradient descent self.cost_critic_optimizer.zero_grad() diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 03329ce85..773d44354 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -62,13 +62,13 @@ def algorithm_specific_logs(self): self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss + r"""Computing pi/actor loss. Args: - data (dict): data from replay buffer + data (dict): data from replay buffer. Returns: - torch.Tensor + torch.Tensor. """ action = self.actor_critic.actor.predict( data['obs'], deterministic=True, need_log_prob=False diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index 365029866..55af5cbf9 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -54,10 +54,10 @@ def compute_loss_v(self, data): r"""Computing value loss. Args: - data (dict): data from replay buffer + data (dict): data from replay buffer. Returns: - torch.Tensor + torch.Tensor. """ obs, act, rew, obs_next, done = ( data['obs'], @@ -89,10 +89,10 @@ def compute_loss_pi(self, data: dict): r"""Computing pi/actor loss. Args: - data (dict): data from replay buffer + data (dict): data from replay buffer. Returns: - torch.Tensor + torch.Tensor. """ action, logp_a = self.actor_critic.actor.predict( data['obs'], deterministic=True, need_log_prob=True diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index 2bda7d163..4c8398f04 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -49,10 +49,10 @@ def __init__( r"""Initialize SDDPG. Args: - env_id (str): environment id - cfgs (dict): configurations - algo (str): algorithm name - wrapper_type (str): environment wrapper type + env_id (str): environment id. + cfgs (dict): configurations. + algo (str): algorithm name. + wrapper_type (str): environment wrapper type. """ super().__init__( env_id=env_id, @@ -107,10 +107,10 @@ def Fvp(self, params): For details see John Schulman's PhD thesis (pp. 40) http://joschu.net/docs/thesis.pdf Args: - params (torch.Tensor): parameters + params (torch.Tensor): parameters. Returns: - flat_grad_grad_kl (torch.Tensor): flat gradient of gradient of KL + flat_grad_grad_kl (torch.Tensor): flat gradient of gradient of KL. """ self.actor_critic.actor.net.zero_grad() q_dist = self.actor_critic.actor.get_distribution(self.fvp_obs) @@ -131,13 +131,13 @@ def Fvp(self, params): return flat_grad_grad_kl + params * self.cg_damping def compute_loss_cost_performance(self, data): - r"""Compute loss of cost performance + r"""Compute loss of cost performance. Args: - data (dict): data dictionary + data (dict): data dictionary. Returns: - loss (torch.Tensor): loss of cost performance + loss (torch.Tensor): loss of cost performance. """ # Compute loss action, _ = self.actor_critic.actor.predict(data['obs'], deterministic=True) @@ -150,7 +150,7 @@ def update_policy_net(self, data) -> None: r"""Update policy network. Args: - data (dict): data dictionary + data (dict): data dictionary. """ # Train policy with one steps of gradient descent theta_old = get_flat_params_from(self.actor_critic.actor.net) diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index f93e480e6..19fd864db 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -49,13 +49,13 @@ def __init__( def compute_loss_v(self, data): r""" - computing value loss + computing value loss. Args: - data (dict): data from replay buffer + data (dict): data from replay buffer. Returns: - torch.Tensor + torch.Tensor. """ obs, act, rew, obs_next, done = ( data['obs'], diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index 60a42e201..2207cc0bf 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -41,10 +41,10 @@ def __init__( r"""Initialize TD3. Args: - env_id (str): environment id - cfgs (dict): configurations - algo (str): algorithm name - wrapper_type (str): environment wrapper type + env_id (str): environment id. + cfgs (dict): configurations. + algo (str): algorithm name. + wrapper_type (str): environment wrapper type. """ TD3.__init__( self, @@ -68,13 +68,13 @@ def algorithm_specific_logs(self): self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss + r"""Computing pi/actor loss. Args: - data (dict): data + data (dict): data. Returns: - torch.Tensor + torch.Tensor. """ action = self.actor_critic.actor.predict( data['obs'], deterministic=True, need_log_prob=False From c2756ce0d3d79356826e2ac2dc1eb35d3c978586 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 17:07:04 +0800 Subject: [PATCH 10/39] docs: update README.md --- README.md | 130 +++++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index c295dbdee..0200ec20d 100644 --- a/README.md +++ b/README.md @@ -22,26 +22,24 @@ The simulation environment around OmniSafe and a series of reliable algorithm im ### Table of Contents -- [OmniSafe](#omnisafe) - - [Table of Contents ](#table-of-contents---) - - [Overview](#overview) - - [Implemented Algorithms](#implemented-algorithms) - - [Published **in 2022**](#published-in-2022) - - [List of Algorithms](#list-of-algorithms) - - [SafeRL Environments](#saferl-environments) - - [Safety Gymnasium](#safety-gymnasium) - - [Vision-base Safe RL](#vision-base-safe-rl) - - [Environment Usage](#environment-usage) - - [Installation](#installation) - - [Prerequisites](#prerequisites) - - [Install from source](#install-from-source) - - [Examples](#examples) - - [Getting Started](#getting-started) - - [1. Run Agent from preset yaml file](#1-run-agent-from-preset-yaml-file) - - [2. Run Agent from custom config dict](#2-run-agent-from-custom-config-dict) - - [3. Run Agent from custom terminal config](#3-run-agent-from-custom-terminal-config) - - [The OmniSafe Team](#the-omnisafe-team) - - [License](#license) +- [Overview](#overview) +- [Implemented Algorithms](#implemented-algorithms) + - [Published **in 2022**](#published-in-2022) + - [List of Algorithms](#list-of-algorithms) +- [SafeRL Environments](#saferl-environments) + - [Safety Gymnasium](#safety-gymnasium) + - [Vision-base Safe RL](#vision-base-safe-rl) + - [Environment Usage](#environment-usage) +- [Installation](#installation) + - [Prerequisites](#prerequisites) + - [Install from source](#install-from-source) + - [Examples](#examples) +- [Getting Started](#getting-started) + - [1. Run Agent from preset yaml file](#1-run-agent-from-preset-yaml-file) + - [2. Run Agent from custom config dict](#2-run-agent-from-custom-config-dict) + - [3. Run Agent from custom terminal config](#3-run-agent-from-custom-terminal-config) +- [The OmniSafe Team](#the-omnisafe-team) +- [License](#license) -------------------------------------------------------------------------------- @@ -51,11 +49,11 @@ Here we provide a table for comparison of **OmniSafe's algorithm core** and exis | SafeRL
Platform | Backend | Engine | # Safe Algo. | Parallel
CPU/GPU | New Gym API**(4)** | Vision Input | | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----: | :---------------------------: | ------------------- | :-------------------: | :---------------------------: | :-----------------: | -| [Safety-Gym](https://github.com/openai/safety-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/openai/safety-gym?label=last%20update) | TF1 | `mujoco-py`**(1)** | 3 | CPU Only (`mpi4py`) | :x: | minimally supported | -| [safe-control-gym](https://github.com/utiasDSL/safe-control-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/utiasDSL/safe-control-gym?label=last%20update) | PyTorch | PyBullet | 5**(2)** | | :x: | :x: | -| Velocity-Constraints**(3)** | N/A | N/A | N/A | N/A | :x: | :x: | -| [mujoco-circle](https://github.com/ymzhang01/mujoco-circle)
![GitHub last commit](https://img.shields.io/github/last-commit/ymzhang01/mujoco-circle?label=last%20update) | PyTorch | N/A | 0 | N/A | :x: | :x: | -| OmniSafe
![GitHub last commit](https://img.shields.io/github/last-commit/PKU-MARL/omnisafe?label=last%20update) | PyTorch | **MuJoCo 2.3.0+** | **25+** | `torch.distributed` | ✅ | ✅ | +| [Safety-Gym](https://github.com/openai/safety-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/openai/safety-gym?label=last%20update) | TF1 | `mujoco-py`**(1)** | 3 | CPU Only (`mpi4py`) | ❌ | minimally supported | +| [safe-control-gym](https://github.com/utiasDSL/safe-control-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/utiasDSL/safe-control-gym?label=last%20update) | PyTorch | PyBullet | 5**(2)** | | ❌ | ❌ | +| Velocity-Constraints**(3)** | N/A | N/A | N/A | N/A | ❌ | ❌ | +| [mujoco-circle](https://github.com/ymzhang01/mujoco-circle)
![GitHub last commit](https://img.shields.io/github/last-commit/ymzhang01/mujoco-circle?label=last%20update) | PyTorch | N/A | 0 | N/A | ❌ | ❌ | +| OmniSafe
![GitHub last commit](https://img.shields.io/github/last-commit/PKU-MARL/omnisafe?label=last%20update) | PyTorch | **MuJoCo 2.3.0+** | **25+** | `torch.distributed` | ✅ | ✅ | (1): Maintenance (expect bug fixes and minor updates), the last commit is 19 Nov 2021. Safety Gym depends on `mujoco-py` 2.0.2.7, which was updated on Oct 12, 2019.
(2): We only count the safe's algorithm.
@@ -70,61 +68,63 @@ The supported interface algorithms currently include: ### Published **in 2022** -- ✅ **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** -- ✅ **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** -- **Under Test**[NeurIPS 2022] [Effects of Safety State Augmentation on +- [X] **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** +- [X] **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** +- [ ] **[NeurIPS 2022]** (Under Testing) [Effects of Safety State Augmentation on Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) -- ✅ **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) -- **Under Test**[ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) -- **Under Test**[ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) -- ✅ **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning](https://arxiv.org/abs/2205.11814) **The original author of the paper contributed code** -- **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) -- **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) - +- [X] **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) +- [ ] **[ICML 2022]** (Under Testing) [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) +- [ ] **[ICML 2022]** (Under Testing) [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) +- [X] **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning](https://arxiv.org/abs/2205.11814) **The original author of the paper contributed code** +- [ ] **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) +- [ ] **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) ### List of Algorithms > On Policy Safe -- ✅[The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) -- ✅[The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) -- ✅[ICML 2017][Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) -- ✅[ICLR 2019][Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) -- ✅[ICML 2020][Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (PID-Lag)](https://arxiv.org/abs/2007.03964) -- ✅[NeurIPS 2020][First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) -- ✅[AAAI 2020][IPO: Interior-point Policy Optimization under Constraints (IPO)](https://arxiv.org/abs/1910.09615) -- ✅[ICLR 2020][Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) -- ✅[ICML 2021][CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) + +- [X] [The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) +- [X] [The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) +- [X] **[ICML 2017]** [Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) +- [X] **[ICLR 2019]** [Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) +- [X] **[ICML 2020]** [Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (PID-Lag)](https://arxiv.org/abs/2007.03964) +- [X] **[NeurIPS 2020]** [First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) +- [X] **[AAAI 2020]** [IPO: Interior-point Policy Optimization under Constraints (IPO)](https://arxiv.org/abs/1910.09615) +- [X] **[ICLR 2020]** [Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) +- [X] **[ICML 2021]** [CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) > Off Policy Safe -- ✅The Lagrange version of TD3 (TD3-Lag) -- ✅The Lagrange version of DDPG (DDPG-Lag) -- ✅The Lagrange version of SAC (SAC-Lag) -- ✅[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG)](https://arxiv.org/abs/1901.10031) -- ✅[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) -- [ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) + +- [X] The Lagrange version of TD3 (TD3-Lag) +- [X] The Lagrange version of DDPG (DDPG-Lag) +- [X] The Lagrange version of SAC (SAC-Lag) +- [X] **[ICML 2019]** [Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG)](https://arxiv.org/abs/1901.10031) +- [X] **[ICML 2019]** [Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) +- [ ] **[ICML 2022]** [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) > Model Base Safe -- [NeurIPS 2021][Safe Reinforcement Learning by Imagining the Near Future (SMBPO)](https://arxiv.org/abs/2202.07789) -- ✅[CoRL 2021 Oral][Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) -- ✅[AAAI 2022][Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) -- [NeurIPS 2022][Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) -- [ICLR 2022] [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) +- [ ] **[NeurIPS 2021]** [Safe Reinforcement Learning by Imagining the Near Future (SMBPO)](https://arxiv.org/abs/2202.07789) +- [X] **[CoRL 2021 (Oral)]** [Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) +- [X] **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) +- [ ] **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) +- [ ] **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) > Offline Safe -- ✅[The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) -- ✅[The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) -- [AAAI 2022] [Constraints Penalized Q-learning for Safe Offline Reinforcement Learning CPQ](https://arxiv.org/abs/2107.09003) -- [ICLR 2022 spotlight] [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) -- [ICML 2022][Constrained Offline Policy Optimization (COPO)](https://proceedings.mlr.press/v162/polosky22a.html) + +- [X] [The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) +- [X] [The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) +- [ ] **[AAAI 2022]** [Constraints Penalized Q-learning for Safe Offline Reinforcement Learning CPQ](https://arxiv.org/abs/2107.09003) +- [ ] **[ICLR 2022 (Spotlight)]** [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) +- [ ] **[ICML 2022]** [Constrained Offline Policy Optimization (COPO)](https://proceedings.mlr.press/v162/polosky22a.html) > Other -- ✅[Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) -- [RA-L 2021] [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) -- [ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) -- [NeurIPS 2022] [Effects of Safety State Augmentation on -Safe Exploration](https://arxiv.org/abs/2206.02675) +- [X] [Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) +- [ ] **[RA-L 2021]** [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) +- [ ] **[ICML 2022]** [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) +- [ ] **[NeurIPS 2022]** [Effects of Safety State Augmentation on +Safe Exploration](https://arxiv.org/abs/2206.02675) -------------------------------------------------------------------------------- From cce970c373ef5150dda19e0d864f4c40642cf6af Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 17:11:37 +0800 Subject: [PATCH 11/39] chore(algorithms): rerender `__init__.py` --- examples/train_policy.py | 21 ++++++++---------- omnisafe/__init__.py | 1 + omnisafe/algorithms/__init__.py | 33 +++++++++++++++++++++++------ omnisafe/algorithms/algo_wrapper.py | 11 ++++------ 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/examples/train_policy.py b/examples/train_policy.py index a699ac0fa..cee365da1 100644 --- a/examples/train_policy.py +++ b/examples/train_policy.py @@ -24,27 +24,24 @@ parser.add_argument( '--algo', type=str, + metavar='ALGO', default='PPOLag', - help='Choose from: ' - 'On Policy:' - 'PolicyGradient, NaturalPG, TRPO, PPO,' - 'PDO, NPGLag, TRPOLag, PPOLag, CPO, PCPO, FOCOPS, CUP,' - 'CPPOPid, TRPOPid,' - 'PPOSaute, PPOSimmer, PPOSimmerPid, PPOSimmerQ, PPOEarlyTerminated,' - 'PPOLagSaute, PPOLagSimmerPid, PPOLagSimmerQ, PPOLagEarlyTerminated,' - 'Off Policy:' - 'DDPG, TD3, SAC,' - 'DDPGLag, TD3Lag, SACLag,' - 'SDDPG', + help='Algorithm to train', + choices=omnisafe.ALGORITHMS['all'], ) parser.add_argument( '--env-id', type=str, + metavar='ENV', default='SafetyPointGoal1-v0', help='The name of test environment', ) parser.add_argument( - '--parallel', default=1, type=int, help='Number of paralleled progress for calculations.' + '--parallel', + default=1, + type=int, + metavar='N', + help='Number of paralleled progress for calculations.', ) args, unparsed_args = parser.parse_known_args() keys = [k[2:] for k in unparsed_args[0::2]] diff --git a/omnisafe/__init__.py b/omnisafe/__init__.py index 77f8e63e0..35d79b41a 100644 --- a/omnisafe/__init__.py +++ b/omnisafe/__init__.py @@ -14,6 +14,7 @@ # ============================================================================== """OmniSafe: A comprehensive and reliable benchmark for safe reinforcement learning.""" +from omnisafe.algorithms import ALGORITHMS from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent # from omnisafe.algorithms.env_wrapper import EnvWrapper as Env diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index 65e3c876e..898b6d348 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -13,6 +13,11 @@ # limitations under the License. # ============================================================================== """Safe Reinforcement Learning algorithms.""" + +import itertools +from types import MappingProxyType + +# Off-Policy Safe from omnisafe.algorithms.off_policy.ddpg import DDPG from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag from omnisafe.algorithms.off_policy.sac import SAC @@ -20,6 +25,8 @@ from omnisafe.algorithms.off_policy.sddpg import SDDPG from omnisafe.algorithms.off_policy.td3 import TD3 from omnisafe.algorithms.off_policy.td3_lag import TD3Lag + +# On-Policy Safe from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -46,8 +53,8 @@ from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ -algo_type = { - 'off-policy': [ +ALGORITHMS = { + 'off-policy': ( 'DDPG', 'DDPGLag', 'TD3', @@ -55,8 +62,8 @@ 'SAC', 'SACLag', 'SDDPG', - ], - 'on-policy': [ + ), + 'on-policy': ( 'PolicyGradient', 'NaturalPG', 'TRPO', @@ -79,6 +86,20 @@ 'PPOLagSaute', 'PPOEarlyTerminated', 'PPOLagEarlyTerminated', - ], - 'model-based': ['MBPPOLag', 'SafeLoop'], + ), + 'model-based': ( + 'MBPPOLag', + 'SafeLoop', + ), +} + +ALGORITHM2TYPE = { + algo: algo_type for algo_type, algorithms in ALGORITHMS.items() for algo in algorithms } + +__all__ = ALGORITHMS['all'] = list(itertools.chain.from_iterable(ALGORITHMS.values())) + +assert len(ALGORITHM2TYPE) == len(__all__), 'Duplicate algorithm names found.' + +ALGORITHMS = MappingProxyType(ALGORITHMS) # make this immutable +ALGORITHM2TYPE = MappingProxyType(ALGORITHM2TYPE) # make this immutable diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py index def42e212..85ee6d9ec 100644 --- a/omnisafe/algorithms/algo_wrapper.py +++ b/omnisafe/algorithms/algo_wrapper.py @@ -19,7 +19,7 @@ import psutil -from omnisafe.algorithms import algo_type, registry +from omnisafe.algorithms import ALGORITHM2TYPE, registry from omnisafe.utils import distributed_utils from omnisafe.utils.config_utils import check_all_configs, recursive_update from omnisafe.utils.tools import get_default_kwargs_yaml @@ -46,13 +46,10 @@ def _init_checks(self): assert ( isinstance(self.custom_cfgs, dict) or self.custom_cfgs is None ), 'custom_cfgs must be a dict!' - for key, value in algo_type.items(): - if self.algo in value: - self.algo_type = key - break - if algo_type is None or algo_type == '': + self.algo_type = ALGORITHM2TYPE.get(self.algo, None) + if self.algo_type is None or self.algo_type == '': raise ValueError(f'{self.algo} is not supported!') - if algo_type == 'off-policy': + if self.algo_type == 'off-policy': assert self.parallel == 1, 'off-policy only support parallel==1!' def learn(self): From 4e12cbd077d800cda763216187821834a5411063 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 17:13:02 +0800 Subject: [PATCH 12/39] docs: update dictionary --- docs/source/spelling_wordlist.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index a0b1256a7..552064409 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -10,7 +10,7 @@ pragma fmt func sys -ol +bool len str iter @@ -165,6 +165,7 @@ xmax ymin ymax vel +pos quaternion Quaternions Jacobian @@ -194,7 +195,6 @@ David Mguni Jun Haitham -u Ammar Sun Ziping From a3d1695f857e7c702ef4334fc6f34e3cca744a4d Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 17:12:38 +0800 Subject: [PATCH 13/39] refactor: reformat the comments --- omnisafe/algorithms/off_policy/td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index 19fd864db..58d28a25b 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -49,7 +49,7 @@ def __init__( def compute_loss_v(self, data): r""" - computing value loss. + Computing value loss. Args: data (dict): data from replay buffer. From bd9d75ba2bff4ee505f49aa4a8b13d5f9cd353f3 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 17:21:28 +0800 Subject: [PATCH 14/39] chore(algorithms): make registration immutable --- omnisafe/algorithms/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index 898b6d348..c733055ba 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -97,7 +97,7 @@ algo: algo_type for algo_type, algorithms in ALGORITHMS.items() for algo in algorithms } -__all__ = ALGORITHMS['all'] = list(itertools.chain.from_iterable(ALGORITHMS.values())) +__all__ = ALGORITHMS['all'] = tuple(itertools.chain.from_iterable(ALGORITHMS.values())) assert len(ALGORITHM2TYPE) == len(__all__), 'Duplicate algorithm names found.' From d4b3317e3a9405e688cbbce52d528ecb7c339060 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 17:21:37 +0800 Subject: [PATCH 15/39] feat: add __init__.py --- omnisafe/algorithms/off_policy/__init__.py | 15 +++++++++++++++ omnisafe/algorithms/on_policy/__init__.py | 1 + omnisafe/algorithms/on_policy/base/__init__.py | 15 +++++++++++++++ .../on_policy/early_terminated/__init__.py | 15 +++++++++++++++ .../algorithms/on_policy/first_order/__init__.py | 15 +++++++++++++++ .../on_policy/naive_lagrange/__init__.py | 15 +++++++++++++++ .../algorithms/on_policy/pid_lagrange/__init__.py | 15 +++++++++++++++ omnisafe/algorithms/on_policy/saute/__init__.py | 15 +++++++++++++++ .../algorithms/on_policy/second_order/__init__.py | 15 +++++++++++++++ omnisafe/algorithms/on_policy/simmer/__init__.py | 15 +++++++++++++++ 10 files changed, 136 insertions(+) create mode 100644 omnisafe/algorithms/off_policy/__init__.py create mode 100644 omnisafe/algorithms/on_policy/base/__init__.py create mode 100644 omnisafe/algorithms/on_policy/early_terminated/__init__.py create mode 100644 omnisafe/algorithms/on_policy/first_order/__init__.py create mode 100644 omnisafe/algorithms/on_policy/naive_lagrange/__init__.py create mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/__init__.py create mode 100644 omnisafe/algorithms/on_policy/saute/__init__.py create mode 100644 omnisafe/algorithms/on_policy/second_order/__init__.py create mode 100644 omnisafe/algorithms/on_policy/simmer/__init__.py diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py new file mode 100644 index 000000000..f96f861ff --- /dev/null +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Off-policy algorithms.""" diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index 756435c9c..b71633f91 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -12,3 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""On-policy algorithms.""" diff --git a/omnisafe/algorithms/on_policy/base/__init__.py b/omnisafe/algorithms/on_policy/base/__init__.py new file mode 100644 index 000000000..434672651 --- /dev/null +++ b/omnisafe/algorithms/on_policy/base/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Basic Reinforcement Learning algorithms.""" diff --git a/omnisafe/algorithms/on_policy/early_terminated/__init__.py b/omnisafe/algorithms/on_policy/early_terminated/__init__.py new file mode 100644 index 000000000..aa270fe80 --- /dev/null +++ b/omnisafe/algorithms/on_policy/early_terminated/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Early terminated algorithms.""" diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py new file mode 100644 index 000000000..7ff8122ea --- /dev/null +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""The first order algorithms.""" diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py b/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py new file mode 100644 index 000000000..e575cd4c6 --- /dev/null +++ b/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Naive Lagrange algorithms.""" diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py new file mode 100644 index 000000000..25592db8a --- /dev/null +++ b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""PID Lagrange algorithms.""" diff --git a/omnisafe/algorithms/on_policy/saute/__init__.py b/omnisafe/algorithms/on_policy/saute/__init__.py new file mode 100644 index 000000000..6dab3e35c --- /dev/null +++ b/omnisafe/algorithms/on_policy/saute/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Saute algorithms.""" diff --git a/omnisafe/algorithms/on_policy/second_order/__init__.py b/omnisafe/algorithms/on_policy/second_order/__init__.py new file mode 100644 index 000000000..db903c4c2 --- /dev/null +++ b/omnisafe/algorithms/on_policy/second_order/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Second order algorithms.""" diff --git a/omnisafe/algorithms/on_policy/simmer/__init__.py b/omnisafe/algorithms/on_policy/simmer/__init__.py new file mode 100644 index 000000000..1a8a5d794 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Simmer algorithms.""" From 90a2ba8be7431ec8a02566f92e6bcf2715597ef3 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 17:32:12 +0800 Subject: [PATCH 16/39] refactor: reformat the comments --- omnisafe/algorithms/off_policy/sac.py | 2 +- omnisafe/algorithms/off_policy/sac_lag.py | 4 ++-- omnisafe/algorithms/off_policy/sddpg.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index 55af5cbf9..072484115 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -102,7 +102,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def update(self, data): - r"""Update""" + r"""Update.""" # First run one gradient descent step for Q. self.update_value_net(data) if self.cfgs.use_cost: diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 6fda58841..12ff4e0f5 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -87,10 +87,10 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def compute_loss_c(self, data): - r"""Computing cost loss + r"""Computing cost loss. Returns: - torch.Tensor + torch.Tensor. """ obs, act, cost, obs_next, done = ( data['obs'], diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index 4c8398f04..a9013efab 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -72,7 +72,7 @@ def update(self, data): r"""Update. Args: - data (dict): data dictionary + data (dict): data dictionary. """ # First run one gradient descent step for Q. self.fvp_obs = data['obs'][::4] From 41fad8fd22cdb6d705b9e9e741707953d95c0e39 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 17:41:56 +0800 Subject: [PATCH 17/39] refactor: reformat the comments --- omnisafe/wrappers/saute_wrapper.py | 60 ++++++++++++++--------------- omnisafe/wrappers/simmer_wrapper.py | 16 ++++---- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/omnisafe/wrappers/saute_wrapper.py b/omnisafe/wrappers/saute_wrapper.py index 25628ce91..3de560e4d 100644 --- a/omnisafe/wrappers/saute_wrapper.py +++ b/omnisafe/wrappers/saute_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""saute env_wrapper""" +"""Environment wrapper for saute algorithms.""" import numpy as np import torch @@ -35,9 +35,9 @@ def __init__( r"""Initialize SauteEnvWrapper. Args: - env_id (str): environment id - cfgs (dict): configuration dictionary - render_mode (str): render mode + env_id (str): environment id. + cfgs (dict): configuration dictionary. + render_mode (str): render mode. """ super().__init__(env_id, render_mode) @@ -62,11 +62,11 @@ def augment_obs(self, obs: np.array, safety_obs: np.array): r"""Augmenting the obs with the safety obs. Args: - obs (np.array): observation - safety_obs (np.array): safety observation + obs (np.array): observation. + safety_obs (np.array): safety observation. Returns: - augmented_obs (np.array): augmented observation + augmented_obs (np.array): augmented observation. """ augmented_obs = np.hstack([obs, safety_obs]) return augmented_obs @@ -75,10 +75,10 @@ def safety_step(self, cost: np.ndarray) -> np.ndarray: r"""Update the normalized safety obs. Args: - cost (np.array): cost + cost (np.array): cost. Returns: - safety_obs (np.array): normalized safety observation + safety_obs (np.array): normalized safety observation. """ self.safety_obs -= cost / self.safety_budget self.safety_obs /= self.saute_gamma @@ -88,24 +88,24 @@ def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.n r"""Update the reward. Args: - reward (np.array): reward - next_safety_obs (np.array): next safety observation + reward (np.array): reward. + next_safety_obs (np.array): next safety observation. Returns: - reward (np.array): updated reward + reward (np.array): updated reward. """ reward = reward * (next_safety_obs > 0) + self.unsafe_reward * (next_safety_obs <= 0) return reward def reset(self, seed=None): - r"""reset environment + r"""Reset environment. Args: - seed (int): seed for environment reset + seed (int): seed for environment reset. Returns: - self.curr_o (np.array): current observation - info (dict): environment info + self.curr_o (np.array): current observation. + info (dict): environment info. """ self.curr_o, info = self.env.reset(seed=seed) self.safety_obs = 1.0 @@ -116,15 +116,15 @@ def step(self, action): r"""Step environment. Args: - action (np.array): action + action (np.array): action. Returns: - augmented_obs (np.array): augmented observation - reward (np.array): reward - cost (np.array): cost - terminated (bool): whether the episode is terminated - truncated (bool): whether the episode is truncated - info (dict): environment info + augmented_obs (np.array): augmented observation. + reward (np.array): reward. + cost (np.array): cost. + terminated (bool): whether the episode is terminated. + truncated (bool): whether the episode is truncated. + info (dict): environment info. """ next_obs, reward, cost, terminated, truncated, info = self.env.step(action) next_safety_obs = self.safety_step(cost) @@ -140,15 +140,15 @@ def roll_out(self, agent, buf, logger): r"""Collect data and store to experience buffer. Args: - agent (Agent): agent - buf (Buffer): buffer - logger (Logger): logger + agent (Agent): agent. + buf (Buffer): buffer. + logger (Logger): logger. Returns: - ep_ret (float): episode return - ep_costs (float): episode costs - ep_len (int): episode length - ep_budget (float): episode budget + ep_ret (float): episode return. + ep_costs (float): episode costs. + ep_len (int): episode length. + ep_budget (float): episode budget. """ obs, _ = self.reset() ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py index 8d1a6fb35..308205d6a 100644 --- a/omnisafe/wrappers/simmer_wrapper.py +++ b/omnisafe/wrappers/simmer_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Environment wrapper for Simmer algorithm.""" import copy @@ -25,7 +25,7 @@ class PidController: # pylint: disable=too-many-instance-attributes - """Using PID controller to control the safety budget in Simmer environment.""" + r"""Using PID controller to control the safety budget in Simmer environment.""" def __init__( self, @@ -159,7 +159,8 @@ def get_state_idx(self, state: float): state (float): The current state. Returns: - int: The state index.""" + int: The state index. + """ state_idx = np.argwhere(self.state_space == state)[0][0] return state_idx @@ -191,7 +192,8 @@ def get_greedy_action(self, state: float): state (float): The current state(``cost_limit``). Returns: - float: The greedy action.""" + float: The greedy action. + """ state_idx = self.get_state_idx(state) action_idx = np.argmax(self.q_function[state_idx, :]) action = self.action_space[action_idx] @@ -381,7 +383,7 @@ def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.n return reward def reset(self, seed=None): - r"""reset environment + r"""Reset environment. Args: seed (int): The seed. @@ -397,7 +399,7 @@ def reset(self, seed=None): return self.curr_o, info def step(self, action): - r"""step environment + r"""Step environment. Args: action (np.array): The action. @@ -432,7 +434,7 @@ def set_budget(self, Jc): # pylint: disable-next=too-many-locals def roll_out(self, agent, buf, logger): - r"""collect data and store to experience buffer. + r"""Collect data and store to experience buffer. Args: agent (Agent): The agent. From 6f3a0d9286edd7e82ab24180d85eb68f20da7bf2 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 18:50:28 +0800 Subject: [PATCH 18/39] refactor: reformat the comments --- omnisafe/algorithms/off_policy/sac_lag.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 12ff4e0f5..06eda3cae 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -42,10 +42,10 @@ def __init__( r"""Initialize SACLag. Args: - env_id (str): environment id - cfgs (dict): configuration - algo (str): algorithm name - wrapper_type (str): environment wrapper type + env_id (str): environment id. + cfgs (dict): configuration. + algo (str): algorithm name. + wrapper_type (str): environment wrapper type. """ SAC.__init__( self, @@ -72,7 +72,7 @@ def compute_loss_pi(self, data: dict): r"""Computing pi/actor loss. Returns: - torch.Tensor + torch.Tensor. """ action, logp_a = self.actor_critic.actor.predict( data['obs'], deterministic=True, need_log_prob=True From 42f856cd28e0295aa8beeb75e07b217472cfadf7 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 19:13:05 +0800 Subject: [PATCH 19/39] refactor: reformat the comments --- omnisafe/algorithms/off_policy/ddpg.py | 34 +++++++-------- omnisafe/algorithms/off_policy/ddpg_lag.py | 8 ++-- omnisafe/algorithms/off_policy/sac.py | 12 +++--- omnisafe/algorithms/off_policy/sac_lag.py | 12 +++--- omnisafe/algorithms/off_policy/td3.py | 4 +- omnisafe/algorithms/off_policy/td3_lag.py | 10 ++--- omnisafe/wrappers/early_terminated_wrapper.py | 6 +-- omnisafe/wrappers/env_wrapper.py | 4 +- omnisafe/wrappers/off_policy_wrapper.py | 4 +- omnisafe/wrappers/on_policy_wrapper.py | 6 +-- omnisafe/wrappers/saute_wrapper.py | 16 +++---- omnisafe/wrappers/simmer_wrapper.py | 42 +++++++++---------- 12 files changed, 79 insertions(+), 79 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index f17cd7eeb..e289d0855 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -48,7 +48,7 @@ def __init__( algo: str = 'DDPG', wrapper_type: str = 'OffPolicyEnvWrapper', ): - r"""Initialize DDPG. + """Initialize DDPG. Args: env_id (str): Environment ID. @@ -146,7 +146,7 @@ def __init__( self.logger.log('Start with training.') def set_learning_rate_scheduler(self): - r"""Set up learning rate scheduler.""" + """Set up learning rate scheduler.""" scheduler = None if self.cfgs.linear_lr_decay: @@ -160,7 +160,7 @@ def linear_anneal(epoch): return scheduler def _init_mpi(self): - r"""Initialize MPI specifics.""" + """Initialize MPI specifics.""" if distributed_utils.num_procs() > 1: # Avoid slowdowns from PyTorch + MPI combo @@ -172,10 +172,10 @@ def _init_mpi(self): self.logger.log(f'Done! (took {time.time()-start:0.3f} sec.)') def algorithm_specific_logs(self): - r"""Use this method to collect log information.""" + """Use this method to collect log information.""" def _ac_training_setup(self): - r"""Set up target network for off_policy training.""" + """Set up target network for off_policy training.""" self.ac_targ = deepcopy(self.actor_critic) # Freeze target networks with respect to optimizer (only update via polyak averaging) for param in self.ac_targ.actor.parameters(): @@ -186,7 +186,7 @@ def _ac_training_setup(self): param.requires_grad = False def check_distributed_parameters(self): - r"""Check if parameters are synchronized across all processes.""" + """Check if parameters are synchronized across all processes.""" if distributed_utils.num_procs() > 1: self.logger.log('Check if distributed parameters are synchronous..') modules = {'Policy': self.actor_critic.actor.net, 'Value': self.actor_critic.critic.net} @@ -197,7 +197,7 @@ def check_distributed_parameters(self): assert np.allclose(global_min, global_max), f'{key} not synced.' def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss. + """Computing pi/actor loss. Args: data (dict): data dictionary. @@ -211,7 +211,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def compute_loss_v(self, data): - r"""Computing value loss. + """Computing value loss. Args: data (dict): data dictionary. @@ -239,7 +239,7 @@ def compute_loss_v(self, data): return loss_q, q_info def compute_loss_c(self, data): - r"""Computing cost loss. + """Computing cost loss. Args: data (dict): data dictionary. @@ -269,7 +269,7 @@ def compute_loss_c(self, data): return loss_qc, qc_info def learn(self): - r""" + """ This is main function for algorithm update, divided into the following steps: (1). self.rollout: collect interactive data from environment (2). self.update: perform actor/critic updates @@ -319,7 +319,7 @@ def learn(self): return self.actor_critic def update(self, data): - r"""Update. + """Update. Args: data (dict): data dictionary. @@ -351,7 +351,7 @@ def update(self, data): self.polyak_update_target() def polyak_update_target(self): - r"""Polyak update target network.""" + """Polyak update target network.""" with torch.no_grad(): for param, param_targ in zip(self.actor_critic.parameters(), self.ac_targ.parameters()): # Notes: We use an in-place operations "mul_", "add_" to update target @@ -360,7 +360,7 @@ def polyak_update_target(self): param_targ.data.add_((1 - self.cfgs.polyak) * param.data) def update_policy_net(self, data) -> None: - r"""Update policy network. + """Update policy network. Args: data (dict): data dictionary. @@ -373,7 +373,7 @@ def update_policy_net(self, data) -> None: self.logger.store(**{'Loss/Pi': loss_pi.item()}) def update_value_net(self, data: dict) -> None: - r"""Update value network. + """Update value network. Args: data (dict): data dictionary @@ -386,7 +386,7 @@ def update_value_net(self, data: dict) -> None: self.logger.store(**{'Loss/Value': loss_q.item(), 'QVals': q_info['QVals']}) def update_cost_net(self, data): - r"""Update cost network. + """Update cost network. Args: data (dict): data dictionary. @@ -399,7 +399,7 @@ def update_cost_net(self, data): self.logger.store(**{'Loss/Cost': loss_qc.item(), 'QCosts': qc_info['QCosts']}) def test_agent(self): - r"""Test agent.""" + """Test agent.""" for _ in range(self.num_test_episodes): # self.env.set_rollout_cfgs(deterministic=True, rand_a=False) self.env.roll_out( @@ -412,7 +412,7 @@ def test_agent(self): ) def log(self, epoch, total_steps): - r"""Log info about epoch.""" + """Log info about epoch.""" fps = self.cfgs.steps_per_epoch / (time.time() - self.epoch_time) # Step the actor learning rate scheduler if provided if self.scheduler and self.cfgs.linear_lr_decay: diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 773d44354..db390cc75 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -22,7 +22,7 @@ @registry.register class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes - r"""The Lagrange version of DDPG Algorithm. + """The Lagrange version of DDPG Algorithm. References: Paper Name: Continuous control with deep reinforcement learning. @@ -57,12 +57,12 @@ def __init__( ) def algorithm_specific_logs(self): - r"""Use this method to collect log information.""" + """Use this method to collect log information.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss. + """Computing pi/actor loss. Args: data (dict): data from replay buffer. @@ -83,7 +83,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def update(self, data): - r"""Update.""" + """Update.""" Jc = data['cost'].sum().item() self.update_lagrange_multiplier(Jc) # First run one gradient descent step for Q. diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index 072484115..89f22a597 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -23,7 +23,7 @@ @registry.register class SAC(DDPG): # pylint: disable=too-many-instance-attributes - r"""Implementation of the SAC algorithm. + """Implementation of the SAC algorithm. References: Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor @@ -39,7 +39,7 @@ def __init__( algo: str = 'SAC', wrapper_type: str = 'OffPolicyEnvWrapper', ): - r"""Initialize SAC.""" + """Initialize SAC.""" super().__init__( env_id=env_id, cfgs=cfgs, @@ -51,7 +51,7 @@ def __init__( # pylint: disable=too-many-locals def compute_loss_v(self, data): - r"""Computing value loss. + """Computing value loss. Args: data (dict): data from replay buffer. @@ -86,7 +86,7 @@ def compute_loss_v(self, data): return sum(loss_q), q_info def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss. + """Computing pi/actor loss. Args: data (dict): data from replay buffer. @@ -102,7 +102,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def update(self, data): - r"""Update.""" + """Update.""" # First run one gradient descent step for Q. self.update_value_net(data) if self.cfgs.use_cost: @@ -131,5 +131,5 @@ def update(self, data): self.alpha_discount() def alpha_discount(self): - r"""Alpha discount.""" + """Alpha discount.""" self.alpha *= self.alpha_gamma diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 06eda3cae..21f835975 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -23,7 +23,7 @@ @registry.register class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes - r"""The Lagrange version of SAC algorithm. + """The Lagrange version of SAC algorithm. References: Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor @@ -39,7 +39,7 @@ def __init__( algo: str = 'SAC-Lag', wrapper_type: str = 'OffPolicyEnvWrapper', ): - r"""Initialize SACLag. + """Initialize SACLag. Args: env_id (str): environment id. @@ -64,12 +64,12 @@ def __init__( ) def algorithm_specific_logs(self): - r"""Use this method to collect log information.""" + """Use this method to collect log information.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss. + """Computing pi/actor loss. Returns: torch.Tensor. @@ -87,7 +87,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def compute_loss_c(self, data): - r"""Computing cost loss. + """Computing cost loss. Returns: torch.Tensor. @@ -116,7 +116,7 @@ def compute_loss_c(self, data): return loss_qc, qc_info def update(self, data): - r"""Update.""" + """Update.""" Jc = data['cost'].sum().item() self.update_lagrange_multiplier(Jc) # First run one gradient descent step for Q. diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index 58d28a25b..ed7f432ce 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -23,7 +23,7 @@ @registry.register class TD3(DDPG): # pylint: disable=too-many-instance-attributes - r"""Implementation of TD3 Algorithm. + """Implementation of TD3 Algorithm. References: Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. @@ -48,7 +48,7 @@ def __init__( ) def compute_loss_v(self, data): - r""" + """ Computing value loss. Args: diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index 2207cc0bf..a4b05a81b 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -22,7 +22,7 @@ @registry.register class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes - r"""The Lagrange version of TD3 Algorithm. + """The Lagrange version of TD3 Algorithm. References: Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. @@ -38,7 +38,7 @@ def __init__( algo: str = 'TD3-Lag', wrapper_type: str = 'OffPolicyEnvWrapper', ): - r"""Initialize TD3. + """Initialize TD3. Args: env_id (str): environment id. @@ -63,12 +63,12 @@ def __init__( ) def algorithm_specific_logs(self): - r"""Use this method to collect log information.""" + """Use this method to collect log information.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) def compute_loss_pi(self, data: dict): - r"""Computing pi/actor loss. + """Computing pi/actor loss. Args: data (dict): data. @@ -89,7 +89,7 @@ def compute_loss_pi(self, data: dict): return -loss_pi.mean(), pi_info def update(self, data): - r"""Update.""" + """Update.""" Jc = data['cost'].sum().item() self.update_lagrange_multiplier(Jc) # First run one gradient descent step for Q. diff --git a/omnisafe/wrappers/early_terminated_wrapper.py b/omnisafe/wrappers/early_terminated_wrapper.py index 78fb726dc..78cd443d8 100644 --- a/omnisafe/wrappers/early_terminated_wrapper.py +++ b/omnisafe/wrappers/early_terminated_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Early terminated wrapper""" +"""Early terminated wrappe""" import torch @@ -22,11 +22,11 @@ @WRAPPER_REGISTRY.register class EarlyTerminatedEnvWrapper(OnPolicyEnvWrapper): # pylint: disable=too-many-instance-attributes - r"""EarlyTerminatedEnvWrapper.""" + """EarlyTerminatedEnvWrapper.""" # pylint: disable-next=too-many-locals def roll_out(self, agent, buf, logger): - r"""Collect data and store to experience buffer. + """Collect data and store to experience buffer. Terminated when the episode is done or the episode length is larger than max_ep_len or cost is unequal to 0.""" obs, _ = self.env.reset() diff --git a/omnisafe/wrappers/env_wrapper.py b/omnisafe/wrappers/env_wrapper.py index 8282f04e3..957a61e51 100644 --- a/omnisafe/wrappers/env_wrapper.py +++ b/omnisafe/wrappers/env_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Environment wrapper.""" import safety_gymnasium import torch @@ -20,7 +20,7 @@ # pylint: disable-next=too-many-instance-attributes class EnvWrapper: - """env_wrapper""" + """Environment wrapper.""" def __init__(self, env_id, render_mode=None): # check env_id is str diff --git a/omnisafe/wrappers/off_policy_wrapper.py b/omnisafe/wrappers/off_policy_wrapper.py index 83710981f..d3a18c810 100644 --- a/omnisafe/wrappers/off_policy_wrapper.py +++ b/omnisafe/wrappers/off_policy_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Environment wrapper for off-policy algorithms.""" import safety_gymnasium import torch @@ -23,7 +23,7 @@ # pylint: disable=too-many-instance-attributes @WRAPPER_REGISTRY.register class OffPolicyEnvWrapper: - """OffPolicyEnvWrapper""" + """OffPolicyEnvWrapperr""" def __init__( self, diff --git a/omnisafe/wrappers/on_policy_wrapper.py b/omnisafe/wrappers/on_policy_wrapper.py index 93d6ab89f..da52e5728 100644 --- a/omnisafe/wrappers/on_policy_wrapper.py +++ b/omnisafe/wrappers/on_policy_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Enviroment wrapper for on-policy algorithms.""" import collections from copy import deepcopy @@ -26,10 +26,10 @@ @WRAPPER_REGISTRY.register class OnPolicyEnvWrapper: # pylint: disable=too-many-instance-attributes - """env_wrapper""" + """env_wrapper.""" def __init__(self, env_id, cfgs: Optional[collections.namedtuple] = None, render_mode=None): - r"""Initialize environment wrapper. + """Initialize environment wrapper. Args: env_id (str): environment id. diff --git a/omnisafe/wrappers/saute_wrapper.py b/omnisafe/wrappers/saute_wrapper.py index 3de560e4d..17bbba46e 100644 --- a/omnisafe/wrappers/saute_wrapper.py +++ b/omnisafe/wrappers/saute_wrapper.py @@ -24,7 +24,7 @@ @WRAPPER_REGISTRY.register class SauteEnvWrapper(OnPolicyEnvWrapper): - r"""SauteEnvWrapper.""" + """SauteEnvWrapper.""" def __init__( self, @@ -32,7 +32,7 @@ def __init__( cfgs, render_mode=None, ) -> None: - r"""Initialize SauteEnvWrapper. + """Initialize SauteEnvWrapper. Args: env_id (str): environment id. @@ -59,7 +59,7 @@ def __init__( self.observation_space = spaces.Box(high=high, low=low) def augment_obs(self, obs: np.array, safety_obs: np.array): - r"""Augmenting the obs with the safety obs. + """Augmenting the obs with the safety obs. Args: obs (np.array): observation. @@ -72,7 +72,7 @@ def augment_obs(self, obs: np.array, safety_obs: np.array): return augmented_obs def safety_step(self, cost: np.ndarray) -> np.ndarray: - r"""Update the normalized safety obs. + """Update the normalized safety obs. Args: cost (np.array): cost. @@ -85,7 +85,7 @@ def safety_step(self, cost: np.ndarray) -> np.ndarray: return self.safety_obs def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.ndarray: - r"""Update the reward. + """Update the reward. Args: reward (np.array): reward. @@ -98,7 +98,7 @@ def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.n return reward def reset(self, seed=None): - r"""Reset environment. + """Reset environment. Args: seed (int): seed for environment reset. @@ -113,7 +113,7 @@ def reset(self, seed=None): return self.curr_o, info def step(self, action): - r"""Step environment. + """Step environment. Args: action (np.array): action. @@ -137,7 +137,7 @@ def step(self, action): # pylint: disable-next=too-many-locals def roll_out(self, agent, buf, logger): - r"""Collect data and store to experience buffer. + """Collect data and store to experience buffer. Args: agent (Agent): agent. diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py index 308205d6a..8d1f2ca46 100644 --- a/omnisafe/wrappers/simmer_wrapper.py +++ b/omnisafe/wrappers/simmer_wrapper.py @@ -25,7 +25,7 @@ class PidController: # pylint: disable=too-many-instance-attributes - r"""Using PID controller to control the safety budget in Simmer environment.""" + """Using PID controller to control the safety budget in Simmer environment.""" def __init__( self, @@ -34,7 +34,7 @@ def __init__( lower_budget: float = 1.0, upper_budget: float = 25.0, ) -> None: - r"""Initialize the PID controller. + """Initialize the PID controller. Args: cfgs (CfgNode): Configurations. @@ -63,7 +63,7 @@ def __init__( self.upper_budget = upper_budget def compute_raw_action(self, obs: float): - r"""Compute the raw action based on current obs. + """Compute the raw action based on current obs. Args: obs (float): The current observation. @@ -82,7 +82,7 @@ def compute_raw_action(self, obs: float): return curr_raw_action def act(self, obs: float): - r"""Compute the safety budget based on the observation ``Jc``. + """Compute the safety budget based on the observation ``Jc``. Args: obs (float): The current observation. @@ -114,7 +114,7 @@ def __init__( lower_budget: float = 1.0, upper_budget: float = 25.0, ) -> None: - r""" " + """ " Initialize the Q-learning controller. Args: @@ -153,7 +153,7 @@ def __init__( self.filtered_obs = 0 def get_state_idx(self, state: float): - r"""Get the state index. + """Get the state index. Args: state (float): The current state. @@ -165,7 +165,7 @@ def get_state_idx(self, state: float): return state_idx def get_action_idx(self, action: float): - r"""Get the action index. + """Get the action index. Args: action (float): The current action. @@ -177,7 +177,7 @@ def get_action_idx(self, action: float): return action_idx def get_random_action(self): - r"""Get the random action. + """Get the random action. Returns: float: The random action. @@ -186,7 +186,7 @@ def get_random_action(self): return self.action_space[action_idx] def get_greedy_action(self, state: float): - r"""Get the greedy action. + """Get the greedy action. Args: state (float): The current state(``cost_limit``). @@ -200,7 +200,7 @@ def get_greedy_action(self, state: float): return action def update_q_function(self, state: float, action: float, reward: float, next_state: float): - r"""Update the Q function using the Bellman equation. + """Update the Q function using the Bellman equation. Args: state (float): The current state. @@ -216,7 +216,7 @@ def update_q_function(self, state: float, action: float, reward: float, next_sta ] + self.q_lr * (reward + self.tau * np.max(self.q_function[next_state_idx, :])) def step(self, action: float): - r"""Step the environment. + """Step the environment. Args: action (float): The current action. @@ -227,7 +227,7 @@ def step(self, action: float): return self.state def reward(self, state: float, action: float, obs: float): - r"""Get the reward function based on whether the observation is within the threshold. + """Get the reward function based on whether the observation is within the threshold. Args: state (float): The current state. @@ -247,7 +247,7 @@ def reward(self, state: float, action: float, obs: float): return reward[0] def act(self, obs: float): - r"""Return the safety budget based on the observation. + """Return the safety budget based on the observation. Args: obs (float): The observation. @@ -285,7 +285,7 @@ def __init__( cfgs, render_mode=None, ) -> None: - r"""Initialize the Simmer environment wrapper. + """Initialize the Simmer environment wrapper. Args: env_id (str): The environment id. @@ -344,7 +344,7 @@ def __init__( ) def augment_obs(self, obs: np.array, safety_obs: np.array): - r"""Augmenting the obs with the safety obs, if needed. + """Augmenting the obs with the safety obs, if needed. Args: obs (np.array): The observation. @@ -357,7 +357,7 @@ def augment_obs(self, obs: np.array, safety_obs: np.array): return augmented_obs def safety_step(self, cost: np.ndarray) -> np.ndarray: - r"""Update the normalized safety obs. + """Update the normalized safety obs. Args: cost (np.ndarray): The cost. @@ -370,7 +370,7 @@ def safety_step(self, cost: np.ndarray) -> np.ndarray: return self.safety_obs def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.ndarray: - r"""Update the reward based on the safety obs. + """Update the reward based on the safety obs. Args: reward (np.ndarray): The reward. @@ -383,7 +383,7 @@ def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.n return reward def reset(self, seed=None): - r"""Reset environment. + """Reset environment. Args: seed (int): The seed. @@ -399,7 +399,7 @@ def reset(self, seed=None): return self.curr_o, info def step(self, action): - r"""Step environment. + """Step environment. Args: action (np.array): The action. @@ -422,7 +422,7 @@ def step(self, action): return augmented_obs, reward, cost, terminated, truncated, info def set_budget(self, Jc): - r"""Set the safety budget. + """Set the safety budget. Args: Jc (np.array): The safety budget. @@ -434,7 +434,7 @@ def set_budget(self, Jc): # pylint: disable-next=too-many-locals def roll_out(self, agent, buf, logger): - r"""Collect data and store to experience buffer. + """Collect data and store to experience buffer. Args: agent (Agent): The agent. From dbec13e7f379ddc5fdc91350980b402d0f619e7f Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 19:18:47 +0800 Subject: [PATCH 20/39] refactor: reformat the comments --- omnisafe/wrappers/early_terminated_wrapper.py | 2 +- omnisafe/wrappers/on_policy_wrapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/omnisafe/wrappers/early_terminated_wrapper.py b/omnisafe/wrappers/early_terminated_wrapper.py index 78cd443d8..7f9b00d93 100644 --- a/omnisafe/wrappers/early_terminated_wrapper.py +++ b/omnisafe/wrappers/early_terminated_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Early terminated wrappe""" +"""Early terminated wrapper.""" import torch diff --git a/omnisafe/wrappers/on_policy_wrapper.py b/omnisafe/wrappers/on_policy_wrapper.py index da52e5728..30df2ab9d 100644 --- a/omnisafe/wrappers/on_policy_wrapper.py +++ b/omnisafe/wrappers/on_policy_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Enviroment wrapper for on-policy algorithms.""" +"""Environment wrapper for on-policy algorithms.""" import collections from copy import deepcopy From 034f8c3ce5e9794078d14369b0d1d1ad2d8168cc Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 19:18:49 +0800 Subject: [PATCH 21/39] docs: update dictionary --- docs/source/spelling_wordlist.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 552064409..f37c8c43b 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -191,7 +191,6 @@ Taher Jafferjee Ziyan Wang -David Mguni Jun Haitham @@ -206,9 +205,7 @@ Peng Jiadong Guo Bo -Dai lei -bool MDP Bolei Bou From 35b3f23168b78b3aa205b3c6d05e0652d9a30c04 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 19:38:36 +0800 Subject: [PATCH 22/39] chore(algorithms): rerender `__init__.py` --- omnisafe/__init__.py | 1 + omnisafe/algorithms/__init__.py | 89 ++++++------------- omnisafe/algorithms/off_policy/__init__.py | 19 ++++ omnisafe/algorithms/on_policy/__init__.py | 36 ++++++++ .../algorithms/on_policy/base/__init__.py | 13 +++ .../on_policy/early_terminated/__init__.py | 11 +++ .../on_policy/first_order/__init__.py | 9 ++ .../on_policy/naive_lagrange/__init__.py | 13 +++ .../on_policy/pid_lagrange/__init__.py | 9 ++ .../algorithms/on_policy/saute/__init__.py | 9 ++ .../on_policy/second_order/__init__.py | 9 ++ .../algorithms/on_policy/simmer/__init__.py | 13 +++ 12 files changed, 169 insertions(+), 62 deletions(-) diff --git a/omnisafe/__init__.py b/omnisafe/__init__.py index 35d79b41a..f252acd93 100644 --- a/omnisafe/__init__.py +++ b/omnisafe/__init__.py @@ -14,6 +14,7 @@ # ============================================================================== """OmniSafe: A comprehensive and reliable benchmark for safe reinforcement learning.""" +from omnisafe import algorithms from omnisafe.algorithms import ALGORITHMS from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index c733055ba..497b58bbd 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -17,76 +17,41 @@ import itertools from types import MappingProxyType +from omnisafe.algorithms import off_policy, on_policy + # Off-Policy Safe -from omnisafe.algorithms.off_policy.ddpg import DDPG -from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag -from omnisafe.algorithms.off_policy.sac import SAC -from omnisafe.algorithms.off_policy.sac_lag import SACLag -from omnisafe.algorithms.off_policy.sddpg import SDDPG -from omnisafe.algorithms.off_policy.td3 import TD3 -from omnisafe.algorithms.off_policy.td3_lag import TD3Lag +from omnisafe.algorithms.off_policy import DDPG, SAC, SDDPG, TD3, DDPGLag, SACLag, TD3Lag # On-Policy Safe -from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG -from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient -from omnisafe.algorithms.on_policy.base.ppo import PPO -from omnisafe.algorithms.on_policy.base.trpo import TRPO -from omnisafe.algorithms.on_policy.early_terminated.ppo_early_terminated import PPOEarlyTerminated -from omnisafe.algorithms.on_policy.early_terminated.ppo_lag_early_terminated import ( +from omnisafe.algorithms.on_policy import ( + CPO, + CUP, + FOCOPS, + PCPO, + PDO, + PPO, + TRPO, + CPPOPid, + NaturalPG, + NPGLag, + PolicyGradient, + PPOEarlyTerminated, + PPOLag, PPOLagEarlyTerminated, + PPOLagSaute, + PPOLagSimmerPid, + PPOLagSimmerQ, + PPOSaute, + PPOSimmerPid, + PPOSimmerQ, + TRPOLag, + TRPOPid, ) -from omnisafe.algorithms.on_policy.first_order.cup import CUP -from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS -from omnisafe.algorithms.on_policy.naive_lagrange.npg_lag import NPGLag -from omnisafe.algorithms.on_policy.naive_lagrange.pdo import PDO -from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag -from omnisafe.algorithms.on_policy.naive_lagrange.trpo_lag import TRPOLag -from omnisafe.algorithms.on_policy.pid_lagrange.cppo_pid import CPPOPid -from omnisafe.algorithms.on_policy.pid_lagrange.trpo_pid import TRPOPid -from omnisafe.algorithms.on_policy.saute.ppo_lag_saute import PPOLagSaute -from omnisafe.algorithms.on_policy.saute.ppo_saute import PPOSaute -from omnisafe.algorithms.on_policy.second_order.cpo import CPO -from omnisafe.algorithms.on_policy.second_order.pcpo import PCPO -from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_pid import PPOLagSimmerPid -from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_q import PPOLagSimmerQ -from omnisafe.algorithms.on_policy.simmer.ppo_simmer_pid import PPOSimmerPid -from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ ALGORITHMS = { - 'off-policy': ( - 'DDPG', - 'DDPGLag', - 'TD3', - 'TD3Lag', - 'SAC', - 'SACLag', - 'SDDPG', - ), - 'on-policy': ( - 'PolicyGradient', - 'NaturalPG', - 'TRPO', - 'PPO', - 'PDO', - 'NPGLag', - 'TRPOLag', - 'PPOLag', - 'CPPOPid', - 'TRPOPid', - 'FOCOPS', - 'CUP', - 'CPO', - 'PCPO', - 'PPOSimmerPid', - 'PPOSimmerQ', - 'PPOLagSimmerQ', - 'PPOLagSimmerPid', - 'PPOSaute', - 'PPOLagSaute', - 'PPOEarlyTerminated', - 'PPOLagEarlyTerminated', - ), + 'off-policy': tuple(off_policy.__all__), + 'on-policy': tuple(on_policy.__all__), 'model-based': ( 'MBPPOLag', 'SafeLoop', diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py index f96f861ff..aa4ea363b 100644 --- a/omnisafe/algorithms/off_policy/__init__.py +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -13,3 +13,22 @@ # limitations under the License. # ============================================================================== """Off-policy algorithms.""" + +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.algorithms.off_policy.sac_lag import SACLag +from omnisafe.algorithms.off_policy.sddpg import SDDPG +from omnisafe.algorithms.off_policy.td3 import TD3 +from omnisafe.algorithms.off_policy.td3_lag import TD3Lag + + +__all__ = [ + 'DDPG', + 'DDPGLag', + 'SAC', + 'SACLag', + 'SDDPG', + 'TD3', + 'TD3Lag', +] diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index b71633f91..c7a781f09 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -13,3 +13,39 @@ # limitations under the License. # ============================================================================== """On-policy algorithms.""" + +from omnisafe.algorithms.on_policy import ( + base, + early_terminated, + first_order, + naive_lagrange, + pid_lagrange, + saute, + second_order, + simmer, +) +from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient +from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, PPOLagEarlyTerminated +from omnisafe.algorithms.on_policy.first_order import CUP, FOCOPS +from omnisafe.algorithms.on_policy.naive_lagrange import PDO, NPGLag, PPOLag, TRPOLag +from omnisafe.algorithms.on_policy.pid_lagrange import CPPOPid, TRPOPid +from omnisafe.algorithms.on_policy.saute import PPOLagSaute, PPOSaute +from omnisafe.algorithms.on_policy.second_order import CPO, PCPO +from omnisafe.algorithms.on_policy.simmer import ( + PPOLagSimmerPid, + PPOLagSimmerQ, + PPOSimmerPid, + PPOSimmerQ, +) + + +__all__ = [ + *base.__all__, + *early_terminated.__all__, + *first_order.__all__, + *naive_lagrange.__all__, + *pid_lagrange.__all__, + *saute.__all__, + *second_order.__all__, + *simmer.__all__, +] diff --git a/omnisafe/algorithms/on_policy/base/__init__.py b/omnisafe/algorithms/on_policy/base/__init__.py index 434672651..0b8e240be 100644 --- a/omnisafe/algorithms/on_policy/base/__init__.py +++ b/omnisafe/algorithms/on_policy/base/__init__.py @@ -13,3 +13,16 @@ # limitations under the License. # ============================================================================== """Basic Reinforcement Learning algorithms.""" + +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO + + +__all__ = [ + 'NaturalPG', + 'PolicyGradient', + 'PPO', + 'TRPO', +] diff --git a/omnisafe/algorithms/on_policy/early_terminated/__init__.py b/omnisafe/algorithms/on_policy/early_terminated/__init__.py index aa270fe80..457ca0b3e 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/__init__.py +++ b/omnisafe/algorithms/on_policy/early_terminated/__init__.py @@ -13,3 +13,14 @@ # limitations under the License. # ============================================================================== """Early terminated algorithms.""" + +from omnisafe.algorithms.on_policy.early_terminated.ppo_early_terminated import PPOEarlyTerminated +from omnisafe.algorithms.on_policy.early_terminated.ppo_lag_early_terminated import ( + PPOLagEarlyTerminated, +) + + +__all__ = [ + 'PPOEarlyTerminated', + 'PPOLagEarlyTerminated', +] diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py index 7ff8122ea..630eedaa6 100644 --- a/omnisafe/algorithms/on_policy/first_order/__init__.py +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -13,3 +13,12 @@ # limitations under the License. # ============================================================================== """The first order algorithms.""" + +from omnisafe.algorithms.on_policy.first_order.cup import CUP +from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS + + +__all__ = [ + 'CUP', + 'FOCOPS', +] diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py b/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py index e575cd4c6..018b41197 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py @@ -13,3 +13,16 @@ # limitations under the License. # ============================================================================== """Naive Lagrange algorithms.""" + +from omnisafe.algorithms.on_policy.naive_lagrange.npg_lag import NPGLag +from omnisafe.algorithms.on_policy.naive_lagrange.pdo import PDO +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag +from omnisafe.algorithms.on_policy.naive_lagrange.trpo_lag import TRPOLag + + +__all__ = [ + 'NPGLag', + 'PDO', + 'PPOLag', + 'TRPOLag', +] diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py index 25592db8a..aef2373d6 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py @@ -13,3 +13,12 @@ # limitations under the License. # ============================================================================== """PID Lagrange algorithms.""" + +from omnisafe.algorithms.on_policy.pid_lagrange.cppo_pid import CPPOPid +from omnisafe.algorithms.on_policy.pid_lagrange.trpo_pid import TRPOPid + + +__all__ = [ + 'CPPOPid', + 'TRPOPid', +] diff --git a/omnisafe/algorithms/on_policy/saute/__init__.py b/omnisafe/algorithms/on_policy/saute/__init__.py index 6dab3e35c..65e0a5087 100644 --- a/omnisafe/algorithms/on_policy/saute/__init__.py +++ b/omnisafe/algorithms/on_policy/saute/__init__.py @@ -13,3 +13,12 @@ # limitations under the License. # ============================================================================== """Saute algorithms.""" + +from omnisafe.algorithms.on_policy.saute.ppo_lag_saute import PPOLagSaute +from omnisafe.algorithms.on_policy.saute.ppo_saute import PPOSaute + + +__all__ = [ + 'PPOLagSaute', + 'PPOSaute', +] diff --git a/omnisafe/algorithms/on_policy/second_order/__init__.py b/omnisafe/algorithms/on_policy/second_order/__init__.py index db903c4c2..ac04d723b 100644 --- a/omnisafe/algorithms/on_policy/second_order/__init__.py +++ b/omnisafe/algorithms/on_policy/second_order/__init__.py @@ -13,3 +13,12 @@ # limitations under the License. # ============================================================================== """Second order algorithms.""" + +from omnisafe.algorithms.on_policy.second_order.cpo import CPO +from omnisafe.algorithms.on_policy.second_order.pcpo import PCPO + + +__all__ = [ + 'CPO', + 'PCPO', +] diff --git a/omnisafe/algorithms/on_policy/simmer/__init__.py b/omnisafe/algorithms/on_policy/simmer/__init__.py index 1a8a5d794..55fddc516 100644 --- a/omnisafe/algorithms/on_policy/simmer/__init__.py +++ b/omnisafe/algorithms/on_policy/simmer/__init__.py @@ -13,3 +13,16 @@ # limitations under the License. # ============================================================================== """Simmer algorithms.""" + +from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_pid import PPOLagSimmerPid +from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_q import PPOLagSimmerQ +from omnisafe.algorithms.on_policy.simmer.ppo_simmer_pid import PPOSimmerPid +from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ + + +__all__ = [ + 'PPOLagSimmerPid', + 'PPOLagSimmerQ', + 'PPOSimmerPid', + 'PPOSimmerQ', +] From 1ee26d0d21d831f562cc67b7ba9ae72a8c53022a Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 19:45:18 +0800 Subject: [PATCH 23/39] chore(algorithms): remove module references --- omnisafe/algorithms/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index 497b58bbd..23829c7d7 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -68,3 +68,5 @@ ALGORITHMS = MappingProxyType(ALGORITHMS) # make this immutable ALGORITHM2TYPE = MappingProxyType(ALGORITHM2TYPE) # make this immutable + +del itertools, MappingProxyType From e7f11cdd1bb6ffa3c3d6de88366941c05e513565 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 21:12:26 +0800 Subject: [PATCH 24/39] fix: reformat the comments --- omnisafe/algorithms/off_policy/ddpg_lag.py | 1 - omnisafe/algorithms/off_policy/sac.py | 2 +- omnisafe/algorithms/off_policy/sddpg.py | 1 - omnisafe/algorithms/off_policy/td3.py | 3 +-- omnisafe/algorithms/off_policy/td3_lag.py | 1 - omnisafe/algorithms/on_policy/first_order/__init__.py | 2 +- omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py | 2 +- omnisafe/algorithms/on_policy/second_order/__init__.py | 2 +- omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py | 2 +- omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py | 2 +- 10 files changed, 7 insertions(+), 11 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index db390cc75..22cec4bf9 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of the DDPGLag algorithm.""" - from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.ddpg import DDPG from omnisafe.common.lagrange import Lagrange diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index 89f22a597..1ba8f7798 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the DDPG algorithm.""" +"""Implementation of the SAC algorithm.""" import torch diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index a9013efab..d7c2260a8 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of the SDDPG algorithm.""" - import torch from omnisafe.algorithms import registry diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index ed7f432ce..d815f4217 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the DDPG algorithm.""" - +"""Implementation of the TD3 algorithm.""" import torch diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index a4b05a81b..8912872aa 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of the TD3Lag algorithm.""" - from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.td3 import TD3 from omnisafe.common.lagrange import Lagrange diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py index 630eedaa6..635735694 100644 --- a/omnisafe/algorithms/on_policy/first_order/__init__.py +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""The first order algorithms.""" +"""FIrst-order algorithms.""" from omnisafe.algorithms.on_policy.first_order.cup import CUP from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index 5ebaf5e1b..b46364f07 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the TRPO Pid-Lagrange algorithm.""" +"""Implementation of the TRPO PID-Lagrange algorithm.""" import torch diff --git a/omnisafe/algorithms/on_policy/second_order/__init__.py b/omnisafe/algorithms/on_policy/second_order/__init__.py index ac04d723b..236f34808 100644 --- a/omnisafe/algorithms/on_policy/second_order/__init__.py +++ b/omnisafe/algorithms/on_policy/second_order/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Second order algorithms.""" +"""Second-order algorithms.""" from omnisafe.algorithms.on_policy.second_order.cpo import CPO from omnisafe.algorithms.on_policy.second_order.pcpo import PCPO diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py index d7a4abeb8..3fa8bec39 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Pid Simmer algorithm by PPOLag.""" +"""Implementation of the PID Simmer algorithm by PPOLag.""" from omnisafe.algorithms import registry diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py index 40a0a430d..14a7de4ba 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Pid Simmer algorithm by PPOLag.""" +"""Implementation of the PID Simmer algorithm by PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO From 2fda913d166e1249a2beb0132216078d50d02e57 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 21:22:39 +0800 Subject: [PATCH 25/39] docs: update README.md --- README.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0200ec20d..793df1e96 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,13 @@ The simulation environment around OmniSafe and a series of reliable algorithm im - [Overview](#overview) - [Implemented Algorithms](#implemented-algorithms) - - [Published **in 2022**](#published-in-2022) + - [Newly Published in 2022](#newly-published-in-2022) - [List of Algorithms](#list-of-algorithms) + - [On-Policy Safe](#on-policy-safe) + - [Off-Policy Safe](#off-policy-safe) + - [Model-Based Safe](#model-based-safe) + - [Offline Safe](#offline-safe) + - [Others](#others) - [SafeRL Environments](#saferl-environments) - [Safety Gymnasium](#safety-gymnasium) - [Vision-base Safe RL](#vision-base-safe-rl) @@ -66,7 +71,7 @@ Here we provide a table for comparison of **OmniSafe's algorithm core** and exis The supported interface algorithms currently include: -### Published **in 2022** +### Newly Published in 2022 - [X] **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** - [X] **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** @@ -81,7 +86,7 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) ### List of Algorithms -> On Policy Safe +#### On-Policy Safe - [X] [The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) - [X] [The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) @@ -93,7 +98,7 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) - [X] **[ICLR 2020]** [Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) - [X] **[ICML 2021]** [CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) -> Off Policy Safe +#### Off-Policy Safe - [X] The Lagrange version of TD3 (TD3-Lag) - [X] The Lagrange version of DDPG (DDPG-Lag) @@ -102,7 +107,7 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) - [X] **[ICML 2019]** [Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) - [ ] **[ICML 2022]** [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) -> Model Base Safe +#### Model-Based Safe - [ ] **[NeurIPS 2021]** [Safe Reinforcement Learning by Imagining the Near Future (SMBPO)](https://arxiv.org/abs/2202.07789) - [X] **[CoRL 2021 (Oral)]** [Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) @@ -110,7 +115,7 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) - [ ] **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) - [ ] **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) -> Offline Safe +#### Offline Safe - [X] [The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) - [X] [The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) @@ -118,7 +123,7 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) - [ ] **[ICLR 2022 (Spotlight)]** [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) - [ ] **[ICML 2022]** [Constrained Offline Policy Optimization (COPO)](https://proceedings.mlr.press/v162/polosky22a.html) -> Other +#### Others - [X] [Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) - [ ] **[RA-L 2021]** [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) From 660519e95bd40e6ed5b71bac8389f0fec6080160 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 22:09:12 +0800 Subject: [PATCH 26/39] docs: update docstrings --- omnisafe/algorithms/off_policy/ddpg.py | 11 +++++------ omnisafe/algorithms/off_policy/ddpg_lag.py | 13 ++++++------- omnisafe/algorithms/off_policy/sac.py | 10 ++++------ omnisafe/algorithms/off_policy/sac_lag.py | 9 ++++----- omnisafe/algorithms/off_policy/sddpg.py | 10 +++++----- omnisafe/algorithms/off_policy/td3.py | 9 ++++----- omnisafe/algorithms/off_policy/td3_lag.py | 11 +++++------ omnisafe/algorithms/on_policy/base/natural_pg.py | 7 +++---- .../algorithms/on_policy/base/policy_gradient.py | 7 +++---- omnisafe/algorithms/on_policy/base/ppo.py | 8 ++++---- omnisafe/algorithms/on_policy/base/trpo.py | 8 ++++---- .../early_terminated/ppo_early_terminated.py | 10 +++++----- .../early_terminated/ppo_lag_early_terminated.py | 10 +++++----- .../algorithms/on_policy/first_order/__init__.py | 2 +- omnisafe/algorithms/on_policy/first_order/cup.py | 10 +++++----- omnisafe/algorithms/on_policy/first_order/focops.py | 7 +++---- .../algorithms/on_policy/naive_lagrange/npg_lag.py | 7 +++---- omnisafe/algorithms/on_policy/naive_lagrange/pdo.py | 5 ++--- .../algorithms/on_policy/naive_lagrange/ppo_lag.py | 11 +++++------ .../algorithms/on_policy/naive_lagrange/trpo_lag.py | 10 +++++----- .../algorithms/on_policy/pid_lagrange/cppo_pid.py | 11 +++++------ .../algorithms/on_policy/pid_lagrange/trpo_pid.py | 11 +++++------ .../algorithms/on_policy/saute/ppo_lag_saute.py | 12 ++++++------ omnisafe/algorithms/on_policy/saute/ppo_saute.py | 10 +++++----- omnisafe/algorithms/on_policy/second_order/cpo.py | 9 ++++----- omnisafe/algorithms/on_policy/second_order/pcpo.py | 7 +++---- .../on_policy/simmer/ppo_lag_simmer_pid.py | 11 +++++------ .../algorithms/on_policy/simmer/ppo_lag_simmer_q.py | 11 +++++------ .../algorithms/on_policy/simmer/ppo_simmer_pid.py | 10 +++++----- .../algorithms/on_policy/simmer/ppo_simmer_q.py | 10 +++++----- omnisafe/algorithms/registry.py | 6 +++--- 31 files changed, 132 insertions(+), 151 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index e289d0855..774563d99 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -31,14 +31,13 @@ @registry.register class DDPG: # pylint: disable=too-many-instance-attributes - """Continuous control with deep reinforcement learning (DDPG) Algorithm. + """The Deep Deterministic Policy Gradient (DDPG) algorithm. References: - Paper Name: Continuous control with deep reinforcement learning. - Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, - Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. - Paper URL: https://arxiv.org/abs/1509.02971 - + Title: Continuous control with deep reinforcement learning + Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, + Yuval Tassa, David Silver, Daan Wierstra. + URL: https://arxiv.org/abs/1509.02971 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 22cec4bf9..01a9ea301 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the DDPGLag algorithm.""" +"""Implementation of the Lagrange version of the DDPG algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.ddpg import DDPG @@ -21,14 +21,13 @@ @registry.register class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes - """The Lagrange version of DDPG Algorithm. + """The Lagrange version of the DDPG Algorithm. References: - Paper Name: Continuous control with deep reinforcement learning. - Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, - Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. - Paper URL: https://arxiv.org/abs/1509.02971 - + Title: Continuous control with deep reinforcement learning + Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, + Yuval Tassa, David Silver, Daan Wierstra. + URL: https://arxiv.org/abs/1509.02971 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index 1ba8f7798..30c6f8d49 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of the SAC algorithm.""" - import torch from omnisafe.algorithms import registry @@ -23,13 +22,12 @@ @registry.register class SAC(DDPG): # pylint: disable=too-many-instance-attributes - """Implementation of the SAC algorithm. + """The Soft Actor-Critic (SAC) algorithm. References: - Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine - Paper URL: https://arxiv.org/abs/1801.01290 - + Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + URL: https://arxiv.org/abs/1801.01290 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 21f835975..ce2d57a90 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the SACLag algorithm.""" +"""Implementation of the Lagrange version of the SAC algorithm.""" import torch @@ -26,10 +26,9 @@ class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes """The Lagrange version of SAC algorithm. References: - Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine - Paper URL: https://arxiv.org/abs/1801.01290 - + Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + URL: https://arxiv.org/abs/1801.01290 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index d7c2260a8..76388d442 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -29,13 +29,13 @@ @registry.register class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name - r"""Implementation of SDDPG Algorithm. + r"""The SDDPG algorithm. References: - Paper Name: Lyapunov-based Safe Policy Optimization for Continuous Control. - Paper author: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, Mohammad Ghavamzadeh. - Paper URL: https://arxiv.org/abs/1901.10031 - + Title: Lyapunov-based Safe Policy Optimization for Continuous Control + Authors: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, + Mohammad Ghavamzadeh. + URL: https://arxiv.org/abs/1901.10031 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index d815f4217..75c0b433c 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -22,13 +22,12 @@ @registry.register class TD3(DDPG): # pylint: disable=too-many-instance-attributes - """Implementation of TD3 Algorithm. + """The Twin Delayed DDPG (TD3) algorithm. References: - Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. - Paper author: Scott Fujimoto, Herke van Hoof, David Meger. - Paper URL: https://arxiv.org/abs/1802.09477 - + Title: Addressing Function Approximation Error in Actor-Critic Methods + Authors: Scott Fujimoto, Herke van Hoof, David Meger. + URL: https://arxiv.org/abs/1802.09477 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index 8912872aa..1620eb81b 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the TD3Lag algorithm.""" +"""Implementation of the Lagrange version of the TD3 algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.td3 import TD3 @@ -21,13 +21,12 @@ @registry.register class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes - """The Lagrange version of TD3 Algorithm. + """The Lagrange version of the TD3 algorithm References: - Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. - Paper author: Scott Fujimoto, Herke van Hoof, David Meger. - Paper URL: https://arxiv.org/abs/1802.09477 - + Title: Addressing Function Approximation Error in Actor-Critic Methods + Authors: Scott Fujimoto, Herke van Hoof, David Meger. + URL: https://arxiv.org/abs/1802.09477 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py index 6201ac307..7d7f02035 100644 --- a/omnisafe/algorithms/on_policy/base/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -32,10 +32,9 @@ class NaturalPG(PolicyGradient): """The Natural Policy Gradient algorithm. References: - Paper Name: A Natural Policy Gradient. - Paper author: Sham Kakade. - Paper URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf - + Title: A Natural Policy Gradient + Author: Sham Kakade. + URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf """ def __init__( diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index 1a4400119..b9367a618 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -34,10 +34,9 @@ class PolicyGradient: # pylint: disable=too-many-instance-attributes """The Policy Gradient algorithm. References: - Paper Name: Policy Gradient Methods for Reinforcement Learning with Function Approximation - Paper Author: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour - Paper URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf - + Title: Policy Gradient Methods for Reinforcement Learning with Function Approximation + Authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour. + URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf """ # pylint: disable-next=too-many-locals diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index 56ddde25e..78e42b5b0 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -23,12 +23,12 @@ @registry.register class PPO(PolicyGradient): - """The Proximal Policy Optimization Algorithms (PPO) Algorithm. + """The Proximal Policy Optimization (PPO) algorithm. References: - Paper Name: Proximal Policy Optimization Algorithms. - Paper author: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - Paper URL: https://arxiv.org/pdf/1707.06347.pdf + Title: Proximal Policy Optimization Algorithms + Authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. + URL: https://arxiv.org/abs/1707.06347 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py index f4fde9504..9e12ef6c7 100644 --- a/omnisafe/algorithms/on_policy/base/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -29,12 +29,12 @@ @registry.register class TRPO(NaturalPG): - """The Trust Region Policy Optimization (TRPO) Algorithm. + """The Trust Region Policy Optimization (TRPO) algorithm. References: - Paper Name: Trust Region Policy Optimization. - Paper author: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1502.05477 + Title: Trust Region Policy Optimization + Authors: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. + URL: https://arxiv.org/abs/1502.05477 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py index 0777ff876..5debd33e4 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Early terminated algorithm by PPO.""" +"""Implementation of the early terminated algorithm using PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOEarlyTerminated(PPO): - r"""Early terminated algorithm implemented by PPO. + r"""The early terminated algorithm implemented with PPO. References: - Paper Name: Safe Exploration by Solving Early Terminated MDP - Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou - Paper URL: https://arxiv.org/abs/2107.04200 + Title: Safe Exploration by Solving Early Terminated MDP + Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. + URL: https://arxiv.org/abs/2107.04200 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py index 2ac6f9d73..19b5be987 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Early terminated algorithm by PPOLag.""" +"""Implementation of the Lagrange version of the early terminated algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,12 +20,12 @@ @registry.register class PPOLagEarlyTerminated(PPOLag): - r"""Early terminated algorithm implemented by PPOLag. + r"""The Lagrange version of the early terminated algorithm implemented with PPOLag. References: - Paper Name: Safe Exploration by Solving Early Terminated MDP - Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou - Paper URL: https://arxiv.org/abs/2107.04200 + Title: Safe Exploration by Solving Early Terminated MDP + Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. + URL: https://arxiv.org/abs/2107.04200 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py index 635735694..3c0bde0ea 100644 --- a/omnisafe/algorithms/on_policy/first_order/__init__.py +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""FIrst-order algorithms.""" +"""First-order algorithms.""" from omnisafe.algorithms.on_policy.first_order.cup import CUP from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py index 099e3b6ad..48bd6324e 100644 --- a/omnisafe/algorithms/on_policy/first_order/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -24,13 +24,13 @@ @registry.register class CUP(PolicyGradient, Lagrange): - """The Constrained Update Projection Approach to Safe Policy Optimization. + """The Constrained Update Projection (CUP) Approach to Safe Policy Optimization. References: - Paper Name: Constrained Update Projection Approach to Safe Policy Optimization. - Paper author: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, Yaodong Yang, Gang Pan. - Paper URL: https://arxiv.org/abs/2209.07089 - + Title: Constrained Update Projection Approach to Safe Policy Optimization + Authors: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, + Yaodong Yang, Gang Pan. + URL: https://arxiv.org/abs/2209.07089 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py index cd19b4b37..3f2533731 100644 --- a/omnisafe/algorithms/on_policy/first_order/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -27,10 +27,9 @@ class FOCOPS(PolicyGradient, Lagrange): """The First Order Constrained Optimization in Policy Space (FOCOPS) algorithm. References: - Paper Name: First Order Constrained Optimization in Policy Space. - Paper author: Yiming Zhang, Quan Vuong, Keith W. Ross. - Paper URL: https://arxiv.org/abs/2002.06506 - + Title: First Order Constrained Optimization in Policy Space + Authors: Yiming Zhang, Quan Vuong, Keith W. Ross. + URL: https://arxiv.org/abs/2002.06506 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index 2876edb7a..88acacc04 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of Natural Policy Gradient algorithm.""" +"""Implementation of the Lagrange version of the Natural Policy Gradient algorithm.""" import torch @@ -23,10 +23,9 @@ @registry.register class NPGLag(NaturalPG, Lagrange): - """The Lagrange version of Natural Policy Gradient algorithm. - - A simple combination of Lagrange method and Natural Policy Gradient algorithm. + """The Lagrange version of the Natural Policy Gradient algorithm. + A simple combination of the Lagrange method and the Natural Policy Gradient algorithm. """ def __init__( diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index 2446f13b0..bebfc5a25 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -23,10 +23,9 @@ @registry.register class PDO(PolicyGradient, Lagrange): - """The Lagrange version of Policy Gradient algorithm. - - A simple combination of Lagrange method and Policy Gradient algorithm. + """The Lagrange version of the Policy Gradient algorithm. + A simple combination of the Lagrange method and the Policy Gradient algorithm. """ def __init__( diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index 954af7d9f..b82045edb 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of PPO algorithm.""" +"""Implementation of the Lagrange version of the PPO algorithm.""" import torch @@ -23,13 +23,12 @@ @registry.register class PPOLag(PolicyGradient, Lagrange): - """The Lagrange version of PPO algorithm. + """The Lagrange version of the PPO algorithm. References: - Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. - Paper author: Alex Ray, Joshua Achiam, Dario Amodei - Paper URL: https://cdn.openai.com/safexp-short.pdf - + Title: Benchmarking Safe Exploration in Deep Reinforcement Learning + Authors: Alex Ray, Joshua Achiam, Dario Amodei. + URL: https://cdn.openai.com/safexp-short.pdf """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index 93c9630e1..6733ce40f 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of TRPO algorithm.""" +"""Implementation of the Lagrange version of the TRPO algorithm.""" import torch @@ -23,12 +23,12 @@ @registry.register class TRPOLag(TRPO, Lagrange): - """The Lagrange version of TRPO algorithm. + """The Lagrange version of the TRPO algorithm. References: - Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. - Paper author: Alex Ray, Joshua Achiam, Dario Amodei - Paper URL: https://cdn.openai.com/safexp-short.pdf + Title: Benchmarking Safe Exploration in Deep Reinforcement Learning + Authors: Alex Ray, Joshua Achiam, Dario Amodei. + URL: https://cdn.openai.com/safexp-short.pdf """ diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 10afd7920..f47bf87db 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the CPPO Pid-Lagrange algorithm.""" +"""Implementation of the PID-Lagrange version of the CPPO algorithm.""" import torch @@ -23,13 +23,12 @@ @registry.register class CPPOPid(PolicyGradient, PIDLagrangian): - """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + """The PID-Lagrange version of the CPPO algorithm. References: - Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/2007.03964 - + Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/2007.03964 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index b46364f07..d92a10c7b 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the TRPO PID-Lagrange algorithm.""" +"""Implementation of the PID-Lagrange version of the TRPO algorithm.""" import torch @@ -23,13 +23,12 @@ @registry.register class TRPOPid(TRPO, PIDLagrangian): - """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + """The PID-Lagrange version of the TRPO algorithm. References: - Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/2007.03964 - + Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/2007.03964 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py index 559c5017a..6021ef5bb 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Saute algorithm.""" +"""Implementation of the Lagrange version of the Saute algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,13 +20,13 @@ @registry.register class PPOLagSaute(PPOLag): - r"""Saute algorithm implemented by PPOLag. + r"""The Saute algorithm implemented with PPOLag. References: - Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, - David Mguni, Jun Wang, Haitham Bou-Ammar. - Paper URL: https://arxiv.org/abs/2202.06558 + Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, + Jun Wang, Haitham Bou-Ammar. + URL: https://arxiv.org/abs/2202.06558 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py index e8c16afab..be7727a38 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -20,13 +20,13 @@ @registry.register class PPOSaute(PPO): - r"""Saute algorithm implemented by PPO. + r"""The Saute algorithm implemented with PPO. References: - Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, - David Mguni, Jun Wang, Haitham Bou-Ammar. - Paper URL: https://arxiv.org/abs/2202.06558 + Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, + Jun Wang, Haitham Bou-Ammar. + URL: https://arxiv.org/abs/2202.06558 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py index dee873cc6..e590672ff 100644 --- a/omnisafe/algorithms/on_policy/second_order/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -30,13 +30,12 @@ @registry.register class CPO(TRPO): - """The Constrained Policy Optimization (CPO) Algorithm. + """The Constrained Policy Optimization (CPO) algorithm. References: - Paper Name: Constrained Policy Optimization. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1705.10528 - + Title: Constrained Policy Optimization + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/1705.10528 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py index 0e5104502..c6bb29048 100644 --- a/omnisafe/algorithms/on_policy/second_order/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -32,10 +32,9 @@ class PCPO(TRPO): """The Projection-Based Constrained Policy Optimization (PCPO) algorithm. References: - Paper name: Projection-Based Constrained Policy Optimization. - Paper author: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge - Paper URL: https://arxiv.org/abs/2010.03152 - + Title: Projection-Based Constrained Policy Optimization + Authors: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge. + URL: https://arxiv.org/abs/2010.03152 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py index 3fa8bec39..32739e28a 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID Simmer algorithm by PPOLag.""" - +"""Implementation of the PID version of the Simmer algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -21,12 +20,12 @@ @registry.register class PPOLagSimmerPid(PPOLag): - r"""Simmer algorithm (PID version) implemented by PPOLag. + r"""The PID version of the Simmer algorithm implemented with PPOLag. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py index dbbd532be..e09cb0126 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Q Simmer algorithm by PPOLag.""" - +"""Implementation of the Q Simmer algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -21,12 +20,12 @@ @registry.register class PPOLagSimmerQ(PPOLag): - r"""Simmer algorithm (Q version) implemented by PPOLag. + r"""The Q Simmer algorithm implemented with PPOLag. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py index 14a7de4ba..11205c504 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID Simmer algorithm by PPOLag.""" +"""Implementation of the PID version of the Simmer algorithm using PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOSimmerPid(PPO): - r"""Simmer algorithm (PID version) implemented by PPO. + r"""The PID version of the Simmer algorithm implemented with PPO. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py index 11f40c892..17d86ad3c 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Q Simmer algorithm by PPOLag.""" +"""Implementation of the Q Simmer algorithm using PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOSimmerQ(PPO): - r"""Simmer algorithm (Q version) implemented by PPO. + r"""The Q Simmer algorithm implemented with PPO. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/registry.py b/omnisafe/algorithms/registry.py index 464307373..72f39dc19 100644 --- a/omnisafe/algorithms/registry.py +++ b/omnisafe/algorithms/registry.py @@ -19,6 +19,7 @@ class Registry: """A registry to map strings to classes. + Args: name (str): Registry name. """ @@ -28,10 +29,9 @@ def __init__(self, name): self._module_dict = {} def __repr__(self): - format_str = ( - self.__class__.__name__ + f'(name={self._name}, items={list(self._module_dict.keys())})' + return ( + f'{self.__class__.__name__ }(name={self._name}, items={list(self._module_dict.keys())})' ) - return format_str @property def name(self): From d96d4c4d8232300b52b7d2201f7bb6579ea21b59 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 22:34:49 +0800 Subject: [PATCH 27/39] chore(env_wrapper, tests): make registration immutable --- omnisafe/wrappers/__init__.py | 25 +++++++++++++++++++++++ tests/test_policy.py | 38 ++--------------------------------- 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/omnisafe/wrappers/__init__.py b/omnisafe/wrappers/__init__.py index 482ce94af..3fd963fd0 100644 --- a/omnisafe/wrappers/__init__.py +++ b/omnisafe/wrappers/__init__.py @@ -14,8 +14,33 @@ # ============================================================================== """Environment wrappers.""" +import itertools +from types import MappingProxyType + from omnisafe.wrappers.early_terminated_wrapper import EarlyTerminatedEnvWrapper from omnisafe.wrappers.off_policy_wrapper import OffPolicyEnvWrapper from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper from omnisafe.wrappers.saute_wrapper import SauteEnvWrapper from omnisafe.wrappers.simmer_wrapper import SimmerEnvWrapper + + +ENVWRAPPERS = { + 'on-policy-wrapper': OnPolicyEnvWrapper, + 'off-policy-wrapper': OffPolicyEnvWrapper, + 'saute-wrapper': SauteEnvWrapper, + 'simmer-wrapper': SimmerEnvWrapper, + 'early-terminated-wrapper': EarlyTerminatedEnvWrapper, +} + +ENVWRAPPERS2TYPE = { + env_wrapper: env_wrapper_type for env_wrapper_type, env_wrapper in ENVWRAPPERS.items() +} + +__all__ = ENVWRAPPERS['all'] = tuple(itertools.chain(ENVWRAPPERS.values())) + +assert len(ENVWRAPPERS2TYPE) == len(__all__), 'Duplicate environment wrappers found.' + +ENVWRAPPERS = MappingProxyType(ENVWRAPPERS) +ENVWRAPPERS2TYPE = MappingProxyType(ENVWRAPPERS2TYPE) + +del itertools, MappingProxyType diff --git a/tests/test_policy.py b/tests/test_policy.py index bede24b9b..7cfe11aa3 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -18,31 +18,7 @@ import omnisafe -@helpers.parametrize( - on_policy_algo=[ - 'PolicyGradient', - 'PPO', - 'PPOLag', - 'NaturalPG', - 'TRPO', - 'TRPOLag', - 'PDO', - 'NPGLag', - 'CPO', - 'PCPO', - 'FOCOPS', - 'CPPOPid', - 'CUP', - 'PPOSaute', - 'PPOSimmerPid', - 'PPOSimmerQ', - 'PPOEarlyTerminated', - 'PPOLagSaute', - 'PPOLagSimmerPid', - 'PPOLagSimmerQ', - 'PPOLagEarlyTerminated', - ] -) +@helpers.parametrize(on_policy_algo=omnisafe.ALGORITHMS['on-policy']) def test_on_policy(on_policy_algo): """Test algorithms""" env_id = 'SafetyPointGoal1-v0' @@ -51,17 +27,7 @@ def test_on_policy(on_policy_algo): agent.learn() -@helpers.parametrize( - off_policy_algo=[ - 'DDPG', - 'TD3', - 'SAC', - 'DDPGLag', - 'TD3Lag', - 'SACLag', - 'SDDPG', - ] -) +@helpers.parametrize(off_policy_algo=omnisafe.ALGORITHMS['off-policy']) def test_off_policy(off_policy_algo): """Test algorithms""" env_id = 'SafetyPointGoal1-v0' From 5447946a7c043723ba30c43283c2324e07134ac5 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 22 Dec 2022 22:37:05 +0800 Subject: [PATCH 28/39] docs: update docstrings --- omnisafe/algorithms/off_policy/sddpg.py | 12 ++++++------ omnisafe/algorithms/off_policy/td3.py | 3 +-- omnisafe/wrappers/off_policy_wrapper.py | 2 +- omnisafe/wrappers/simmer_wrapper.py | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index 76388d442..29b4a7c3e 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -29,7 +29,7 @@ @registry.register class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name - r"""The SDDPG algorithm. + """Implementation of SDDPG Algorithm. References: Title: Lyapunov-based Safe Policy Optimization for Continuous Control @@ -45,7 +45,7 @@ def __init__( algo: str = 'SDDPG', wrapper_type: str = 'OffPolicyEnvWrapper', ): - r"""Initialize SDDPG. + """Initialize SDDPG. Args: env_id (str): environment id. @@ -68,7 +68,7 @@ def __init__( self.d_init = cfgs.d_init def update(self, data): - r"""Update. + """Update. Args: data (dict): data dictionary. @@ -101,7 +101,7 @@ def update(self, data): self.polyak_update_target() def Fvp(self, params): - r""" + """ Build the Hessian-vector product based on an approximation of the KL-divergence. For details see John Schulman's PhD thesis (pp. 40) http://joschu.net/docs/thesis.pdf @@ -130,7 +130,7 @@ def Fvp(self, params): return flat_grad_grad_kl + params * self.cg_damping def compute_loss_cost_performance(self, data): - r"""Compute loss of cost performance. + """Compute loss of cost performance. Args: data (dict): data dictionary. @@ -146,7 +146,7 @@ def compute_loss_cost_performance(self, data): # pylint: disable=invalid-name,too-many-arguments,too-many-locals def update_policy_net(self, data) -> None: - r"""Update policy network. + """Update policy network. Args: data (dict): data dictionary. diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index 75c0b433c..cd4838937 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -46,8 +46,7 @@ def __init__( ) def compute_loss_v(self, data): - """ - Computing value loss. + """Computing value loss. Args: data (dict): data from replay buffer. diff --git a/omnisafe/wrappers/off_policy_wrapper.py b/omnisafe/wrappers/off_policy_wrapper.py index d3a18c810..95f584d69 100644 --- a/omnisafe/wrappers/off_policy_wrapper.py +++ b/omnisafe/wrappers/off_policy_wrapper.py @@ -23,7 +23,7 @@ # pylint: disable=too-many-instance-attributes @WRAPPER_REGISTRY.register class OffPolicyEnvWrapper: - """OffPolicyEnvWrapperr""" + """OffPolicyEnvWrapper""" def __init__( self, diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py index 8d1f2ca46..1b1989545 100644 --- a/omnisafe/wrappers/simmer_wrapper.py +++ b/omnisafe/wrappers/simmer_wrapper.py @@ -344,7 +344,7 @@ def __init__( ) def augment_obs(self, obs: np.array, safety_obs: np.array): - """Augmenting the obs with the safety obs, if needed. + """Augmenting the obs with the safety obs. Args: obs (np.array): The observation. From d7aa5a3ed8c957a9ce9f73193cdd951f55460d42 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Thu, 22 Dec 2022 22:42:04 +0800 Subject: [PATCH 29/39] docs: update docstrings --- omnisafe/algorithms/off_policy/sddpg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index 29b4a7c3e..24cf2dd50 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -29,7 +29,7 @@ @registry.register class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name - """Implementation of SDDPG Algorithm. + """Implementation of the SDDPG algorithm. References: Title: Lyapunov-based Safe Policy Optimization for Continuous Control From 83733f02c74a9b3fb15a640068c15a3ad087a547 Mon Sep 17 00:00:00 2001 From: ruiyang sun Date: Thu, 22 Dec 2022 23:33:12 +0800 Subject: [PATCH 30/39] fix(model): fix compute bugs --- omnisafe/models/actor/gaussian_annealing_actor.py | 7 +++++-- omnisafe/models/actor/gaussian_learning_actor.py | 6 +++--- omnisafe/models/actor/gaussian_stdnet_actor.py | 2 +- omnisafe/models/actor_critic.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/omnisafe/models/actor/gaussian_annealing_actor.py b/omnisafe/models/actor/gaussian_annealing_actor.py index 065abc801..8672de9dd 100644 --- a/omnisafe/models/actor/gaussian_annealing_actor.py +++ b/omnisafe/models/actor/gaussian_annealing_actor.py @@ -76,10 +76,13 @@ def predict(self, obs, deterministic=False, need_log_prob=True): else: out = dist.sample() + action = torch.clamp(out, -1, 1) + action = 0.5 * (action + 1) * (self.act_max - self.act_min) + self.act_min + if need_log_prob: log_prob = dist.log_prob(out).sum(axis=-1) - return out, log_prob - return out + return action, log_prob + return action def forward(self, obs, act=None): dist = self._distribution(obs) diff --git a/omnisafe/models/actor/gaussian_learning_actor.py b/omnisafe/models/actor/gaussian_learning_actor.py index 5cddf42eb..b980a5086 100644 --- a/omnisafe/models/actor/gaussian_learning_actor.py +++ b/omnisafe/models/actor/gaussian_learning_actor.py @@ -72,15 +72,15 @@ def predict(self, obs, deterministic=False, need_log_prob=False): if deterministic: out = dist.mean else: - out = dist.sample() + out = dist.rsample() action = torch.clamp(out, -1, 1) action = self.act_min + (action + 1) * 0.5 * (self.act_max - self.act_min) if need_log_prob: log_prob = dist.log_prob(out).sum(axis=-1) - return out, log_prob - return out + return action, log_prob + return action def forward(self, obs, act=None): dist = self._distribution(obs) diff --git a/omnisafe/models/actor/gaussian_stdnet_actor.py b/omnisafe/models/actor/gaussian_stdnet_actor.py index 2c67f2052..01ec10409 100644 --- a/omnisafe/models/actor/gaussian_stdnet_actor.py +++ b/omnisafe/models/actor/gaussian_stdnet_actor.py @@ -90,7 +90,7 @@ def predict(self, obs, deterministic=False, need_log_prob=False): if deterministic: out = dist.mean else: - out = dist.sample() + out = dist.rsample() action = torch.tanh(out) action = self.act_min + (action + 1) * 0.5 * (self.act_max - self.act_min) diff --git a/omnisafe/models/actor_critic.py b/omnisafe/models/actor_critic.py index 5b3df55fa..cbea39711 100644 --- a/omnisafe/models/actor_critic.py +++ b/omnisafe/models/actor_critic.py @@ -131,5 +131,5 @@ def anneal_exploration(self, frac): frac: progress of epochs, i.e. current epoch / total epochs e.g. 10 / 100 = 0.1 """ - if hasattr(self.actor, 'set_log_std'): + if hasattr(self.actor, 'set_std'): self.actor.set_std(1 - frac) From 5054d6bfa3ae3b5aa30dd26cbd67f28118050440 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Fri, 23 Dec 2022 00:29:02 +0800 Subject: [PATCH 31/39] refactor: reframe algorithms --- examples/train_policy.py | 2 +- omnisafe/algorithms/off_policy/ddpg.py | 20 +++++++++---------- omnisafe/algorithms/off_policy/ddpg_lag.py | 17 +++++++--------- omnisafe/algorithms/off_policy/sac.py | 14 ++++++------- omnisafe/algorithms/off_policy/sac_lag.py | 13 +++++------- omnisafe/algorithms/off_policy/sddpg.py | 14 +++++-------- omnisafe/algorithms/off_policy/td3.py | 13 +++++------- omnisafe/algorithms/off_policy/td3_lag.py | 16 ++++++--------- .../algorithms/on_policy/base/natural_pg.py | 11 ++++------ .../on_policy/base/policy_gradient.py | 16 +++++++-------- omnisafe/algorithms/on_policy/base/ppo.py | 12 ++++------- omnisafe/algorithms/on_policy/base/trpo.py | 12 ++++------- .../early_terminated/ppo_early_terminated.py | 16 ++++++--------- .../ppo_lag_early_terminated.py | 16 ++++++--------- .../on_policy/first_order/__init__.py | 2 +- .../algorithms/on_policy/first_order/cup.py | 15 ++++++-------- .../on_policy/first_order/focops.py | 12 +++++------ .../on_policy/naive_lagrange/npg_lag.py | 12 +++++------ .../on_policy/naive_lagrange/pdo.py | 10 ++++------ .../on_policy/naive_lagrange/ppo_lag.py | 18 ++++++++--------- .../on_policy/naive_lagrange/trpo_lag.py | 15 ++++++-------- .../on_policy/pid_lagrange/cppo_pid.py | 15 ++++++-------- .../on_policy/pid_lagrange/trpo_pid.py | 16 +++++++-------- .../on_policy/saute/ppo_lag_saute.py | 18 +++++++---------- .../algorithms/on_policy/saute/ppo_saute.py | 16 ++++++--------- .../algorithms/on_policy/second_order/cpo.py | 13 +++++------- .../algorithms/on_policy/second_order/pcpo.py | 11 ++++------ .../on_policy/simmer/ppo_lag_simmer_pid.py | 19 ++++++++---------- .../on_policy/simmer/ppo_lag_simmer_q.py | 17 +++++++--------- .../on_policy/simmer/ppo_simmer_pid.py | 18 +++++++---------- .../on_policy/simmer/ppo_simmer_q.py | 16 ++++++--------- omnisafe/algorithms/registry.py | 6 +++--- omnisafe/configs/off-policy/DDPG.yaml | 2 ++ omnisafe/configs/off-policy/DDPGLag.yaml | 2 ++ omnisafe/configs/off-policy/SAC.yaml | 2 ++ omnisafe/configs/off-policy/SACLag.yaml | 2 ++ omnisafe/configs/off-policy/SDDPG.yaml | 2 ++ omnisafe/configs/off-policy/TD3.yaml | 2 ++ omnisafe/configs/off-policy/TD3Lag.yaml | 2 ++ omnisafe/configs/on-policy/CPO.yaml | 2 ++ omnisafe/configs/on-policy/CPPOPid.yaml | 2 ++ omnisafe/configs/on-policy/CUP.yaml | 2 ++ omnisafe/configs/on-policy/FOCOPS.yaml | 2 ++ omnisafe/configs/on-policy/NPGLag.yaml | 2 ++ omnisafe/configs/on-policy/NaturalPG.yaml | 2 ++ omnisafe/configs/on-policy/PCPO.yaml | 2 ++ omnisafe/configs/on-policy/PDO.yaml | 2 ++ omnisafe/configs/on-policy/PPO.yaml | 2 ++ .../configs/on-policy/PPOEarlyTerminated.yaml | 2 ++ omnisafe/configs/on-policy/PPOLag.yaml | 2 ++ .../on-policy/PPOLagEarlyTerminated.yaml | 2 ++ omnisafe/configs/on-policy/PPOLagSaute.yaml | 2 ++ .../configs/on-policy/PPOLagSimmerPid.yaml | 2 ++ omnisafe/configs/on-policy/PPOLagSimmerQ.yaml | 2 ++ omnisafe/configs/on-policy/PPOSaute.yaml | 2 ++ omnisafe/configs/on-policy/PPOSimmerPid.yaml | 2 ++ omnisafe/configs/on-policy/PPOSimmerQ.yaml | 2 ++ .../configs/on-policy/PolicyGradient.yaml | 2 ++ omnisafe/configs/on-policy/TRPO.yaml | 2 ++ omnisafe/configs/on-policy/TRPOLag.yaml | 2 ++ omnisafe/configs/on-policy/TRPOPid.yaml | 2 ++ .../models/actor/gaussian_annealing_actor.py | 7 ++----- .../models/actor/gaussian_learning_actor.py | 6 +++--- .../models/actor/gaussian_stdnet_actor.py | 2 +- omnisafe/wrappers/off_policy_wrapper.py | 2 +- omnisafe/wrappers/simmer_wrapper.py | 2 +- 66 files changed, 244 insertions(+), 274 deletions(-) diff --git a/examples/train_policy.py b/examples/train_policy.py index cee365da1..ac352a9da 100644 --- a/examples/train_policy.py +++ b/examples/train_policy.py @@ -25,7 +25,7 @@ '--algo', type=str, metavar='ALGO', - default='PPOLag', + default='PPOLagEarlyTerminated', help='Algorithm to train', choices=omnisafe.ALGORITHMS['all'], ) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 774563d99..802768f22 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -31,21 +31,20 @@ @registry.register class DDPG: # pylint: disable=too-many-instance-attributes - """The Deep Deterministic Policy Gradient (DDPG) algorithm. + """Continuous control with deep reinforcement learning (DDPG) Algorithm. References: - Title: Continuous control with deep reinforcement learning - Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, - Yuval Tassa, David Silver, Daan Wierstra. - URL: https://arxiv.org/abs/1509.02971 + Paper Name: Continuous control with deep reinforcement learning. + Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, + Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. + Paper URL: https://arxiv.org/abs/1509.02971 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'DDPG', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize DDPG. @@ -55,14 +54,15 @@ def __init__( algo (str): Algorithm name. wrapper_type (str): Wrapper type. """ - self.env = wrapper_registry.get(wrapper_type)( + self.cfgs = deepcopy(cfgs) + self.wrapper_type = self.cfgs.wrapper_type + self.env = wrapper_registry.get(self.wrapper_type)( env_id, use_cost=cfgs.use_cost, max_ep_len=cfgs.max_ep_len, ) self.env_id = env_id - self.algo = algo - self.cfgs = deepcopy(cfgs) + self.algo = self.__class__.__name__ # Set up for learning and rolling out schedule self.steps_per_epoch = cfgs.steps_per_epoch diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 01a9ea301..ad4a84c0b 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the DDPG algorithm.""" +"""Implementation of the DDPGLag algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.ddpg import DDPG @@ -21,29 +21,26 @@ @registry.register class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes - """The Lagrange version of the DDPG Algorithm. + """The Lagrange version of DDPG Algorithm. References: - Title: Continuous control with deep reinforcement learning - Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, - Yuval Tassa, David Silver, Daan Wierstra. - URL: https://arxiv.org/abs/1509.02971 + Paper Name: Continuous control with deep reinforcement learning. + Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, + Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. + Paper URL: https://arxiv.org/abs/1509.02971 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'DDPG-Lag', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize DDPG.""" DDPG.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index 30c6f8d49..d8070e346 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -14,6 +14,7 @@ # ============================================================================== """Implementation of the SAC algorithm.""" + import torch from omnisafe.algorithms import registry @@ -22,27 +23,24 @@ @registry.register class SAC(DDPG): # pylint: disable=too-many-instance-attributes - """The Soft Actor-Critic (SAC) algorithm. + """Implementation of the SAC algorithm. References: - Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. - URL: https://arxiv.org/abs/1801.01290 + Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine + Paper URL: https://arxiv.org/abs/1801.01290 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'SAC', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize SAC.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) self.alpha = cfgs.alpha self.alpha_gamma = cfgs.alpha_gamma diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index ce2d57a90..7dc2472ad 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the SAC algorithm.""" +"""Implementation of the SACLag algorithm.""" import torch @@ -26,17 +26,16 @@ class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes """The Lagrange version of SAC algorithm. References: - Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. - URL: https://arxiv.org/abs/1801.01290 + Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine + Paper URL: https://arxiv.org/abs/1801.01290 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'SAC-Lag', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize SACLag. @@ -50,8 +49,6 @@ def __init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index 24cf2dd50..d49e0734d 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -29,21 +29,19 @@ @registry.register class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name - """Implementation of the SDDPG algorithm. + """Implementation of SDDPG Algorithm. References: - Title: Lyapunov-based Safe Policy Optimization for Continuous Control - Authors: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, - Mohammad Ghavamzadeh. - URL: https://arxiv.org/abs/1901.10031 + Paper Name: Lyapunov-based Safe Policy Optimization for Continuous Control. + Paper author: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, Mohammad Ghavamzadeh. + Paper URL: https://arxiv.org/abs/1901.10031 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'SDDPG', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize SDDPG. @@ -56,8 +54,6 @@ def __init__( super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) self.beta = cfgs.beta self.cg_damping = cfgs.cg_damping diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index cd4838937..ec7607e70 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -22,27 +22,24 @@ @registry.register class TD3(DDPG): # pylint: disable=too-many-instance-attributes - """The Twin Delayed DDPG (TD3) algorithm. + """Implementation of TD3 Algorithm. References: - Title: Addressing Function Approximation Error in Actor-Critic Methods - Authors: Scott Fujimoto, Herke van Hoof, David Meger. - URL: https://arxiv.org/abs/1802.09477 + Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. + Paper author: Scott Fujimoto, Herke van Hoof, David Meger. + Paper URL: https://arxiv.org/abs/1802.09477 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'TD3', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize DDPG.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def compute_loss_v(self, data): diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index 1620eb81b..077a7d2cc 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the TD3 algorithm.""" +"""Implementation of the TD3Lag algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.td3 import TD3 @@ -21,20 +21,19 @@ @registry.register class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes - """The Lagrange version of the TD3 algorithm + """The Lagrange version of TD3 Algorithm. References: - Title: Addressing Function Approximation Error in Actor-Critic Methods - Authors: Scott Fujimoto, Herke van Hoof, David Meger. - URL: https://arxiv.org/abs/1802.09477 + Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. + Paper author: Scott Fujimoto, Herke van Hoof, David Meger. + Paper URL: https://arxiv.org/abs/1802.09477 + """ def __init__( self, env_id: str, cfgs=None, - algo: str = 'TD3-Lag', - wrapper_type: str = 'OffPolicyEnvWrapper', ): """Initialize TD3. @@ -48,10 +47,7 @@ def __init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py index 7d7f02035..71ed950ed 100644 --- a/omnisafe/algorithms/on_policy/base/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -32,23 +32,20 @@ class NaturalPG(PolicyGradient): """The Natural Policy Gradient algorithm. References: - Title: A Natural Policy Gradient - Author: Sham Kakade. - URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf + Paper Name: A Natural Policy Gradient. + Paper author: Sham Kakade. + Paper URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf + """ def __init__( self, env_id, cfgs, - algo: str = 'NaturalPolicyGradient', - wrapper_type: str = 'OnPolicyEnvWrapper', ): super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) self.cg_damping = cfgs.cg_damping self.cg_iters = cfgs.cg_iters diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index b9367a618..a8dca9f0e 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -34,9 +34,10 @@ class PolicyGradient: # pylint: disable=too-many-instance-attributes """The Policy Gradient algorithm. References: - Title: Policy Gradient Methods for Reinforcement Learning with Function Approximation - Authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour. - URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf + Paper Name: Policy Gradient Methods for Reinforcement Learning with Function Approximation + Paper Author: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour + Paper URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf + """ # pylint: disable-next=too-many-locals @@ -44,10 +45,8 @@ def __init__( self, env_id, cfgs=None, - algo: str = 'PolicyGradient', - wrapper_type: str = 'OnPolicyEnvWrapper', ) -> None: - r"""Initialize the algorithm. + """Initialize the algorithm. Args: env: The environment. @@ -56,9 +55,10 @@ def __init__( cfgs: (default: :const:`None`) This is a dictionary of the algorithm hyper-parameters. """ - self.algo = algo + self.algo = self.__class__.__name__ self.cfgs = deepcopy(cfgs) - self.env = wrapper_registry.get(wrapper_type)( + self.wrapper_type = self.cfgs.wrapper_type + self.env = wrapper_registry.get(self.wrapper_type)( env_id, cfgs=self.cfgs._asdict().get('env_cfgs') ) diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index 78e42b5b0..bf3a1c897 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -23,12 +23,12 @@ @registry.register class PPO(PolicyGradient): - """The Proximal Policy Optimization (PPO) algorithm. + """The Proximal Policy Optimization Algorithms (PPO) Algorithm. References: - Title: Proximal Policy Optimization Algorithms - Authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - URL: https://arxiv.org/abs/1707.06347 + Paper Name: Proximal Policy Optimization Algorithms. + Paper author: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. + Paper URL: https://arxiv.org/pdf/1707.06347.pdf """ # pylint: disable-next=too-many-arguments @@ -36,16 +36,12 @@ def __init__( self, env_id, cfgs, - algo='ppo', - wrapper_type: str = 'OnPolicyEnvWrapper', ): """Initialize PPO.""" self.clip = cfgs.clip super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def compute_loss_pi(self, data: dict): diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py index 9e12ef6c7..4eee0dea4 100644 --- a/omnisafe/algorithms/on_policy/base/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -29,26 +29,22 @@ @registry.register class TRPO(NaturalPG): - """The Trust Region Policy Optimization (TRPO) algorithm. + """The Trust Region Policy Optimization (TRPO) Algorithm. References: - Title: Trust Region Policy Optimization - Authors: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. - URL: https://arxiv.org/abs/1502.05477 + Paper Name: Trust Region Policy Optimization. + Paper author: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. + Paper URL: https://arxiv.org/abs/1502.05477 """ def __init__( self, env_id, cfgs, - algo='TRPO', - wrapper_type: str = 'OnPolicyEnvWrapper', ): super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) # pylint: disable-next=too-many-arguments,too-many-locals,arguments-differ diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py index 5debd33e4..47f9e4629 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the early terminated algorithm using PPO.""" +"""Implementation of the Early terminated algorithm by PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOEarlyTerminated(PPO): - r"""The early terminated algorithm implemented with PPO. + """Early terminated algorithm implemented by PPO. References: - Title: Safe Exploration by Solving Early Terminated MDP - Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. - URL: https://arxiv.org/abs/2107.04200 + Paper Name: Safe Exploration by Solving Early Terminated MDP + Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou + Paper URL: https://arxiv.org/abs/2107.04200 """ # pylint: disable-next=too-many-arguments @@ -33,13 +33,9 @@ def __init__( self, env_id, cfgs, - algo='ppo_early_terminated', - wrapper_type: str = 'EarlyTerminatedEnvWrapper', ) -> None: - r"""Initialize PPO_Earyly_Terminated.""" + """Initialize PPO_Earyly_Terminated.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py index 19b5be987..d115837fb 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the early terminated algorithm using PPOLag.""" +"""Implementation of the Early terminated algorithm by PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,12 +20,12 @@ @registry.register class PPOLagEarlyTerminated(PPOLag): - r"""The Lagrange version of the early terminated algorithm implemented with PPOLag. + """Early terminated algorithm implemented by PPOLag. References: - Title: Safe Exploration by Solving Early Terminated MDP - Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. - URL: https://arxiv.org/abs/2107.04200 + Paper Name: Safe Exploration by Solving Early Terminated MDP + Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou + Paper URL: https://arxiv.org/abs/2107.04200 """ # pylint: disable-next=too-many-arguments @@ -33,13 +33,9 @@ def __init__( self, env_id, cfgs, - algo='ppo_lag_early_terminated', - wrapper_type: str = 'EarlyTerminatedEnvWrapper', ) -> None: - r"""Initialize PPO_Lag_Earyly_Terminated.""" + """Initialize PPO_Lag_Earyly_Terminated.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py index 3c0bde0ea..635735694 100644 --- a/omnisafe/algorithms/on_policy/first_order/__init__.py +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""First-order algorithms.""" +"""FIrst-order algorithms.""" from omnisafe.algorithms.on_policy.first_order.cup import CUP from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py index 48bd6324e..aaa494fe9 100644 --- a/omnisafe/algorithms/on_policy/first_order/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -24,21 +24,19 @@ @registry.register class CUP(PolicyGradient, Lagrange): - """The Constrained Update Projection (CUP) Approach to Safe Policy Optimization. + """The Constrained Update Projection Approach to Safe Policy Optimization. References: - Title: Constrained Update Projection Approach to Safe Policy Optimization - Authors: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, - Yaodong Yang, Gang Pan. - URL: https://arxiv.org/abs/2209.07089 + Paper Name: Constrained Update Projection Approach to Safe Policy Optimization. + Paper author: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, Yaodong Yang, Gang Pan. + Paper URL: https://arxiv.org/abs/2209.07089 + """ def __init__( self, env_id, cfgs, - algo='CUP', - wrapper_type: str = 'OnPolicyEnvWrapper', ): r"""The :meth:`init` function.""" @@ -46,8 +44,6 @@ def __init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( @@ -58,6 +54,7 @@ def __init__( lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, ) + self.algo = self.__class__.__name__ self.lam = self.cfgs.lam self.eta = self.cfgs.eta self.clip = self.cfgs.clip diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py index 3f2533731..2fd6fe73d 100644 --- a/omnisafe/algorithms/on_policy/first_order/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -27,17 +27,16 @@ class FOCOPS(PolicyGradient, Lagrange): """The First Order Constrained Optimization in Policy Space (FOCOPS) algorithm. References: - Title: First Order Constrained Optimization in Policy Space - Authors: Yiming Zhang, Quan Vuong, Keith W. Ross. - URL: https://arxiv.org/abs/2002.06506 + Paper Name: First Order Constrained Optimization in Policy Space. + Paper author: Yiming Zhang, Quan Vuong, Keith W. Ross. + Paper URL: https://arxiv.org/abs/2002.06506 + """ def __init__( self, env_id, cfgs, - algo='FOCOPS', - wrapper_type: str = 'OnPolicyEnvWrapper', ): r"""The :meth:`init` function.""" @@ -45,8 +44,6 @@ def __init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( @@ -57,6 +54,7 @@ def __init__( lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, ) + self.algo = self.__class__.__name__ self.lam = self.cfgs.lam self.eta = self.cfgs.eta diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index 88acacc04..d4c2dbce3 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the Natural Policy Gradient algorithm.""" +"""Implementation of the Lagrange version of Natural Policy Gradient algorithm.""" import torch @@ -23,17 +23,16 @@ @registry.register class NPGLag(NaturalPG, Lagrange): - """The Lagrange version of the Natural Policy Gradient algorithm. + """The Lagrange version of Natural Policy Gradient algorithm. + + A simple combination of Lagrange method and Natural Policy Gradient algorithm. - A simple combination of the Lagrange method and the Natural Policy Gradient algorithm. """ def __init__( self, env_id, cfgs, - algo: str = 'NPG-Lag', - wrapper_type: str = 'OnPolicyEnvWrapper', ): """initialize""" @@ -41,8 +40,6 @@ def __init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, @@ -51,6 +48,7 @@ def __init__( lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, ) + self.algo = self.__class__.__name__ def compute_loss_pi(self, data: dict): """ diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index bebfc5a25..411efe1f1 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -23,25 +23,22 @@ @registry.register class PDO(PolicyGradient, Lagrange): - """The Lagrange version of the Policy Gradient algorithm. + """The Lagrange version of Policy Gradient algorithm. + + A simple combination of Lagrange method and Policy Gradient algorithm. - A simple combination of the Lagrange method and the Policy Gradient algorithm. """ def __init__( self, env_id, cfgs, - algo='PDO', - wrapper_type: str = 'OnPolicyEnvWrapper', ): """initialization""" PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, @@ -50,6 +47,7 @@ def __init__( lambda_lr=cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=cfgs.lagrange_cfgs.lambda_optimizer, ) + self.algo = self.__class__.__name__ def compute_loss_pi(self, data: dict): """ diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index b82045edb..a8c3e732b 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the PPO algorithm.""" +"""Implementation of the Lagrange version of PPO algorithm.""" import torch @@ -23,12 +23,13 @@ @registry.register class PPOLag(PolicyGradient, Lagrange): - """The Lagrange version of the PPO algorithm. + """The Lagrange version of PPO algorithm. References: - Title: Benchmarking Safe Exploration in Deep Reinforcement Learning - Authors: Alex Ray, Joshua Achiam, Dario Amodei. - URL: https://cdn.openai.com/safexp-short.pdf + Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. + Paper author: Alex Ray, Joshua Achiam, Dario Amodei + Paper URL: https://cdn.openai.com/safexp-short.pdf + """ # pylint: disable-next=too-many-arguments @@ -36,17 +37,12 @@ def __init__( self, env_id, cfgs, - algo='PPO-Lag', - wrapper_type: str = 'OnPolicyEnvWrapper', ): """Initialize PPO-Lag algorithm.""" - self.clip = cfgs.clip PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, @@ -55,6 +51,8 @@ def __init__( lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, ) + self.algo = self.__class__.__name__ + self.clip = cfgs.clip def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index 6733ce40f..be037e1a9 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the TRPO algorithm.""" +"""Implementation of the Lagrange version of TRPO algorithm.""" import torch @@ -23,12 +23,12 @@ @registry.register class TRPOLag(TRPO, Lagrange): - """The Lagrange version of the TRPO algorithm. + """The Lagrange version of TRPO algorithm. References: - Title: Benchmarking Safe Exploration in Deep Reinforcement Learning - Authors: Alex Ray, Joshua Achiam, Dario Amodei. - URL: https://cdn.openai.com/safexp-short.pdf + Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. + Paper author: Alex Ray, Joshua Achiam, Dario Amodei + Paper URL: https://cdn.openai.com/safexp-short.pdf """ @@ -36,16 +36,12 @@ def __init__( self, env_id, cfgs, - algo: str = 'TRPO-Lag', - wrapper_type: str = 'OnPolicyEnvWrapper', ): """initialize""" TRPO.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, @@ -54,6 +50,7 @@ def __init__( lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, ) + self.algo = self.__class__.__name__ def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index f47bf87db..5fea4b9c7 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID-Lagrange version of the CPPO algorithm.""" +"""Implementation of the CPPO Pid-Lagrange algorithm.""" import torch @@ -23,28 +23,25 @@ @registry.register class CPPOPid(PolicyGradient, PIDLagrangian): - """The PID-Lagrange version of the CPPO algorithm. + """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. References: - Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods - Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - URL: https://arxiv.org/abs/2007.03964 + Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + Paper URL: https://arxiv.org/abs/2007.03964 + """ def __init__( self, env_id, cfgs, - algo: str = 'CPPO-PID', - wrapper_type: str = 'OnPolicyEnvWrapper', ): PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index d92a10c7b..ff4f3689c 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID-Lagrange version of the TRPO algorithm.""" +"""Implementation of the TRPO PID-Lagrange algorithm.""" import torch @@ -23,30 +23,28 @@ @registry.register class TRPOPid(TRPO, PIDLagrangian): - """The PID-Lagrange version of the TRPO algorithm. + """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. References: - Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods - Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - URL: https://arxiv.org/abs/2007.03964 + Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + Paper URL: https://arxiv.org/abs/2007.03964 + """ def __init__( self, env_id, cfgs, - algo: str = 'TRPO-PID', - wrapper_type: str = 'OnPolicyEnvWrapper', ): TRPO.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) + self.cost_limit = self.cfgs.cost_limit def algorithm_specific_logs(self): diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py index 6021ef5bb..1f5b4efcb 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of the Saute algorithm using PPOLag.""" +"""Implementation of the Saute algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,13 +20,13 @@ @registry.register class PPOLagSaute(PPOLag): - r"""The Saute algorithm implemented with PPOLag. + """Saute algorithm implemented by PPOLag. References: - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, - Jun Wang, Haitham Bou-Ammar. - URL: https://arxiv.org/abs/2202.06558 + Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, + David Mguni, Jun Wang, Haitham Bou-Ammar. + Paper URL: https://arxiv.org/abs/2202.06558 """ # pylint: disable-next=too-many-arguments @@ -34,15 +34,11 @@ def __init__( self, env_id, cfgs, - algo='ppo_lag_saute', - wrapper_type: str = 'SauteEnvWrapper', ) -> None: - r"""Initialize PPOLagSaute.""" + """Initialize PPOLagSaute.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def algorithm_specific_logs(self): diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py index be7727a38..c95fdd2fb 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -20,13 +20,13 @@ @registry.register class PPOSaute(PPO): - r"""The Saute algorithm implemented with PPO. + r"""Saute algorithm implemented by PPO. References: - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, - Jun Wang, Haitham Bou-Ammar. - URL: https://arxiv.org/abs/2202.06558 + Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, + David Mguni, Jun Wang, Haitham Bou-Ammar. + Paper URL: https://arxiv.org/abs/2202.06558 """ # pylint: disable-next=too-many-arguments @@ -34,17 +34,13 @@ def __init__( self, env_id, cfgs, - algo='ppo_saute', clip=0.2, - wrapper_type: str = 'SauteEnvWrapper', ) -> None: - r"""Initialize PPOSaute.""" + """Initialize PPOSaute.""" self.clip = clip super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def algorithm_specific_logs(self): diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py index e590672ff..b5ee65bc5 100644 --- a/omnisafe/algorithms/on_policy/second_order/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -30,26 +30,23 @@ @registry.register class CPO(TRPO): - """The Constrained Policy Optimization (CPO) algorithm. + """The Constrained Policy Optimization (CPO) Algorithm. References: - Title: Constrained Policy Optimization - Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - URL: https://arxiv.org/abs/1705.10528 + Paper Name: Constrained Policy Optimization. + Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + Paper URL: https://arxiv.org/abs/1705.10528 + """ def __init__( self, env_id, cfgs, - algo='CPO', - wrapper_type: str = 'OnPolicyEnvWrapper', ): super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) self.cost_limit = cfgs.cost_limit self.loss_pi_cost_before = 0.0 diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py index c6bb29048..5785bda3c 100644 --- a/omnisafe/algorithms/on_policy/second_order/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -32,23 +32,20 @@ class PCPO(TRPO): """The Projection-Based Constrained Policy Optimization (PCPO) algorithm. References: - Title: Projection-Based Constrained Policy Optimization - Authors: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge. - URL: https://arxiv.org/abs/2010.03152 + Paper name: Projection-Based Constrained Policy Optimization. + Paper author: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge + Paper URL: https://arxiv.org/abs/2010.03152 + """ def __init__( self, env_id, cfgs, - algo='PCPO', - wrapper_type: str = 'OnPolicyEnvWrapper', ): super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) self.cost_limit = self.cfgs.cost_limit diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py index 32739e28a..f91596562 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID version of the Simmer algorithm using PPOLag.""" +"""Implementation of the PID Simmer algorithm by PPOLag.""" + from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,12 +21,12 @@ @registry.register class PPOLagSimmerPid(PPOLag): - r"""The PID version of the Simmer algorithm implemented with PPOLag. + """Simmer algorithm (PID version) implemented by PPOLag. References: - Title: Effects of Safety State Augmentation on Safe Exploration - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - URL: https://arxiv.org/abs/2206.02675 + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments @@ -33,19 +34,15 @@ def __init__( self, env_id, cfgs, - algo='ppo_lag_simmer_pid', - wrapper_type: str = 'SimmerEnvWrapper', ): - r"""Initialize PPOLagSimmerPid algorithm.""" + """Initialize PPOLagSimmerPid algorithm.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def algorithm_specific_logs(self): - r"""Log the algorithm specific metrics.""" + """Log the algorithm specific metrics.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/EpBudget') self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py index e09cb0126..1647f1fe9 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Q Simmer algorithm using PPOLag.""" +"""Implementation of the Q Simmer algorithm by PPOLag.""" + from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,12 +21,12 @@ @registry.register class PPOLagSimmerQ(PPOLag): - r"""The Q Simmer algorithm implemented with PPOLag. + """Simmer algorithm (Q version) implemented by PPOLag. References: - Title: Effects of Safety State Augmentation on Safe Exploration - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - URL: https://arxiv.org/abs/2206.02675 + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments @@ -33,19 +34,15 @@ def __init__( self, env_id, cfgs, - algo='ppo_lag_simmer_q', - wrapper_type: str = 'SimmerEnvWrapper', ): """Initialize PPOLagSimmerQ algorithm.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def algorithm_specific_logs(self): - r"""Log the algorithm specific metrics.""" + """Log the algorithm specific metrics.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/EpBudget') self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py index 11205c504..131cce164 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID version of the Simmer algorithm using PPO.""" +"""Implementation of the PID Simmer algorithm by PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOSimmerPid(PPO): - r"""The PID version of the Simmer algorithm implemented with PPO. + """Simmer algorithm (PID version) implemented by PPO. References: - Title: Effects of Safety State Augmentation on Safe Exploration - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - URL: https://arxiv.org/abs/2206.02675 + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments @@ -33,19 +33,15 @@ def __init__( self, env_id, cfgs, - algo='ppo_simmer_pid', - wrapper_type: str = 'SimmerEnvWrapper', ) -> None: - r"""Initialize PPOSimmerPid.""" + """Initialize PPOSimmerPid.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def algorithm_specific_logs(self): - r"""Log the algorithm specific metrics.""" + """Log the algorithm specific metrics.""" super().algorithm_specific_logs() self.logger.log_tabular('Metrics/EpBudget') self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py index 17d86ad3c..f9e9d4b50 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Q Simmer algorithm using PPO.""" +"""Implementation of the Q Simmer algorithm by PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOSimmerQ(PPO): - r"""The Q Simmer algorithm implemented with PPO. + """Simmer algorithm (Q version) implemented by PPO. References: - Title: Effects of Safety State Augmentation on Safe Exploration - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - URL: https://arxiv.org/abs/2206.02675 + Paper Name: Effects of Safety State Augmentation on Safe Exploration. + Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + Paper URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments @@ -33,15 +33,11 @@ def __init__( self, env_id, cfgs, - algo='ppo_simmer_q', - wrapper_type: str = 'SimmerEnvWrapper', ) -> None: - r"""Initialize PPOSimmerQ.""" + """Initialize PPOSimmerQ.""" super().__init__( env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) def algorithm_specific_logs(self): diff --git a/omnisafe/algorithms/registry.py b/omnisafe/algorithms/registry.py index 72f39dc19..464307373 100644 --- a/omnisafe/algorithms/registry.py +++ b/omnisafe/algorithms/registry.py @@ -19,7 +19,6 @@ class Registry: """A registry to map strings to classes. - Args: name (str): Registry name. """ @@ -29,9 +28,10 @@ def __init__(self, name): self._module_dict = {} def __repr__(self): - return ( - f'{self.__class__.__name__ }(name={self._name}, items={list(self._module_dict.keys())})' + format_str = ( + self.__class__.__name__ + f'(name={self._name}, items={list(self._module_dict.keys())})' ) + return format_str @property def name(self): diff --git a/omnisafe/configs/off-policy/DDPG.yaml b/omnisafe/configs/off-policy/DDPG.yaml index 116ba7f95..5abb1f71e 100644 --- a/omnisafe/configs/off-policy/DDPG.yaml +++ b/omnisafe/configs/off-policy/DDPG.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class DDPG---------------------- ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/off-policy/DDPGLag.yaml b/omnisafe/configs/off-policy/DDPGLag.yaml index cb6f94a0c..9e948fc3b 100644 --- a/omnisafe/configs/off-policy/DDPGLag.yaml +++ b/omnisafe/configs/off-policy/DDPGLag.yaml @@ -18,6 +18,8 @@ defaults: ## ----------------------------Basic configurations for base class DDPG----------------------- ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/off-policy/SAC.yaml b/omnisafe/configs/off-policy/SAC.yaml index 8beafad08..736203daf 100644 --- a/omnisafe/configs/off-policy/SAC.yaml +++ b/omnisafe/configs/off-policy/SAC.yaml @@ -18,6 +18,8 @@ defaults: ## ----------------------------Basic configurations for base class DDPG----------------------- ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/off-policy/SACLag.yaml b/omnisafe/configs/off-policy/SACLag.yaml index 42f21b52c..27e541c4e 100644 --- a/omnisafe/configs/off-policy/SACLag.yaml +++ b/omnisafe/configs/off-policy/SACLag.yaml @@ -18,6 +18,8 @@ defaults: ## ----------------------------Basic configurations for base class DDPG----------------------- ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/off-policy/SDDPG.yaml b/omnisafe/configs/off-policy/SDDPG.yaml index 45878f10f..2fa6c5819 100644 --- a/omnisafe/configs/off-policy/SDDPG.yaml +++ b/omnisafe/configs/off-policy/SDDPG.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/off-policy/TD3.yaml b/omnisafe/configs/off-policy/TD3.yaml index 90ce7a0ae..d7e38e746 100644 --- a/omnisafe/configs/off-policy/TD3.yaml +++ b/omnisafe/configs/off-policy/TD3.yaml @@ -18,6 +18,8 @@ defaults: ## ----------------------------Basic configurations for base class DDPG----------------------- ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/off-policy/TD3Lag.yaml b/omnisafe/configs/off-policy/TD3Lag.yaml index 49ae779fe..dd28ad391 100644 --- a/omnisafe/configs/off-policy/TD3Lag.yaml +++ b/omnisafe/configs/off-policy/TD3Lag.yaml @@ -18,6 +18,8 @@ defaults: ## ----------------------------Basic configurations for base class DDPG----------------------- ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/CPO.yaml b/omnisafe/configs/on-policy/CPO.yaml index 7be369c59..7c8ab4026 100644 --- a/omnisafe/configs/on-policy/CPO.yaml +++ b/omnisafe/configs/on-policy/CPO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/CPPOPid.yaml b/omnisafe/configs/on-policy/CPPOPid.yaml index 618d5fe00..3589a5ae7 100644 --- a/omnisafe/configs/on-policy/CPPOPid.yaml +++ b/omnisafe/configs/on-policy/CPPOPid.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/CUP.yaml b/omnisafe/configs/on-policy/CUP.yaml index 5cd9c3486..baa0a5354 100644 --- a/omnisafe/configs/on-policy/CUP.yaml +++ b/omnisafe/configs/on-policy/CUP.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/FOCOPS.yaml b/omnisafe/configs/on-policy/FOCOPS.yaml index 57c24fefb..5ed15f3ce 100644 --- a/omnisafe/configs/on-policy/FOCOPS.yaml +++ b/omnisafe/configs/on-policy/FOCOPS.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/NPGLag.yaml b/omnisafe/configs/on-policy/NPGLag.yaml index c1540f9ab..8ddc14c4c 100644 --- a/omnisafe/configs/on-policy/NPGLag.yaml +++ b/omnisafe/configs/on-policy/NPGLag.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/NaturalPG.yaml b/omnisafe/configs/on-policy/NaturalPG.yaml index 180a7a594..301faea0c 100644 --- a/omnisafe/configs/on-policy/NaturalPG.yaml +++ b/omnisafe/configs/on-policy/NaturalPG.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PCPO.yaml b/omnisafe/configs/on-policy/PCPO.yaml index 23bbd2d94..558fa0fa0 100644 --- a/omnisafe/configs/on-policy/PCPO.yaml +++ b/omnisafe/configs/on-policy/PCPO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PDO.yaml b/omnisafe/configs/on-policy/PDO.yaml index 917981a2d..40983174e 100644 --- a/omnisafe/configs/on-policy/PDO.yaml +++ b/omnisafe/configs/on-policy/PDO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml index 41452dc01..cff8630f1 100644 --- a/omnisafe/configs/on-policy/PPO.yaml +++ b/omnisafe/configs/on-policy/PPO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml index 2a80bd88b..056620a39 100644 --- a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml +++ b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: EarlyTerminatedEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOLag.yaml b/omnisafe/configs/on-policy/PPOLag.yaml index 87bb2b7d6..60c75f289 100644 --- a/omnisafe/configs/on-policy/PPOLag.yaml +++ b/omnisafe/configs/on-policy/PPOLag.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml index 853fa1938..4e9930adf 100644 --- a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml +++ b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: EarlyTerminatedEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOLagSaute.yaml b/omnisafe/configs/on-policy/PPOLagSaute.yaml index c956352f6..1a21bcc0d 100644 --- a/omnisafe/configs/on-policy/PPOLagSaute.yaml +++ b/omnisafe/configs/on-policy/PPOLagSaute.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: SauteEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml index 2b4910a33..e07e8635d 100644 --- a/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml +++ b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml index cdbcf2190..de56ab841 100644 --- a/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml +++ b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOSaute.yaml b/omnisafe/configs/on-policy/PPOSaute.yaml index be7ef49ce..dbd20ae0c 100644 --- a/omnisafe/configs/on-policy/PPOSaute.yaml +++ b/omnisafe/configs/on-policy/PPOSaute.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: SauteEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOSimmerPid.yaml b/omnisafe/configs/on-policy/PPOSimmerPid.yaml index 469b7340e..0de3587c3 100644 --- a/omnisafe/configs/on-policy/PPOSimmerPid.yaml +++ b/omnisafe/configs/on-policy/PPOSimmerPid.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PPOSimmerQ.yaml b/omnisafe/configs/on-policy/PPOSimmerQ.yaml index 6b4d4fe44..4d640d877 100644 --- a/omnisafe/configs/on-policy/PPOSimmerQ.yaml +++ b/omnisafe/configs/on-policy/PPOSimmerQ.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PolicyGradient.yaml b/omnisafe/configs/on-policy/PolicyGradient.yaml index 4ee16278c..f2eb2bd31 100644 --- a/omnisafe/configs/on-policy/PolicyGradient.yaml +++ b/omnisafe/configs/on-policy/PolicyGradient.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index 3e52373bd..2dfdeb532 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/TRPOLag.yaml b/omnisafe/configs/on-policy/TRPOLag.yaml index 9ba653926..5bff7d726 100644 --- a/omnisafe/configs/on-policy/TRPOLag.yaml +++ b/omnisafe/configs/on-policy/TRPOLag.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/TRPOPid.yaml b/omnisafe/configs/on-policy/TRPOPid.yaml index 87ca151ed..5c693a6cc 100644 --- a/omnisafe/configs/on-policy/TRPOPid.yaml +++ b/omnisafe/configs/on-policy/TRPOPid.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/models/actor/gaussian_annealing_actor.py b/omnisafe/models/actor/gaussian_annealing_actor.py index 8672de9dd..065abc801 100644 --- a/omnisafe/models/actor/gaussian_annealing_actor.py +++ b/omnisafe/models/actor/gaussian_annealing_actor.py @@ -76,13 +76,10 @@ def predict(self, obs, deterministic=False, need_log_prob=True): else: out = dist.sample() - action = torch.clamp(out, -1, 1) - action = 0.5 * (action + 1) * (self.act_max - self.act_min) + self.act_min - if need_log_prob: log_prob = dist.log_prob(out).sum(axis=-1) - return action, log_prob - return action + return out, log_prob + return out def forward(self, obs, act=None): dist = self._distribution(obs) diff --git a/omnisafe/models/actor/gaussian_learning_actor.py b/omnisafe/models/actor/gaussian_learning_actor.py index b980a5086..5cddf42eb 100644 --- a/omnisafe/models/actor/gaussian_learning_actor.py +++ b/omnisafe/models/actor/gaussian_learning_actor.py @@ -72,15 +72,15 @@ def predict(self, obs, deterministic=False, need_log_prob=False): if deterministic: out = dist.mean else: - out = dist.rsample() + out = dist.sample() action = torch.clamp(out, -1, 1) action = self.act_min + (action + 1) * 0.5 * (self.act_max - self.act_min) if need_log_prob: log_prob = dist.log_prob(out).sum(axis=-1) - return action, log_prob - return action + return out, log_prob + return out def forward(self, obs, act=None): dist = self._distribution(obs) diff --git a/omnisafe/models/actor/gaussian_stdnet_actor.py b/omnisafe/models/actor/gaussian_stdnet_actor.py index 01ec10409..2c67f2052 100644 --- a/omnisafe/models/actor/gaussian_stdnet_actor.py +++ b/omnisafe/models/actor/gaussian_stdnet_actor.py @@ -90,7 +90,7 @@ def predict(self, obs, deterministic=False, need_log_prob=False): if deterministic: out = dist.mean else: - out = dist.rsample() + out = dist.sample() action = torch.tanh(out) action = self.act_min + (action + 1) * 0.5 * (self.act_max - self.act_min) diff --git a/omnisafe/wrappers/off_policy_wrapper.py b/omnisafe/wrappers/off_policy_wrapper.py index 95f584d69..d3a18c810 100644 --- a/omnisafe/wrappers/off_policy_wrapper.py +++ b/omnisafe/wrappers/off_policy_wrapper.py @@ -23,7 +23,7 @@ # pylint: disable=too-many-instance-attributes @WRAPPER_REGISTRY.register class OffPolicyEnvWrapper: - """OffPolicyEnvWrapper""" + """OffPolicyEnvWrapperr""" def __init__( self, diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py index 1b1989545..8d1f2ca46 100644 --- a/omnisafe/wrappers/simmer_wrapper.py +++ b/omnisafe/wrappers/simmer_wrapper.py @@ -344,7 +344,7 @@ def __init__( ) def augment_obs(self, obs: np.array, safety_obs: np.array): - """Augmenting the obs with the safety obs. + """Augmenting the obs with the safety obs, if needed. Args: obs (np.array): The observation. From 7148693180c4e7139164a8597ac14bd88bb576ff Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Fri, 23 Dec 2022 09:51:22 +0800 Subject: [PATCH 32/39] feat: add configs tool function --- omnisafe/algorithms/off_policy/ddpg.py | 3 ++- omnisafe/algorithms/on_policy/base/policy_gradient.py | 3 ++- omnisafe/utils/config_utils.py | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 802768f22..3cb50e026 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -25,6 +25,7 @@ from omnisafe.common.logger import Logger from omnisafe.models.constraint_actor_q_critic import ConstraintActorQCritic from omnisafe.utils import core, distributed_utils +from omnisafe.utils.config_utils import create_dict_from_namedtuple from omnisafe.utils.tools import get_flat_params_from from omnisafe.wrappers import wrapper_registry @@ -93,7 +94,7 @@ def __init__( # Set up logger and save configuration to disk self.logger = Logger(exp_name=cfgs.exp_name, data_dir=cfgs.data_dir, seed=cfgs.seed) - self.logger.save_config(cfgs._asdict()) + self.logger.save_config(create_dict_from_namedtuple(cfgs)) # Set seed seed = cfgs.seed + 10000 * distributed_utils.proc_id() torch.manual_seed(seed) diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index a8dca9f0e..ac8aac8ff 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -25,6 +25,7 @@ from omnisafe.common.logger import Logger from omnisafe.models.constraint_actor_critic import ConstraintActorCritic from omnisafe.utils import core, distributed_utils +from omnisafe.utils.config_utils import create_dict_from_namedtuple from omnisafe.utils.tools import get_flat_params_from from omnisafe.wrappers import wrapper_registry @@ -73,7 +74,7 @@ def __init__( # Set up logger and save configuration to disk self.logger = Logger(exp_name=cfgs.exp_name, data_dir=cfgs.data_dir, seed=cfgs.seed) - self.logger.save_config(cfgs._asdict()) + self.logger.save_config(create_dict_from_namedtuple(cfgs)) # Set seed seed = int(cfgs.seed) + 10000 * distributed_utils.proc_id() torch.manual_seed(seed) diff --git a/omnisafe/utils/config_utils.py b/omnisafe/utils/config_utils.py index d0e9c8cf3..741fe6904 100644 --- a/omnisafe/utils/config_utils.py +++ b/omnisafe/utils/config_utils.py @@ -60,6 +60,13 @@ def create_namedtuple_from_dict(obj): return obj +def create_dict_from_namedtuple(obj): + """Create dict from name-tuple""" + if isinstance(obj, tuple): + return {key: create_dict_from_namedtuple(value) for key, value in obj._asdict().items()} + return obj + + def check_all_configs(configs, algo_type): """Check all configs""" if algo_type == 'on-policy': From 65cc1db27b5135e75bc836896ca66d7757f3caf1 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Fri, 23 Dec 2022 11:57:59 +0800 Subject: [PATCH 33/39] docs: update docstring --- omnisafe/algorithms/off_policy/ddpg.py | 11 +++++------ omnisafe/algorithms/off_policy/ddpg_lag.py | 11 +++++------ omnisafe/algorithms/off_policy/sac.py | 10 ++++------ omnisafe/algorithms/off_policy/sac_lag.py | 9 ++++----- omnisafe/algorithms/off_policy/sddpg.py | 10 +++++----- omnisafe/algorithms/off_policy/td3.py | 9 ++++----- omnisafe/algorithms/off_policy/td3_lag.py | 11 +++++------ omnisafe/algorithms/on_policy/base/natural_pg.py | 7 +++---- .../algorithms/on_policy/base/policy_gradient.py | 7 +++---- omnisafe/algorithms/on_policy/base/ppo.py | 8 ++++---- omnisafe/algorithms/on_policy/base/trpo.py | 10 +++++----- .../early_terminated/ppo_early_terminated.py | 10 +++++----- .../early_terminated/ppo_lag_early_terminated.py | 10 +++++----- .../algorithms/on_policy/first_order/__init__.py | 2 +- omnisafe/algorithms/on_policy/first_order/cup.py | 11 +++++------ .../algorithms/on_policy/first_order/focops.py | 8 +++----- .../algorithms/on_policy/naive_lagrange/npg_lag.py | 8 +++----- .../algorithms/on_policy/naive_lagrange/pdo.py | 6 ++---- .../algorithms/on_policy/naive_lagrange/ppo_lag.py | 14 ++++++-------- .../on_policy/naive_lagrange/trpo_lag.py | 11 +++++------ .../algorithms/on_policy/pid_lagrange/cppo_pid.py | 11 +++++------ .../algorithms/on_policy/pid_lagrange/trpo_pid.py | 12 +++++------- .../algorithms/on_policy/saute/ppo_lag_saute.py | 12 ++++++------ omnisafe/algorithms/on_policy/saute/ppo_saute.py | 12 +++++------- omnisafe/algorithms/on_policy/second_order/cpo.py | 9 ++++----- omnisafe/algorithms/on_policy/second_order/pcpo.py | 7 +++---- .../on_policy/simmer/ppo_lag_simmer_pid.py | 11 +++++------ .../on_policy/simmer/ppo_lag_simmer_q.py | 11 +++++------ .../algorithms/on_policy/simmer/ppo_simmer_pid.py | 10 +++++----- .../algorithms/on_policy/simmer/ppo_simmer_q.py | 10 +++++----- omnisafe/algorithms/registry.py | 6 +++--- 31 files changed, 133 insertions(+), 161 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 3cb50e026..8233a095e 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -32,14 +32,13 @@ @registry.register class DDPG: # pylint: disable=too-many-instance-attributes - """Continuous control with deep reinforcement learning (DDPG) Algorithm. + """The Deep Deterministic Policy Gradient (DDPG) algorithm. References: - Paper Name: Continuous control with deep reinforcement learning. - Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, - Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. - Paper URL: https://arxiv.org/abs/1509.02971 - + Title: Continuous control with deep reinforcement learning + Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, + Yuval Tassa, David Silver, Daan Wierstra. + URL: https://arxiv.org/abs/1509.02971 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index ad4a84c0b..816a1201f 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -21,14 +21,13 @@ @registry.register class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes - """The Lagrange version of DDPG Algorithm. + """The Deep Deterministic Policy Gradient (DDPG) algorithm. References: - Paper Name: Continuous control with deep reinforcement learning. - Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, - Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. - Paper URL: https://arxiv.org/abs/1509.02971 - + Title: Continuous control with deep reinforcement learning + Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, + Yuval Tassa, David Silver, Daan Wierstra. + URL: https://arxiv.org/abs/1509.02971 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index d8070e346..b6ddd1cf8 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of the SAC algorithm.""" - import torch from omnisafe.algorithms import registry @@ -23,13 +22,12 @@ @registry.register class SAC(DDPG): # pylint: disable=too-many-instance-attributes - """Implementation of the SAC algorithm. + """The Soft Actor-Critic (SAC) algorithm. References: - Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine - Paper URL: https://arxiv.org/abs/1801.01290 - + Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + URL: https://arxiv.org/abs/1801.01290 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 7dc2472ad..2fb7d45db 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the SACLag algorithm.""" +"""Implementation of the Lagrange version of the SAC algorithm.""" import torch @@ -26,10 +26,9 @@ class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes """The Lagrange version of SAC algorithm. References: - Paper Name: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - Paper author: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine - Paper URL: https://arxiv.org/abs/1801.01290 - + Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + URL: https://arxiv.org/abs/1801.01290 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index d49e0734d..024f072a3 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -29,13 +29,13 @@ @registry.register class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name - """Implementation of SDDPG Algorithm. + """Implementation of the SDDPG algorithm. References: - Paper Name: Lyapunov-based Safe Policy Optimization for Continuous Control. - Paper author: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, Mohammad Ghavamzadeh. - Paper URL: https://arxiv.org/abs/1901.10031 - + Title: Lyapunov-based Safe Policy Optimization for Continuous Control + Authors: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, + Mohammad Ghavamzadeh. + URL: https://arxiv.org/abs/1901.10031 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index ec7607e70..13de54bf2 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -22,13 +22,12 @@ @registry.register class TD3(DDPG): # pylint: disable=too-many-instance-attributes - """Implementation of TD3 Algorithm. + """The Twin Delayed DDPG (TD3) algorithm. References: - Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. - Paper author: Scott Fujimoto, Herke van Hoof, David Meger. - Paper URL: https://arxiv.org/abs/1802.09477 - + Title: Addressing Function Approximation Error in Actor-Critic Methods + Authors: Scott Fujimoto, Herke van Hoof, David Meger. + URL: https://arxiv.org/abs/1802.09477 """ def __init__( diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index 077a7d2cc..ee3ce0a3d 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the TD3Lag algorithm.""" +"""Implementation of the Lagrange version of the TD3 algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.td3 import TD3 @@ -21,13 +21,12 @@ @registry.register class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes - """The Lagrange version of TD3 Algorithm. + """The Lagrange version of the TD3 algorithm References: - Paper Name: Addressing Function Approximation Error in Actor-Critic Methods. - Paper author: Scott Fujimoto, Herke van Hoof, David Meger. - Paper URL: https://arxiv.org/abs/1802.09477 - + Title: Addressing Function Approximation Error in Actor-Critic Methods + Authors: Scott Fujimoto, Herke van Hoof, David Meger. + URL: https://arxiv.org/abs/1802.09477 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py index 71ed950ed..2634722eb 100644 --- a/omnisafe/algorithms/on_policy/base/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -32,10 +32,9 @@ class NaturalPG(PolicyGradient): """The Natural Policy Gradient algorithm. References: - Paper Name: A Natural Policy Gradient. - Paper author: Sham Kakade. - Paper URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf - + Title: A Natural Policy Gradient + Author: Sham Kakade. + URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf """ def __init__( diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index ac8aac8ff..965af8e5e 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -35,10 +35,9 @@ class PolicyGradient: # pylint: disable=too-many-instance-attributes """The Policy Gradient algorithm. References: - Paper Name: Policy Gradient Methods for Reinforcement Learning with Function Approximation - Paper Author: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour - Paper URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf - + Title: Policy Gradient Methods for Reinforcement Learning with Function Approximation + Authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour. + URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf """ # pylint: disable-next=too-many-locals diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index bf3a1c897..8e4bd7dcb 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -23,12 +23,12 @@ @registry.register class PPO(PolicyGradient): - """The Proximal Policy Optimization Algorithms (PPO) Algorithm. + """The Proximal Policy Optimization (PPO) algorithm. References: - Paper Name: Proximal Policy Optimization Algorithms. - Paper author: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - Paper URL: https://arxiv.org/pdf/1707.06347.pdf + Title: Proximal Policy Optimization Algorithms + Authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. + URL: https://arxiv.org/abs/1707.06347 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py index 4eee0dea4..fe7ed362f 100644 --- a/omnisafe/algorithms/on_policy/base/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -29,12 +29,12 @@ @registry.register class TRPO(NaturalPG): - """The Trust Region Policy Optimization (TRPO) Algorithm. + """The Trust Region Policy Optimization (TRPO) algorithm. References: - Paper Name: Trust Region Policy Optimization. - Paper author: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1502.05477 + Title: Trust Region Policy Optimization + Authors: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. + URL: https://arxiv.org/abs/1502.05477 """ def __init__( @@ -58,7 +58,7 @@ def search_step_size( total_steps=15, decay=0.8, ): - r"""TRPO performs line-search until constraint satisfaction. + """TRPO performs line-search until constraint satisfaction. search around for a satisfied step of policy update to improve loss and reward performance diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py index 47f9e4629..922e5a9f1 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Early terminated algorithm by PPO.""" +"""Implementation of the early terminated algorithm using PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOEarlyTerminated(PPO): - """Early terminated algorithm implemented by PPO. + """The early terminated algorithm implemented with PPO. References: - Paper Name: Safe Exploration by Solving Early Terminated MDP - Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou - Paper URL: https://arxiv.org/abs/2107.04200 + Title: Safe Exploration by Solving Early Terminated MDP + Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. + URL: https://arxiv.org/abs/2107.04200 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py index d115837fb..e8301270b 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Early terminated algorithm by PPOLag.""" +"""Implementation of the Lagrange version of the early terminated algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,12 +20,12 @@ @registry.register class PPOLagEarlyTerminated(PPOLag): - """Early terminated algorithm implemented by PPOLag. + """The Lagrange version of the early terminated algorithm implemented with PPOLag. References: - Paper Name: Safe Exploration by Solving Early Terminated MDP - Paper author: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou - Paper URL: https://arxiv.org/abs/2107.04200 + Title: Safe Exploration by Solving Early Terminated MDP + Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. + URL: https://arxiv.org/abs/2107.04200 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py index 635735694..3c0bde0ea 100644 --- a/omnisafe/algorithms/on_policy/first_order/__init__.py +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""FIrst-order algorithms.""" +"""First-order algorithms.""" from omnisafe.algorithms.on_policy.first_order.cup import CUP from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py index aaa494fe9..043d3027e 100644 --- a/omnisafe/algorithms/on_policy/first_order/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -24,13 +24,13 @@ @registry.register class CUP(PolicyGradient, Lagrange): - """The Constrained Update Projection Approach to Safe Policy Optimization. + """The Constrained Update Projection (CUP) Approach to Safe Policy Optimization. References: - Paper Name: Constrained Update Projection Approach to Safe Policy Optimization. - Paper author: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, Yaodong Yang, Gang Pan. - Paper URL: https://arxiv.org/abs/2209.07089 - + Title: Constrained Update Projection Approach to Safe Policy Optimization + Authors: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, + Yaodong Yang, Gang Pan. + URL: https://arxiv.org/abs/2209.07089 """ def __init__( @@ -54,7 +54,6 @@ def __init__( lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, ) - self.algo = self.__class__.__name__ self.lam = self.cfgs.lam self.eta = self.cfgs.eta self.clip = self.cfgs.clip diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py index 2fd6fe73d..df29cc45a 100644 --- a/omnisafe/algorithms/on_policy/first_order/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -27,10 +27,9 @@ class FOCOPS(PolicyGradient, Lagrange): """The First Order Constrained Optimization in Policy Space (FOCOPS) algorithm. References: - Paper Name: First Order Constrained Optimization in Policy Space. - Paper author: Yiming Zhang, Quan Vuong, Keith W. Ross. - Paper URL: https://arxiv.org/abs/2002.06506 - + Title: First Order Constrained Optimization in Policy Space + Authors: Yiming Zhang, Quan Vuong, Keith W. Ross. + URL: https://arxiv.org/abs/2002.06506 """ def __init__( @@ -54,7 +53,6 @@ def __init__( lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, ) - self.algo = self.__class__.__name__ self.lam = self.cfgs.lam self.eta = self.cfgs.eta diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index d4c2dbce3..3869c0a78 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of Natural Policy Gradient algorithm.""" +"""Implementation of the Lagrange version of the Natural Policy Gradient algorithm.""" import torch @@ -23,10 +23,9 @@ @registry.register class NPGLag(NaturalPG, Lagrange): - """The Lagrange version of Natural Policy Gradient algorithm. - - A simple combination of Lagrange method and Natural Policy Gradient algorithm. + """The Lagrange version of the Natural Policy Gradient algorithm. + A simple combination of the Lagrange method and the Natural Policy Gradient algorithm. """ def __init__( @@ -48,7 +47,6 @@ def __init__( lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, ) - self.algo = self.__class__.__name__ def compute_loss_pi(self, data: dict): """ diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index 411efe1f1..c5736443b 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -23,10 +23,9 @@ @registry.register class PDO(PolicyGradient, Lagrange): - """The Lagrange version of Policy Gradient algorithm. - - A simple combination of Lagrange method and Policy Gradient algorithm. + """The Lagrange version of the Policy Gradient algorithm. + A simple combination of the Lagrange method and the Policy Gradient algorithm. """ def __init__( @@ -47,7 +46,6 @@ def __init__( lambda_lr=cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=cfgs.lagrange_cfgs.lambda_optimizer, ) - self.algo = self.__class__.__name__ def compute_loss_pi(self, data: dict): """ diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index a8c3e732b..952b12f74 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of PPO algorithm.""" +"""Implementation of the Lagrange version of the PPO algorithm.""" import torch @@ -23,13 +23,12 @@ @registry.register class PPOLag(PolicyGradient, Lagrange): - """The Lagrange version of PPO algorithm. + """The Lagrange version of the PPO algorithm. References: - Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. - Paper author: Alex Ray, Joshua Achiam, Dario Amodei - Paper URL: https://cdn.openai.com/safexp-short.pdf - + Title: Benchmarking Safe Exploration in Deep Reinforcement Learning + Authors: Alex Ray, Joshua Achiam, Dario Amodei. + URL: https://cdn.openai.com/safexp-short.pdf """ # pylint: disable-next=too-many-arguments @@ -39,6 +38,7 @@ def __init__( cfgs, ): """Initialize PPO-Lag algorithm.""" + self.clip = cfgs.clip PolicyGradient.__init__( self, env_id=env_id, @@ -51,8 +51,6 @@ def __init__( lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, ) - self.algo = self.__class__.__name__ - self.clip = cfgs.clip def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index be037e1a9..746ba50dd 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of TRPO algorithm.""" +"""Implementation of the Lagrange version of the TRPO algorithm.""" import torch @@ -23,12 +23,12 @@ @registry.register class TRPOLag(TRPO, Lagrange): - """The Lagrange version of TRPO algorithm. + """The Lagrange version of the TRPO algorithm. References: - Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. - Paper author: Alex Ray, Joshua Achiam, Dario Amodei - Paper URL: https://cdn.openai.com/safexp-short.pdf + Title: Benchmarking Safe Exploration in Deep Reinforcement Learning + Authors: Alex Ray, Joshua Achiam, Dario Amodei. + URL: https://cdn.openai.com/safexp-short.pdf """ @@ -50,7 +50,6 @@ def __init__( lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, ) - self.algo = self.__class__.__name__ def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 5fea4b9c7..901272e46 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the CPPO Pid-Lagrange algorithm.""" +"""Implementation of the PID-Lagrange version of the CPPO algorithm.""" import torch @@ -23,13 +23,12 @@ @registry.register class CPPOPid(PolicyGradient, PIDLagrangian): - """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + """The PID-Lagrange version of the CPPO algorithm. References: - Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/2007.03964 - + Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/2007.03964 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index ff4f3689c..097749858 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the TRPO PID-Lagrange algorithm.""" +"""Implementation of the PID-Lagrange version of the TRPO algorithm.""" import torch @@ -23,13 +23,12 @@ @registry.register class TRPOPid(TRPO, PIDLagrangian): - """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + """The PID-Lagrange version of the TRPO algorithm. References: - Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/2007.03964 - + Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/2007.03964 """ def __init__( @@ -44,7 +43,6 @@ def __init__( cfgs=cfgs, ) PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) - self.cost_limit = self.cfgs.cost_limit def algorithm_specific_logs(self): diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py index 1f5b4efcb..7417cc7a0 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Saute algorithm.""" +"""Implementation of the Lagrange version of the Saute algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -20,13 +20,13 @@ @registry.register class PPOLagSaute(PPOLag): - """Saute algorithm implemented by PPOLag. + """The Saute algorithm implemented with PPOLag. References: - Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, - David Mguni, Jun Wang, Haitham Bou-Ammar. - Paper URL: https://arxiv.org/abs/2202.06558 + Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, + Jun Wang, Haitham Bou-Ammar. + URL: https://arxiv.org/abs/2202.06558 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py index c95fdd2fb..4bcf775d9 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -20,13 +20,13 @@ @registry.register class PPOSaute(PPO): - r"""Saute algorithm implemented by PPO. + """The Saute algorithm implemented with PPO. References: - Paper Name: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, - David Mguni, Jun Wang, Haitham Bou-Ammar. - Paper URL: https://arxiv.org/abs/2202.06558 + Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, + Jun Wang, Haitham Bou-Ammar. + URL: https://arxiv.org/abs/2202.06558 """ # pylint: disable-next=too-many-arguments @@ -34,10 +34,8 @@ def __init__( self, env_id, cfgs, - clip=0.2, ) -> None: """Initialize PPOSaute.""" - self.clip = clip super().__init__( env_id=env_id, cfgs=cfgs, diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py index b5ee65bc5..792ad91ed 100644 --- a/omnisafe/algorithms/on_policy/second_order/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -30,13 +30,12 @@ @registry.register class CPO(TRPO): - """The Constrained Policy Optimization (CPO) Algorithm. + """The Constrained Policy Optimization (CPO) algorithm. References: - Paper Name: Constrained Policy Optimization. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1705.10528 - + Title: Constrained Policy Optimization + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/1705.10528 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py index 5785bda3c..d549c69af 100644 --- a/omnisafe/algorithms/on_policy/second_order/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -32,10 +32,9 @@ class PCPO(TRPO): """The Projection-Based Constrained Policy Optimization (PCPO) algorithm. References: - Paper name: Projection-Based Constrained Policy Optimization. - Paper author: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge - Paper URL: https://arxiv.org/abs/2010.03152 - + Title: Projection-Based Constrained Policy Optimization + Authors: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge. + URL: https://arxiv.org/abs/2010.03152 """ def __init__( diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py index f91596562..987a954c4 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID Simmer algorithm by PPOLag.""" - +"""Implementation of the PID version of the Simmer algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -21,12 +20,12 @@ @registry.register class PPOLagSimmerPid(PPOLag): - """Simmer algorithm (PID version) implemented by PPOLag. + """The PID version of the Simmer algorithm implemented with PPOLag. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py index 1647f1fe9..940b70112 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Q Simmer algorithm by PPOLag.""" - +"""Implementation of the Q Simmer algorithm using PPOLag.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -21,12 +20,12 @@ @registry.register class PPOLagSimmerQ(PPOLag): - """Simmer algorithm (Q version) implemented by PPOLag. + """The Q Simmer algorithm implemented with PPOLag. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py index 131cce164..5fe5ee746 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PID Simmer algorithm by PPOLag.""" +"""Implementation of the PID version of the Simmer algorithm using PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOSimmerPid(PPO): - """Simmer algorithm (PID version) implemented by PPO. + """The PID version of the Simmer algorithm implemented with PPO. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py index f9e9d4b50..44d257eeb 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Q Simmer algorithm by PPOLag.""" +"""Implementation of the Q Simmer algorithm using PPO.""" from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -20,12 +20,12 @@ @registry.register class PPOSimmerQ(PPO): - """Simmer algorithm (Q version) implemented by PPO. + """The Q Simmer algorithm implemented with PPO. References: - Paper Name: Effects of Safety State Augmentation on Safe Exploration. - Paper author: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - Paper URL: https://arxiv.org/abs/2206.02675 + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 """ # pylint: disable-next=too-many-arguments diff --git a/omnisafe/algorithms/registry.py b/omnisafe/algorithms/registry.py index 464307373..72f39dc19 100644 --- a/omnisafe/algorithms/registry.py +++ b/omnisafe/algorithms/registry.py @@ -19,6 +19,7 @@ class Registry: """A registry to map strings to classes. + Args: name (str): Registry name. """ @@ -28,10 +29,9 @@ def __init__(self, name): self._module_dict = {} def __repr__(self): - format_str = ( - self.__class__.__name__ + f'(name={self._name}, items={list(self._module_dict.keys())})' + return ( + f'{self.__class__.__name__ }(name={self._name}, items={list(self._module_dict.keys())})' ) - return format_str @property def name(self): From ec55b5705209114c82fc3b7cc9f3f573a51d1892 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Fri, 23 Dec 2022 12:17:25 +0800 Subject: [PATCH 34/39] docs: update docstrings --- omnisafe/algorithms/off_policy/ddpg_lag.py | 5 ++--- omnisafe/algorithms/off_policy/sac_lag.py | 1 - omnisafe/algorithms/on_policy/first_order/cup.py | 2 -- omnisafe/algorithms/on_policy/first_order/focops.py | 2 -- omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py | 1 - omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py | 1 - omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py | 1 - 7 files changed, 2 insertions(+), 11 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 816a1201f..1b71c7a43 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the DDPGLag algorithm.""" +"""Implementation of the Lagrange version of the DDPG algorithm.""" from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.ddpg import DDPG @@ -21,7 +21,7 @@ @registry.register class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes - """The Deep Deterministic Policy Gradient (DDPG) algorithm. + """The Lagrange version of the DDPG Algorithm. References: Title: Continuous control with deep reinforcement learning @@ -41,7 +41,6 @@ def __init__( env_id=env_id, cfgs=cfgs, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 2fb7d45db..21811f3cf 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -49,7 +49,6 @@ def __init__( env_id=env_id, cfgs=cfgs, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py index 043d3027e..81f093dd1 100644 --- a/omnisafe/algorithms/on_policy/first_order/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -39,13 +39,11 @@ def __init__( cfgs, ): r"""The :meth:`init` function.""" - PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py index df29cc45a..4e5c76fce 100644 --- a/omnisafe/algorithms/on_policy/first_order/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -38,13 +38,11 @@ def __init__( cfgs, ): r"""The :meth:`init` function.""" - PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index 3869c0a78..f68d4db8e 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -34,7 +34,6 @@ def __init__( cfgs, ): """initialize""" - NaturalPG.__init__( self, env_id=env_id, diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 901272e46..161a6c668 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -36,7 +36,6 @@ def __init__( env_id, cfgs, ): - PolicyGradient.__init__( self, env_id=env_id, diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index 097749858..956979df3 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -36,7 +36,6 @@ def __init__( env_id, cfgs, ): - TRPO.__init__( self, env_id=env_id, From bab39645788cf7ccf3c401d5af3f3239fc7a709b Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Fri, 23 Dec 2022 15:26:03 +0800 Subject: [PATCH 35/39] style: cleanup `__init__` arguments --- omnisafe/algorithms/on_policy/base/natural_pg.py | 12 ++---------- .../algorithms/on_policy/base/policy_gradient.py | 7 +------ omnisafe/algorithms/on_policy/base/ppo.py | 7 +------ omnisafe/algorithms/on_policy/base/trpo.py | 11 ++--------- .../early_terminated/ppo_early_terminated.py | 12 ++---------- .../early_terminated/ppo_lag_early_terminated.py | 12 ++---------- omnisafe/algorithms/on_policy/first_order/cup.py | 6 +----- omnisafe/algorithms/on_policy/first_order/focops.py | 6 +----- .../algorithms/on_policy/naive_lagrange/npg_lag.py | 6 +----- omnisafe/algorithms/on_policy/naive_lagrange/pdo.py | 6 +----- .../algorithms/on_policy/naive_lagrange/ppo_lag.py | 7 +------ .../algorithms/on_policy/naive_lagrange/trpo_lag.py | 6 +----- .../algorithms/on_policy/pid_lagrange/cppo_pid.py | 6 +----- .../algorithms/on_policy/pid_lagrange/trpo_pid.py | 6 +----- omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py | 12 ++---------- omnisafe/algorithms/on_policy/saute/ppo_saute.py | 12 ++---------- omnisafe/algorithms/on_policy/second_order/cpo.py | 11 ++--------- omnisafe/algorithms/on_policy/second_order/pcpo.py | 11 ++--------- .../on_policy/simmer/ppo_lag_simmer_pid.py | 12 ++---------- .../algorithms/on_policy/simmer/ppo_lag_simmer_q.py | 12 ++---------- .../algorithms/on_policy/simmer/ppo_simmer_pid.py | 12 ++---------- omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py | 12 ++---------- 22 files changed, 34 insertions(+), 170 deletions(-) diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py index 2634722eb..6a4f0b40d 100644 --- a/omnisafe/algorithms/on_policy/base/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -37,21 +37,13 @@ class NaturalPG(PolicyGradient): URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf """ - def __init__( - self, - env_id, - cfgs, - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) self.cg_damping = cfgs.cg_damping self.cg_iters = cfgs.cg_iters self.target_kl = cfgs.target_kl self.fvp_obs = cfgs.fvp_obs - # pylint: disable-next=too-many-arguments,unused-argument def search_step_size(self, step_dir): """ NPG use full step_size diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index 965af8e5e..ee3cb8a6d 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -40,12 +40,7 @@ class PolicyGradient: # pylint: disable=too-many-instance-attributes URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf """ - # pylint: disable-next=too-many-locals - def __init__( - self, - env_id, - cfgs=None, - ) -> None: + def __init__(self, env_id, cfgs=None) -> None: """Initialize the algorithm. Args: diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index 8e4bd7dcb..d2e651554 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -31,12 +31,7 @@ class PPO(PolicyGradient): URL: https://arxiv.org/abs/1707.06347 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """Initialize PPO.""" self.clip = cfgs.clip super().__init__( diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py index fe7ed362f..2ae594094 100644 --- a/omnisafe/algorithms/on_policy/base/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -37,15 +37,8 @@ class TRPO(NaturalPG): URL: https://arxiv.org/abs/1502.05477 """ - def __init__( - self, - env_id, - cfgs, - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) # pylint: disable-next=too-many-arguments,too-many-locals,arguments-differ def search_step_size( diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py index 922e5a9f1..aa62ec519 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -28,14 +28,6 @@ class PPOEarlyTerminated(PPO): URL: https://arxiv.org/abs/2107.04200 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ) -> None: + def __init__(self, env_id, cfgs) -> None: """Initialize PPO_Earyly_Terminated.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py index e8301270b..e0827fffa 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -28,14 +28,6 @@ class PPOLagEarlyTerminated(PPOLag): URL: https://arxiv.org/abs/2107.04200 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ) -> None: + def __init__(self, env_id, cfgs) -> None: """Initialize PPO_Lag_Earyly_Terminated.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py index 81f093dd1..0a70cc2c0 100644 --- a/omnisafe/algorithms/on_policy/first_order/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -33,11 +33,7 @@ class CUP(PolicyGradient, Lagrange): URL: https://arxiv.org/abs/2209.07089 """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: r"""The :meth:`init` function.""" PolicyGradient.__init__( self, diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py index 4e5c76fce..1efff805a 100644 --- a/omnisafe/algorithms/on_policy/first_order/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -32,11 +32,7 @@ class FOCOPS(PolicyGradient, Lagrange): URL: https://arxiv.org/abs/2002.06506 """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: r"""The :meth:`init` function.""" PolicyGradient.__init__( self, diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index f68d4db8e..323173ee8 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -28,11 +28,7 @@ class NPGLag(NaturalPG, Lagrange): A simple combination of the Lagrange method and the Natural Policy Gradient algorithm. """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """initialize""" NaturalPG.__init__( self, diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index c5736443b..85d7d7416 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -28,11 +28,7 @@ class PDO(PolicyGradient, Lagrange): A simple combination of the Lagrange method and the Policy Gradient algorithm. """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """initialization""" PolicyGradient.__init__( self, diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index 952b12f74..f120c0463 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -31,12 +31,7 @@ class PPOLag(PolicyGradient, Lagrange): URL: https://cdn.openai.com/safexp-short.pdf """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """Initialize PPO-Lag algorithm.""" self.clip = cfgs.clip PolicyGradient.__init__( diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index 746ba50dd..d6e1c2df3 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -32,11 +32,7 @@ class TRPOLag(TRPO, Lagrange): """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """initialize""" TRPO.__init__( self, diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 161a6c668..34a248149 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -31,11 +31,7 @@ class CPPOPid(PolicyGradient, PIDLagrangian): URL: https://arxiv.org/abs/2007.03964 """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: PolicyGradient.__init__( self, env_id=env_id, diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index 956979df3..408b8205a 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -31,11 +31,7 @@ class TRPOPid(TRPO, PIDLagrangian): URL: https://arxiv.org/abs/2007.03964 """ - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: TRPO.__init__( self, env_id=env_id, diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py index 7417cc7a0..180e48d5f 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -29,17 +29,9 @@ class PPOLagSaute(PPOLag): URL: https://arxiv.org/abs/2202.06558 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ) -> None: + def __init__(self, env_id, cfgs) -> None: """Initialize PPOLagSaute.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py index 4bcf775d9..91e8286f6 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -29,17 +29,9 @@ class PPOSaute(PPO): URL: https://arxiv.org/abs/2202.06558 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ) -> None: + def __init__(self, env_id, cfgs) -> None: """Initialize PPOSaute.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py index 792ad91ed..1d465cb6b 100644 --- a/omnisafe/algorithms/on_policy/second_order/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -38,15 +38,8 @@ class CPO(TRPO): URL: https://arxiv.org/abs/1705.10528 """ - def __init__( - self, - env_id, - cfgs, - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) self.cost_limit = cfgs.cost_limit self.loss_pi_cost_before = 0.0 diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py index d549c69af..e952341af 100644 --- a/omnisafe/algorithms/on_policy/second_order/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -37,15 +37,8 @@ class PCPO(TRPO): URL: https://arxiv.org/abs/2010.03152 """ - def __init__( - self, - env_id, - cfgs, - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) self.cost_limit = self.cfgs.cost_limit # pylint: disable-next=too-many-locals,too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py index 987a954c4..cd89f093c 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -28,17 +28,9 @@ class PPOLagSimmerPid(PPOLag): URL: https://arxiv.org/abs/2206.02675 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """Initialize PPOLagSimmerPid algorithm.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) def algorithm_specific_logs(self): """Log the algorithm specific metrics.""" diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py index 940b70112..d06fe068f 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -28,17 +28,9 @@ class PPOLagSimmerQ(PPOLag): URL: https://arxiv.org/abs/2206.02675 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ): + def __init__(self, env_id, cfgs) -> None: """Initialize PPOLagSimmerQ algorithm.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) def algorithm_specific_logs(self): """Log the algorithm specific metrics.""" diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py index 5fe5ee746..de4c4ed76 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -28,17 +28,9 @@ class PPOSimmerPid(PPO): URL: https://arxiv.org/abs/2206.02675 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ) -> None: + def __init__(self, env_id, cfgs) -> None: """Initialize PPOSimmerPid.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) def algorithm_specific_logs(self): """Log the algorithm specific metrics.""" diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py index 44d257eeb..7bd50a0bb 100644 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py @@ -28,17 +28,9 @@ class PPOSimmerQ(PPO): URL: https://arxiv.org/abs/2206.02675 """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - ) -> None: + def __init__(self, env_id, cfgs) -> None: """Initialize PPOSimmerQ.""" - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) + super().__init__(env_id=env_id, cfgs=cfgs) def algorithm_specific_logs(self): super().algorithm_specific_logs() From c59a5e944802c6688e844db7c0a43fd1b7c6e8d2 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Fri, 23 Dec 2022 15:28:55 +0800 Subject: [PATCH 36/39] style: cleanup `__init__` arguments --- omnisafe/algorithms/off_policy/ddpg.py | 6 +----- omnisafe/algorithms/off_policy/ddpg_lag.py | 6 +----- omnisafe/algorithms/off_policy/sac.py | 6 +----- omnisafe/algorithms/off_policy/sac_lag.py | 6 +----- omnisafe/algorithms/off_policy/sddpg.py | 8 ++------ omnisafe/algorithms/off_policy/td3.py | 6 +----- omnisafe/algorithms/off_policy/td3_lag.py | 6 +----- 7 files changed, 8 insertions(+), 36 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 8233a095e..76d11eebd 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -41,11 +41,7 @@ class DDPG: # pylint: disable=too-many-instance-attributes URL: https://arxiv.org/abs/1509.02971 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize DDPG. Args: diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py index 1b71c7a43..3b9fc2a3f 100644 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -30,11 +30,7 @@ class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes URL: https://arxiv.org/abs/1509.02971 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize DDPG.""" DDPG.__init__( self, diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py index b6ddd1cf8..fadf022cd 100644 --- a/omnisafe/algorithms/off_policy/sac.py +++ b/omnisafe/algorithms/off_policy/sac.py @@ -30,11 +30,7 @@ class SAC(DDPG): # pylint: disable=too-many-instance-attributes URL: https://arxiv.org/abs/1801.01290 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize SAC.""" super().__init__( env_id=env_id, diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py index 21811f3cf..d55c626b9 100644 --- a/omnisafe/algorithms/off_policy/sac_lag.py +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -31,11 +31,7 @@ class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes URL: https://arxiv.org/abs/1801.01290 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize SACLag. Args: diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py index 024f072a3..29a357f27 100644 --- a/omnisafe/algorithms/off_policy/sddpg.py +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -28,7 +28,7 @@ @registry.register -class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name +class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name """Implementation of the SDDPG algorithm. References: @@ -38,11 +38,7 @@ class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-nam URL: https://arxiv.org/abs/1901.10031 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize SDDPG. Args: diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py index 13de54bf2..c811da379 100644 --- a/omnisafe/algorithms/off_policy/td3.py +++ b/omnisafe/algorithms/off_policy/td3.py @@ -30,11 +30,7 @@ class TD3(DDPG): # pylint: disable=too-many-instance-attributes URL: https://arxiv.org/abs/1802.09477 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize DDPG.""" super().__init__( env_id=env_id, diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py index ee3ce0a3d..aa8bd2be0 100644 --- a/omnisafe/algorithms/off_policy/td3_lag.py +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -29,11 +29,7 @@ class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes URL: https://arxiv.org/abs/1802.09477 """ - def __init__( - self, - env_id: str, - cfgs=None, - ): + def __init__(self, env_id: str, cfgs=None) -> None: """Initialize TD3. Args: From c5e8fddbf1a90f61d42b1a47d7bab17b86313dd5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Fri, 23 Dec 2022 15:36:46 +0800 Subject: [PATCH 37/39] fix: test for namedtuple --- omnisafe/algorithms/off_policy/ddpg.py | 4 ++-- .../on_policy/base/policy_gradient.py | 6 +++--- .../on_policy/pid_lagrange/cppo_pid.py | 3 ++- .../on_policy/pid_lagrange/trpo_pid.py | 3 ++- omnisafe/utils/config_utils.py | 19 ++++++++----------- tests/test_model.py | 15 ++++++++------- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 76d11eebd..3a85311c9 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -25,7 +25,7 @@ from omnisafe.common.logger import Logger from omnisafe.models.constraint_actor_q_critic import ConstraintActorQCritic from omnisafe.utils import core, distributed_utils -from omnisafe.utils.config_utils import create_dict_from_namedtuple +from omnisafe.utils.config_utils import namedtuple2dict from omnisafe.utils.tools import get_flat_params_from from omnisafe.wrappers import wrapper_registry @@ -89,7 +89,7 @@ def __init__(self, env_id: str, cfgs=None) -> None: # Set up logger and save configuration to disk self.logger = Logger(exp_name=cfgs.exp_name, data_dir=cfgs.data_dir, seed=cfgs.seed) - self.logger.save_config(create_dict_from_namedtuple(cfgs)) + self.logger.save_config(namedtuple2dict(cfgs)) # Set seed seed = cfgs.seed + 10000 * distributed_utils.proc_id() torch.manual_seed(seed) diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index ee3cb8a6d..a4c62b577 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -25,7 +25,7 @@ from omnisafe.common.logger import Logger from omnisafe.models.constraint_actor_critic import ConstraintActorCritic from omnisafe.utils import core, distributed_utils -from omnisafe.utils.config_utils import create_dict_from_namedtuple +from omnisafe.utils.config_utils import namedtuple2dict from omnisafe.utils.tools import get_flat_params_from from omnisafe.wrappers import wrapper_registry @@ -54,7 +54,7 @@ def __init__(self, env_id, cfgs=None) -> None: self.cfgs = deepcopy(cfgs) self.wrapper_type = self.cfgs.wrapper_type self.env = wrapper_registry.get(self.wrapper_type)( - env_id, cfgs=self.cfgs._asdict().get('env_cfgs') + env_id, cfgs=namedtuple2dict(self.cfgs).get('env_cfgs') ) assert self.cfgs.steps_per_epoch % distributed_utils.num_procs() == 0 @@ -68,7 +68,7 @@ def __init__(self, env_id, cfgs=None) -> None: # Set up logger and save configuration to disk self.logger = Logger(exp_name=cfgs.exp_name, data_dir=cfgs.data_dir, seed=cfgs.seed) - self.logger.save_config(create_dict_from_namedtuple(cfgs)) + self.logger.save_config(namedtuple2dict(cfgs)) # Set seed seed = int(cfgs.seed) + 10000 * distributed_utils.proc_id() torch.manual_seed(seed) diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 34a248149..9a4db984e 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -19,6 +19,7 @@ from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.pid_lagrange import PIDLagrangian +from omnisafe.utils.config_utils import namedtuple2dict @registry.register @@ -37,7 +38,7 @@ def __init__(self, env_id, cfgs) -> None: env_id=env_id, cfgs=cfgs, ) - PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) + PIDLagrangian.__init__(self, **namedtuple2dict(self.cfgs.PID_cfgs)) self.clip = self.cfgs.clip self.cost_limit = self.cfgs.PID_cfgs.cost_limit diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index 408b8205a..9996f838a 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -19,6 +19,7 @@ from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.common.pid_lagrange import PIDLagrangian +from omnisafe.utils.config_utils import namedtuple2dict @registry.register @@ -37,7 +38,7 @@ def __init__(self, env_id, cfgs) -> None: env_id=env_id, cfgs=cfgs, ) - PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) + PIDLagrangian.__init__(self, **namedtuple2dict(self.cfgs.PID_cfgs)) self.cost_limit = self.cfgs.cost_limit def algorithm_specific_logs(self): diff --git a/omnisafe/utils/config_utils.py b/omnisafe/utils/config_utils.py index 741fe6904..0a9c8056b 100644 --- a/omnisafe/utils/config_utils.py +++ b/omnisafe/utils/config_utils.py @@ -26,18 +26,15 @@ def recursive_update(args: dict, update_args: dict): print(f'{key}:') recursive_update(args[key], update_args[key]) else: - # f-strings: - # https://pylint.pycqa.org/en/latest/user_guide/messages/convention/consider-using-f-string.html args[key] = update_args[key] menus = (key, update_args[key]) print(f'- {menus[0]}: {menus[1]} is update!') elif isinstance(value, dict): recursive_update(value, update_args) + return dict2namedtuple(args) - return create_namedtuple_from_dict(args) - -def create_namedtuple_from_dict(obj): +def dict2namedtuple(obj): """Create namedtuple from dict""" if isinstance(obj, dict): fields = sorted(obj.keys()) @@ -47,7 +44,7 @@ def create_namedtuple_from_dict(obj): rename=True, ) field_value_pairs = OrderedDict( - (str(field), create_namedtuple_from_dict(obj[field])) for field in fields + (str(field), dict2namedtuple(obj[field])) for field in fields ) try: return namedtuple_type(**field_value_pairs) @@ -55,15 +52,15 @@ def create_namedtuple_from_dict(obj): # Cannot create namedtuple instance so fallback to dict (invalid attribute names) return dict(**field_value_pairs) elif isinstance(obj, (list, set, tuple, frozenset)): - return [create_namedtuple_from_dict(item) for item in obj] + return [dict2namedtuple(item) for item in obj] else: return obj -def create_dict_from_namedtuple(obj): - """Create dict from name-tuple""" - if isinstance(obj, tuple): - return {key: create_dict_from_namedtuple(value) for key, value in obj._asdict().items()} +def namedtuple2dict(obj): + """Create a dict from a namedtuple.""" + if isinstance(obj, tuple) and hasattr(obj, '_fields'): + return {key: namedtuple2dict(value) for key, value in obj._asdict().items()} return obj diff --git a/tests/test_model.py b/tests/test_model.py index 368540424..0f0954376 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -22,7 +22,7 @@ import helpers from omnisafe.models import ActorBuilder, CriticBuilder from omnisafe.models.actor_critic import ActorCritic -from omnisafe.utils.config_utils import create_namedtuple_from_dict +from omnisafe.utils.config_utils import dict2namedtuple @helpers.parametrize( @@ -196,12 +196,13 @@ def test_actor_critic( } observation_space = Box(low=-1, high=1, shape=(obs_dim,)) - model_cfgs = { - 'ac_kwargs': ac_kwargs, - 'weight_initialization_mode': weight_initialization_mode, - 'shared_weights': shared_weights, - } - model_cfgs = create_namedtuple_from_dict(model_cfgs) + model_cfgs = dict2namedtuple( + { + 'ac_kwargs': ac_kwargs, + 'weight_initialization_mode': weight_initialization_mode, + 'shared_weights': shared_weights, + } + ) if space_type == Discrete: action_space = space_type(act_dim) From 363d40c2088b5cdf1afd2c08a43398904c044f55 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Fri, 23 Dec 2022 16:02:43 +0800 Subject: [PATCH 38/39] chore: appease linters --- .../safety_gymnasium/bases/base_mujoco_task.py | 2 +- omnisafe/common/pid_lagrange.py | 2 +- omnisafe/utils/config_utils.py | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py b/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py index 026d48615..d8a7a09da 100644 --- a/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py +++ b/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py @@ -18,7 +18,7 @@ from copy import deepcopy from typing import Union -# import gymnasium +import gymnasium # pylint: disable=unused-import import mujoco import numpy as np from gymnasium.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer diff --git a/omnisafe/common/pid_lagrange.py b/omnisafe/common/pid_lagrange.py index 02d3cb9c5..4a04338be 100644 --- a/omnisafe/common/pid_lagrange.py +++ b/omnisafe/common/pid_lagrange.py @@ -34,7 +34,7 @@ def __init__( sum_norm: bool, diff_norm: bool, penalty_max: int, - lagrangian_multiplier_init: 0.001, + lagrangian_multiplier_init: float, cost_limit: int, ): """init""" diff --git a/omnisafe/utils/config_utils.py b/omnisafe/utils/config_utils.py index 0a9c8056b..dba52654d 100644 --- a/omnisafe/utils/config_utils.py +++ b/omnisafe/utils/config_utils.py @@ -38,11 +38,7 @@ def dict2namedtuple(obj): """Create namedtuple from dict""" if isinstance(obj, dict): fields = sorted(obj.keys()) - namedtuple_type = namedtuple( - typename='GenericObject', - field_names=fields, - rename=True, - ) + namedtuple_type = namedtuple('GenericObject', fields, rename=True) field_value_pairs = OrderedDict( (str(field), dict2namedtuple(obj[field])) for field in fields ) From 6654837ee7d1241b01cd843abc1f08b4cb492568 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Fri, 23 Dec 2022 16:17:22 +0800 Subject: [PATCH 39/39] docs(README): update README.md --- README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 793df1e96..f40c2ff87 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ # OmniSafe -OmniSafe is a comprehensive and trustworthy benchmark for safe reinforcement learning, covering a multitude of SafeRL domains and delivering a new suite of testing environments. +OmniSafe is a comprehensive and reliable benchmark for safe reinforcement learning, covering a multitude of SafeRL domains and delivering a new suite of testing environments. The simulation environment around OmniSafe and a series of reliable algorithm implementations will help the SafeRL research community easier to replicate and improve the excellent work already done while also helping to facilitate the validation of new ideas and new algorithms. @@ -128,8 +128,7 @@ Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) - [X] [Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) - [ ] **[RA-L 2021]** [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) - [ ] **[ICML 2022]** [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) -- [ ] **[NeurIPS 2022]** [Effects of Safety State Augmentation on -Safe Exploration](https://arxiv.org/abs/2206.02675) +- [ ] **[NeurIPS 2022]** [Effects of Safety State Augmentation on Safe Exploration](https://arxiv.org/abs/2206.02675) -------------------------------------------------------------------------------- @@ -139,9 +138,9 @@ Safe Exploration](https://arxiv.org/abs/2206.02675) We designed a variety of safety-enhanced learning tasks around the latest version of Gymnasium, including safety-run, safety-circle, safety-goal, safety-button, etc., leading to a unified safety-enhanced learning benchmark environment called `safety-gymnasium`. -Further, to facilitate the progress of community research, we redesigned [Safety_Gym](https://github.com/openai/safety-gym), removed the dependency on mujoco_py, made it created on top of [Mujoco](https://github.com/deepmind/mujoco), and fixed some bugs. +Further, to facilitate the progress of community research, we redesigned [Safety-Gym](https://github.com/openai/safety-gym) and removed the dependency on `mujoco-py`. We build it on top of [MuJoCo](https://github.com/deepmind/mujoco), and fixed some bugs. -After careful testing, we confirmed that it has the same dynamics parameters and training environment as the original safety gym, named `safety-gymnasium`. +After careful testing, we confirmed that it has the same dynamics parameters and training environment as the original `safety-gym`, named `safety-gymnasium`. Here is a list of all the environments we support, some of them are being tested in our baseline and we will gradually release them within a month. @@ -204,7 +203,7 @@ For the appetizer, the images are as follows ### Environment Usage -**Notes:** We support new **Gym APIs**. +**Notes:** We support new [**Gymnasium APIs**](https://github.com/Farama-Foundation/Gymnasium). ```python import safety_gymnasium @@ -300,12 +299,12 @@ agent.learn() # if done: # obs = env.reset() # env.close() - ``` ### 3. Run Agent from custom terminal config -cd `omnisafe/examples` and run -```python + +```bash +cd examples python train_on_policy.py --env-id SafetyPointGoal1-v0 --algo PPOLag --parallel 5 --epochs 1 ```