From b2e9847bf5a4b429c81530c820043843ac5fd21d Mon Sep 17 00:00:00 2001 From: Jiayi Zhou <108712610+Gaiejj@users.noreply.github.com> Date: Fri, 23 Dec 2022 17:59:38 +0800 Subject: [PATCH] feat: add new algorithms (#52) Co-authored-by: Xuehai Pan Co-authored-by: ruiyang sun --- README.md | 134 ++--- docs/source/spelling_wordlist.txt | 41 ++ .../bases/base_mujoco_task.py | 2 +- examples/train_policy.py | 14 +- omnisafe/__init__.py | 2 + omnisafe/algorithms/__init__.py | 93 ++-- omnisafe/algorithms/algo_wrapper.py | 11 +- omnisafe/algorithms/off_policy/__init__.py | 34 ++ omnisafe/algorithms/off_policy/ddpg.py | 140 ++--- omnisafe/algorithms/off_policy/ddpg_lag.py | 102 ++++ omnisafe/algorithms/off_policy/sac.py | 125 +++++ omnisafe/algorithms/off_policy/sac_lag.py | 137 +++++ omnisafe/algorithms/off_policy/sddpg.py | 180 +++++++ omnisafe/algorithms/off_policy/td3.py | 71 +++ omnisafe/algorithms/off_policy/td3_lag.py | 108 ++++ omnisafe/algorithms/on_policy/__init__.py | 37 ++ .../algorithms/on_policy/base/__init__.py | 28 + .../on_policy/{ => base}/natural_pg.py | 25 +- .../on_policy/{ => base}/policy_gradient.py | 28 +- omnisafe/algorithms/on_policy/base/ppo.py | 136 +++++ .../algorithms/on_policy/{ => base}/trpo.py | 27 +- .../on_policy/early_terminated/__init__.py | 26 + .../early_terminated/ppo_early_terminated.py | 33 ++ .../ppo_lag_early_terminated.py | 33 ++ .../on_policy/first_order/__init__.py | 24 + .../on_policy/{ => first_order}/cup.py | 27 +- .../on_policy/{ => first_order}/focops.py | 21 +- .../on_policy/naive_lagrange/__init__.py | 28 + .../on_policy/{ => naive_lagrange}/npg_lag.py | 20 +- .../on_policy/{ => naive_lagrange}/pdo.py | 17 +- .../on_policy/{ => naive_lagrange}/ppo_lag.py | 27 +- .../{ => naive_lagrange}/trpo_lag.py | 22 +- .../on_policy/pid_lagrange/__init__.py | 24 + .../on_policy/{ => pid_lagrange}/cppo_pid.py | 29 +- .../on_policy/pid_lagrange/trpo_pid.py | 84 +++ omnisafe/algorithms/on_policy/ppo.py | 65 --- .../algorithms/on_policy/saute/__init__.py | 24 + .../on_policy/saute/ppo_lag_saute.py | 38 ++ .../algorithms/on_policy/saute/ppo_saute.py | 38 ++ .../on_policy/second_order/__init__.py | 24 + .../on_policy/{ => second_order}/cpo.py | 26 +- .../on_policy/{ => second_order}/pcpo.py | 24 +- .../algorithms/on_policy/simmer/__init__.py | 28 + .../on_policy/simmer/ppo_lag_simmer_pid.py | 39 ++ .../on_policy/simmer/ppo_lag_simmer_q.py | 39 ++ .../on_policy/simmer/ppo_simmer_pid.py | 39 ++ .../on_policy/simmer/ppo_simmer_q.py | 38 ++ omnisafe/algorithms/registry.py | 6 +- omnisafe/common/pid_lagrange.py | 2 +- omnisafe/configs/off-policy/DDPG.yaml | 71 ++- omnisafe/configs/off-policy/DDPGLag.yaml | 116 ++++ omnisafe/configs/off-policy/SAC.yaml | 110 ++++ omnisafe/configs/off-policy/SACLag.yaml | 119 ++++ omnisafe/configs/off-policy/SDDPG.yaml | 116 ++++ omnisafe/configs/off-policy/TD3.yaml | 104 ++++ omnisafe/configs/off-policy/TD3Lag.yaml | 114 ++++ omnisafe/configs/on-policy/CPO.yaml | 2 + omnisafe/configs/on-policy/CPPOPid.yaml | 108 ++-- omnisafe/configs/on-policy/CUP.yaml | 6 +- omnisafe/configs/on-policy/FOCOPS.yaml | 2 + omnisafe/configs/on-policy/NPGLag.yaml | 81 ++- omnisafe/configs/on-policy/NaturalPG.yaml | 73 ++- omnisafe/configs/on-policy/PCPO.yaml | 2 + omnisafe/configs/on-policy/PDO.yaml | 77 ++- omnisafe/configs/on-policy/PPO.yaml | 75 ++- .../configs/on-policy/PPOEarlyTerminated.yaml | 111 ++++ omnisafe/configs/on-policy/PPOLag.yaml | 8 +- .../on-policy/PPOLagEarlyTerminated.yaml | 121 +++++ omnisafe/configs/on-policy/PPOLagSaute.yaml | 128 +++++ .../configs/on-policy/PPOLagSimmerPid.yaml | 145 +++++ omnisafe/configs/on-policy/PPOLagSimmerQ.yaml | 147 +++++ omnisafe/configs/on-policy/PPOSaute.yaml | 118 ++++ omnisafe/configs/on-policy/PPOSimmerPid.yaml | 137 +++++ omnisafe/configs/on-policy/PPOSimmerQ.yaml | 139 +++++ .../configs/on-policy/PolicyGradient.yaml | 68 ++- omnisafe/configs/on-policy/TRPO.yaml | 75 ++- omnisafe/configs/on-policy/TRPOLag.yaml | 4 +- omnisafe/configs/on-policy/TRPOPid.yaml | 139 +++++ omnisafe/models/actor/actor_builder.py | 27 + .../models/actor/gaussian_annealing_actor.py | 2 +- .../models/actor/gaussian_stdnet_actor.py | 9 +- omnisafe/models/actor/mlp_actor.py | 28 +- omnisafe/models/actor_critic.py | 2 +- omnisafe/models/actor_q_critic.py | 53 +- omnisafe/models/constraint_actor_q_critic.py | 12 +- omnisafe/models/critic/q_critic.py | 35 +- omnisafe/utils/config_utils.py | 24 +- omnisafe/wrappers/__init__.py | 28 + omnisafe/wrappers/early_terminated_wrapper.py | 87 +++ omnisafe/wrappers/env_wrapper.py | 4 +- omnisafe/wrappers/off_policy_wrapper.py | 9 +- omnisafe/wrappers/on_policy_wrapper.py | 19 +- omnisafe/wrappers/saute_wrapper.py | 210 ++++++++ omnisafe/wrappers/simmer_wrapper.py | 510 ++++++++++++++++++ tests/test_model.py | 15 +- tests/test_policy.py | 31 +- 96 files changed, 5247 insertions(+), 694 deletions(-) create mode 100644 omnisafe/algorithms/off_policy/__init__.py create mode 100644 omnisafe/algorithms/off_policy/ddpg_lag.py create mode 100644 omnisafe/algorithms/off_policy/sac.py create mode 100644 omnisafe/algorithms/off_policy/sac_lag.py create mode 100644 omnisafe/algorithms/off_policy/sddpg.py create mode 100644 omnisafe/algorithms/off_policy/td3.py create mode 100644 omnisafe/algorithms/off_policy/td3_lag.py create mode 100644 omnisafe/algorithms/on_policy/base/__init__.py rename omnisafe/algorithms/on_policy/{ => base}/natural_pg.py (90%) rename omnisafe/algorithms/on_policy/{ => base}/policy_gradient.py (96%) create mode 100644 omnisafe/algorithms/on_policy/base/ppo.py rename omnisafe/algorithms/on_policy/{ => base}/trpo.py (91%) create mode 100644 omnisafe/algorithms/on_policy/early_terminated/__init__.py create mode 100644 omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py create mode 100644 omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py create mode 100644 omnisafe/algorithms/on_policy/first_order/__init__.py rename omnisafe/algorithms/on_policy/{ => first_order}/cup.py (93%) rename omnisafe/algorithms/on_policy/{ => first_order}/focops.py (93%) create mode 100644 omnisafe/algorithms/on_policy/naive_lagrange/__init__.py rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/npg_lag.py (85%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/pdo.py (87%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/ppo_lag.py (81%) rename omnisafe/algorithms/on_policy/{ => naive_lagrange}/trpo_lag.py (83%) create mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/__init__.py rename omnisafe/algorithms/on_policy/{ => pid_lagrange}/cppo_pid.py (78%) create mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py delete mode 100644 omnisafe/algorithms/on_policy/ppo.py create mode 100644 omnisafe/algorithms/on_policy/saute/__init__.py create mode 100644 omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py create mode 100644 omnisafe/algorithms/on_policy/saute/ppo_saute.py create mode 100644 omnisafe/algorithms/on_policy/second_order/__init__.py rename omnisafe/algorithms/on_policy/{ => second_order}/cpo.py (96%) rename omnisafe/algorithms/on_policy/{ => second_order}/pcpo.py (94%) create mode 100644 omnisafe/algorithms/on_policy/simmer/__init__.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py create mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py create mode 100644 omnisafe/configs/off-policy/DDPGLag.yaml create mode 100644 omnisafe/configs/off-policy/SAC.yaml create mode 100644 omnisafe/configs/off-policy/SACLag.yaml create mode 100644 omnisafe/configs/off-policy/SDDPG.yaml create mode 100644 omnisafe/configs/off-policy/TD3.yaml create mode 100644 omnisafe/configs/off-policy/TD3Lag.yaml create mode 100644 omnisafe/configs/on-policy/PPOEarlyTerminated.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagSaute.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagSimmerPid.yaml create mode 100644 omnisafe/configs/on-policy/PPOLagSimmerQ.yaml create mode 100644 omnisafe/configs/on-policy/PPOSaute.yaml create mode 100644 omnisafe/configs/on-policy/PPOSimmerPid.yaml create mode 100644 omnisafe/configs/on-policy/PPOSimmerQ.yaml create mode 100644 omnisafe/configs/on-policy/TRPOPid.yaml create mode 100644 omnisafe/wrappers/early_terminated_wrapper.py create mode 100644 omnisafe/wrappers/saute_wrapper.py create mode 100644 omnisafe/wrappers/simmer_wrapper.py diff --git a/README.md b/README.md index ab9d85a05..f40c2ff87 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ # OmniSafe -OmniSafe is a comprehensive and trustworthy benchmark for safe reinforcement learning, covering a multitude of SafeRL domains and delivering a new suite of testing environments. +OmniSafe is a comprehensive and reliable benchmark for safe reinforcement learning, covering a multitude of SafeRL domains and delivering a new suite of testing environments. The simulation environment around OmniSafe and a series of reliable algorithm implementations will help the SafeRL research community easier to replicate and improve the excellent work already done while also helping to facilitate the validation of new ideas and new algorithms. @@ -24,8 +24,13 @@ The simulation environment around OmniSafe and a series of reliable algorithm im - [Overview](#overview) - [Implemented Algorithms](#implemented-algorithms) - - [Published in 2022](#published-in-2022) + - [Newly Published in 2022](#newly-published-in-2022) - [List of Algorithms](#list-of-algorithms) + - [On-Policy Safe](#on-policy-safe) + - [Off-Policy Safe](#off-policy-safe) + - [Model-Based Safe](#model-based-safe) + - [Offline Safe](#offline-safe) + - [Others](#others) - [SafeRL Environments](#saferl-environments) - [Safety Gymnasium](#safety-gymnasium) - [Vision-base Safe RL](#vision-base-safe-rl) @@ -49,11 +54,11 @@ Here we provide a table for comparison of **OmniSafe's algorithm core** and exis | SafeRL
Platform | Backend | Engine | # Safe Algo. | Parallel
CPU/GPU | New Gym API**(4)** | Vision Input | | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----: | :---------------------------: | ------------------- | :-------------------: | :---------------------------: | :-----------------: | -| [Safety-Gym](https://github.com/openai/safety-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/openai/safety-gym?label=last%20update) | TF1 | `mujoco-py`**(1)** | 3 | CPU Only (`mpi4py`) | :x: | minimally supported | -| [safe-control-gym](https://github.com/utiasDSL/safe-control-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/utiasDSL/safe-control-gym?label=last%20update) | PyTorch | PyBullet | 5**(2)** | | :x: | :x: | -| Velocity-Constraints**(3)** | N/A | N/A | N/A | N/A | :x: | :x: | -| [mujoco-circle](https://github.com/ymzhang01/mujoco-circle)
![GitHub last commit](https://img.shields.io/github/last-commit/ymzhang01/mujoco-circle?label=last%20update) | PyTorch | N/A | 0 | N/A | :x: | :x: | -| OmniSafe
![GitHub last commit](https://img.shields.io/github/last-commit/PKU-MARL/omnisafe?label=last%20update) | PyTorch | **MuJoCo 2.3.0+** | **25+** | `torch.distributed` | :heavy_check_mark: | :heavy_check_mark: | +| [Safety-Gym](https://github.com/openai/safety-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/openai/safety-gym?label=last%20update) | TF1 | `mujoco-py`**(1)** | 3 | CPU Only (`mpi4py`) | ❌ | minimally supported | +| [safe-control-gym](https://github.com/utiasDSL/safe-control-gym)
![GitHub last commit](https://img.shields.io/github/last-commit/utiasDSL/safe-control-gym?label=last%20update) | PyTorch | PyBullet | 5**(2)** | | ❌ | ❌ | +| Velocity-Constraints**(3)** | N/A | N/A | N/A | N/A | ❌ | ❌ | +| [mujoco-circle](https://github.com/ymzhang01/mujoco-circle)
![GitHub last commit](https://img.shields.io/github/last-commit/ymzhang01/mujoco-circle?label=last%20update) | PyTorch | N/A | 0 | N/A | ❌ | ❌ | +| OmniSafe
![GitHub last commit](https://img.shields.io/github/last-commit/PKU-MARL/omnisafe?label=last%20update) | PyTorch | **MuJoCo 2.3.0+** | **25+** | `torch.distributed` | ✅ | ✅ | (1): Maintenance (expect bug fixes and minor updates), the last commit is 19 Nov 2021. Safety Gym depends on `mujoco-py` 2.0.2.7, which was updated on Oct 12, 2019.
(2): We only count the safe's algorithm.
@@ -66,63 +71,64 @@ Here we provide a table for comparison of **OmniSafe's algorithm core** and exis The supported interface algorithms currently include: -### Published **in 2022** +### Newly Published in 2022 -- 😃 **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** -- 😃 **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** -- 😞 **Under Test**[NeurIPS 2022] [Effects of Safety State Augmentation on +- [X] **[AAAI 2023]** Augmented Proximal Policy Optimization for Safe Reinforcement Learning (APPO) **The original author of the paper contributed code** +- [X] **[NeurIPS 2022]** [Constrained Update Projection Approach to Safe Policy Optimization (CUP)](https://arxiv.org/abs/2209.07089) **The original author of the paper contributed code** +- [ ] **[NeurIPS 2022]** (Under Testing) [Effects of Safety State Augmentation on Safe Exploration (Swimmer)](https://arxiv.org/abs/2206.02675) -- 😃 **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) -- 😞 **Under Test**[ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) -- 😞 **Under Test**[ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) -- 😃 **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning](https://arxiv.org/abs/2205.11814) **The original author of the paper contributed code** -- **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) -- **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) - +- [X] **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) +- [ ] **[ICML 2022]** (Under Testing) [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) +- [ ] **[ICML 2022]** (Under Testing) [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) +- [X] **[IJCAI 2022]** [Penalized Proximal Policy Optimization for Safe Reinforcement Learning](https://arxiv.org/abs/2205.11814) **The original author of the paper contributed code** +- [ ] **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) +- [ ] **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) ### List of Algorithms -> On Policy Safe -- :heavy_check_mark:[The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) -- :heavy_check_mark:[The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) -- :heavy_check_mark:[ICML 2017][Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) -- :heavy_check_mark:[ICLR 2019][Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) -- :heavy_check_mark:[ICML 2020][Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (PID-Lag)](https://arxiv.org/abs/2007.03964) -- :heavy_check_mark:[NeurIPS 2020][First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) -- :heavy_check_mark:[AAAI 2020][IPO: Interior-point Policy Optimization under Constraints (IPO)](https://arxiv.org/abs/1910.09615) -- :heavy_check_mark:[ICLR 2020][Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) -- :heavy_check_mark:[ICML 2021][CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) - -> Off Policy Safe -- :heavy_check_mark:The Lagrange version of TD3 (TD3-Lag) -- :heavy_check_mark:The Lagrange version of DDPG (DDPG-Lag) -- :heavy_check_mark:The Lagrange version of SAC (SAC-Lag) -- :heavy_check_mark:[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG)](https://arxiv.org/abs/1901.10031) -- :heavy_check_mark:[ICML 2019][Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) -- [ICML 2022] [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) - -> Model Base Safe - -- [NeurIPS 2021][Safe Reinforcement Learning by Imagining the Near Future (SMBPO)](https://arxiv.org/abs/2202.07789) -- :heavy_check_mark:[CoRL 2021 Oral][Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) -- :heavy_check_mark:[AAAI 2022][Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) -- [NeurIPS 2022][Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) -- [ICLR 2022] [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) - -> Offline Safe -- :heavy_check_mark:[The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) -- :heavy_check_mark:[The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) -- [AAAI 2022] [Constraints Penalized Q-learning for Safe Offline Reinforcement Learning CPQ](https://arxiv.org/abs/2107.09003) -- [ICLR 2022 spotlight] [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) -- [ICML 2022][Constrained Offline Policy Optimization (COPO)](https://proceedings.mlr.press/v162/polosky22a.html) - -> Other -- :heavy_check_mark:[Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) -- [RA-L 2021] [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) -- [ICML 2022] [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) -- [NeurIPS 2022] [Effects of Safety State Augmentation on -Safe Exploration](https://arxiv.org/abs/2206.02675) +#### On-Policy Safe + +- [X] [The Lagrange version of PPO (PPO-Lag)](https://cdn.openai.com/safexp-short.pdf) +- [X] [The Lagrange version of TRPO (TRPO-Lag)](https://cdn.openai.com/safexp-short.pdf) +- [X] **[ICML 2017]** [Constrained Policy Optimization (CPO)](https://proceedings.mlr.press/v70/achiam17a) +- [X] **[ICLR 2019]** [Reward Constrained Policy Optimization (RCPO)](https://openreview.net/forum?id=SkfrvsA9FX) +- [X] **[ICML 2020]** [Responsive Safety in Reinforcement Learning by PID Lagrangian Methods (PID-Lag)](https://arxiv.org/abs/2007.03964) +- [X] **[NeurIPS 2020]** [First Order Constrained Optimization in Policy Space (FOCOPS)](https://arxiv.org/abs/2002.06506) +- [X] **[AAAI 2020]** [IPO: Interior-point Policy Optimization under Constraints (IPO)](https://arxiv.org/abs/1910.09615) +- [X] **[ICLR 2020]** [Projection-Based Constrained Policy Optimization (PCPO)](https://openreview.net/forum?id=rke3TJrtPS) +- [X] **[ICML 2021]** [CRPO: A New Approach for Safe Reinforcement Learning with Convergence Guarantee](https://arxiv.org/abs/2011.05869) + +#### Off-Policy Safe + +- [X] The Lagrange version of TD3 (TD3-Lag) +- [X] The Lagrange version of DDPG (DDPG-Lag) +- [X] The Lagrange version of SAC (SAC-Lag) +- [X] **[ICML 2019]** [Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG)](https://arxiv.org/abs/1901.10031) +- [X] **[ICML 2019]** [Lyapunov-based Safe Policy Optimization for Continuous Control (SDDPG-modular)](https://arxiv.org/abs/1901.10031) +- [ ] **[ICML 2022]** [Constrained Variational Policy Optimization for Safe Reinforcement Learning (CVPO)](https://arxiv.org/abs/2201.11927) + +#### Model-Based Safe + +- [ ] **[NeurIPS 2021]** [Safe Reinforcement Learning by Imagining the Near Future (SMBPO)](https://arxiv.org/abs/2202.07789) +- [X] **[CoRL 2021 (Oral)]** [Learning Off-Policy with Online Planning (SafeLoop)](https://arxiv.org/abs/2008.10066) +- [X] **[AAAI 2022]** [Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning (CAP)](https://arxiv.org/abs/2112.07701) +- [ ] **[NeurIPS 2022]** [Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm](https://arxiv.org/abs/2210.07573) +- [ ] **[ICLR 2022]** [Constrained Policy Optimization via Bayesian World Models (LAMBDA)](https://arxiv.org/abs/2201.09802) + +#### Offline Safe +- [X] [The Lagrange version of BCQ (BCQ-Lag)](https://arxiv.org/abs/1812.02900) +- [X] [The Constrained version of CRR (C-CRR)](https://proceedings.neurips.cc/paper/2020/hash/588cb956d6bbe67078f29f8de420a13d-Abstract.html) +- [ ] **[AAAI 2022]** [Constraints Penalized Q-learning for Safe Offline Reinforcement Learning CPQ](https://arxiv.org/abs/2107.09003) +- [ ] **[ICLR 2022 (Spotlight)]** [COptiDICE: Offline Constrained Reinforcement Learning via Stationary Distribution Correction Estimation](https://arxiv.org/abs/2204.08957?context=cs.AI) +- [ ] **[ICML 2022]** [Constrained Offline Policy Optimization (COPO)](https://proceedings.mlr.press/v162/polosky22a.html) + +#### Others + +- [X] [Safe Exploration in Continuous Action Spaces (Safety Layer)](https://arxiv.org/abs/1801.08757) +- [ ] **[RA-L 2021]** [Recovery RL: Safe Reinforcement Learning with Learned Recovery Zones](https://arxiv.org/abs/2010.15920) +- [ ] **[ICML 2022]** [Sauté RL: Almost Surely Safe Reinforcement Learning Using State Augmentation (SauteRL)](https://arxiv.org/abs/2202.06558) +- [ ] **[NeurIPS 2022]** [Effects of Safety State Augmentation on Safe Exploration](https://arxiv.org/abs/2206.02675) -------------------------------------------------------------------------------- @@ -132,9 +138,9 @@ Safe Exploration](https://arxiv.org/abs/2206.02675) We designed a variety of safety-enhanced learning tasks around the latest version of Gymnasium, including safety-run, safety-circle, safety-goal, safety-button, etc., leading to a unified safety-enhanced learning benchmark environment called `safety-gymnasium`. -Further, to facilitate the progress of community research, we redesigned [Safety_Gym](https://github.com/openai/safety-gym), removed the dependency on mujoco_py, made it created on top of [Mujoco](https://github.com/deepmind/mujoco), and fixed some bugs. +Further, to facilitate the progress of community research, we redesigned [Safety-Gym](https://github.com/openai/safety-gym) and removed the dependency on `mujoco-py`. We build it on top of [MuJoCo](https://github.com/deepmind/mujoco), and fixed some bugs. -After careful testing, we confirmed that it has the same dynamics parameters and training environment as the original safety gym, named `safety-gymnasium`. +After careful testing, we confirmed that it has the same dynamics parameters and training environment as the original `safety-gym`, named `safety-gymnasium`. Here is a list of all the environments we support, some of them are being tested in our baseline and we will gradually release them within a month. @@ -197,7 +203,7 @@ For the appetizer, the images are as follows ### Environment Usage -**Notes:** We support new **Gym APIs**. +**Notes:** We support new [**Gymnasium APIs**](https://github.com/Farama-Foundation/Gymnasium). ```python import safety_gymnasium @@ -293,12 +299,12 @@ agent.learn() # if done: # obs = env.reset() # env.close() - ``` ### 3. Run Agent from custom terminal config -cd `omnisafe/examples` and run -```python + +```bash +cd examples python train_on_policy.py --env-id SafetyPointGoal1-v0 --algo PPOLag --parallel 5 --epochs 1 ``` diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 1d609b912..f37c8c43b 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -182,3 +182,44 @@ Binbin Zhou Pengfei Yaodong +buf +Aivar +Sootla +Alexander +Cowen +Taher +Jafferjee +Ziyan +Wang +Mguni +Jun +Haitham +Ammar +Sun +Ziping +Xu +Meng +Fang +Zhenghao +Peng +Jiadong +Guo +Bo +lei +MDP +Bolei +Bou +Hao +Tuomas +Haarnoja +Aurick +Meger +Herke +Fujimoto +Lyapunov +Yinlam +Ofir +Nachum +Aleksandra +Duenez +Ghavamzadeh diff --git a/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py b/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py index 026d48615..d8a7a09da 100644 --- a/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py +++ b/envs/safety-gymnasium/safety_gymnasium/bases/base_mujoco_task.py @@ -18,7 +18,7 @@ from copy import deepcopy from typing import Union -# import gymnasium +import gymnasium # pylint: disable=unused-import import mujoco import numpy as np from gymnasium.envs.mujoco.mujoco_rendering import RenderContextOffscreen, Viewer diff --git a/examples/train_policy.py b/examples/train_policy.py index 675d51455..ac352a9da 100644 --- a/examples/train_policy.py +++ b/examples/train_policy.py @@ -24,18 +24,24 @@ parser.add_argument( '--algo', type=str, - default='PPOLag', - help='Choose from: {PolicyGradient, PPO, PPOLag, NaturalPG,' - ' TRPO, TRPOLag, PDO, NPGLag, CPO, PCPO, FOCOPS, CPPOPid,CUP', + metavar='ALGO', + default='PPOLagEarlyTerminated', + help='Algorithm to train', + choices=omnisafe.ALGORITHMS['all'], ) parser.add_argument( '--env-id', type=str, + metavar='ENV', default='SafetyPointGoal1-v0', help='The name of test environment', ) parser.add_argument( - '--parallel', default=1, type=int, help='Number of paralleled progress for calculations.' + '--parallel', + default=1, + type=int, + metavar='N', + help='Number of paralleled progress for calculations.', ) args, unparsed_args = parser.parse_known_args() keys = [k[2:] for k in unparsed_args[0::2]] diff --git a/omnisafe/__init__.py b/omnisafe/__init__.py index 77f8e63e0..f252acd93 100644 --- a/omnisafe/__init__.py +++ b/omnisafe/__init__.py @@ -14,6 +14,8 @@ # ============================================================================== """OmniSafe: A comprehensive and reliable benchmark for safe reinforcement learning.""" +from omnisafe import algorithms +from omnisafe.algorithms import ALGORITHMS from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent # from omnisafe.algorithms.env_wrapper import EnvWrapper as Env diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index 13f758cac..23829c7d7 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -14,42 +14,59 @@ # ============================================================================== """Safe Reinforcement Learning algorithms.""" -# Off Policy Safe -from omnisafe.algorithms.off_policy.ddpg import DDPG - -# On Policy Safe -from omnisafe.algorithms.on_policy.cpo import CPO -from omnisafe.algorithms.on_policy.cppo_pid import CPPOPid -from omnisafe.algorithms.on_policy.cup import CUP -from omnisafe.algorithms.on_policy.focops import FOCOPS -from omnisafe.algorithms.on_policy.natural_pg import NaturalPG -from omnisafe.algorithms.on_policy.npg_lag import NPGLag -from omnisafe.algorithms.on_policy.pcpo import PCPO -from omnisafe.algorithms.on_policy.pdo import PDO -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient -from omnisafe.algorithms.on_policy.ppo import PPO -from omnisafe.algorithms.on_policy.ppo_lag import PPOLag -from omnisafe.algorithms.on_policy.trpo import TRPO -from omnisafe.algorithms.on_policy.trpo_lag import TRPOLag - - -algo_type = { - 'off-policy': ['DDPG'], - 'on-policy': [ - 'CPO', - 'FOCOPS', - 'CPPOPid', - 'FOCOPS', - 'NaturalPG', - 'NPGLag', - 'PCPO', - 'PDO', - 'PolicyGradient', - 'PPO', - 'PPOLag', - 'TRPO', - 'TRPOLag', - 'CUP', - ], - 'model-based': ['MBPPOLag', 'SafeLoop'], +import itertools +from types import MappingProxyType + +from omnisafe.algorithms import off_policy, on_policy + +# Off-Policy Safe +from omnisafe.algorithms.off_policy import DDPG, SAC, SDDPG, TD3, DDPGLag, SACLag, TD3Lag + +# On-Policy Safe +from omnisafe.algorithms.on_policy import ( + CPO, + CUP, + FOCOPS, + PCPO, + PDO, + PPO, + TRPO, + CPPOPid, + NaturalPG, + NPGLag, + PolicyGradient, + PPOEarlyTerminated, + PPOLag, + PPOLagEarlyTerminated, + PPOLagSaute, + PPOLagSimmerPid, + PPOLagSimmerQ, + PPOSaute, + PPOSimmerPid, + PPOSimmerQ, + TRPOLag, + TRPOPid, +) + + +ALGORITHMS = { + 'off-policy': tuple(off_policy.__all__), + 'on-policy': tuple(on_policy.__all__), + 'model-based': ( + 'MBPPOLag', + 'SafeLoop', + ), } + +ALGORITHM2TYPE = { + algo: algo_type for algo_type, algorithms in ALGORITHMS.items() for algo in algorithms +} + +__all__ = ALGORITHMS['all'] = tuple(itertools.chain.from_iterable(ALGORITHMS.values())) + +assert len(ALGORITHM2TYPE) == len(__all__), 'Duplicate algorithm names found.' + +ALGORITHMS = MappingProxyType(ALGORITHMS) # make this immutable +ALGORITHM2TYPE = MappingProxyType(ALGORITHM2TYPE) # make this immutable + +del itertools, MappingProxyType diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py index def42e212..85ee6d9ec 100644 --- a/omnisafe/algorithms/algo_wrapper.py +++ b/omnisafe/algorithms/algo_wrapper.py @@ -19,7 +19,7 @@ import psutil -from omnisafe.algorithms import algo_type, registry +from omnisafe.algorithms import ALGORITHM2TYPE, registry from omnisafe.utils import distributed_utils from omnisafe.utils.config_utils import check_all_configs, recursive_update from omnisafe.utils.tools import get_default_kwargs_yaml @@ -46,13 +46,10 @@ def _init_checks(self): assert ( isinstance(self.custom_cfgs, dict) or self.custom_cfgs is None ), 'custom_cfgs must be a dict!' - for key, value in algo_type.items(): - if self.algo in value: - self.algo_type = key - break - if algo_type is None or algo_type == '': + self.algo_type = ALGORITHM2TYPE.get(self.algo, None) + if self.algo_type is None or self.algo_type == '': raise ValueError(f'{self.algo} is not supported!') - if algo_type == 'off-policy': + if self.algo_type == 'off-policy': assert self.parallel == 1, 'off-policy only support parallel==1!' def learn(self): diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py new file mode 100644 index 000000000..aa4ea363b --- /dev/null +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Off-policy algorithms.""" + +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.algorithms.off_policy.sac_lag import SACLag +from omnisafe.algorithms.off_policy.sddpg import SDDPG +from omnisafe.algorithms.off_policy.td3 import TD3 +from omnisafe.algorithms.off_policy.td3_lag import TD3Lag + + +__all__ = [ + 'DDPG', + 'DDPGLag', + 'SAC', + 'SACLag', + 'SDDPG', + 'TD3', + 'TD3Lag', +] diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 58ab15f12..3a85311c9 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -25,38 +25,40 @@ from omnisafe.common.logger import Logger from omnisafe.models.constraint_actor_q_critic import ConstraintActorQCritic from omnisafe.utils import core, distributed_utils +from omnisafe.utils.config_utils import namedtuple2dict from omnisafe.utils.tools import get_flat_params_from from omnisafe.wrappers import wrapper_registry @registry.register class DDPG: # pylint: disable=too-many-instance-attributes - """Continuous control with deep reinforcement learning (DDPG) Algorithm. + """The Deep Deterministic Policy Gradient (DDPG) algorithm. References: - Paper Name: Continuous control with deep reinforcement learning. - Paper author: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, - Tom Erez, Yuval Tassa, David Silver, Daan Wierstra. - Paper URL: https://arxiv.org/abs/1509.02971 - + Title: Continuous control with deep reinforcement learning + Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, + Yuval Tassa, David Silver, Daan Wierstra. + URL: https://arxiv.org/abs/1509.02971 """ - def __init__( - self, - env_id: str, - cfgs=None, - algo: str = 'DDPG', - wrapper_type: str = 'OffPolicyEnvWrapper', - ): - """Initialize DDPG.""" - self.env = wrapper_registry.get(wrapper_type)( + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize DDPG. + + Args: + env_id (str): Environment ID. + cfgs (dict): Configuration dictionary. + algo (str): Algorithm name. + wrapper_type (str): Wrapper type. + """ + self.cfgs = deepcopy(cfgs) + self.wrapper_type = self.cfgs.wrapper_type + self.env = wrapper_registry.get(self.wrapper_type)( env_id, use_cost=cfgs.use_cost, max_ep_len=cfgs.max_ep_len, ) self.env_id = env_id - self.algo = algo - self.cfgs = deepcopy(cfgs) + self.algo = self.__class__.__name__ # Set up for learning and rolling out schedule self.steps_per_epoch = cfgs.steps_per_epoch @@ -87,7 +89,7 @@ def __init__( # Set up logger and save configuration to disk self.logger = Logger(exp_name=cfgs.exp_name, data_dir=cfgs.data_dir, seed=cfgs.seed) - self.logger.save_config(cfgs._asdict()) + self.logger.save_config(namedtuple2dict(cfgs)) # Set seed seed = cfgs.seed + 10000 * distributed_utils.proc_id() torch.manual_seed(seed) @@ -140,6 +142,7 @@ def __init__( def set_learning_rate_scheduler(self): """Set up learning rate scheduler.""" + scheduler = None if self.cfgs.linear_lr_decay: # Linear anneal @@ -152,9 +155,8 @@ def linear_anneal(epoch): return scheduler def _init_mpi(self): - """ - Initialize MPI specifics - """ + """Initialize MPI specifics.""" + if distributed_utils.num_procs() > 1: # Avoid slowdowns from PyTorch + MPI combo distributed_utils.setup_torch_for_mpi() @@ -165,9 +167,7 @@ def _init_mpi(self): self.logger.log(f'Done! (took {time.time()-start:0.3f} sec.)') def algorithm_specific_logs(self): - """ - Use this method to collect log information. - """ + """Use this method to collect log information.""" def _ac_training_setup(self): """Set up target network for off_policy training.""" @@ -179,15 +179,9 @@ def _ac_training_setup(self): param.requires_grad = False for param in self.ac_targ.cost_critic.parameters(): param.requires_grad = False - if self.algo in ['SAC', 'TD3', 'SACLag', 'TD3Lag']: - # Freeze target networks with respect to optimizer (only update via polyak averaging) - for param in self.ac_targ.critic_.parameters(): - param.requires_grad = False def check_distributed_parameters(self): - """ - Check if parameters are synchronized across all processes. - """ + """Check if parameters are synchronized across all processes.""" if distributed_utils.num_procs() > 1: self.logger.log('Check if distributed parameters are synchronous..') modules = {'Policy': self.actor_critic.actor.net, 'Value': self.actor_critic.critic.net} @@ -198,23 +192,27 @@ def check_distributed_parameters(self): assert np.allclose(global_min, global_max), f'{key} not synced.' def compute_loss_pi(self, data: dict): - """ - computing pi/actor loss + """Computing pi/actor loss. + + Args: + data (dict): data dictionary. Returns: - torch.Tensor + torch.Tensor. """ action, _ = self.actor_critic.actor.predict(data['obs'], deterministic=True) - loss_pi = self.actor_critic.critic(data['obs'], action) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] pi_info = {} return -loss_pi.mean(), pi_info def compute_loss_v(self, data): - """ - computing value loss + """Computing value loss. + + Args: + data (dict): data dictionary. Returns: - torch.Tensor + torch.Tensor. """ obs, act, rew, obs_next, done = ( data['obs'], @@ -223,24 +221,26 @@ def compute_loss_v(self, data): data['obs_next'], data['done'], ) - q_value = self.actor_critic.critic(obs, act) + q_value = self.actor_critic.critic(obs, act)[0] # Bellman backup for Q function with torch.no_grad(): - act_targ, _ = self.ac_targ.actor.predict(obs, deterministic=True) - q_targ = self.ac_targ.critic(obs_next, act_targ) + act_targ = self.ac_targ.actor.predict(obs, deterministic=True, need_log_prob=False) + q_targ = self.ac_targ.critic(obs_next, act_targ)[0] backup = rew + self.cfgs.gamma * (1 - done) * q_targ # MSE loss against Bellman backup loss_q = ((q_value - backup) ** 2).mean() # Useful info for logging - q_info = dict(Q1Vals=q_value.detach().numpy()) + q_info = dict(QVals=q_value.detach().numpy()) return loss_q, q_info def compute_loss_c(self, data): - """ - computing cost loss + """Computing cost loss. + + Args: + data (dict): data dictionary. Returns: - torch.Tensor + torch.Tensor. """ obs, act, cost, obs_next, done = ( data['obs'], @@ -249,12 +249,12 @@ def compute_loss_c(self, data): data['obs_next'], data['done'], ) - cost_q_value = self.actor_critic.cost_critic(obs, act) + cost_q_value = self.actor_critic.cost_critic(obs, act)[0] # Bellman backup for Q function with torch.no_grad(): - action, _ = self.ac_targ.pi.predict(obs_next, deterministic=True) - qc_targ = self.ac_targ.c(obs_next, action) + action, _ = self.ac_targ.actor.predict(obs_next, deterministic=True) + qc_targ = self.ac_targ.cost_critic(obs_next, action)[0] backup = cost + self.cfgs.gamma * (1 - done) * qc_targ # MSE loss against Bellman backup loss_qc = ((cost_q_value - backup) ** 2).mean() @@ -271,7 +271,7 @@ def learn(self): (3). log epoch/update information for visualization and terminal log print. Returns: - model and environment + model and environment. """ for steps in range(0, self.local_steps_per_epoch * self.epochs, self.update_every): @@ -314,7 +314,11 @@ def learn(self): return self.actor_critic def update(self, data): - """update""" + """Update. + + Args: + data (dict): data dictionary. + """ # First run one gradient descent step for Q. self.update_value_net(data) if self.cfgs.use_cost: @@ -327,7 +331,7 @@ def update(self, data): for param in self.actor_critic.critic.parameters(): param.requires_grad = False - # Next run one gradient descent step for pi. + # Next run one gradient descent step for actor. self.update_policy_net(data) # Unfreeze Q-network so you can optimize it at next DDPG step. @@ -342,7 +346,7 @@ def update(self, data): self.polyak_update_target() def polyak_update_target(self): - """polyak update target network""" + """Polyak update target network.""" with torch.no_grad(): for param, param_targ in zip(self.actor_critic.parameters(), self.ac_targ.parameters()): # Notes: We use an in-place operations "mul_", "add_" to update target @@ -351,7 +355,11 @@ def polyak_update_target(self): param_targ.data.add_((1 - self.cfgs.polyak) * param.data) def update_policy_net(self, data) -> None: - """update policy network""" + """Update policy network. + + Args: + data (dict): data dictionary. + """ # Train policy with one steps of gradient descent self.actor_optimizer.zero_grad() loss_pi, _ = self.compute_loss_pi(data) @@ -360,16 +368,24 @@ def update_policy_net(self, data) -> None: self.logger.store(**{'Loss/Pi': loss_pi.item()}) def update_value_net(self, data: dict) -> None: - """update value network""" + """Update value network. + + Args: + data (dict): data dictionary + """ # Train value critic with one steps of gradient descent self.critic_optimizer.zero_grad() loss_q, q_info = self.compute_loss_v(data) loss_q.backward() self.critic_optimizer.step() - self.logger.store(**{'Loss/Value': loss_q.item(), 'Q1Vals': q_info['Q1Vals']}) + self.logger.store(**{'Loss/Value': loss_q.item(), 'QVals': q_info['QVals']}) def update_cost_net(self, data): - """update cost network""" + """Update cost network. + + Args: + data (dict): data dictionary. + """ # Train cost critic with one steps of gradient descent self.cost_critic_optimizer.zero_grad() loss_qc, qc_info = self.compute_loss_c(data) @@ -378,7 +394,7 @@ def update_cost_net(self, data): self.logger.store(**{'Loss/Cost': loss_qc.item(), 'QCosts': qc_info['QCosts']}) def test_agent(self): - """test agent""" + """Test agent.""" for _ in range(self.num_test_episodes): # self.env.set_rollout_cfgs(deterministic=True, rand_a=False) self.env.roll_out( @@ -391,7 +407,7 @@ def test_agent(self): ) def log(self, epoch, total_steps): - """Log info about epoch""" + """Log info about epoch.""" fps = self.cfgs.steps_per_epoch / (time.time() - self.epoch_time) # Step the actor learning rate scheduler if provided if self.scheduler and self.cfgs.linear_lr_decay: @@ -402,13 +418,13 @@ def log(self, epoch, total_steps): self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('Metrics/EpRet') - self.logger.log_tabular('Metrics/EpCosts') + self.logger.log_tabular('Metrics/EpCost') self.logger.log_tabular('Metrics/EpLen') self.logger.log_tabular('Test/EpRet') - self.logger.log_tabular('Test/EpCosts') + self.logger.log_tabular('Test/EpCost') self.logger.log_tabular('Test/EpLen') self.logger.log_tabular('Values/V', min_and_max=True) - self.logger.log_tabular('Q1Vals') + self.logger.log_tabular('QVals') if self.cfgs.use_cost: self.logger.log_tabular('Values/C', min_and_max=True) self.logger.log_tabular('QCosts') @@ -424,7 +440,7 @@ def log(self, epoch, total_steps): self.logger.log_tabular('Misc/RewScaleMean', reward_scale_mean) self.logger.log_tabular('Misc/RewScaleStddev', reward_scale_stddev) if self.cfgs.exploration_noise_anneal: - noise_std = np.exp(self.actor_critic.pi.log_std[0].item()) + noise_std = np.exp(self.actor_critic.actor.log_std[0].item()) self.logger.log_tabular('Misc/ExplorationNoiseStd', noise_std) self.algorithm_specific_logs() self.logger.log_tabular('TotalEnvSteps', total_steps) diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py new file mode 100644 index 000000000..3b9fc2a3f --- /dev/null +++ b/omnisafe/algorithms/off_policy/ddpg_lag.py @@ -0,0 +1,102 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Lagrange version of the DDPG algorithm.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.common.lagrange import Lagrange + + +@registry.register +class DDPGLag(DDPG, Lagrange): # pylint: disable=too-many-instance-attributes + """The Lagrange version of the DDPG Algorithm. + + References: + Title: Continuous control with deep reinforcement learning + Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, + Yuval Tassa, David Silver, Daan Wierstra. + URL: https://arxiv.org/abs/1509.02971 + """ + + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize DDPG.""" + DDPG.__init__( + self, + env_id=env_id, + cfgs=cfgs, + ) + Lagrange.__init__( + self, + cost_limit=self.cfgs.lagrange_cfgs.cost_limit, + lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, + lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, + lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + ) + + def algorithm_specific_logs(self): + """Use this method to collect log information.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) + + def compute_loss_pi(self, data: dict): + """Computing pi/actor loss. + + Args: + data (dict): data from replay buffer. + + Returns: + torch.Tensor. + """ + action = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=False + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] + penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() + loss_pi -= ( + self.lagrangian_multiplier * self.actor_critic.cost_critic(data['obs'], action)[0] + ) + loss_pi /= 1 + penalty + pi_info = {} + return -loss_pi.mean(), pi_info + + def update(self, data): + """Update.""" + Jc = data['cost'].sum().item() + self.update_lagrange_multiplier(Jc) + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for pi. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next DDPG step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py new file mode 100644 index 000000000..fadf022cd --- /dev/null +++ b/omnisafe/algorithms/off_policy/sac.py @@ -0,0 +1,125 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the SAC algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG + + +@registry.register +class SAC(DDPG): # pylint: disable=too-many-instance-attributes + """The Soft Actor-Critic (SAC) algorithm. + + References: + Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + URL: https://arxiv.org/abs/1801.01290 + """ + + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize SAC.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + ) + self.alpha = cfgs.alpha + self.alpha_gamma = cfgs.alpha_gamma + + # pylint: disable=too-many-locals + def compute_loss_v(self, data): + """Computing value loss. + + Args: + data (dict): data from replay buffer. + + Returns: + torch.Tensor. + """ + obs, act, rew, obs_next, done = ( + data['obs'], + data['act'], + data['rew'], + data['obs_next'], + data['done'], + ) + q_value_list = self.actor_critic.critic(obs, act) + # Bellman backup for Q function + with torch.no_grad(): + act_targ, logp_a_next = self.ac_targ.actor.predict( + obs, deterministic=False, need_log_prob=True + ) + q_targ = torch.min(torch.vstack(self.ac_targ.critic(obs_next, act_targ)), dim=0).values + backup = rew + self.cfgs.gamma * (1 - done) * (q_targ - self.alpha * logp_a_next) + # MSE loss against Bellman backup + loss_q = [] + q_values = [] + for q_value in q_value_list: + loss_q.append(torch.mean((q_value - backup) ** 2)) + q_values.append(torch.mean(q_value)) + + # Useful info for logging + q_info = dict(QVals=sum(q_values).detach().numpy()) + return sum(loss_q), q_info + + def compute_loss_pi(self, data: dict): + """Computing pi/actor loss. + + Args: + data (dict): data from replay buffer. + + Returns: + torch.Tensor. + """ + action, logp_a = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=True + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] - self.alpha * logp_a + pi_info = {'LogPi': logp_a.detach().numpy()} + return -loss_pi.mean(), pi_info + + def update(self, data): + """Update.""" + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for actor. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next DDPG step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() + self.alpha_discount() + + def alpha_discount(self): + """Alpha discount.""" + self.alpha *= self.alpha_gamma diff --git a/omnisafe/algorithms/off_policy/sac_lag.py b/omnisafe/algorithms/off_policy/sac_lag.py new file mode 100644 index 000000000..d55c626b9 --- /dev/null +++ b/omnisafe/algorithms/off_policy/sac_lag.py @@ -0,0 +1,137 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Lagrange version of the SAC algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.common.lagrange import Lagrange + + +@registry.register +class SACLag(SAC, Lagrange): # pylint: disable=too-many-instance-attributes + """The Lagrange version of SAC algorithm. + + References: + Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + URL: https://arxiv.org/abs/1801.01290 + """ + + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize SACLag. + + Args: + env_id (str): environment id. + cfgs (dict): configuration. + algo (str): algorithm name. + wrapper_type (str): environment wrapper type. + """ + SAC.__init__( + self, + env_id=env_id, + cfgs=cfgs, + ) + Lagrange.__init__( + self, + cost_limit=self.cfgs.lagrange_cfgs.cost_limit, + lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, + lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, + lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + ) + + def algorithm_specific_logs(self): + """Use this method to collect log information.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) + + def compute_loss_pi(self, data: dict): + """Computing pi/actor loss. + + Returns: + torch.Tensor. + """ + action, logp_a = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=True + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] - self.alpha * logp_a + penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() + loss_pi -= ( + self.lagrangian_multiplier * self.actor_critic.cost_critic(data['obs'], action)[0] + ) + loss_pi /= 1 + penalty + pi_info = {} + return -loss_pi.mean(), pi_info + + def compute_loss_c(self, data): + """Computing cost loss. + + Returns: + torch.Tensor. + """ + obs, act, cost, obs_next, done = ( + data['obs'], + data['act'], + data['rew'], + data['obs_next'], + data['done'], + ) + cost_q_value = self.actor_critic.cost_critic(obs, act)[0] + + # Bellman backup for Q function + with torch.no_grad(): + act_targ, logp_a_next = self.ac_targ.actor.predict( + obs_next, deterministic=False, need_log_prob=True + ) + qc_targ = self.ac_targ.cost_critic(obs_next, act_targ)[0] + backup = cost + self.cfgs.gamma * (1 - done) * (qc_targ - self.alpha * logp_a_next) + # MSE loss against Bellman backup + loss_qc = ((cost_q_value - backup) ** 2).mean() + # Useful info for logging + qc_info = dict(QCosts=cost_q_value.detach().numpy()) + + return loss_qc, qc_info + + def update(self, data): + """Update.""" + Jc = data['cost'].sum().item() + self.update_lagrange_multiplier(Jc) + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for pi. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next SAC step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() + self.alpha_discount() diff --git a/omnisafe/algorithms/off_policy/sddpg.py b/omnisafe/algorithms/off_policy/sddpg.py new file mode 100644 index 000000000..29a357f27 --- /dev/null +++ b/omnisafe/algorithms/off_policy/sddpg.py @@ -0,0 +1,180 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the SDDPG algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.utils import distributed_utils +from omnisafe.utils.tools import ( + conjugate_gradients, + get_flat_gradients_from, + get_flat_params_from, + set_param_values_to_model, +) + + +@registry.register +class SDDPG(DDPG): # pylint: disable=too-many-instance-attributes,invalid-name + """Implementation of the SDDPG algorithm. + + References: + Title: Lyapunov-based Safe Policy Optimization for Continuous Control + Authors: Yinlam Chow, Ofir Nachum, Aleksandra Faust, Edgar Duenez-Guzman, + Mohammad Ghavamzadeh. + URL: https://arxiv.org/abs/1901.10031 + """ + + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize SDDPG. + + Args: + env_id (str): environment id. + cfgs (dict): configurations. + algo (str): algorithm name. + wrapper_type (str): environment wrapper type. + """ + super().__init__( + env_id=env_id, + cfgs=cfgs, + ) + self.beta = cfgs.beta + self.cg_damping = cfgs.cg_damping + self.cg_iters = cfgs.cg_iters + self.fvp_obs = None + self.target_kl = cfgs.target_kl + self.gamma = cfgs.gamma + self.d_init = cfgs.d_init + + def update(self, data): + """Update. + + Args: + data (dict): data dictionary. + """ + # First run one gradient descent step for Q. + self.fvp_obs = data['obs'][::4] + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for actor. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next DDPG step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() + + def Fvp(self, params): + """ + Build the Hessian-vector product based on an approximation of the KL-divergence. + For details see John Schulman's PhD thesis (pp. 40) http://joschu.net/docs/thesis.pdf + + Args: + params (torch.Tensor): parameters. + + Returns: + flat_grad_grad_kl (torch.Tensor): flat gradient of gradient of KL. + """ + self.actor_critic.actor.net.zero_grad() + q_dist = self.actor_critic.actor.get_distribution(self.fvp_obs) + with torch.no_grad(): + p_dist = self.actor_critic.actor.get_distribution(self.fvp_obs) + kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean() + + grads = torch.autograd.grad(kl, self.actor_critic.actor.net.parameters(), create_graph=True) + flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) + + kl_p = (flat_grad_kl * params).sum() + grads = torch.autograd.grad( + kl_p, self.actor_critic.actor.net.parameters(), retain_graph=False + ) + # contiguous indicating, if the memory is contiguously stored or not + flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]) + distributed_utils.mpi_avg_torch_tensor(flat_grad_grad_kl) + return flat_grad_grad_kl + params * self.cg_damping + + def compute_loss_cost_performance(self, data): + """Compute loss of cost performance. + + Args: + data (dict): data dictionary. + + Returns: + loss (torch.Tensor): loss of cost performance. + """ + # Compute loss + action, _ = self.actor_critic.actor.predict(data['obs'], deterministic=True) + loss_pi = self.actor_critic.cost_critic(data['obs'], action)[0] + pi_info = {} + return loss_pi.mean(), pi_info + + # pylint: disable=invalid-name,too-many-arguments,too-many-locals + def update_policy_net(self, data) -> None: + """Update policy network. + + Args: + data (dict): data dictionary. + """ + # Train policy with one steps of gradient descent + theta_old = get_flat_params_from(self.actor_critic.actor.net) + + self.actor_optimizer.zero_grad() + loss_pi, _ = self.compute_loss_pi(data) + loss_pi.backward() + + g_flat = get_flat_gradients_from(self.actor_critic.actor.net) + g_flat *= -1 + + x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters) + assert torch.isfinite(x).all() + + eps = 1.0e-8 + xHx = torch.dot(x, self.Fvp(x)) + + alpha = torch.sqrt(2 * self.target_kl / (xHx + eps)) + + self.actor_optimizer.zero_grad() + loss_cost, _ = self.compute_loss_cost_performance(data) + loss_cost.backward() + + b_flat = get_flat_gradients_from(self.actor_critic.actor.net) + d = conjugate_gradients(self.Fvp, b_flat, self.cg_iters) + dHd = torch.dot(d, self.Fvp(d)) + sHd = torch.dot(d, self.Fvp(d)) + + epsilon = (1 - self.gamma) * (self.d_init - loss_cost) + lambda_star = (-self.beta * epsilon - sHd) / (dHd + eps) + + final_step_dir = -alpha / self.beta * (self.Fvp(x) - lambda_star * self.Fvp(d)) + new_theta = theta_old + final_step_dir + set_param_values_to_model(self.actor_critic.actor.net, new_theta) + + self.logger.store(**{'Loss/Pi': loss_pi.item()}) diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py new file mode 100644 index 000000000..c811da379 --- /dev/null +++ b/omnisafe/algorithms/off_policy/td3.py @@ -0,0 +1,71 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the TD3 algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG + + +@registry.register +class TD3(DDPG): # pylint: disable=too-many-instance-attributes + """The Twin Delayed DDPG (TD3) algorithm. + + References: + Title: Addressing Function Approximation Error in Actor-Critic Methods + Authors: Scott Fujimoto, Herke van Hoof, David Meger. + URL: https://arxiv.org/abs/1802.09477 + """ + + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize DDPG.""" + super().__init__( + env_id=env_id, + cfgs=cfgs, + ) + + def compute_loss_v(self, data): + """Computing value loss. + + Args: + data (dict): data from replay buffer. + + Returns: + torch.Tensor. + """ + obs, act, rew, obs_next, done = ( + data['obs'], + data['act'], + data['rew'], + data['obs_next'], + data['done'], + ) + q_value_list = self.actor_critic.critic(obs, act) + # Bellman backup for Q function + with torch.no_grad(): + act_targ = self.ac_targ.actor.predict(obs, deterministic=False, need_log_prob=False) + q_targ = torch.min(torch.vstack(self.ac_targ.critic(obs_next, act_targ)), dim=0).values + backup = rew + self.cfgs.gamma * (1 - done) * q_targ + # MSE loss against Bellman backup + loss_q = [] + q_values = [] + for q_value in q_value_list: + loss_q.append(torch.mean((q_value - backup) ** 2)) + q_values.append(torch.mean(q_value)) + + # Useful info for logging + q_info = dict(QVals=sum(q_values).detach().numpy()) + return sum(loss_q), q_info diff --git a/omnisafe/algorithms/off_policy/td3_lag.py b/omnisafe/algorithms/off_policy/td3_lag.py new file mode 100644 index 000000000..aa8bd2be0 --- /dev/null +++ b/omnisafe/algorithms/off_policy/td3_lag.py @@ -0,0 +1,108 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Lagrange version of the TD3 algorithm.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.td3 import TD3 +from omnisafe.common.lagrange import Lagrange + + +@registry.register +class TD3Lag(TD3, Lagrange): # pylint: disable=too-many-instance-attributes + """The Lagrange version of the TD3 algorithm + + References: + Title: Addressing Function Approximation Error in Actor-Critic Methods + Authors: Scott Fujimoto, Herke van Hoof, David Meger. + URL: https://arxiv.org/abs/1802.09477 + """ + + def __init__(self, env_id: str, cfgs=None) -> None: + """Initialize TD3. + + Args: + env_id (str): environment id. + cfgs (dict): configurations. + algo (str): algorithm name. + wrapper_type (str): environment wrapper type. + """ + TD3.__init__( + self, + env_id=env_id, + cfgs=cfgs, + ) + Lagrange.__init__( + self, + cost_limit=self.cfgs.lagrange_cfgs.cost_limit, + lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, + lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, + lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + ) + + def algorithm_specific_logs(self): + """Use this method to collect log information.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) + + def compute_loss_pi(self, data: dict): + """Computing pi/actor loss. + + Args: + data (dict): data. + + Returns: + torch.Tensor. + """ + action = self.actor_critic.actor.predict( + data['obs'], deterministic=True, need_log_prob=False + ) + loss_pi = self.actor_critic.critic(data['obs'], action)[0] + penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() + loss_pi -= ( + self.lagrangian_multiplier * self.actor_critic.cost_critic(data['obs'], action)[0] + ) + loss_pi /= 1 + penalty + pi_info = {} + return -loss_pi.mean(), pi_info + + def update(self, data): + """Update.""" + Jc = data['cost'].sum().item() + self.update_lagrange_multiplier(Jc) + # First run one gradient descent step for Q. + self.update_value_net(data) + if self.cfgs.use_cost: + self.update_cost_net(data) + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = False + + # Freeze Q-network so you don't waste computational effort + # computing gradients for it during the policy learning step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = False + + # Next run one gradient descent step for pi. + self.update_policy_net(data) + + # Unfreeze Q-network so you can optimize it at next TD3 step. + for param in self.actor_critic.critic.parameters(): + param.requires_grad = True + + if self.cfgs.use_cost: + for param in self.actor_critic.cost_critic.parameters(): + param.requires_grad = True + + # Finally, update target networks by polyak averaging. + self.polyak_update_target() diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index 756435c9c..c7a781f09 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -12,3 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""On-policy algorithms.""" + +from omnisafe.algorithms.on_policy import ( + base, + early_terminated, + first_order, + naive_lagrange, + pid_lagrange, + saute, + second_order, + simmer, +) +from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient +from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, PPOLagEarlyTerminated +from omnisafe.algorithms.on_policy.first_order import CUP, FOCOPS +from omnisafe.algorithms.on_policy.naive_lagrange import PDO, NPGLag, PPOLag, TRPOLag +from omnisafe.algorithms.on_policy.pid_lagrange import CPPOPid, TRPOPid +from omnisafe.algorithms.on_policy.saute import PPOLagSaute, PPOSaute +from omnisafe.algorithms.on_policy.second_order import CPO, PCPO +from omnisafe.algorithms.on_policy.simmer import ( + PPOLagSimmerPid, + PPOLagSimmerQ, + PPOSimmerPid, + PPOSimmerQ, +) + + +__all__ = [ + *base.__all__, + *early_terminated.__all__, + *first_order.__all__, + *naive_lagrange.__all__, + *pid_lagrange.__all__, + *saute.__all__, + *second_order.__all__, + *simmer.__all__, +] diff --git a/omnisafe/algorithms/on_policy/base/__init__.py b/omnisafe/algorithms/on_policy/base/__init__.py new file mode 100644 index 000000000..0b8e240be --- /dev/null +++ b/omnisafe/algorithms/on_policy/base/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Basic Reinforcement Learning algorithms.""" + +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO + + +__all__ = [ + 'NaturalPG', + 'PolicyGradient', + 'PPO', + 'TRPO', +] diff --git a/omnisafe/algorithms/on_policy/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py similarity index 90% rename from omnisafe/algorithms/on_policy/natural_pg.py rename to omnisafe/algorithms/on_policy/base/natural_pg.py index 4b86d0ee2..6a4f0b40d 100644 --- a/omnisafe/algorithms/on_policy/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, @@ -32,31 +32,18 @@ class NaturalPG(PolicyGradient): """The Natural Policy Gradient algorithm. References: - Paper Name: A Natural Policy Gradient. - Paper author: Sham Kakade. - Paper URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf - + Title: A Natural Policy Gradient + Author: Sham Kakade. + URL: https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf """ - def __init__( - self, - env_id, - cfgs, - algo: str = 'NaturalPolicyGradient', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) self.cg_damping = cfgs.cg_damping self.cg_iters = cfgs.cg_iters self.target_kl = cfgs.target_kl self.fvp_obs = cfgs.fvp_obs - # pylint: disable-next=too-many-arguments,unused-argument def search_step_size(self, step_dir): """ NPG use full step_size diff --git a/omnisafe/algorithms/on_policy/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py similarity index 96% rename from omnisafe/algorithms/on_policy/policy_gradient.py rename to omnisafe/algorithms/on_policy/base/policy_gradient.py index 0c84eb44b..a4c62b577 100644 --- a/omnisafe/algorithms/on_policy/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -25,6 +25,7 @@ from omnisafe.common.logger import Logger from omnisafe.models.constraint_actor_critic import ConstraintActorCritic from omnisafe.utils import core, distributed_utils +from omnisafe.utils.config_utils import namedtuple2dict from omnisafe.utils.tools import get_flat_params_from from omnisafe.wrappers import wrapper_registry @@ -34,21 +35,13 @@ class PolicyGradient: # pylint: disable=too-many-instance-attributes """The Policy Gradient algorithm. References: - Paper Name: Policy Gradient Methods for Reinforcement Learning with Function Approximation - Paper Author: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour - Paper URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf - + Title: Policy Gradient Methods for Reinforcement Learning with Function Approximation + Authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour. + URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf """ - # pylint: disable-next=too-many-locals - def __init__( - self, - env_id, - cfgs=None, - algo: str = 'PolicyGradient', - wrapper_type: str = 'OnPolicyEnvWrapper', - ) -> None: - r"""Initialize the algorithm. + def __init__(self, env_id, cfgs=None) -> None: + """Initialize the algorithm. Args: env: The environment. @@ -57,9 +50,12 @@ def __init__( cfgs: (default: :const:`None`) This is a dictionary of the algorithm hyper-parameters. """ - self.env = wrapper_registry.get(wrapper_type)(env_id) - self.algo = algo + self.algo = self.__class__.__name__ self.cfgs = deepcopy(cfgs) + self.wrapper_type = self.cfgs.wrapper_type + self.env = wrapper_registry.get(self.wrapper_type)( + env_id, cfgs=namedtuple2dict(self.cfgs).get('env_cfgs') + ) assert self.cfgs.steps_per_epoch % distributed_utils.num_procs() == 0 self.local_steps_per_epoch = cfgs.steps_per_epoch // distributed_utils.num_procs() @@ -72,7 +68,7 @@ def __init__( # Set up logger and save configuration to disk self.logger = Logger(exp_name=cfgs.exp_name, data_dir=cfgs.data_dir, seed=cfgs.seed) - self.logger.save_config(cfgs._asdict()) + self.logger.save_config(namedtuple2dict(cfgs)) # Set seed seed = int(cfgs.seed) + 10000 * distributed_utils.proc_id() torch.manual_seed(seed) diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py new file mode 100644 index 000000000..d2e651554 --- /dev/null +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -0,0 +1,136 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PPO algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient +from omnisafe.utils import distributed_utils + + +@registry.register +class PPO(PolicyGradient): + """The Proximal Policy Optimization (PPO) algorithm. + + References: + Title: Proximal Policy Optimization Algorithms + Authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. + URL: https://arxiv.org/abs/1707.06347 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPO.""" + self.clip = cfgs.clip + super().__init__( + env_id=env_id, + cfgs=cfgs, + ) + + def compute_loss_pi(self, data: dict): + """Compute policy loss.""" + dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) + # Importance ratio + ratio = torch.exp(_log_p - data['log_p']) + ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) + loss_pi = -(torch.min(ratio * data['adv'], ratio_clip * data['adv'])).mean() + loss_pi += self.cfgs.entropy_coef * dist.entropy().mean() + + # Useful extra info + approx_kl = (0.5 * (dist.mean - data['act']) ** 2 / dist.stddev**2).mean().item() + ent = dist.entropy().mean().item() + pi_info = dict(kl=approx_kl, ent=ent, ratio=ratio_clip.mean().item()) + + return loss_pi, pi_info + + def slice_data(self, data) -> dict: + """slice data for mini batch update""" + + slice_data = [] + obs = data['obs'] + act = data['act'] + target_v = data['target_v'] + log_p = data['log_p'] + adv = data['adv'] + discounted_ret = data['discounted_ret'] + cost_adv = data['cost_adv'] + target_v = data['target_v'] + batch_size = self.cfgs.batch_size + for i in range(int(len(obs) / batch_size)): + slice_data.append( + { + 'obs': obs[i * batch_size : (i + 1) * batch_size], + 'act': act[i * batch_size : (i + 1) * batch_size], + 'target_v': target_v[i * batch_size : (i + 1) * batch_size], + 'log_p': log_p[i * batch_size : (i + 1) * batch_size], + 'adv': adv[i * batch_size : (i + 1) * batch_size], + 'discounted_ret': discounted_ret[i * batch_size : (i + 1) * batch_size], + 'cost_adv': cost_adv[i * batch_size : (i + 1) * batch_size], + } + ) + + return slice_data + + def update_policy_net(self, data) -> None: + """update policy network""" + + # Slice data for mini batch update + slice_data = self.slice_data(data) + + # Get prob. distribution before updates: used to measure KL distance + with torch.no_grad(): + self.p_dist = self.actor_critic.actor(slice_data[0]['obs']) + + # Get loss and info values before update + pi_l_old, _ = self.compute_loss_pi(data=slice_data[0]) + loss_pi_before = pi_l_old.item() + + # Train policy with multiple steps of gradient descent + for i in range(self.cfgs.actor_iters): + for batch_data in slice_data: + self.actor_optimizer.zero_grad() + loss_pi, pi_info = self.compute_loss_pi(data=batch_data) + loss_pi.backward() + # Apply L2 norm + if self.cfgs.use_max_grad_norm: + torch.nn.utils.clip_grad_norm_( + self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm + ) + + # Average grads across MPI processes + distributed_utils.mpi_avg_grads(self.actor_critic.actor.net) + self.actor_optimizer.step() + + q_dist = self.actor_critic.actor(batch_data['obs']) + torch_kl = torch.distributions.kl.kl_divergence(self.p_dist, q_dist).mean().item() + + if self.cfgs.kl_early_stopping: + # Average KL for consistent early stopping across processes + if distributed_utils.mpi_avg(torch_kl) > self.cfgs.target_kl: + self.logger.log(f'Reached ES criterion after {i+1} steps.') + break + + # Track when policy iteration is stopped; Log changes from update + self.logger.store( + **{ + 'Loss/Loss_pi': loss_pi.item(), + 'Loss/Delta_loss_pi': loss_pi.item() - loss_pi_before, + 'Train/StopIter': i + 1, + 'Values/Adv': data['adv'].numpy(), + 'Train/Entropy': pi_info['ent'], + 'Train/KL': torch_kl, + 'Train/PolicyRatio': pi_info['ratio'], + } + ) diff --git a/omnisafe/algorithms/on_policy/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py similarity index 91% rename from omnisafe/algorithms/on_policy/trpo.py rename to omnisafe/algorithms/on_policy/base/trpo.py index 630a7dcd8..2ae594094 100644 --- a/omnisafe/algorithms/on_policy/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, @@ -29,27 +29,16 @@ @registry.register class TRPO(NaturalPG): - """The Trust Region Policy Optimization (TRPO) Algorithm. + """The Trust Region Policy Optimization (TRPO) algorithm. References: - Paper Name: Trust Region Policy Optimization. - Paper author: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1502.05477 + Title: Trust Region Policy Optimization + Authors: John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel. + URL: https://arxiv.org/abs/1502.05477 """ - def __init__( - self, - env_id, - cfgs, - algo='TRPO', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) # pylint: disable-next=too-many-arguments,too-many-locals,arguments-differ def search_step_size( @@ -62,7 +51,7 @@ def search_step_size( total_steps=15, decay=0.8, ): - r"""TRPO performs line-search until constraint satisfaction. + """TRPO performs line-search until constraint satisfaction. search around for a satisfied step of policy update to improve loss and reward performance diff --git a/omnisafe/algorithms/on_policy/early_terminated/__init__.py b/omnisafe/algorithms/on_policy/early_terminated/__init__.py new file mode 100644 index 000000000..457ca0b3e --- /dev/null +++ b/omnisafe/algorithms/on_policy/early_terminated/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Early terminated algorithms.""" + +from omnisafe.algorithms.on_policy.early_terminated.ppo_early_terminated import PPOEarlyTerminated +from omnisafe.algorithms.on_policy.early_terminated.ppo_lag_early_terminated import ( + PPOLagEarlyTerminated, +) + + +__all__ = [ + 'PPOEarlyTerminated', + 'PPOLagEarlyTerminated', +] diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py new file mode 100644 index 000000000..aa62ec519 --- /dev/null +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -0,0 +1,33 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the early terminated algorithm using PPO.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOEarlyTerminated(PPO): + """The early terminated algorithm implemented with PPO. + + References: + Title: Safe Exploration by Solving Early Terminated MDP + Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. + URL: https://arxiv.org/abs/2107.04200 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPO_Earyly_Terminated.""" + super().__init__(env_id=env_id, cfgs=cfgs) diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py new file mode 100644 index 000000000..e0827fffa --- /dev/null +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -0,0 +1,33 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Lagrange version of the early terminated algorithm using PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagEarlyTerminated(PPOLag): + """The Lagrange version of the early terminated algorithm implemented with PPOLag. + + References: + Title: Safe Exploration by Solving Early Terminated MDP + Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou. + URL: https://arxiv.org/abs/2107.04200 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPO_Lag_Earyly_Terminated.""" + super().__init__(env_id=env_id, cfgs=cfgs) diff --git a/omnisafe/algorithms/on_policy/first_order/__init__.py b/omnisafe/algorithms/on_policy/first_order/__init__.py new file mode 100644 index 000000000..3c0bde0ea --- /dev/null +++ b/omnisafe/algorithms/on_policy/first_order/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""First-order algorithms.""" + +from omnisafe.algorithms.on_policy.first_order.cup import CUP +from omnisafe.algorithms.on_policy.first_order.focops import FOCOPS + + +__all__ = [ + 'CUP', + 'FOCOPS', +] diff --git a/omnisafe/algorithms/on_policy/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py similarity index 93% rename from omnisafe/algorithms/on_policy/cup.py rename to omnisafe/algorithms/on_policy/first_order/cup.py index bbaa32541..0a70cc2c0 100644 --- a/omnisafe/algorithms/on_policy/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -17,45 +17,36 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange from omnisafe.utils import distributed_utils @registry.register class CUP(PolicyGradient, Lagrange): - """The Constrained Update Projection Approach to Safe Policy Optimization. + """The Constrained Update Projection (CUP) Approach to Safe Policy Optimization. References: - Paper Name: Constrained Update Projection Approach to Safe Policy Optimization. - Paper author: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, Yaodong Yang, Gang Pan. - Paper URL: https://arxiv.org/abs/2209.07089 - + Title: Constrained Update Projection Approach to Safe Policy Optimization + Authors: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, + Yaodong Yang, Gang Pan. + URL: https://arxiv.org/abs/2209.07089 """ - def __init__( - self, - env_id, - cfgs, - algo='CUP', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): + def __init__(self, env_id, cfgs) -> None: r"""The :meth:`init` function.""" - PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, + lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, ) self.lam = self.cfgs.lam self.eta = self.cfgs.eta @@ -65,7 +56,7 @@ def __init__( def algorithm_specific_logs(self): super().algorithm_specific_logs() - self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier) + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) self.logger.log_tabular('Train/MaxRatio', self.max_ratio) self.logger.log_tabular('Train/MinRatio', self.min_ratio) diff --git a/omnisafe/algorithms/on_policy/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py similarity index 93% rename from omnisafe/algorithms/on_policy/focops.py rename to omnisafe/algorithms/on_policy/first_order/focops.py index 1b80e1962..1efff805a 100644 --- a/omnisafe/algorithms/on_policy/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange from omnisafe.utils import distributed_utils @@ -27,29 +27,18 @@ class FOCOPS(PolicyGradient, Lagrange): """The First Order Constrained Optimization in Policy Space (FOCOPS) algorithm. References: - Paper Name: First Order Constrained Optimization in Policy Space. - Paper author: Yiming Zhang, Quan Vuong, Keith W. Ross. - Paper URL: https://arxiv.org/abs/2002.06506 - + Title: First Order Constrained Optimization in Policy Space + Authors: Yiming Zhang, Quan Vuong, Keith W. Ross. + URL: https://arxiv.org/abs/2002.06506 """ - def __init__( - self, - env_id, - cfgs, - algo='FOCOPS', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): + def __init__(self, env_id, cfgs) -> None: r"""The :meth:`init` function.""" - PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) - Lagrange.__init__( self, cost_limit=self.cfgs.lagrange_cfgs.cost_limit, diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py b/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py new file mode 100644 index 000000000..018b41197 --- /dev/null +++ b/omnisafe/algorithms/on_policy/naive_lagrange/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Naive Lagrange algorithms.""" + +from omnisafe.algorithms.on_policy.naive_lagrange.npg_lag import NPGLag +from omnisafe.algorithms.on_policy.naive_lagrange.pdo import PDO +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag +from omnisafe.algorithms.on_policy.naive_lagrange.trpo_lag import TRPOLag + + +__all__ = [ + 'NPGLag', + 'PDO', + 'PPOLag', + 'TRPOLag', +] diff --git a/omnisafe/algorithms/on_policy/npg_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py similarity index 85% rename from omnisafe/algorithms/on_policy/npg_lag.py rename to omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py index aa10d0b8a..323173ee8 100644 --- a/omnisafe/algorithms/on_policy/npg_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/npg_lag.py @@ -12,38 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of Natural Policy Gradient algorithm.""" +"""Implementation of the Lagrange version of the Natural Policy Gradient algorithm.""" import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.natural_pg import NaturalPG +from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG from omnisafe.common.lagrange import Lagrange @registry.register class NPGLag(NaturalPG, Lagrange): - """The Lagrange version of Natural Policy Gradient algorithm. - - A simple combination of Lagrange method and Natural Policy Gradient algorithm. + """The Lagrange version of the Natural Policy Gradient algorithm. + A simple combination of the Lagrange method and the Natural Policy Gradient algorithm. """ - def __init__( - self, - env_id, - cfgs, - algo: str = 'NPG-Lag', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): + def __init__(self, env_id, cfgs) -> None: """initialize""" - NaturalPG.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, diff --git a/omnisafe/algorithms/on_policy/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py similarity index 87% rename from omnisafe/algorithms/on_policy/pdo.py rename to omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index 633dbebe7..85d7d7416 100644 --- a/omnisafe/algorithms/on_policy/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -17,32 +17,23 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange @registry.register class PDO(PolicyGradient, Lagrange): - """The Lagrange version of Policy Gradient algorithm. - - A simple combination of Lagrange method and Policy Gradient algorithm. + """The Lagrange version of the Policy Gradient algorithm. + A simple combination of the Lagrange method and the Policy Gradient algorithm. """ - def __init__( - self, - env_id, - cfgs, - algo='PDO', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): + def __init__(self, env_id, cfgs) -> None: """initialization""" PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, diff --git a/omnisafe/algorithms/on_policy/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py similarity index 81% rename from omnisafe/algorithms/on_policy/ppo_lag.py rename to omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index 5ce205555..f120c0463 100644 --- a/omnisafe/algorithms/on_policy/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -12,43 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of PPO algorithm.""" +"""Implementation of the Lagrange version of the PPO algorithm.""" import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange @registry.register class PPOLag(PolicyGradient, Lagrange): - """The Lagrange version of PPO algorithm. + """The Lagrange version of the PPO algorithm. References: - Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. - Paper author: Alex Ray, Joshua Achiam, Dario Amodei - Paper URL: https://cdn.openai.com/safexp-short.pdf - + Title: Benchmarking Safe Exploration in Deep Reinforcement Learning + Authors: Alex Ray, Joshua Achiam, Dario Amodei. + URL: https://cdn.openai.com/safexp-short.pdf """ - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - algo='PPO-Lag', - clip=0.2, - wrapper_type: str = 'OnPolicyEnvWrapper', - ): + def __init__(self, env_id, cfgs) -> None: """Initialize PPO-Lag algorithm.""" - self.clip = clip + self.clip = cfgs.clip PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, diff --git a/omnisafe/algorithms/on_policy/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py similarity index 83% rename from omnisafe/algorithms/on_policy/trpo_lag.py rename to omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index e3282863f..d6e1c2df3 100644 --- a/omnisafe/algorithms/on_policy/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -12,40 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the Lagrange version of TRPO algorithm.""" +"""Implementation of the Lagrange version of the TRPO algorithm.""" import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.trpo import TRPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.common.lagrange import Lagrange @registry.register class TRPOLag(TRPO, Lagrange): - """The Lagrange version of TRPO algorithm. + """The Lagrange version of the TRPO algorithm. References: - Paper Name: Benchmarking Safe Exploration in Deep Reinforcement Learning. - Paper author: Alex Ray, Joshua Achiam, Dario Amodei - Paper URL: https://cdn.openai.com/safexp-short.pdf + Title: Benchmarking Safe Exploration in Deep Reinforcement Learning + Authors: Alex Ray, Joshua Achiam, Dario Amodei. + URL: https://cdn.openai.com/safexp-short.pdf """ - def __init__( - self, - env_id, - cfgs, - algo: str = 'TRPO-Lag', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): + def __init__(self, env_id, cfgs) -> None: """initialize""" TRPO.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) Lagrange.__init__( self, diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py new file mode 100644 index 000000000..aef2373d6 --- /dev/null +++ b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""PID Lagrange algorithms.""" + +from omnisafe.algorithms.on_policy.pid_lagrange.cppo_pid import CPPOPid +from omnisafe.algorithms.on_policy.pid_lagrange.trpo_pid import TRPOPid + + +__all__ = [ + 'CPPOPid', + 'TRPOPid', +] diff --git a/omnisafe/algorithms/on_policy/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py similarity index 78% rename from omnisafe/algorithms/on_policy/cppo_pid.py rename to omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 85581a066..9a4db984e 100644 --- a/omnisafe/algorithms/on_policy/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -12,45 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the CPPO Pid-Lagrange algorithm.""" +"""Implementation of the PID-Lagrange version of the CPPO algorithm.""" import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.pid_lagrange import PIDLagrangian +from omnisafe.utils.config_utils import namedtuple2dict @registry.register class CPPOPid(PolicyGradient, PIDLagrangian): - """The Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. + """The PID-Lagrange version of the CPPO algorithm. References: - Paper Name: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1705.10528 - + Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/2007.03964 """ - def __init__( - self, - env_id, - cfgs, - algo: str = 'CPPO-PID', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - + def __init__(self, env_id, cfgs) -> None: PolicyGradient.__init__( self, env_id=env_id, cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, ) - PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs._asdict()) + PIDLagrangian.__init__(self, **namedtuple2dict(self.cfgs.PID_cfgs)) self.clip = self.cfgs.clip - self.cost_limit = self.cfgs.cost_limit + self.cost_limit = self.cfgs.PID_cfgs.cost_limit def algorithm_specific_logs(self): super().algorithm_specific_logs() diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py new file mode 100644 index 000000000..9996f838a --- /dev/null +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -0,0 +1,84 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PID-Lagrange version of the TRPO algorithm.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.trpo import TRPO +from omnisafe.common.pid_lagrange import PIDLagrangian +from omnisafe.utils.config_utils import namedtuple2dict + + +@registry.register +class TRPOPid(TRPO, PIDLagrangian): + """The PID-Lagrange version of the TRPO algorithm. + + References: + Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/2007.03964 + """ + + def __init__(self, env_id, cfgs) -> None: + TRPO.__init__( + self, + env_id=env_id, + cfgs=cfgs, + ) + PIDLagrangian.__init__(self, **namedtuple2dict(self.cfgs.PID_cfgs)) + self.cost_limit = self.cfgs.cost_limit + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/LagrangeMultiplier', self.cost_penalty) + self.logger.log_tabular('PID/pid_Kp', self.pid_kp) + self.logger.log_tabular('PID/pid_Ki', self.pid_ki) + self.logger.log_tabular('PID/pid_Kd', self.pid_kd) + + def compute_loss_pi(self, data: dict): + """compute loss for policy""" + dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) + ratio = torch.exp(_log_p - data['log_p']) + + # Compute loss via ratio and advantage + loss_pi = -(ratio * data['adv']).mean() + loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean() + + penalty = self.cost_penalty + loss_pi += penalty * (ratio * data['cost_adv']).mean() + loss_pi /= 1 + penalty + + # Useful extra info + approx_kl = 0.5 * (data['log_p'] - _log_p).mean().item() + ent = dist.entropy().mean().item() + pi_info = dict(kl=approx_kl, ent=ent, ratio=ratio.mean().item()) + + return loss_pi, pi_info + + def update(self): + """update policy""" + raw_data, data = self.buf.pre_process_data() + # sub-sampling accelerates calculations + self.fvp_obs = data['obs'][::4] + # Note that logger already uses MPI statistics across all processes.. + ep_costs = self.logger.get_stats('Metrics/EpCost')[0] + # First update Lagrange multiplier parameter + self.pid_update(ep_costs) + # now update policy and value network + self.update_policy_net(data=data) + self.update_value_net(data=data) + self.update_cost_net(data=data) + return raw_data, data diff --git a/omnisafe/algorithms/on_policy/ppo.py b/omnisafe/algorithms/on_policy/ppo.py deleted file mode 100644 index e24335344..000000000 --- a/omnisafe/algorithms/on_policy/ppo.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright 2022 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the PPO algorithm.""" - -import torch - -from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.policy_gradient import PolicyGradient - - -@registry.register -class PPO(PolicyGradient): - """The Proximal Policy Optimization Algorithms (PPO) Algorithm. - - References: - Paper Name: Proximal Policy Optimization Algorithms. - Paper author: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - Paper URL: https://arxiv.org/pdf/1707.06347.pdf - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - env_id, - cfgs, - algo='ppo', - clip=0.2, - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - """Initialize PPO.""" - self.clip = clip - super().__init__( - env_id=env_id, - cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, - ) - - def compute_loss_pi(self, data: dict): - """Compute policy loss.""" - dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) - # Importance ratio - ratio = torch.exp(_log_p - data['log_p']) - ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) - loss_pi = -(torch.min(ratio * data['adv'], ratio_clip * data['adv'])).mean() - loss_pi += self.cfgs.entropy_coef * dist.entropy().mean() - - # Useful extra info - approx_kl = (0.5 * (dist.mean - data['act']) ** 2 / dist.stddev**2).mean().item() - ent = dist.entropy().mean().item() - pi_info = dict(kl=approx_kl, ent=ent, ratio=ratio_clip.mean().item()) - - return loss_pi, pi_info diff --git a/omnisafe/algorithms/on_policy/saute/__init__.py b/omnisafe/algorithms/on_policy/saute/__init__.py new file mode 100644 index 000000000..65e0a5087 --- /dev/null +++ b/omnisafe/algorithms/on_policy/saute/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Saute algorithms.""" + +from omnisafe.algorithms.on_policy.saute.ppo_lag_saute import PPOLagSaute +from omnisafe.algorithms.on_policy.saute.ppo_saute import PPOSaute + + +__all__ = [ + 'PPOLagSaute', + 'PPOSaute', +] diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py new file mode 100644 index 000000000..180e48d5f --- /dev/null +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -0,0 +1,38 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Lagrange version of the Saute algorithm using PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagSaute(PPOLag): + """The Saute algorithm implemented with PPOLag. + + References: + Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, + Jun Wang, Haitham Bou-Ammar. + URL: https://arxiv.org/abs/2202.06558 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPOLagSaute.""" + super().__init__(env_id=env_id, cfgs=cfgs) + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py new file mode 100644 index 000000000..91e8286f6 --- /dev/null +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -0,0 +1,38 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Saute algorithm.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOSaute(PPO): + """The Saute algorithm implemented with PPO. + + References: + Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, + Jun Wang, Haitham Bou-Ammar. + URL: https://arxiv.org/abs/2202.06558 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPOSaute.""" + super().__init__(env_id=env_id, cfgs=cfgs) + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') diff --git a/omnisafe/algorithms/on_policy/second_order/__init__.py b/omnisafe/algorithms/on_policy/second_order/__init__.py new file mode 100644 index 000000000..236f34808 --- /dev/null +++ b/omnisafe/algorithms/on_policy/second_order/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Second-order algorithms.""" + +from omnisafe.algorithms.on_policy.second_order.cpo import CPO +from omnisafe.algorithms.on_policy.second_order.pcpo import PCPO + + +__all__ = [ + 'CPO', + 'PCPO', +] diff --git a/omnisafe/algorithms/on_policy/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py similarity index 96% rename from omnisafe/algorithms/on_policy/cpo.py rename to omnisafe/algorithms/on_policy/second_order/cpo.py index 207bd0abc..1d465cb6b 100644 --- a/omnisafe/algorithms/on_policy/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -18,7 +18,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.trpo import TRPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, @@ -30,28 +30,16 @@ @registry.register class CPO(TRPO): - """The Constrained Policy Optimization (CPO) Algorithm. + """The Constrained Policy Optimization (CPO) algorithm. References: - Paper Name: Constrained Policy Optimization. - Paper author: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. - Paper URL: https://arxiv.org/abs/1705.10528 - + Title: Constrained Policy Optimization + Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel. + URL: https://arxiv.org/abs/1705.10528 """ - def __init__( - self, - env_id, - cfgs, - algo='CPO', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) self.cost_limit = cfgs.cost_limit self.loss_pi_cost_before = 0.0 diff --git a/omnisafe/algorithms/on_policy/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py similarity index 94% rename from omnisafe/algorithms/on_policy/pcpo.py rename to omnisafe/algorithms/on_policy/second_order/pcpo.py index edef0fdf4..e952341af 100644 --- a/omnisafe/algorithms/on_policy/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -17,7 +17,7 @@ import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.trpo import TRPO +from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.utils import distributed_utils from omnisafe.utils.tools import ( conjugate_gradients, @@ -32,25 +32,13 @@ class PCPO(TRPO): """The Projection-Based Constrained Policy Optimization (PCPO) algorithm. References: - Paper name: Projection-Based Constrained Policy Optimization. - Paper author: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge - Paper URL: https://arxiv.org/abs/2010.03152 - + Title: Projection-Based Constrained Policy Optimization + Authors: Tsung-Yen Yang, Justinian Rosca, Karthik Narasimhan, Peter J. Ramadge. + URL: https://arxiv.org/abs/2010.03152 """ - def __init__( - self, - env_id, - cfgs, - algo='PCPO', - wrapper_type: str = 'OnPolicyEnvWrapper', - ): - super().__init__( - env_id=env_id, - cfgs=cfgs, - algo=algo, - wrapper_type=wrapper_type, - ) + def __init__(self, env_id, cfgs) -> None: + super().__init__(env_id=env_id, cfgs=cfgs) self.cost_limit = self.cfgs.cost_limit # pylint: disable-next=too-many-locals,too-many-arguments diff --git a/omnisafe/algorithms/on_policy/simmer/__init__.py b/omnisafe/algorithms/on_policy/simmer/__init__.py new file mode 100644 index 000000000..55fddc516 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Simmer algorithms.""" + +from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_pid import PPOLagSimmerPid +from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_q import PPOLagSimmerQ +from omnisafe.algorithms.on_policy.simmer.ppo_simmer_pid import PPOSimmerPid +from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ + + +__all__ = [ + 'PPOLagSimmerPid', + 'PPOLagSimmerQ', + 'PPOSimmerPid', + 'PPOSimmerQ', +] diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py new file mode 100644 index 000000000..cd89f093c --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py @@ -0,0 +1,39 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PID version of the Simmer algorithm using PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagSimmerPid(PPOLag): + """The PID version of the Simmer algorithm implemented with PPOLag. + + References: + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPOLagSimmerPid algorithm.""" + super().__init__(env_id=env_id, cfgs=cfgs) + + def algorithm_specific_logs(self): + """Log the algorithm specific metrics.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py new file mode 100644 index 000000000..d06fe068f --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py @@ -0,0 +1,39 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Q Simmer algorithm using PPOLag.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag + + +@registry.register +class PPOLagSimmerQ(PPOLag): + """The Q Simmer algorithm implemented with PPOLag. + + References: + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPOLagSimmerQ algorithm.""" + super().__init__(env_id=env_id, cfgs=cfgs) + + def algorithm_specific_logs(self): + """Log the algorithm specific metrics.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py new file mode 100644 index 000000000..de4c4ed76 --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py @@ -0,0 +1,39 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PID version of the Simmer algorithm using PPO.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOSimmerPid(PPO): + """The PID version of the Simmer algorithm implemented with PPO. + + References: + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPOSimmerPid.""" + super().__init__(env_id=env_id, cfgs=cfgs) + + def algorithm_specific_logs(self): + """Log the algorithm specific metrics.""" + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py new file mode 100644 index 000000000..7bd50a0bb --- /dev/null +++ b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py @@ -0,0 +1,38 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Q Simmer algorithm using PPO.""" + +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO + + +@registry.register +class PPOSimmerQ(PPO): + """The Q Simmer algorithm implemented with PPO. + + References: + Title: Effects of Safety State Augmentation on Safe Exploration + Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. + URL: https://arxiv.org/abs/2206.02675 + """ + + def __init__(self, env_id, cfgs) -> None: + """Initialize PPOSimmerQ.""" + super().__init__(env_id=env_id, cfgs=cfgs) + + def algorithm_specific_logs(self): + super().algorithm_specific_logs() + self.logger.log_tabular('Metrics/EpBudget') + self.logger.log_tabular('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/registry.py b/omnisafe/algorithms/registry.py index 464307373..72f39dc19 100644 --- a/omnisafe/algorithms/registry.py +++ b/omnisafe/algorithms/registry.py @@ -19,6 +19,7 @@ class Registry: """A registry to map strings to classes. + Args: name (str): Registry name. """ @@ -28,10 +29,9 @@ def __init__(self, name): self._module_dict = {} def __repr__(self): - format_str = ( - self.__class__.__name__ + f'(name={self._name}, items={list(self._module_dict.keys())})' + return ( + f'{self.__class__.__name__ }(name={self._name}, items={list(self._module_dict.keys())})' ) - return format_str @property def name(self): diff --git a/omnisafe/common/pid_lagrange.py b/omnisafe/common/pid_lagrange.py index 02d3cb9c5..4a04338be 100644 --- a/omnisafe/common/pid_lagrange.py +++ b/omnisafe/common/pid_lagrange.py @@ -34,7 +34,7 @@ def __init__( sum_norm: bool, diff_norm: bool, penalty_max: int, - lagrangian_multiplier_init: 0.001, + lagrangian_multiplier_init: float, cost_limit: int, ): """init""" diff --git a/omnisafe/configs/off-policy/DDPG.yaml b/omnisafe/configs/off-policy/DDPG.yaml index 48c76ae4c..5abb1f71e 100644 --- a/omnisafe/configs/off-policy/DDPG.yaml +++ b/omnisafe/configs/off-policy/DDPG.yaml @@ -14,50 +14,93 @@ # ============================================================================== defaults: - # Basic Configurations - epochs: 100 - steps_per_epoch: 4000 + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class DDPG---------------------- ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps update_after: 1000 + # Update every `update_every` steps update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 10 + # The max length of per epoch max_ep_len: 1000 + # The number of test episodes num_test_episodes: 10 - actor_lr: 0.001 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The soft update coefficient polyak: 0.999 + # The discount factor of GAE gamma: 0.99 + # Actor perdorm random action before `start_steps` steps start_steps: 10000 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - Env_cfgs: - start_step: 1000 - update_every: 100 - # Optional Configuration - ## Whether to use cost critic - use_cost: False + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: False + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - pi_type: "dire" + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise act_noise: 0.1 + # Size of hidden layers hidden_sizes: [400, 300] + # Activation function activation: relu + # Configuration of Critic network val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers hidden_sizes: [400, 300] + # Activation function activation: relu - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## replay_buffer_cfgs: + # The size of replay buffer size: 50000 + # The size of batch batch_size: 256 diff --git a/omnisafe/configs/off-policy/DDPGLag.yaml b/omnisafe/configs/off-policy/DDPGLag.yaml new file mode 100644 index 000000000..9e948fc3b --- /dev/null +++ b/omnisafe/configs/off-policy/DDPGLag.yaml @@ -0,0 +1,116 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer------------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 +## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.01 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/off-policy/SAC.yaml b/omnisafe/configs/off-policy/SAC.yaml new file mode 100644 index 000000000..736203daf --- /dev/null +++ b/omnisafe/configs/off-policy/SAC.yaml @@ -0,0 +1,110 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class SAC---------------------- ## + # The entropy coefficient + alpha: 0.2 + # The learning rate of Alpha + alpha_gamma: 0.99 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: False + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "gaussian_stdnet" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/SACLag.yaml b/omnisafe/configs/off-policy/SACLag.yaml new file mode 100644 index 000000000..27e541c4e --- /dev/null +++ b/omnisafe/configs/off-policy/SACLag.yaml @@ -0,0 +1,119 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class SAC---------------------- ## + # The entropy coefficient + alpha: 0.2 + # The learning rate of Alpha + alpha_gamma: 0.99 + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "gaussian_stdnet" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 +## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.01 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/off-policy/SDDPG.yaml b/omnisafe/configs/off-policy/SDDPG.yaml new file mode 100644 index 000000000..2fa6c5819 --- /dev/null +++ b/omnisafe/configs/off-policy/SDDPG.yaml @@ -0,0 +1,116 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 200 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + + ## ---------------------------Basic configurations for derived class SDDPG-------------------- ## + # The normalize coefficient + beta: 1.5 + # The discontinuous coefficient for conjugate gradient + cg_damping: 0.1 + # The max iteration for conjugate gradient + cg_iters: 10 + # The constraint for KL divergence + target_kl: 0.01 + # Hypperparameter for SDDPG + d_init: 5 + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/TD3.yaml b/omnisafe/configs/off-policy/TD3.yaml new file mode 100644 index 000000000..d7e38e746 --- /dev/null +++ b/omnisafe/configs/off-policy/TD3.yaml @@ -0,0 +1,104 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: False + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 diff --git a/omnisafe/configs/off-policy/TD3Lag.yaml b/omnisafe/configs/off-policy/TD3Lag.yaml new file mode 100644 index 000000000..dd28ad391 --- /dev/null +++ b/omnisafe/configs/off-policy/TD3Lag.yaml @@ -0,0 +1,114 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## ----------------------------Basic configurations for base class DDPG----------------------- ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OffPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 6000 + # Update after `update_after` steps + update_after: 1000 + # Update every `update_every` steps + update_every: 50 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 10 + # The max length of per epoch + max_ep_len: 1000 + # The number of test episodes + num_test_episodes: 10 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The soft update coefficient + polyak: 0.999 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_steps` steps + start_steps: 10000 + # The Address for saving training process data + data_dir: "./runs" + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: False + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: "dire" + # The standard deviation of Gaussian noise + act_noise: 0.1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # Configuration of Critic network + val: + # Number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + ## --------------------------------------Configuration For Buffer----------------------------- ## + replay_buffer_cfgs: + # The size of replay buffer + size: 50000 + # The size of batch + batch_size: 256 +## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.01 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/CPO.yaml b/omnisafe/configs/on-policy/CPO.yaml index 7be369c59..7c8ab4026 100644 --- a/omnisafe/configs/on-policy/CPO.yaml +++ b/omnisafe/configs/on-policy/CPO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/CPPOPid.yaml b/omnisafe/configs/on-policy/CPPOPid.yaml index 7ceebef06..3589a5ae7 100644 --- a/omnisafe/configs/on-policy/CPPOPid.yaml +++ b/omnisafe/configs/on-policy/CPPOPid.yaml @@ -14,79 +14,123 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 - use_cost: True - cost_gamma: 1.0 - target_kl: 0.01 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class CPPO_PID - cost_limit: 25.0 - clip: 0. - pid_Kp: 0.01 - pid_Ki: 0.01 - pid_Kd: 0.01 - lagrangian_multiplier_init: 0.001 - pid_d_delay: 10 - pid_delta_p_ema_alpha: 0.95 # 0 for hard update, 1 for no update - pid_delta_d_ema_alpha: 0.95 - sum_norm: True # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 - diff_norm: False # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 - penalty_max: 100 # only used if sum_norm=diff_norm=False + ## -------------------------Basic configurations for derived class CPPOPid-------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 - # Optional Configuration - ## Whether to use cost critic - use_cost_critic: True + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: True + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - standardized_obs: False - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + + ## --------------------------------------Configuration For PID--------------------------------- ## PID_cfgs: + # KP for PID pid_kp: 0.01 + # KI for PID pid_ki: 0.01 + # KD for PID pid_kd: 0.01 + # The init value of lagrangian multiplier lagrangian_multiplier_init: 0.001 + # The delay rate of KD pid_d_delay: 10 - pid_delta_p_ema_alpha: 0.95 # 0 for hard update, 1 for no update + # 0 for hard update, 1 for no update + pid_delta_p_ema_alpha: 0.95 + # The same as above pid_delta_d_ema_alpha: 0.95 - sum_norm: True # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 - diff_norm: False # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 - penalty_max: 100 # only used if sum_norm=diff_norm=False + # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 + sum_norm: True + # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 + diff_norm: False + # Only used if sum_norm=diff_norm=False + penalty_max: 100 + # Tolerance of violation cost_limit: 25.0 diff --git a/omnisafe/configs/on-policy/CUP.yaml b/omnisafe/configs/on-policy/CUP.yaml index 3991f7914..baa0a5354 100644 --- a/omnisafe/configs/on-policy/CUP.yaml +++ b/omnisafe/configs/on-policy/CUP.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch @@ -43,7 +45,7 @@ defaults: # The Address for saving training process data data_dir: "./runs" - ## ---------------------------Basic configurations for derived class FOCOPS------------------- ## + ## ----------------------------Basic configurations for derived class CUP-------------------- ## # The thereshold for KL early stopping target_kl: 0.01 # Tolerance of constraint violation @@ -127,3 +129,5 @@ defaults: lambda_lr: 0.035 # Type of lagrangian optimizer lambda_optimizer: "Adam" + # The upper bound of lagrange multiplier + lagrangian_upper_bound: 2.0 diff --git a/omnisafe/configs/on-policy/FOCOPS.yaml b/omnisafe/configs/on-policy/FOCOPS.yaml index 57c24fefb..5ed15f3ce 100644 --- a/omnisafe/configs/on-policy/FOCOPS.yaml +++ b/omnisafe/configs/on-policy/FOCOPS.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/NPGLag.yaml b/omnisafe/configs/on-policy/NPGLag.yaml index bef481360..8ddc14c4c 100644 --- a/omnisafe/configs/on-policy/NPGLag.yaml +++ b/omnisafe/configs/on-policy/NPGLag.yaml @@ -14,63 +14,112 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 - steps_per_epoch: 20000 - actor_iters: 40 - critic_iters: 20 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class PDO_Lag + ## --------------------------Basic configurations for derived class NaturalPG----------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic use_cost: True + # Cost discounted factor cost_gamma: 1.0 - linear_lr_decay: True + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: True - use_max_grad_norm: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - ## Configuration For Mode + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae + # Whether to use standardized reward standardized_reward: True + # Whether to use standardized cost standardized_cost: True - reward_penalty: False - ## Configuration For Lagrange + ## --------------------------------Configuration For Lagrangian multiplier-------------------- ## lagrange_cfgs: + # Tolerance of constraint violation cost_limit: 25.0 - lagrangian_multiplier_init: 0.1 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier lambda_lr: 0.035 + # Type of lagrangian optimizer lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/NaturalPG.yaml b/omnisafe/configs/on-policy/NaturalPG.yaml index 1f81e5f7e..301faea0c 100644 --- a/omnisafe/configs/on-policy/NaturalPG.yaml +++ b/omnisafe/configs/on-policy/NaturalPG.yaml @@ -14,57 +14,102 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 - steps_per_epoch: 20000 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class Natural PG + ## --------------------------Basic configurations for derived class NaturalPG----------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False - kl_early_stopping: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - ## Configuration For Mode + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/PCPO.yaml b/omnisafe/configs/on-policy/PCPO.yaml index 23bbd2d94..558fa0fa0 100644 --- a/omnisafe/configs/on-policy/PCPO.yaml +++ b/omnisafe/configs/on-policy/PCPO.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch diff --git a/omnisafe/configs/on-policy/PDO.yaml b/omnisafe/configs/on-policy/PDO.yaml index aa1ecce82..40983174e 100644 --- a/omnisafe/configs/on-policy/PDO.yaml +++ b/omnisafe/configs/on-policy/PDO.yaml @@ -14,63 +14,112 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 - steps_per_epoch: 20000 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class PDO + ## --------------------------Basic configurations for derived class NaturalPG----------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic use_cost: True + # Cost discounted factor cost_gamma: 1.0 - linear_lr_decay: True + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: True - use_max_grad_norm: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False + # Whether to use standardized observation standardized_obs: True - ## Configuration For Mode + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost standardized_cost: True - reward_penalty: False - ## Configuration For Lagrange + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## lagrange_cfgs: + # Tolerance of constraint violation cost_limit: 25.0 + # Initial value of lagrangian multiplier lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier lambda_lr: 0.035 + # Type of lagrangian optimizer lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml index 56a113edb..cff8630f1 100644 --- a/omnisafe/configs/on-policy/PPO.yaml +++ b/omnisafe/configs/on-policy/PPO.yaml @@ -14,53 +14,98 @@ # ============================================================================== defaults: - # Basic Configurations + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 - target_kl: 0.01 + # The Address for saving training process data data_dir: "./runs" - seed: 0 + ## ---------------------------Basic configurations for derived class PPO---------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 - standardized_obs: True - exploration_noise_anneal: True - kl_early_stopping: True + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml new file mode 100644 index 000000000..056620a39 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml @@ -0,0 +1,111 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: EarlyTerminatedEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/PPOLag.yaml b/omnisafe/configs/on-policy/PPOLag.yaml index 3f6327ca2..60c75f289 100644 --- a/omnisafe/configs/on-policy/PPOLag.yaml +++ b/omnisafe/configs/on-policy/PPOLag.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch @@ -42,9 +44,13 @@ defaults: critic_lr: 0.001 # The Address for saving training process data data_dir: "./runs" - ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + ## ---------------------------Basic configurations for derived class PPO---------------------- ## # The thereshold for KL early stopping target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## diff --git a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml new file mode 100644 index 000000000..4e9930adf --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml @@ -0,0 +1,121 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: EarlyTerminatedEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/PPOLagSaute.yaml b/omnisafe/configs/on-policy/PPOLagSaute.yaml new file mode 100644 index 000000000..1a21bcc0d --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagSaute.yaml @@ -0,0 +1,128 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: SauteEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + unsafe_reward: -0.1 + # ``safety_budget`` in saute is actually the same as ``cost_limmit``. + safety_budget: 25 + saute_gamma: 0.9997 + scale_safety_budget: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" diff --git a/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml new file mode 100644 index 000000000..e07e8635d --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml @@ -0,0 +1,145 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.05 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.999 + # Whether to scale the safety budget + scale_safety_budget: True + # Type of Simmer Controller + simmer_controller: 'PID' + # Configuration of Simmer Controller + controller_cfgs: + # Kp for PID + pid_kp: 0.1 + # Ki for PID + pid_ki: 0.01 + # Kd for PID + pid_kd: 0.01 + # The step size for PID + step_size: 2 + # Lowpass filter coefficient + tau: 0.95 diff --git a/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml new file mode 100644 index 000000000..de56ab841 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml @@ -0,0 +1,147 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: True + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ## + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 25.0 + # Initial value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # Learning rate of lagrangian multiplier + lambda_lr: 0.035 + # Type of lagrangian optimizer + lambda_optimizer: "Adam" + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.05 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.999 + # Whether to scale the safety budget + scale_safety_budget: True + # Type of Simmer Controller + simmer_controller: 'Q' + # Configurations for controller + controller_cfgs: + # The dim of state space + state_dim: 5 + # The dim of action space + act_dim: 3 + # The theshold of safety budget + threshold: 2 + # The learning rate of Q network + q_lr: 0.1 + # The hyperparameter of episilon greedy + epsilon: 0.8 + # Lowpass filter coefficient + tau: 0.95 diff --git a/omnisafe/configs/on-policy/PPOSaute.yaml b/omnisafe/configs/on-policy/PPOSaute.yaml new file mode 100644 index 000000000..dbd20ae0c --- /dev/null +++ b/omnisafe/configs/on-policy/PPOSaute.yaml @@ -0,0 +1,118 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: SauteEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + unsafe_reward: -0.1 + # ``safety_budget`` in saute is actually the same as ``cost_limmit``. + safety_budget: 25 + saute_gamma: 0.9997 + scale_safety_budget: True diff --git a/omnisafe/configs/on-policy/PPOSimmerPid.yaml b/omnisafe/configs/on-policy/PPOSimmerPid.yaml new file mode 100644 index 000000000..0de3587c3 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOSimmerPid.yaml @@ -0,0 +1,137 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.5 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.9997 + # Whether to scale the safety budget + scale_safety_budget: True + # Type of Simmer Controller + simmer_controller: 'PID' + # Configuration of Simmer Controller + controller_cfgs: + # Kp for PID + pid_kp: 0.1 + # Ki for PID + pid_ki: 0.01 + # Kd for PID + pid_kd: 0.01 + # The step size for PID + step_size: 3 + # Lowpass filter coefficient + tau: 0.05 diff --git a/omnisafe/configs/on-policy/PPOSimmerQ.yaml b/omnisafe/configs/on-policy/PPOSimmerQ.yaml new file mode 100644 index 000000000..4d640d877 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOSimmerQ.yaml @@ -0,0 +1,139 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: SimmerEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## ---------------------------Basic configurations for derived class PPOLag------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # The size of batch for policy update + batch_size: 2000 + # The clip range for PPO loss + clip: 0.2 + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: False + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## Configuration For Env_Wrapper + env_cfgs: + # The reward when the state is unsafe + unsafe_reward: -0.1 + # The lower bound of safety budget + lower_budget: 15 + # The upper bound of safety budget + upper_budget: 25 + # The dicounted factor + simmer_gamma: 0.9997 + # Whether to scale the safety budget + scale_safety_budget: False + # Type of Simmer Controller + simmer_controller: 'Q' + # Configurations for controller + controller_cfgs: + # The dim of state space + state_dim: 5 + # The dim of action space + act_dim: 3 + # The theshold of safety budget + threshold: 2 + # The learning rate of Q network + q_lr: 0.1 + # The hyperparameter of episilon greedy + epsilon: 0.8 + # Lowpass filter coefficient + tau: 0.95 diff --git a/omnisafe/configs/on-policy/PolicyGradient.yaml b/omnisafe/configs/on-policy/PolicyGradient.yaml index 033c62086..f2eb2bd31 100644 --- a/omnisafe/configs/on-policy/PolicyGradient.yaml +++ b/omnisafe/configs/on-policy/PolicyGradient.yaml @@ -14,53 +14,91 @@ # ============================================================================== defaults: - # Basic Configurations + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 - num_mini_batches: 32 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 - target_kl: 0.01 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 - use_cost_critic: False + # Whther to use linear decay of learning rate linear_lr_decay: False - exploration_noise_anneal: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False + # Whether to use KL early stopping kl_early_stopping: False + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - standardized_obs: True - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index f0f4176fb..2dfdeb532 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -14,57 +14,102 @@ # ============================================================================== defaults: - # Basic Configurations - ## Basic configurations for base class PG + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs epochs: 500 + # Number of steps per epoch steps_per_epoch: 30000 + # Number of update iteration for Actor network actor_iters: 80 + # Number of update iteration for Critic network critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch check_freq: 25 + # Save model to disk every `check_freq` epochs save_freq: 100 + # Entropy coefficient for PPO loss entropy_coef: 0.01 + # The max length of per epoch max_ep_len: 1000 + # The size of mini batch num_mini_batches: 16 + # The learning rate of Actor network actor_lr: 0.0003 + # The learning rate of Critic network critic_lr: 0.001 + # The Address for saving training process data data_dir: "./runs" - seed: 0 - ## Basic configurations for derived class Natural PG + ## -----------------------------Basic configurations for derived class CPO--------------------- ## + # The thereshold for KL early stopping target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient cg_damping: 0.1 + # Number of conjugate gradient iterations cg_iters: 10 + # Subsampled observation fvp_obs: None - # Optional Configuration - ## Whether to use cost critic - use_cost: False + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor cost_gamma: 1.0 - exploration_noise_anneal: True - standardized_obs: True + # Whther to use linear decay of learning rate linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty reward_penalty: False - kl_early_stopping: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm use_max_grad_norm: False + # The thereshold of max gradient norm max_grad_norm: 0.5 + # Whether to use reward scaling scale_rewards: False - ## Configuration For Mode + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: + # Whether to share the weight of Actor network with Critic network shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network ac_kwargs: + # Configuration of Actor network pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" actor_type: gaussian_annealing + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh val: + # Size of hidden layers hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" activation: tanh - ## Configuration For Buffer + ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: + # Reward discounted factor gamma: 0.99 + # Parameters used to estimate future rewards in GAE lam: 0.95 + # Parameters used to estimate future costs in GAE lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" adv_estimation_method: gae - standardized_reward: False - standardized_cost: False - reward_penalty: False + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True diff --git a/omnisafe/configs/on-policy/TRPOLag.yaml b/omnisafe/configs/on-policy/TRPOLag.yaml index 0e438436f..5bff7d726 100644 --- a/omnisafe/configs/on-policy/TRPOLag.yaml +++ b/omnisafe/configs/on-policy/TRPOLag.yaml @@ -18,6 +18,8 @@ defaults: ## -----------------------------Basic configurations for base class PG------------------------ ## # The random seed seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper # Number of epochs epochs: 500 # Number of steps per epoch @@ -75,7 +77,7 @@ defaults: # Whether to use reward scaling scale_rewards: False # Whether to use standardized observation - standardized_obs: True + standardized_obs: False ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: # Whether to share the weight of Actor network with Critic network diff --git a/omnisafe/configs/on-policy/TRPOPid.yaml b/omnisafe/configs/on-policy/TRPOPid.yaml new file mode 100644 index 000000000..5c693a6cc --- /dev/null +++ b/omnisafe/configs/on-policy/TRPOPid.yaml @@ -0,0 +1,139 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # --------------------------------------Basic Configurations----------------------------------- # + ## -----------------------------Basic configurations for base class PG------------------------ ## + # The random seed + seed: 0 + # The environment wrapper type + wrapper_type: OnPolicyEnvWrapper + # Number of epochs + epochs: 500 + # Number of steps per epoch + steps_per_epoch: 30000 + # Number of update iteration for Actor network + actor_iters: 80 + # Number of update iteration for Critic network + critic_iters: 40 + # Check if all models own the same parameter values every `check_freq` epoch + check_freq: 25 + # Save model to disk every `check_freq` epochs + save_freq: 100 + # Entropy coefficient for PPO loss + entropy_coef: 0.01 + # The max length of per epoch + max_ep_len: 1000 + # The size of mini batch + num_mini_batches: 16 + # The learning rate of Actor network + actor_lr: 0.0003 + # The learning rate of Critic network + critic_lr: 0.001 + # The Address for saving training process data + data_dir: "./runs" + ## -----------------------------Basic configurations for derived class CPO--------------------- ## + # The thereshold for KL early stopping + target_kl: 0.01 + # Tolerance of constraint violation + cost_limit: 25 + # Damping value for conjugate gradient + cg_damping: 0.1 + # Number of conjugate gradient iterations + cg_iters: 10 + # Subsampled observation + fvp_obs: None + + # ---------------------------------------Optional Configuration-------------------------------- # + ## -----------------------------------Configuration For Cost Critic--------------------------- ## + # Whether to use cost critic + use_cost: True + # Cost discounted factor + cost_gamma: 1.0 + # Whther to use linear decay of learning rate + linear_lr_decay: False + # Whether to use exploration noise anneal + exploration_noise_anneal: True + # Whther to use reward penalty + reward_penalty: False + # Whether to use KL early stopping + kl_early_stopping: True + # Whether to use max gradient norm + use_max_grad_norm: False + # The thereshold of max gradient norm + max_grad_norm: 0.5 + # Whether to use reward scaling + scale_rewards: False + # Whether to use standardized observation + standardized_obs: False + ## ---------------------------------------Configuration For Model----------------------------- ## + model_cfgs: + # Whether to share the weight of Actor network with Critic network + shared_weights: False + # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". + weight_initialization_mode: "kaiming_uniform" + # Configuration of Actor and Critic network + ac_kwargs: + # Configuration of Actor network + pi: + # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" + actor_type: gaussian_annealing + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + val: + # Size of hidden layers + hidden_sizes: [64, 64] + # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus" + activation: tanh + ## --------------------------------------Configuration For Buffer----------------------------- ## + buffer_cfgs: + # Reward discounted factor + gamma: 0.99 + # Parameters used to estimate future rewards in GAE + lam: 0.95 + # Parameters used to estimate future costs in GAE + lam_c: 0.95 + # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace" + adv_estimation_method: gae + # Whether to use standardized reward + standardized_reward: True + # Whether to use standardized cost + standardized_cost: True + ## --------------------------------------Configuration For PID--------------------------------- ## + PID_cfgs: + # KP for PID + pid_kp: 0.01 + # KI for PID + pid_ki: 0.01 + # KD for PID + pid_kd: 0.01 + # The init value of lagrangian multiplier + lagrangian_multiplier_init: 0.001 + # The delay rate of KD + pid_d_delay: 10 + # 0 for hard update, 1 for no update + pid_delta_p_ema_alpha: 0.95 + # The same as above + pid_delta_d_ema_alpha: 0.95 + # L = (J_r - lam * J_c) / (1 + lam); lam <= 0 + sum_norm: True + # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1 + diff_norm: False + # Only used if sum_norm=diff_norm=False + penalty_max: 100 + # Tolerance of violation + cost_limit: 25.0 diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index 8f325f7b2..760b50080 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -14,12 +14,16 @@ # ============================================================================== """Implementation of ActorBuilder.""" +from typing import Optional + import torch.nn as nn from omnisafe.models.actor.categorical_actor import CategoricalActor +from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor from omnisafe.models.actor.gaussian_annealing_actor import GaussianAnnealingActor from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor +from omnisafe.models.actor.mlp_actor import MLPActor from omnisafe.utils.model_utils import Activation, InitFunction @@ -36,6 +40,7 @@ def __init__( activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'xavier_uniform', shared: nn.Module = None, + act_noise: Optional[float] = None, ) -> None: self.obs_dim = obs_dim self.act_dim = act_dim @@ -43,6 +48,7 @@ def __init__( self.activation = activation self.weight_initialization_mode = weight_initialization_mode self.shared = shared + self.act_noise = act_noise def build_actor(self, actor_type: str, **kwargs): """Build actor network.""" @@ -86,4 +92,25 @@ def build_actor(self, actor_type: str, **kwargs): shared=self.shared, **kwargs, ) + if actor_type == 'dire': + return MLPActor( + obs_dim=self.obs_dim, + act_dim=self.act_dim, + act_noise=self.act_noise, + hidden_sizes=self.hidden_sizes, + activation=self.activation, + weight_initialization_mode=self.weight_initialization_mode, + shared=self.shared, + **kwargs, + ) + if actor_type == 'cholesky': + return MLPCholeskyActor( + obs_dim=self.obs_dim, + act_dim=self.act_dim, + hidden_sizes=self.hidden_sizes, + activation=self.activation, + weight_initialization_mode=self.weight_initialization_mode, + **kwargs, + ) + raise NotImplementedError(f'Actor type {actor_type} is not implemented.') diff --git a/omnisafe/models/actor/gaussian_annealing_actor.py b/omnisafe/models/actor/gaussian_annealing_actor.py index 3bdf51014..065abc801 100644 --- a/omnisafe/models/actor/gaussian_annealing_actor.py +++ b/omnisafe/models/actor/gaussian_annealing_actor.py @@ -69,7 +69,7 @@ def _distribution(self, obs): mean = self.net(obs) return Normal(mean, self._std) - def predict(self, obs, deterministic=False, need_log_prob=False): + def predict(self, obs, deterministic=False, need_log_prob=True): dist = self._distribution(obs) if deterministic: out = dist.mean diff --git a/omnisafe/models/actor/gaussian_stdnet_actor.py b/omnisafe/models/actor/gaussian_stdnet_actor.py index 5ae43116b..2c67f2052 100644 --- a/omnisafe/models/actor/gaussian_stdnet_actor.py +++ b/omnisafe/models/actor/gaussian_stdnet_actor.py @@ -30,9 +30,9 @@ def __init__( self, obs_dim, act_dim, - act_min: torch.Tensor, act_max: torch.Tensor, - hidden_sizes, + act_min: torch.Tensor, + hidden_sizes: list, activation, weight_initialization_mode, shared=None, @@ -94,12 +94,13 @@ def predict(self, obs, deterministic=False, need_log_prob=False): action = torch.tanh(out) action = self.act_min + (action + 1) * 0.5 * (self.act_max - self.act_min) + action = torch.clamp(action, self.act_min, self.act_max) if need_log_prob: log_prob = dist.log_prob(out).sum(axis=-1) log_prob -= torch.log(1.00001 - torch.tanh(out) ** 2).sum(axis=-1) - return out, log_prob - return out + return action.to(torch.float32), log_prob + return action.to(torch.float32) def forward(self, obs, act=None): dist = self._distribution(obs) diff --git a/omnisafe/models/actor/mlp_actor.py b/omnisafe/models/actor/mlp_actor.py index ccd8a3bf2..616e3d55e 100644 --- a/omnisafe/models/actor/mlp_actor.py +++ b/omnisafe/models/actor/mlp_actor.py @@ -32,15 +32,18 @@ def __init__( obs_dim: int, act_dim: int, act_noise, - act_limit, + act_max, + act_min, hidden_sizes: list, activation: Activation, weight_initialization_mode: InitFunction = 'xavier_uniform', shared: nn.Module = None, ): super().__init__(obs_dim, act_dim, hidden_sizes, activation) - self.act_limit = act_limit + self.act_max = act_max + self.act_min = act_min self.act_noise = act_noise + self._std = 0.5 * torch.ones(self.act_dim, dtype=torch.float32) if shared is not None: # use shared layers action_head = build_mlp_network( @@ -62,15 +65,24 @@ def _distribution(self, obs): mean = self.net(obs) return Normal(mean, self._std) + def get_distribution(self, obs): + """Get the distribution of actor.""" + return self._distribution(obs) + def forward(self, obs, act=None): - """forward""" + """Forward""" # Return output from network scaled to action space limits. - return self.act_limit * self.net(obs) + return self.act_max * self.net(obs) - def predict(self, obs, deterministic=False, need_log_prob=False): + def predict(self, obs, deterministic=False, need_log_prob=True): if deterministic: - action = self.act_limit * self.net(obs) + action = self.act_max * self.net(obs) else: - action = self.act_limit * self.net(obs) + action = self.act_max * self.net(obs) action += self.act_noise * np.random.randn(self.act_dim) - return action.to(torch.float32), torch.tensor(1, dtype=torch.float32) + + action = torch.clamp(action, self.act_min, self.act_max) + if need_log_prob: + return action.to(torch.float32), torch.tensor(1, dtype=torch.float32) + + return action.to(torch.float32) diff --git a/omnisafe/models/actor_critic.py b/omnisafe/models/actor_critic.py index 5b3df55fa..cbea39711 100644 --- a/omnisafe/models/actor_critic.py +++ b/omnisafe/models/actor_critic.py @@ -131,5 +131,5 @@ def anneal_exploration(self, frac): frac: progress of epochs, i.e. current epoch / total epochs e.g. 10 / 100 = 0.1 """ - if hasattr(self.actor, 'set_log_std'): + if hasattr(self.actor, 'set_std'): self.actor.set_std(1 - frac) diff --git a/omnisafe/models/actor_q_critic.py b/omnisafe/models/actor_q_critic.py index 66dc66a83..8e6d49708 100644 --- a/omnisafe/models/actor_q_critic.py +++ b/omnisafe/models/actor_q_critic.py @@ -17,8 +17,8 @@ import numpy as np import torch import torch.nn as nn -from gymnasium.spaces import Box +from omnisafe.models.actor import ActorBuilder from omnisafe.models.actor.mlp_actor import MLPActor from omnisafe.models.critic.q_critic import QCritic from omnisafe.utils.model_utils import build_mlp_network @@ -45,15 +45,10 @@ def __init__( self.obs_shape = observation_space.shape self.obs_oms = OnlineMeanStd(shape=self.obs_shape) if standardized_obs else None self.act_dim = action_space.shape[0] - self.act_limit = action_space.high[0] + self.act_max = torch.as_tensor(action_space.high) + self.act_min = torch.as_tensor(action_space.low) self.ac_kwargs = model_cfgs.ac_kwargs # build policy and value functions - if isinstance(action_space, Box): - if model_cfgs.pi_type == 'dire': - actor_fn = MLPActor - act_dim = action_space.shape[0] - else: - raise ValueError self.obs_dim = observation_space.shape[0] @@ -71,31 +66,42 @@ def __init__( else: shared = None - self.actor = actor_fn( + actor_builder = ActorBuilder( obs_dim=self.obs_dim, - act_dim=act_dim, + act_dim=self.act_dim, act_noise=model_cfgs.ac_kwargs.pi.act_noise, - act_limit=self.act_limit, hidden_sizes=model_cfgs.ac_kwargs.pi.hidden_sizes, activation=model_cfgs.ac_kwargs.pi.activation, weight_initialization_mode=weight_initialization_mode, shared=shared, ) + + if self.ac_kwargs.pi.actor_type == 'cholesky': + self.actor = actor_builder.build_actor( + self.ac_kwargs.pi.actor_type, + act_max=self.act_max, + act_min=self.act_min, + cov_min=self.ac_kwargs.pi.cov_min, + mu_clamp_min=self.ac_kwargs.pi.mu_clamp_min, + mu_clamp_max=self.ac_kwargs.pi.mu_clamp_max, + cov_clamp_min=self.ac_kwargs.pi.cov_clamp_min, + cov_clamp_max=self.ac_kwargs.pi.cov_clamp_max, + ) + else: + self.actor = actor_builder.build_actor( + self.ac_kwargs.pi.actor_type, + act_max=self.act_max, + act_min=self.act_min, + ) + self.critic = QCritic( self.obs_dim, - act_dim, - hidden_sizes=model_cfgs.ac_kwargs.val.hidden_sizes, - activation=model_cfgs.ac_kwargs.val.activation, - weight_initialization_mode=weight_initialization_mode, - shared=shared, - ) - self.critic_ = QCritic( - self.obs_dim, - act_dim, + self.act_dim, hidden_sizes=model_cfgs.ac_kwargs.val.hidden_sizes, activation=model_cfgs.ac_kwargs.val.activation, weight_initialization_mode=weight_initialization_mode, shared=shared, + num_critics=model_cfgs.ac_kwargs.val.num_critics, ) def step(self, obs, deterministic=False): @@ -120,7 +126,8 @@ def step(self, obs, deterministic=False): else: action, logp_a = self.pi.predict(obs, determinstic=deterministic) value = self.v(obs, action) - action = np.clip(action.numpy(), -self.act_limit, self.act_limit) + action = action.to(torch.float32) + action = np.clip(action.numpy(), self.act_min, self.act_max) return action, value.numpy(), logp_a.numpy() @@ -130,8 +137,8 @@ def anneal_exploration(self, frac): frac: progress of epochs, i.e. current epoch / total epochs e.g. 10 / 100 = 0.1 """ - if hasattr(self.pi, 'set_log_std'): - self.pi.set_log_std(1 - frac) + if hasattr(self.actor, 'set_log_std'): + self.actor.set_log_std(1 - frac) def forward(self, obs, act): """Compute the value of a given state-action pair.""" diff --git a/omnisafe/models/constraint_actor_q_critic.py b/omnisafe/models/constraint_actor_q_critic.py index e6116866e..049d636bf 100644 --- a/omnisafe/models/constraint_actor_q_critic.py +++ b/omnisafe/models/constraint_actor_q_critic.py @@ -14,7 +14,6 @@ # ============================================================================== """Implementation of ConstraintActorQCritic.""" -import numpy as np import torch from omnisafe.models.actor_q_critic import ActorQCritic @@ -67,9 +66,10 @@ def step(self, obs, deterministic=False): # Note: Update RMS in Algorithm.running_statistics() method # self.obs_oms.update(obs) if self.training else None obs = self.obs_oms(obs) - action, logp_a = self.actor.predict(obs, deterministic=deterministic) - value = self.critic(obs, action) - cost_value = self.cost_critic(obs, action) - action = np.clip(action.numpy(), -self.act_limit, self.act_limit) + action, logp_a = self.actor.predict( + obs, deterministic=deterministic, need_log_prob=True + ) + value = self.critic(obs, action)[0] + cost_value = self.cost_critic(obs, action)[0] - return action, value.numpy(), cost_value.numpy(), logp_a.numpy() + return action.numpy(), value.numpy(), cost_value.numpy(), logp_a.numpy() diff --git a/omnisafe/models/critic/q_critic.py b/omnisafe/models/critic/q_critic.py index 2b4db763e..904df3701 100644 --- a/omnisafe/models/critic/q_critic.py +++ b/omnisafe/models/critic/q_critic.py @@ -34,6 +34,7 @@ def __init__( activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'xavier_uniform', shared: nn.Module = None, + num_critics: int = 1, ) -> None: """Initialize.""" Critic.__init__( @@ -45,17 +46,22 @@ def __init__( weight_initialization_mode=weight_initialization_mode, shared=shared, ) - self.obs_encoder = build_mlp_network( - [obs_dim, hidden_sizes[0]], - activation=activation, - output_activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.net = build_mlp_network( - [hidden_sizes[0] + act_dim] + hidden_sizes[1:] + [1], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) + self.critic_list = [] + for idx in range(num_critics): + obs_encoder = build_mlp_network( + [obs_dim, hidden_sizes[0]], + activation=activation, + output_activation=activation, + weight_initialization_mode=weight_initialization_mode, + ) + net = build_mlp_network( + [hidden_sizes[0] + act_dim] + hidden_sizes[1:] + [1], + activation=activation, + weight_initialization_mode=weight_initialization_mode, + ) + critic = nn.Sequential(obs_encoder, net) + self.critic_list.append(critic) + self.add_module(f'critic_{idx}', critic) def forward( self, @@ -63,5 +69,8 @@ def forward( act: Optional[torch.Tensor] = None, ): """Forward.""" - obs = self.obs_encoder(obs) - return torch.squeeze(self.net(torch.cat([obs, act], dim=-1)), -1) + res = [] + for critic in self.critic_list: + encodered_obs = critic[0](obs) + res.append(torch.squeeze(critic[1](torch.cat([encodered_obs, act], dim=-1)), -1)) + return res diff --git a/omnisafe/utils/config_utils.py b/omnisafe/utils/config_utils.py index d0e9c8cf3..dba52654d 100644 --- a/omnisafe/utils/config_utils.py +++ b/omnisafe/utils/config_utils.py @@ -26,28 +26,21 @@ def recursive_update(args: dict, update_args: dict): print(f'{key}:') recursive_update(args[key], update_args[key]) else: - # f-strings: - # https://pylint.pycqa.org/en/latest/user_guide/messages/convention/consider-using-f-string.html args[key] = update_args[key] menus = (key, update_args[key]) print(f'- {menus[0]}: {menus[1]} is update!') elif isinstance(value, dict): recursive_update(value, update_args) + return dict2namedtuple(args) - return create_namedtuple_from_dict(args) - -def create_namedtuple_from_dict(obj): +def dict2namedtuple(obj): """Create namedtuple from dict""" if isinstance(obj, dict): fields = sorted(obj.keys()) - namedtuple_type = namedtuple( - typename='GenericObject', - field_names=fields, - rename=True, - ) + namedtuple_type = namedtuple('GenericObject', fields, rename=True) field_value_pairs = OrderedDict( - (str(field), create_namedtuple_from_dict(obj[field])) for field in fields + (str(field), dict2namedtuple(obj[field])) for field in fields ) try: return namedtuple_type(**field_value_pairs) @@ -55,11 +48,18 @@ def create_namedtuple_from_dict(obj): # Cannot create namedtuple instance so fallback to dict (invalid attribute names) return dict(**field_value_pairs) elif isinstance(obj, (list, set, tuple, frozenset)): - return [create_namedtuple_from_dict(item) for item in obj] + return [dict2namedtuple(item) for item in obj] else: return obj +def namedtuple2dict(obj): + """Create a dict from a namedtuple.""" + if isinstance(obj, tuple) and hasattr(obj, '_fields'): + return {key: namedtuple2dict(value) for key, value in obj._asdict().items()} + return obj + + def check_all_configs(configs, algo_type): """Check all configs""" if algo_type == 'on-policy': diff --git a/omnisafe/wrappers/__init__.py b/omnisafe/wrappers/__init__.py index 9eb8141a6..3fd963fd0 100644 --- a/omnisafe/wrappers/__init__.py +++ b/omnisafe/wrappers/__init__.py @@ -14,5 +14,33 @@ # ============================================================================== """Environment wrappers.""" +import itertools +from types import MappingProxyType + +from omnisafe.wrappers.early_terminated_wrapper import EarlyTerminatedEnvWrapper from omnisafe.wrappers.off_policy_wrapper import OffPolicyEnvWrapper from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.saute_wrapper import SauteEnvWrapper +from omnisafe.wrappers.simmer_wrapper import SimmerEnvWrapper + + +ENVWRAPPERS = { + 'on-policy-wrapper': OnPolicyEnvWrapper, + 'off-policy-wrapper': OffPolicyEnvWrapper, + 'saute-wrapper': SauteEnvWrapper, + 'simmer-wrapper': SimmerEnvWrapper, + 'early-terminated-wrapper': EarlyTerminatedEnvWrapper, +} + +ENVWRAPPERS2TYPE = { + env_wrapper: env_wrapper_type for env_wrapper_type, env_wrapper in ENVWRAPPERS.items() +} + +__all__ = ENVWRAPPERS['all'] = tuple(itertools.chain(ENVWRAPPERS.values())) + +assert len(ENVWRAPPERS2TYPE) == len(__all__), 'Duplicate environment wrappers found.' + +ENVWRAPPERS = MappingProxyType(ENVWRAPPERS) +ENVWRAPPERS2TYPE = MappingProxyType(ENVWRAPPERS2TYPE) + +del itertools, MappingProxyType diff --git a/omnisafe/wrappers/early_terminated_wrapper.py b/omnisafe/wrappers/early_terminated_wrapper.py new file mode 100644 index 000000000..7f9b00d93 --- /dev/null +++ b/omnisafe/wrappers/early_terminated_wrapper.py @@ -0,0 +1,87 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Early terminated wrapper.""" + +import torch + +from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY + + +@WRAPPER_REGISTRY.register +class EarlyTerminatedEnvWrapper(OnPolicyEnvWrapper): # pylint: disable=too-many-instance-attributes + """EarlyTerminatedEnvWrapper.""" + + # pylint: disable-next=too-many-locals + def roll_out(self, agent, buf, logger): + """Collect data and store to experience buffer. + Terminated when the episode is done or the episode length is larger than max_ep_len + or cost is unequal to 0.""" + obs, _ = self.env.reset() + ep_ret, ep_costs, ep_len = 0.0, 0.0, 0 + for step_i in range(self.local_steps_per_epoch): + action, value, cost_value, logp = agent.step(torch.as_tensor(obs, dtype=torch.float32)) + next_obs, reward, cost, done, truncated, _ = self.step(action) + ep_ret += reward + ep_costs += (self.cost_gamma**ep_len) * cost + ep_len += 1 + + # Save and log + # Notes: + # - raw observations are stored to buffer (later transformed) + # - reward scaling is performed in buffer + buf.store( + obs=obs, + act=action, + rew=reward, + val=value, + logp=logp, + cost=cost, + cost_val=cost_value, + ) + + # Store values for statistic purpose + if self.use_cost: + logger.store(**{'Values/V': value, 'Values/C': cost_value}) + else: + logger.store(**{'Values/V': value}) + + # Update observation + obs = next_obs + + timeout = ep_len == self.max_ep_len + terminal = done or timeout or truncated or cost + epoch_ended = step_i == self.local_steps_per_epoch - 1 + + if terminal or epoch_ended: + if timeout or epoch_ended: + _, value, cost_value, _ = agent(torch.as_tensor(obs, dtype=torch.float32)) + else: + value, cost_value = 0.0, 0.0 + + # Automatically compute GAE in buffer + buf.finish_path(value, cost_value, penalty_param=float(self.penalty_param)) + + # Only save EpRet / EpLen if trajectory finished + if terminal: + logger.store( + **{ + 'Metrics/EpRet': ep_ret, + 'Metrics/EpLen': ep_len, + 'Metrics/EpCost': ep_costs, + } + ) + ep_ret, ep_costs, ep_len = 0.0, 0.0, 0 + obs, _ = self.env.reset() diff --git a/omnisafe/wrappers/env_wrapper.py b/omnisafe/wrappers/env_wrapper.py index 8282f04e3..957a61e51 100644 --- a/omnisafe/wrappers/env_wrapper.py +++ b/omnisafe/wrappers/env_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Environment wrapper.""" import safety_gymnasium import torch @@ -20,7 +20,7 @@ # pylint: disable-next=too-many-instance-attributes class EnvWrapper: - """env_wrapper""" + """Environment wrapper.""" def __init__(self, env_id, render_mode=None): # check env_id is str diff --git a/omnisafe/wrappers/off_policy_wrapper.py b/omnisafe/wrappers/off_policy_wrapper.py index ee42c0f54..d3a18c810 100644 --- a/omnisafe/wrappers/off_policy_wrapper.py +++ b/omnisafe/wrappers/off_policy_wrapper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Environment wrapper for off-policy algorithms.""" import safety_gymnasium import torch @@ -23,7 +23,7 @@ # pylint: disable=too-many-instance-attributes @WRAPPER_REGISTRY.register class OffPolicyEnvWrapper: - """OffPolicyEnvWrapper""" + """OffPolicyEnvWrapperr""" def __init__( self, @@ -53,7 +53,6 @@ def __init__( # self.deterministic = False self.local_steps_per_epoch = None self.cost_gamma = None - self.use_cost = None self.penalty_param = None def make(self): @@ -130,7 +129,7 @@ def roll_out( **{ 'Metrics/EpRet': ep_ret, 'Metrics/EpLen': ep_len, - 'Metrics/EpCosts': ep_cost, + 'Metrics/EpCost': ep_cost, } ) self.curr_o, _ = self.env.reset(seed=self.seed) @@ -142,7 +141,7 @@ def roll_out( **{ 'Test/EpRet': ep_ret, 'Test/EpLen': ep_len, - 'Test/EpCosts': ep_cost, + 'Test/EpCost': ep_cost, } ) self.curr_o, _ = self.env.reset(seed=self.seed) diff --git a/omnisafe/wrappers/on_policy_wrapper.py b/omnisafe/wrappers/on_policy_wrapper.py index 8284c8d2c..30df2ab9d 100644 --- a/omnisafe/wrappers/on_policy_wrapper.py +++ b/omnisafe/wrappers/on_policy_wrapper.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""env_wrapper""" +"""Environment wrapper for on-policy algorithms.""" + +import collections +from copy import deepcopy +from typing import Optional import safety_gymnasium import torch @@ -22,11 +26,18 @@ @WRAPPER_REGISTRY.register class OnPolicyEnvWrapper: # pylint: disable=too-many-instance-attributes - """env_wrapper""" + """env_wrapper.""" + + def __init__(self, env_id, cfgs: Optional[collections.namedtuple] = None, render_mode=None): + """Initialize environment wrapper. - def __init__(self, env_id, render_mode=None): - # check env_id is str + Args: + env_id (str): environment id. + cfgs (collections.namedtuple): configs. + render_mode (str): render mode. + """ self.env = safety_gymnasium.make(env_id, render_mode=render_mode) + self.cfgs = deepcopy(cfgs) self.env_id = env_id self.render_mode = render_mode self.metadata = self.env.metadata diff --git a/omnisafe/wrappers/saute_wrapper.py b/omnisafe/wrappers/saute_wrapper.py new file mode 100644 index 000000000..17bbba46e --- /dev/null +++ b/omnisafe/wrappers/saute_wrapper.py @@ -0,0 +1,210 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Environment wrapper for saute algorithms.""" + +import numpy as np +import torch +from gymnasium import spaces + +from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY + + +@WRAPPER_REGISTRY.register +class SauteEnvWrapper(OnPolicyEnvWrapper): + """SauteEnvWrapper.""" + + def __init__( + self, + env_id, + cfgs, + render_mode=None, + ) -> None: + """Initialize SauteEnvWrapper. + + Args: + env_id (str): environment id. + cfgs (dict): configuration dictionary. + render_mode (str): render mode. + + """ + super().__init__(env_id, render_mode) + + self.unsafe_reward = cfgs.unsafe_reward + self.saute_gamma = cfgs.saute_gamma + if cfgs.scale_safety_budget: + self.safety_budget = ( + cfgs.safety_budget + * (1 - self.saute_gamma**self.max_ep_len) + / (1 - self.saute_gamma) + / np.float32(self.max_ep_len) + ) + else: + self.safety_budget = cfgs.safety_budget + self.safety_obs = 1.0 + high = np.array(np.hstack([self.env.observation_space.high, np.inf]), dtype=np.float32) + low = np.array(np.hstack([self.env.observation_space.low, np.inf]), dtype=np.float32) + self.observation_space = spaces.Box(high=high, low=low) + + def augment_obs(self, obs: np.array, safety_obs: np.array): + """Augmenting the obs with the safety obs. + + Args: + obs (np.array): observation. + safety_obs (np.array): safety observation. + + Returns: + augmented_obs (np.array): augmented observation. + """ + augmented_obs = np.hstack([obs, safety_obs]) + return augmented_obs + + def safety_step(self, cost: np.ndarray) -> np.ndarray: + """Update the normalized safety obs. + + Args: + cost (np.array): cost. + + Returns: + safety_obs (np.array): normalized safety observation. + """ + self.safety_obs -= cost / self.safety_budget + self.safety_obs /= self.saute_gamma + return self.safety_obs + + def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.ndarray: + """Update the reward. + + Args: + reward (np.array): reward. + next_safety_obs (np.array): next safety observation. + + Returns: + reward (np.array): updated reward. + """ + reward = reward * (next_safety_obs > 0) + self.unsafe_reward * (next_safety_obs <= 0) + return reward + + def reset(self, seed=None): + """Reset environment. + + Args: + seed (int): seed for environment reset. + + Returns: + self.curr_o (np.array): current observation. + info (dict): environment info. + """ + self.curr_o, info = self.env.reset(seed=seed) + self.safety_obs = 1.0 + self.curr_o = self.augment_obs(self.curr_o, self.safety_obs) + return self.curr_o, info + + def step(self, action): + """Step environment. + + Args: + action (np.array): action. + + Returns: + augmented_obs (np.array): augmented observation. + reward (np.array): reward. + cost (np.array): cost. + terminated (bool): whether the episode is terminated. + truncated (bool): whether the episode is truncated. + info (dict): environment info. + """ + next_obs, reward, cost, terminated, truncated, info = self.env.step(action) + next_safety_obs = self.safety_step(cost) + info['true_reward'] = reward + info['safety_obs'] = next_safety_obs + reward = self.safety_reward(reward, next_safety_obs) + augmented_obs = self.augment_obs(next_obs, next_safety_obs) + + return augmented_obs, reward, cost, terminated, truncated, info + + # pylint: disable-next=too-many-locals + def roll_out(self, agent, buf, logger): + """Collect data and store to experience buffer. + + Args: + agent (Agent): agent. + buf (Buffer): buffer. + logger (Logger): logger. + + Returns: + ep_ret (float): episode return. + ep_costs (float): episode costs. + ep_len (int): episode length. + ep_budget (float): episode budget. + """ + obs, _ = self.reset() + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + for step_i in range(self.local_steps_per_epoch): + action, value, cost_value, logp = agent.step(torch.as_tensor(obs, dtype=torch.float32)) + next_obs, reward, cost, done, truncated, info = self.step(action) + ep_ret += info['true_reward'] + ep_costs += (self.cost_gamma**ep_len) * cost + ep_len += 1 + ep_budget += self.safety_obs + + # Save and log + # Notes: + # - raw observations are stored to buffer (later transformed) + # - reward scaling is performed in buffer + buf.store( + obs=obs, + act=action, + rew=reward, + val=value, + logp=logp, + cost=cost, + cost_val=cost_value, + ) + + # Store values for statistic purpose + if self.use_cost: + logger.store(**{'Values/V': value, 'Values/C': cost_value}) + else: + logger.store(**{'Values/V': value}) + + # Update observation + obs = next_obs + + timeout = ep_len == self.max_ep_len + terminal = done or timeout or truncated + epoch_ended = step_i == self.local_steps_per_epoch - 1 + + if terminal or epoch_ended: + if timeout or epoch_ended: + _, value, cost_value, _ = agent(torch.as_tensor(obs, dtype=torch.float32)) + else: + value, cost_value = 0.0, 0.0 + + # Automatically compute GAE in buffer + buf.finish_path(value, cost_value, penalty_param=float(self.penalty_param)) + + # Only save EpRet / EpLen if trajectory finished + if terminal: + logger.store( + **{ + 'Metrics/EpRet': ep_ret, + 'Metrics/EpLen': ep_len, + 'Metrics/EpCost': ep_costs, + 'Metrics/EpBudget': ep_budget, + } + ) + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + obs, _ = self.reset() diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py new file mode 100644 index 000000000..8d1f2ca46 --- /dev/null +++ b/omnisafe/wrappers/simmer_wrapper.py @@ -0,0 +1,510 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY pid_kiND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Environment wrapper for Simmer algorithm.""" + +import copy + +import numpy as np +import torch +from gymnasium import spaces + +from omnisafe.wrappers.on_policy_wrapper import OnPolicyEnvWrapper +from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY + + +class PidController: # pylint: disable=too-many-instance-attributes + """Using PID controller to control the safety budget in Simmer environment.""" + + def __init__( + self, + cfgs, + safety_budget: float = 25.0, + lower_budget: float = 1.0, + upper_budget: float = 25.0, + ) -> None: + """Initialize the PID controller. + + Args: + cfgs (CfgNode): Configurations. + safety_budget (float): The initial safety budget. + lower_budget (float): The lower bound of safety budget. + upper_budget (float): The upper bound of safety budget. + """ + # PID parameters. + self.pid_kp = cfgs.pid_kp + self.pid_ki = cfgs.pid_ki + self.pid_kd = cfgs.pid_kd + + # Low pass filter. + self.tau = cfgs.tau + + # Initialize the PID controller. + self.error = 0.0 + self.error_i = 0.0 + self.prev_action = 0 + self.prev_raw_action = 0 + self.step_size = cfgs.step_size + + # Set the initial safety budget. + self.safety_budget = safety_budget + self.lower_budget = lower_budget + self.upper_budget = upper_budget + + def compute_raw_action(self, obs: float): + """Compute the raw action based on current obs. + + Args: + obs (float): The current observation. + + Returns: + float: The raw action. + """ + + # Low pass filter. + error_p = self.tau * self.error + (1 - self.tau) * (self.safety_budget - obs) + self.error_i += self.error + error_d = self.pid_kd * (self.prev_action - self.prev_raw_action) + + # Compute PID error. + curr_raw_action = self.pid_kp * error_p + self.pid_ki * self.error_i + self.pid_kd * error_d + return curr_raw_action + + def act(self, obs: float): + """Compute the safety budget based on the observation ``Jc``. + + Args: + obs (float): The current observation. + + Returns: + float: The safety budget. + """ + curr_raw_action = self.compute_raw_action(obs) + + # Clip the raw action. + curr_action = np.clip(curr_raw_action, -self.step_size, self.step_size) + self.prev_action = curr_action + self.prev_raw_action = curr_raw_action + raw_budget = self.safety_budget + curr_action + + # Clip the safety budget. + self.safety_budget = np.clip(raw_budget, self.lower_budget, self.upper_budget) + + return self.safety_budget + + +class QController: # pylint: disable=too-many-instance-attributes + """Using Q-learning to control the safety budget in Simmer environment.""" + + def __init__( + self, + cfgs, + safety_budget: float = 25.0, + lower_budget: float = 1.0, + upper_budget: float = 25.0, + ) -> None: + """ " + Initialize the Q-learning controller. + + Args: + cfgs (CfgNode): The config file. + safety_budget (float): The initial safety budget. + lower_budget (float): The lower bound of the safety budget. + upper_budget (float): The upper bound of the safety budget. + """ + + # Set the initial safety budget. + self.lower_budget = lower_budget + self.upper_budget = upper_budget + + # Initialize the Q-learning controller. + self.state_dim = cfgs.state_dim + self.act_dim = cfgs.act_dim + self.q_function = np.zeros((cfgs.state_dim, cfgs.act_dim)) + self.state_space = np.linspace(self.lower_budget, self.upper_budget, cfgs.state_dim) + self.action_space = np.linspace(-1, 1, cfgs.act_dim, dtype=int) + self.state = safety_budget + self.init_idx = np.argwhere(self.state_space == self.state) + self.action = 0 + self.step(self.action) + + # Set the Q-learning parameters. + self.tau = cfgs.tau + self.threshold = cfgs.threshold + self.q_lr = cfgs.q_lr + + # Use epsilon greedy to explore the environment. + self.epsilon = cfgs.epsilon + + # Initialize the observation (Cost value per epoch) buffer. + self.prev_obs = copy.copy(self.state) + self.filtered_obs_buffer = [] + self.filtered_obs = 0 + + def get_state_idx(self, state: float): + """Get the state index. + + Args: + state (float): The current state. + + Returns: + int: The state index. + """ + state_idx = np.argwhere(self.state_space == state)[0][0] + return state_idx + + def get_action_idx(self, action: float): + """Get the action index. + + Args: + action (float): The current action. + + Returns: + int: The action index. + """ + action_idx = np.argwhere(self.action_space == action) + return action_idx + + def get_random_action(self): + """Get the random action. + + Returns: + float: The random action. + """ + action_idx = np.random.randint(0, self.act_dim) + return self.action_space[action_idx] + + def get_greedy_action(self, state: float): + """Get the greedy action. + + Args: + state (float): The current state(``cost_limit``). + + Returns: + float: The greedy action. + """ + state_idx = self.get_state_idx(state) + action_idx = np.argmax(self.q_function[state_idx, :]) + action = self.action_space[action_idx] + return action + + def update_q_function(self, state: float, action: float, reward: float, next_state: float): + """Update the Q function using the Bellman equation. + + Args: + state (float): The current state. + action (float): The current action. + reward (float): The reward. + next_state (float): The next state. + """ + state_idx = self.get_state_idx(state) + action_idx = self.get_action_idx(action) + next_state_idx = self.get_state_idx(next_state) + self.q_function[state_idx, action_idx] = (1 - self.q_lr) * self.q_function[ + state_idx, action_idx + ] + self.q_lr * (reward + self.tau * np.max(self.q_function[next_state_idx, :])) + + def step(self, action: float): + """Step the environment. + + Args: + action (float): The current action. + """ + state_idx = self.get_state_idx(self.state) + state_idx = np.clip(state_idx + action, 0, self.state_dim - 1, dtype=int) + self.state = self.state_space[state_idx] + return self.state + + def reward(self, state: float, action: float, obs: float): + """Get the reward function based on whether the observation is within the threshold. + + Args: + state (float): The current state. + action (float): The current action. + obs (float): The observation. + + Returns: + float: The reward. + """ + action_idx = self.get_action_idx(action) + if int(self.threshold > obs - state and obs - state > -self.threshold): + reward = np.array([-1, 1, 0.5])[action_idx] + elif int(obs - state <= -self.threshold): + reward = np.array([-1, 0, 2])[action_idx] + elif int(obs - state >= self.threshold): + reward = np.array([2, -1, -1])[action_idx] + return reward[0] + + def act(self, obs: float): + """Return the safety budget based on the observation. + + Args: + obs (float): The observation. + + Returns: + float: The safety budget. + """ + prev_obs = self.filtered_obs + self.filtered_obs = self.tau * prev_obs + (1 - self.tau) * obs + self.filtered_obs_buffer.append(self.filtered_obs) + state = self.state + + # Use epsilon greedy to explore the environment + epsilon = np.random.random() + if epsilon > self.epsilon: + action = self.get_random_action() + else: + action = self.get_greedy_action(state) + reward = self.reward(state, action, self.filtered_obs) + next_state = self.step(action) + safety_budget = next_state + + # Update the Q function + self.update_q_function(state, action, reward, next_state) + return safety_budget + + +@WRAPPER_REGISTRY.register +class SimmerEnvWrapper(OnPolicyEnvWrapper): # pylint: disable=too-many-instance-attributes + """Wrapper for the Simmer environment.""" + + def __init__( + self, + env_id, + cfgs, + render_mode=None, + ) -> None: + """Initialize the Simmer environment wrapper. + + Args: + env_id (str): The environment id. + cfgs (Config): The configuration. + render_mode (str): The render mode. + """ + super().__init__(env_id, render_mode) + + self.unsafe_reward = cfgs.unsafe_reward + self.simmer_gamma = cfgs.simmer_gamma + if cfgs.scale_safety_budget: + self.safety_budget = ( + cfgs.lower_budget + * (1 - self.simmer_gamma**self.max_ep_len) + / (1 - self.simmer_gamma) + / np.float32(self.max_ep_len) + ) + self.lower_budget = ( + cfgs.lower_budget + * (1 - self.simmer_gamma**self.max_ep_len) + / (1 - self.simmer_gamma) + / np.float32(self.max_ep_len) + ) + self.upper_budget = ( + cfgs.upper_budget + * (1 - self.simmer_gamma**self.max_ep_len) + / (1 - self.simmer_gamma) + / np.float32(self.max_ep_len) + ) + else: + self.safety_budget = cfgs.lower_budget + self.lower_budget = cfgs.lower_budget + self.upper_budget = cfgs.upper_budget + self.rel_safety_budget = self.safety_budget / self.upper_budget + self.safety_obs = self.rel_safety_budget + high = np.array(np.hstack([self.env.observation_space.high, np.inf]), dtype=np.float32) + low = np.array(np.hstack([self.env.observation_space.low, np.inf]), dtype=np.float32) + self.observation_space = spaces.Box(high=high, low=low) + if cfgs.simmer_controller == 'PID': + self.controller = PidController( + cfgs.controller_cfgs, + safety_budget=self.safety_budget, + lower_budget=self.lower_budget, + upper_budget=self.upper_budget, + ) + elif cfgs.simmer_controller == 'Q': + self.controller = QController( + cfgs.controller_cfgs, + safety_budget=self.safety_budget, + lower_budget=self.lower_budget, + upper_budget=self.upper_budget, + ) + else: + raise NotImplementedError( + f'Controller type {cfgs.simmer_controller} is not implemented.' + ) + + def augment_obs(self, obs: np.array, safety_obs: np.array): + """Augmenting the obs with the safety obs, if needed. + + Args: + obs (np.array): The observation. + safety_obs (np.array): The safety observation. + + Returns: + np.array: The augmented observation. + """ + augmented_obs = np.hstack([obs, safety_obs]) + return augmented_obs + + def safety_step(self, cost: np.ndarray) -> np.ndarray: + """Update the normalized safety obs. + + Args: + cost (np.ndarray): The cost. + + Returns: + np.ndarray: The normalized safety obs. + """ + self.safety_obs -= cost / self.upper_budget + self.safety_obs /= self.simmer_gamma + return self.safety_obs + + def safety_reward(self, reward: np.ndarray, next_safety_obs: np.ndarray) -> np.ndarray: + """Update the reward based on the safety obs. + + Args: + reward (np.ndarray): The reward. + next_safety_obs (np.ndarray): The next safety obs. + + Returns: + np.ndarray: The updated reward. + """ + reward = reward * (next_safety_obs > 0) + self.unsafe_reward * (next_safety_obs <= 0) + return reward + + def reset(self, seed=None): + """Reset environment. + + Args: + seed (int): The seed. + + Returns: + np.array: The augmented observation. + dict: The info. + """ + self.curr_o, info = self.env.reset(seed=seed) + self.rel_safety_budget = self.safety_budget / self.upper_budget + self.safety_obs = self.rel_safety_budget + self.curr_o = self.augment_obs(self.curr_o, self.safety_obs) + return self.curr_o, info + + def step(self, action): + """Step environment. + + Args: + action (np.array): The action. + + Returns: + np.array: The augmented observation. + np.array: The reward. + np.array: The cost. + bool: The terminated flag. + bool: The truncated flag. + dict: The info. + """ + next_obs, reward, cost, terminated, truncated, info = self.env.step(action) + next_safety_obs = self.safety_step(cost) + info['true_reward'] = reward + info['safety_obs'] = next_safety_obs + reward = self.safety_reward(reward, next_safety_obs) + augmented_obs = self.augment_obs(next_obs, next_safety_obs) + + return augmented_obs, reward, cost, terminated, truncated, info + + def set_budget(self, Jc): + """Set the safety budget. + + Args: + Jc (np.array): The safety budget. + + Returns: + np.array: The safety budget. + """ + self.safety_budget = self.controller.act(Jc) + + # pylint: disable-next=too-many-locals + def roll_out(self, agent, buf, logger): + """Collect data and store to experience buffer. + + Args: + agent (Agent): The agent. + buf (Buffer): The buffer. + logger (Logger): The logger. + + Returns: + float: The episode return. + float: The episode cost. + int: The episode length. + float: The episode budget. + """ + obs, _ = self.reset() + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + for step_i in range(self.local_steps_per_epoch): + action, value, cost_value, logp = agent.step(torch.as_tensor(obs, dtype=torch.float32)) + next_obs, reward, cost, done, truncated, info = self.step(action) + ep_ret += info['true_reward'] + ep_costs += (self.cost_gamma**ep_len) * cost + ep_len += 1 + ep_budget += self.safety_obs + + # Save and log + # Notes: + # - raw observations are stored to buffer (later transformed) + # - reward scaling is performed in buffer + buf.store( + obs=obs, + act=action, + rew=reward, + val=value, + logp=logp, + cost=cost, + cost_val=cost_value, + ) + + # Store values for statistic purpose + if self.use_cost: + logger.store(**{'Values/V': value, 'Values/C': cost_value}) + else: + logger.store(**{'Values/V': value}) + + # Update observation + obs = next_obs + + timeout = ep_len == self.max_ep_len + terminal = done or timeout or truncated + epoch_ended = step_i == self.local_steps_per_epoch - 1 + + if terminal or epoch_ended: + if timeout or epoch_ended: + _, value, cost_value, _ = agent(torch.as_tensor(obs, dtype=torch.float32)) + else: + value, cost_value = 0.0, 0.0 + + # Automatically compute GAE in buffer + buf.finish_path(value, cost_value, penalty_param=float(self.penalty_param)) + + # Only save EpRet / EpLen if trajectory finished + if terminal: + logger.store( + **{ + 'Metrics/EpRet': ep_ret, + 'Metrics/EpLen': ep_len, + 'Metrics/EpCost': ep_costs, + 'Metrics/EpBudget': ep_budget, + 'Metrics/SafetyBudget': self.safety_budget, + } + ) + ep_ret, ep_costs, ep_len, ep_budget = 0.0, 0.0, 0, 0.0 + obs, _ = self.reset() + # Update safety budget after each epoch. + self.set_budget(logger.get_stats('Metrics/EpCost')[0]) diff --git a/tests/test_model.py b/tests/test_model.py index 368540424..0f0954376 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -22,7 +22,7 @@ import helpers from omnisafe.models import ActorBuilder, CriticBuilder from omnisafe.models.actor_critic import ActorCritic -from omnisafe.utils.config_utils import create_namedtuple_from_dict +from omnisafe.utils.config_utils import dict2namedtuple @helpers.parametrize( @@ -196,12 +196,13 @@ def test_actor_critic( } observation_space = Box(low=-1, high=1, shape=(obs_dim,)) - model_cfgs = { - 'ac_kwargs': ac_kwargs, - 'weight_initialization_mode': weight_initialization_mode, - 'shared_weights': shared_weights, - } - model_cfgs = create_namedtuple_from_dict(model_cfgs) + model_cfgs = dict2namedtuple( + { + 'ac_kwargs': ac_kwargs, + 'weight_initialization_mode': weight_initialization_mode, + 'shared_weights': shared_weights, + } + ) if space_type == Discrete: action_space = space_type(act_dim) diff --git a/tests/test_policy.py b/tests/test_policy.py index 1d27864fb..7cfe11aa3 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -18,26 +18,19 @@ import omnisafe -@helpers.parametrize( - algo=[ - 'PolicyGradient', - 'PPO', - 'PPOLag', - 'NaturalPG', - 'TRPO', - 'TRPOLag', - 'PDO', - 'NPGLag', - 'CPO', - 'PCPO', - 'FOCOPS', - 'CPPOPid', - 'CUP', - ] -) -def test_on_policy(algo): +@helpers.parametrize(on_policy_algo=omnisafe.ALGORITHMS['on-policy']) +def test_on_policy(on_policy_algo): """Test algorithms""" env_id = 'SafetyPointGoal1-v0' custom_cfgs = {'epochs': 1, 'steps_per_epoch': 2000, 'pi_iters': 1, 'critic_iters': 1} - agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1) + agent = omnisafe.Agent(on_policy_algo, env_id, custom_cfgs=custom_cfgs, parallel=1) + agent.learn() + + +@helpers.parametrize(off_policy_algo=omnisafe.ALGORITHMS['off-policy']) +def test_off_policy(off_policy_algo): + """Test algorithms""" + env_id = 'SafetyPointGoal1-v0' + custom_cfgs = {'epochs': 1, 'steps_per_epoch': 2000, 'pi_iters': 1, 'critic_iters': 1} + agent = omnisafe.Agent(off_policy_algo, env_id, custom_cfgs=custom_cfgs, parallel=1) agent.learn()