From c2fcffa684041e164ffd1c6cfc6d624bdc178e1e Mon Sep 17 00:00:00 2001 From: Ruiyang Sun Date: Fri, 24 Feb 2023 01:58:00 +0800 Subject: [PATCH] refactor: change architecture of omnisafe (#121) --- .github/workflows/ci.yml | 7 +- .pylintrc | 14 +- docs/source/spelling_wordlist.txt | 9 + omnisafe/__init__.py | 4 +- .../model_based => adapter}/__init__.py | 16 +- omnisafe/adapter/early_terminated_adapter.py | 49 + omnisafe/adapter/online_adapter.py | 125 +++ omnisafe/adapter/onpolicy_adapter.py | 136 +++ omnisafe/adapter/saute_adapter.py | 127 +++ omnisafe/adapter/simmer_adapter.py | 62 ++ omnisafe/algorithms/__init__.py | 26 +- omnisafe/algorithms/algo_wrapper.py | 61 +- omnisafe/algorithms/base_algo.py | 67 ++ omnisafe/algorithms/model_based/cap.py | 140 --- omnisafe/algorithms/model_based/mbppo_lag.py | 445 --------- .../model_based/models/dynamic_model.py | 405 -------- .../model_based/models/virtual_env.py | 249 ----- omnisafe/algorithms/model_based/planner.py | 919 ------------------ .../algorithms/model_based/policy_gradient.py | 304 ------ omnisafe/algorithms/model_based/safeloop.py | 311 ------ omnisafe/algorithms/off_policy/__init__.py | 44 - omnisafe/algorithms/off_policy/cvpo.py | 189 ---- omnisafe/algorithms/off_policy/ddpg_lag.py | 103 -- omnisafe/algorithms/on_policy/__init__.py | 19 +- .../algorithms/on_policy/base/natural_pg.py | 174 +--- .../on_policy/base/policy_gradient.py | 861 +++++----------- omnisafe/algorithms/on_policy/base/ppo.py | 51 +- omnisafe/algorithms/on_policy/base/trpo.py | 179 ++-- .../early_terminated/ppo_early_terminated.py | 16 +- .../ppo_lag_early_terminated.py | 15 +- .../algorithms/on_policy/first_order/cup.py | 235 ++--- .../on_policy/first_order/focops.py | 257 ++--- .../on_policy/naive_lagrange/crpo.py | 72 +- .../on_policy/naive_lagrange/pdo.py | 81 +- .../on_policy/naive_lagrange/ppo_lag.py | 84 +- .../on_policy/naive_lagrange/rcpo.py | 83 +- .../on_policy/naive_lagrange/trpo_lag.py | 83 +- .../on_policy/penalty_function/ipo.py | 42 +- .../on_policy/penalty_function/p3o.py | 97 +- .../on_policy/pid_lagrange/cppo_pid.py | 161 +-- .../on_policy/pid_lagrange/trpo_pid.py | 150 +-- .../on_policy/saute/ppo_lag_saute.py | 31 +- .../algorithms/on_policy/saute/ppo_saute.py | 30 +- .../algorithms/on_policy/second_order/cpo.py | 400 ++++---- .../algorithms/on_policy/second_order/pcpo.py | 330 ++----- .../algorithms/on_policy/simmer/__init__.py | 28 - .../on_policy/simmer/ppo_lag_simmer_pid.py | 48 - .../on_policy/simmer/ppo_lag_simmer_q.py | 48 - .../on_policy/simmer/ppo_simmer_pid.py | 47 - .../on_policy/simmer/ppo_simmer_q.py | 47 - omnisafe/common/buffer/onpolicy_buffer.py | 93 +- .../common/buffer/vector_onpolicy_buffer.py | 6 +- omnisafe/common/experiment_grid.py | 13 +- omnisafe/common/lagrange.py | 4 +- omnisafe/common/logger.py | 73 +- omnisafe/common/normalizer.py | 118 ++- omnisafe/common/pid_lagrange.py | 11 +- omnisafe/common/record_queue.py | 62 -- omnisafe/configs/on-policy/CPO.yaml | 58 +- omnisafe/configs/on-policy/CPPOPid.yaml | 62 +- omnisafe/configs/on-policy/CUP.yaml | 58 +- omnisafe/configs/on-policy/FOCOPS.yaml | 58 +- omnisafe/configs/on-policy/IPO.yaml | 58 +- omnisafe/configs/on-policy/NaturalPG.yaml | 58 +- omnisafe/configs/on-policy/OnCRPO.yaml | 58 +- omnisafe/configs/on-policy/P3O.yaml | 58 +- omnisafe/configs/on-policy/PCPO.yaml | 58 +- omnisafe/configs/on-policy/PDO.yaml | 63 +- omnisafe/configs/on-policy/PPO.yaml | 60 +- .../configs/on-policy/PPOEarlyTerminated.yaml | 60 +- omnisafe/configs/on-policy/PPOLag.yaml | 58 +- .../on-policy/PPOLagEarlyTerminated.yaml | 60 +- omnisafe/configs/on-policy/PPOLagSaute.yaml | 60 +- omnisafe/configs/on-policy/PPOSaute.yaml | 60 +- .../configs/on-policy/PolicyGradient.yaml | 64 +- omnisafe/configs/on-policy/RCPO.yaml | 58 +- omnisafe/configs/on-policy/TRPO.yaml | 58 +- omnisafe/configs/on-policy/TRPOLag.yaml | 58 +- omnisafe/configs/on-policy/TRPOPid.yaml | 60 +- .../model_based/models => envs}/__init__.py | 6 +- omnisafe/envs/core.py | 336 +++++++ omnisafe/envs/safety_gymnasium_env.py | 117 +++ omnisafe/envs/wrapper.py | 288 ++++++ omnisafe/evaluator.py | 318 ------ omnisafe/models/__init__.py | 11 +- omnisafe/models/actor/__init__.py | 5 +- omnisafe/models/actor/actor_builder.py | 141 +-- omnisafe/models/actor/categorical_actor.py | 134 --- omnisafe/models/actor/cholesky_actor.py | 160 --- omnisafe/models/actor/gaussian_actor.py | 227 +---- .../models/actor/gaussian_learning_actor.py | 87 ++ omnisafe/models/actor/gaussian_sac_actor.py | 76 ++ .../models/actor/gaussian_stdnet_actor.py | 166 ---- omnisafe/models/actor_critic.py | 164 ---- omnisafe/models/actor_critic/actor_critic.py | 154 +++ .../actor_critic/constraint_actor_critic.py | 117 +++ omnisafe/models/actor_q_critic.py | 178 ---- omnisafe/models/base.py | 157 +-- omnisafe/models/constraint_actor_critic.py | 113 --- omnisafe/models/constraint_actor_q_critic.py | 109 --- omnisafe/models/critic/critic_builder.py | 75 +- omnisafe/models/critic/q_critic.py | 83 +- omnisafe/models/critic/v_critic.py | 62 +- omnisafe/typing.py | 2 + omnisafe/utils/config.py | 36 +- omnisafe/utils/core.py | 65 -- .../{distributed_utils.py => distributed.py} | 186 ++-- omnisafe/utils/exp_grid_tools.py | 3 +- omnisafe/utils/{algo_utils.py => math.py} | 124 ++- omnisafe/utils/{model_utils.py => model.py} | 52 +- omnisafe/utils/online_mean_std.py | 116 --- omnisafe/utils/schedule.py | 93 ++ omnisafe/utils/tools.py | 124 +-- omnisafe/utils/vtrace.py | 80 -- omnisafe/wrappers/__init__.py | 44 - omnisafe/wrappers/early_terminated_wrapper.py | 77 -- omnisafe/wrappers/model_based_wrapper.py | 455 --------- omnisafe/wrappers/saute_wrapper.py | 282 ------ omnisafe/wrappers/simmer_wrapper.py | 688 ------------- omnisafe/wrappers/wrapper_registry.py | 72 -- pyproject.toml | 1 + tests/test_model.py | 2 +- tests/test_utils.py | 124 --- 123 files changed, 4361 insertions(+), 10655 deletions(-) rename omnisafe/{algorithms/model_based => adapter}/__init__.py (69%) create mode 100644 omnisafe/adapter/early_terminated_adapter.py create mode 100644 omnisafe/adapter/online_adapter.py create mode 100644 omnisafe/adapter/onpolicy_adapter.py create mode 100644 omnisafe/adapter/saute_adapter.py create mode 100644 omnisafe/adapter/simmer_adapter.py create mode 100644 omnisafe/algorithms/base_algo.py delete mode 100644 omnisafe/algorithms/model_based/cap.py delete mode 100644 omnisafe/algorithms/model_based/mbppo_lag.py delete mode 100644 omnisafe/algorithms/model_based/models/dynamic_model.py delete mode 100644 omnisafe/algorithms/model_based/models/virtual_env.py delete mode 100644 omnisafe/algorithms/model_based/planner.py delete mode 100644 omnisafe/algorithms/model_based/policy_gradient.py delete mode 100644 omnisafe/algorithms/model_based/safeloop.py delete mode 100644 omnisafe/algorithms/off_policy/__init__.py delete mode 100644 omnisafe/algorithms/off_policy/cvpo.py delete mode 100644 omnisafe/algorithms/off_policy/ddpg_lag.py delete mode 100644 omnisafe/algorithms/on_policy/simmer/__init__.py delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py delete mode 100644 omnisafe/common/record_queue.py rename omnisafe/{algorithms/model_based/models => envs}/__init__.py (77%) create mode 100644 omnisafe/envs/core.py create mode 100644 omnisafe/envs/safety_gymnasium_env.py create mode 100644 omnisafe/envs/wrapper.py delete mode 100644 omnisafe/evaluator.py delete mode 100644 omnisafe/models/actor/categorical_actor.py delete mode 100644 omnisafe/models/actor/cholesky_actor.py create mode 100644 omnisafe/models/actor/gaussian_learning_actor.py create mode 100644 omnisafe/models/actor/gaussian_sac_actor.py delete mode 100644 omnisafe/models/actor/gaussian_stdnet_actor.py delete mode 100644 omnisafe/models/actor_critic.py create mode 100644 omnisafe/models/actor_critic/actor_critic.py create mode 100644 omnisafe/models/actor_critic/constraint_actor_critic.py delete mode 100644 omnisafe/models/actor_q_critic.py delete mode 100644 omnisafe/models/constraint_actor_critic.py delete mode 100644 omnisafe/models/constraint_actor_q_critic.py delete mode 100644 omnisafe/utils/core.py rename omnisafe/utils/{distributed_utils.py => distributed.py} (64%) rename omnisafe/utils/{algo_utils.py => math.py} (50%) rename omnisafe/utils/{model_utils.py => model.py} (67%) delete mode 100644 omnisafe/utils/online_mean_std.py create mode 100644 omnisafe/utils/schedule.py delete mode 100644 omnisafe/utils/vtrace.py delete mode 100644 omnisafe/wrappers/__init__.py delete mode 100644 omnisafe/wrappers/early_terminated_wrapper.py delete mode 100644 omnisafe/wrappers/model_based_wrapper.py delete mode 100644 omnisafe/wrappers/saute_wrapper.py delete mode 100644 omnisafe/wrappers/simmer_wrapper.py delete mode 100644 omnisafe/wrappers/wrapper_registry.py delete mode 100644 tests/test_utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37eeefea3..2933289bc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,10 +61,9 @@ jobs: run: | make addlicense - # TODO: enable this when ready - # - name: mypy - # run: | - # make mypy + - name: mypy + run: | + make mypy - name: Install dependencies run: | diff --git a/.pylintrc b/.pylintrc index 68ca39503..13c1bd408 100644 --- a/.pylintrc +++ b/.pylintrc @@ -289,10 +289,10 @@ exclude-too-few-public-methods= ignored-parents= # Maximum number of arguments for function / method. -max-args=5 +max-args=8 # Maximum number of attributes for a class (see R0902). -max-attributes=7 +max-attributes=12 # Maximum number of boolean expressions in an if statement (see R0916). max-bool-expr=5 @@ -301,22 +301,22 @@ max-bool-expr=5 max-branches=12 # Maximum number of locals for function / method body. -max-locals=15 +max-locals=20 # Maximum number of parents for a class (see R0901). -max-parents=7 +max-parents=12 # Maximum number of public methods for a class (see R0904). max-public-methods=20 # Maximum number of return / yield for function / method body. -max-returns=6 +max-returns=8 # Maximum number of statements in function / method body. -max-statements=50 +max-statements=80 # Minimum number of public methods for a class (see R0903). -min-public-methods=2 +min-public-methods=1 [EXCEPTIONS] diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 7f8a2f617..653cf0d8b 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -369,3 +369,12 @@ noqa hyperparameters json msg +env's +CMDP +api +moviepy +normalizer +Unsqueeze +Golub +logp +loc diff --git a/omnisafe/__init__.py b/omnisafe/__init__.py index 3669e2ac5..71973dc66 100644 --- a/omnisafe/__init__.py +++ b/omnisafe/__init__.py @@ -17,7 +17,9 @@ from omnisafe import algorithms from omnisafe.algorithms import ALGORITHMS from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent -from omnisafe.evaluator import Evaluator # from omnisafe.algorithms.env_wrapper import EnvWrapper as Env from omnisafe.version import __version__ + + +# from omnisafe.evaluator import Evaluator diff --git a/omnisafe/algorithms/model_based/__init__.py b/omnisafe/adapter/__init__.py similarity index 69% rename from omnisafe/algorithms/model_based/__init__.py rename to omnisafe/adapter/__init__.py index 54456b5b3..40b122bb1 100644 --- a/omnisafe/algorithms/model_based/__init__.py +++ b/omnisafe/adapter/__init__.py @@ -12,15 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Model-Based algorithms.""" +"""Adapter for the environment and the algorithm.""" -from omnisafe.algorithms.model_based.cap import CAP -from omnisafe.algorithms.model_based.mbppo_lag import MBPPOLag -from omnisafe.algorithms.model_based.safeloop import SafeLOOP - - -__all__ = [ - 'CAP', - 'MBPPOLag', - 'SafeLOOP', -] +from omnisafe.adapter.early_terminated_adapter import EarlyTerminatedAdapter +from omnisafe.adapter.online_adapter import OnlineAdapter +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.adapter.saute_adapter import SauteAdapter diff --git a/omnisafe/adapter/early_terminated_adapter.py b/omnisafe/adapter/early_terminated_adapter.py new file mode 100644 index 000000000..4674d41a6 --- /dev/null +++ b/omnisafe/adapter/early_terminated_adapter.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""OnPolicy Adapter for OmniSafe.""" + +from typing import Dict, Tuple + +import torch + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.utils.config import Config + + +class EarlyTerminatedAdapter(OnPolicyAdapter): + """OnPolicy Adapter for OmniSafe.""" + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + assert num_envs == 1, 'EarlyTerminatedAdapter only supports num_envs=1.' + + super().__init__(env_id, num_envs, seed, cfgs) + + self._cost_limit = cfgs.cost_limit + self._cost_logger = torch.zeros(self._env.num_envs) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + next_obs, reward, cost, terminated, truncated, info = super().step(action) + + self._cost_logger += info.get('original_cost', cost) + + if self._cost_logger > self._cost_limit: + reward = torch.zeros(self._env.num_envs) # r_e = 0 + terminated = torch.ones(self._env.num_envs) + next_obs, _ = self._env.reset() + self._cost_logger = torch.zeros(self._env.num_envs) + + return next_obs, reward, cost, terminated, truncated, info diff --git a/omnisafe/adapter/online_adapter.py b/omnisafe/adapter/online_adapter.py new file mode 100644 index 000000000..f2439b508 --- /dev/null +++ b/omnisafe/adapter/online_adapter.py @@ -0,0 +1,125 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Online Adapter for OmniSafe.""" + +from typing import Dict, Tuple + +import torch + +from omnisafe.envs.core import make, support_envs +from omnisafe.envs.wrapper import ( + ActionScale, + AutoReset, + CostNormalize, + ObsNormalize, + RewardNormalize, + TimeLimit, + Unsqueeze, +) +from omnisafe.typing import OmnisafeSpace +from omnisafe.utils.config import Config + + +class OnlineAdapter: + """Online Adapter for OmniSafe.""" + + def __init__( # pylint: disable=too-many-arguments + self, + env_id: str, + num_envs: int, + seed: int, + cfgs: Config, + ) -> None: + assert env_id in support_envs(), f'Env {env_id} is not supported.' + + self._env_id = env_id + self._env = make(env_id, num_envs=num_envs) + self._wrapper( + obs_normalize=cfgs.obs_normalize, + reward_normalize=cfgs.reward_normalize, + cost_normalize=cfgs.cost_normalize, + ) + self._env.set_seed(seed) + + self._cfgs = cfgs + + def _wrapper( + self, + obs_normalize: bool = True, + reward_normalize: bool = True, + cost_normalize: bool = True, + ): + if self._env.need_time_limit_wrapper: + self._env = TimeLimit(self._env, time_limit=1000) + if self._env.need_auto_reset_wrapper: + self._env = AutoReset(self._env) + if obs_normalize: + self._env = ObsNormalize(self._env) + if reward_normalize: + self._env = RewardNormalize(self._env) + if cost_normalize: + self._env = CostNormalize(self._env) + self._env = ActionScale(self._env, low=-1.0, high=1.0) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env) + + @property + def action_space(self) -> OmnisafeSpace: + """The action space of the environment. + + Returns: + OmnisafeSpace: the action space. + """ + return self._env.action_space + + @property + def observation_space(self) -> OmnisafeSpace: + """The observation space of the environment. + + Returns: + OmnisafeSpace: the observation space. + """ + return self._env.observation_space + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + """Run one timestep of the environment's dynamics using the agent actions. + + Args: + action (torch.Tensor): action. + + Returns: + observation (torch.Tensor): agent's observation of the current environment. + reward (torch.Tensor): amount of reward returned after previous action. + cost (torch.Tensor): amount of cost returned after previous action. + terminated (torch.Tensor): whether the episode has ended, in which case further step() + calls will return undefined results. + truncated (torch.Tensor): whether the episode has been truncated due to a time limit. + info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + return self._env.step(action) + + def reset(self) -> Tuple[torch.Tensor, Dict]: + """Resets the environment and returns an initial observation. + + Args: + seed (Optional[int]): seed for the environment. + + Returns: + observation (torch.Tensor): the initial observation of the space. + info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + return self._env.reset() diff --git a/omnisafe/adapter/onpolicy_adapter.py b/omnisafe/adapter/onpolicy_adapter.py new file mode 100644 index 000000000..f816e20d4 --- /dev/null +++ b/omnisafe/adapter/onpolicy_adapter.py @@ -0,0 +1,136 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""OnPolicy Adapter for OmniSafe.""" + +from typing import Dict, Optional + +import torch + +from omnisafe.adapter.online_adapter import OnlineAdapter +from omnisafe.common.buffer import VectorOnPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic +from omnisafe.utils.config import Config + + +class OnPolicyAdapter(OnlineAdapter): + """OnPolicy Adapter for OmniSafe.""" + + def __init__( # pylint: disable=too-many-arguments + self, env_id: str, num_envs: int, seed: int, cfgs: Config + ) -> None: + super().__init__(env_id, num_envs, seed, cfgs) + + self._ep_ret: torch.Tensor + self._ep_cost: torch.Tensor + self._ep_len: torch.Tensor + self._reset_log() + + def roll_out( # pylint: disable=too-many-locals + self, + steps_per_epoch: int, + agent: ConstraintActorCritic, + buffer: VectorOnPolicyBuffer, + logger: Logger, + ) -> None: + """Roll out the environment and store the data in the buffer. + + Args: + steps_per_epoch (int): Number of steps per epoch. + agent (ConstraintActorCritic): Agent. + buf (VectorOnPolicyBuffer): Buffer. + logger (Logger): Logger. + """ + self._reset_log() + + obs, _ = self.reset() + for step in range(steps_per_epoch): + act, value_r, value_c, logp = agent.step(obs) + next_obs, reward, cost, terminated, truncated, info = self.step(act) + + self._log_value(reward=reward, cost=cost, info=info) + + if self._cfgs.use_cost: + logger.store(**{'Value/cost': value_c}) + logger.store(**{'Value/reward': value_r}) + + buffer.store( + obs=obs, + act=act, + reward=reward, + cost=cost, + value_r=value_r, + value_c=value_c, + logp=logp, + ) + + obs = next_obs + dones = torch.logical_or(terminated, truncated) + epoch_end = step >= steps_per_epoch - 1 + for idx, done in enumerate(dones): + if epoch_end or done: + if epoch_end and not done: + logger.log( + f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.' + ) + _, last_value_r, last_value_c, _ = agent.step(obs[idx]) + last_value_r = last_value_r.unsqueeze(0) + last_value_c = last_value_c.unsqueeze(0) + elif done: + last_value_r = torch.zeros(1) + last_value_c = torch.zeros(1) + + self._log_metrics(logger, idx) + self._reset_log(idx) + + self._ep_ret[idx] = 0.0 + self._ep_cost[idx] = 0.0 + self._ep_len[idx] = 0.0 + + buffer.finish_path(last_value_r, last_value_c, idx) + + def _log_value( + self, + reward: torch.Tensor, + cost: torch.Tensor, + info: Dict, + **kwargs, # pylint: disable=unused-argument + ) -> None: # pylint: disable=unused-argument + """Log value.""" + self._ep_ret += info.get('original_reward', reward) + self._ep_cost += info.get('original_cost', cost) + self._ep_len += 1 + + def _log_metrics(self, logger: Logger, idx: int) -> None: + """Log metrics.""" + + logger.store( + **{ + 'Metrics/EpRet': self._ep_ret[idx], + 'Metrics/EpCost': self._ep_cost[idx], + 'Metrics/EpLen': self._ep_len[idx], + } + ) + + def _reset_log(self, idx: Optional[int] = None) -> None: + """Reset log.""" + if idx is None: + self._ep_ret = torch.zeros(self._env.num_envs) + self._ep_cost = torch.zeros(self._env.num_envs) + self._ep_len = torch.zeros(self._env.num_envs) + else: + self._ep_ret[idx] = 0.0 + self._ep_cost[idx] = 0.0 + self._ep_len[idx] = 0.0 diff --git a/omnisafe/adapter/saute_adapter.py b/omnisafe/adapter/saute_adapter.py new file mode 100644 index 000000000..1b65d60a8 --- /dev/null +++ b/omnisafe/adapter/saute_adapter.py @@ -0,0 +1,127 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""OnPolicy Adapter for OmniSafe.""" + +from typing import Dict, Optional, Tuple + +import numpy as np +import torch +from gymnasium.spaces import Box + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.common.logger import Logger +from omnisafe.envs.wrapper import ActionScale, AutoReset, ObsNormalize, TimeLimit, Unsqueeze +from omnisafe.utils.config import Config + + +class SauteAdapter(OnPolicyAdapter): + """OnPolicy Adapter for OmniSafe.""" + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + super().__init__(env_id, num_envs, seed, cfgs) + + self._safety_budget: torch.Tensor + self._safety_obs: torch.Tensor + + if self._cfgs.env_cfgs.scale_safety_budget: + self._safety_budget = ( + self._cfgs.env_cfgs.safety_budget + * (1 - self._cfgs.env_cfgs.saute_gamma**self._cfgs.env_cfgs.max_ep_len) + / (1 - self._cfgs.env_cfgs.saute_gamma) + / self._cfgs.env_cfgs.max_ep_len + * torch.ones(num_envs, 1) + ) + else: + self._safety_budget = self._cfgs.env_cfgs.safety_budget * torch.ones(num_envs, 1) + + self._ep_budget: torch.Tensor + + assert isinstance(self._env.observation_space, Box), 'Observation space must be Box' + self._observation_space = Box( + low=-np.inf, + high=np.inf, + shape=(self._env.observation_space.shape[0] + 1,), + ) + + @property + def observation_space(self) -> Box: + return self._observation_space + + def _wrapper( + self, + obs_normalize: bool = True, + reward_normalize: bool = False, + cost_normalize: bool = False, + ): + if self._env.need_time_limit_wrapper: + self._env = TimeLimit(self._env, time_limit=1000) + if self._env.need_auto_reset_wrapper: + self._env = AutoReset(self._env) + if obs_normalize: + self._env = ObsNormalize(self._env) + assert reward_normalize is False, 'Reward normalization is not supported' + assert cost_normalize is False, 'Cost normalization is not supported' + self._env = ActionScale(self._env, low=-1.0, high=1.0) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env) + + def reset(self) -> Tuple[torch.Tensor, Dict]: + obs, info = self._env.reset() + self._safety_obs = torch.ones(self._env.num_envs, 1) + obs = self._augment_obs(obs) + return obs, info + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + next_obs, reward, cost, terminated, truncated, info = self._env.step(action) + + self._safety_step(cost) + reward = self._safety_reward(reward) + + # autoreset the environment + done = torch.logical_or(terminated, truncated).float().unsqueeze(-1) + self._safety_obs = self._safety_obs * (1 - done) + done + + augmented_obs = self._augment_obs(next_obs) + + return augmented_obs, reward, cost, terminated, truncated, info + + def _safety_step(self, cost: torch.Tensor) -> None: + self._safety_obs -= cost.unsqueeze(-1) / self._safety_budget + self._safety_obs /= self._safety_budget + + def _safety_reward(self, reward: torch.Tensor) -> torch.Tensor: + safe = torch.as_tensor(self._safety_obs > 0, dtype=reward.dtype).squeeze(-1) + reward = safe * reward + (1 - safe) * self._cfgs.env_cfgs.unsafe_reward + return reward + + def _augment_obs(self, obs: torch.Tensor) -> torch.Tensor: + return torch.cat([obs, self._safety_obs], dim=-1) + + def _log_value(self, reward: torch.Tensor, cost: torch.Tensor, info: Dict, **kwargs) -> None: + super()._log_value(reward, cost, info, **kwargs) + self._ep_budget += self._safety_obs.squeeze(-1) + + def _reset_log(self, idx: Optional[int] = None) -> None: + super()._reset_log(idx) + if idx is None: + self._ep_budget = torch.zeros(self._env.num_envs) + else: + self._ep_budget[idx] = 0 + + def _log_metrics(self, logger: Logger, idx: int) -> None: + super()._log_metrics(logger, idx) + logger.store(**{'Metrics/EpBudget': self._ep_budget[idx]}) diff --git a/omnisafe/adapter/simmer_adapter.py b/omnisafe/adapter/simmer_adapter.py new file mode 100644 index 000000000..65f062bd6 --- /dev/null +++ b/omnisafe/adapter/simmer_adapter.py @@ -0,0 +1,62 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""OnPolicy Adapter for OmniSafe.""" + +import numpy as np +import torch +from gymnasium.spaces import Box + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.adapter.saute_adapter import SauteAdapter +from omnisafe.utils.config import Config + + +class SimmerAdapter(SauteAdapter, OnPolicyAdapter): + """OnPolicy Adapter for OmniSafe.""" + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize the adapter.""" + super(OnPolicyAdapter, self).__init__(env_id, num_envs, seed, cfgs) + + self._safety_budget: torch.Tensor + self._safety_obs: torch.Tensor + + if self._cfgs.env_cfgs.scale_safety_budget: + self._safety_budget = ( + self._cfgs.env_cfgs.lower_budget + * (1 - self._cfgs.env_cfgs.saute_gamma**self._cfgs.env_cfgs.max_ep_len) + / (1 - self._cfgs.env_cfgs.saute_gamma) + / self._cfgs.env_cfgs.max_ep_len + ) + self._lower_budget = self._safety_budget + self._upper_budget = ( + self._cfgs.env_cfgs.upper_budget + * (1 - self._cfgs.env_cfgs.saute_gamma**self._cfgs.env_cfgs.max_ep_len) + / (1 - self._cfgs.env_cfgs.saute_gamma) + / self._cfgs.env_cfgs.max_ep_len + ) + else: + self._safety_budget = self._cfgs.env_cfgs.lower_budget + self._lower_budget = self._safety_budget + self._upper_budget = self._cfgs.env_cfgs.upper_budget + + self._ep_budget: torch.Tensor + + assert isinstance(self._env.observation_space, Box), 'Observation space must be Box' + self._observation_space = Box( + low=-np.inf, + high=np.inf, + shape=(self._env.observation_space.shape[0] + 1,), + ) diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index 5231479b6..9c74117ac 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -17,16 +17,11 @@ import itertools from types import MappingProxyType -from omnisafe.algorithms import model_based, off_policy, on_policy - -# Model-based Safe -from omnisafe.algorithms.model_based import CAP, MBPPOLag, SafeLOOP - -# Off-Policy Safe -from omnisafe.algorithms.off_policy import DDPG, SAC, SDDPG, TD3, DDPGLag, SACLag, TD3Lag +from omnisafe.algorithms import on_policy +from omnisafe.algorithms.base_algo import BaseAlgo # On-Policy Safe -from omnisafe.algorithms.on_policy import ( +from omnisafe.algorithms.on_policy import ( # PPOLagSimmerPid,; PPOLagSimmerQ,; PPOSimmerPid,; PPOSimmerQ, CPO, CUP, FOCOPS, @@ -43,20 +38,23 @@ PPOLag, PPOLagEarlyTerminated, PPOLagSaute, - PPOLagSimmerPid, - PPOLagSimmerQ, PPOSaute, - PPOSimmerPid, - PPOSimmerQ, TRPOLag, TRPOPid, ) +# Model-based Safe +# from omnisafe.algorithms.model_based import CAP, MBPPOLag, SafeLOOP + +# Off-Policy Safe +# from omnisafe.algorithms.off_policy import DDPG, SAC, SDDPG, TD3, DDPGLag, SACLag, TD3Lag + + ALGORITHMS = { - 'off-policy': tuple(off_policy.__all__), + # 'off-policy': tuple(off_policy.__all__), 'on-policy': tuple(on_policy.__all__), - 'model-based': tuple(model_based.__all__), + # 'model-based': tuple(model_based.__all__), } ALGORITHM2TYPE = { diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py index d0ed9b2b1..1a75fc308 100644 --- a/omnisafe/algorithms/algo_wrapper.py +++ b/omnisafe/algorithms/algo_wrapper.py @@ -17,24 +17,32 @@ import difflib import os import sys +from typing import Any, Dict, Optional import psutil +import torch from safety_gymnasium.utils.registration import safe_registry from omnisafe.algorithms import ALGORITHM2TYPE, ALGORITHMS, registry -from omnisafe.utils import distributed_utils -from omnisafe.utils.config import check_all_configs, get_default_kwargs_yaml +from omnisafe.utils import distributed +from omnisafe.utils.config import get_default_kwargs_yaml class AlgoWrapper: """Algo Wrapper for algo.""" - def __init__(self, algo, env_id, parallel=1, custom_cfgs=None): + def __init__( + self, + algo: str, + env_id: str, + parallel: int = 1, + custom_cfgs: Optional[Dict[str, Any]] = None, + ): self.algo = algo self.parallel = parallel self.env_id = env_id # algo_type will set in _init_checks() - self.algo_type = None + self.algo_type: str self.custom_cfgs = custom_cfgs self.evaluator = None self._init_checks() @@ -55,7 +63,7 @@ def _init_checks(self): f"{self.env_id} doesn't exist. " f'Did you mean {difflib.get_close_matches(self.env_id, safe_registry, n=1)[0]}?' ) - self.algo_type = ALGORITHM2TYPE.get(self.algo, None) + self.algo_type = ALGORITHM2TYPE.get(self.algo, '') if self.algo_type is None or self.algo_type == '': raise ValueError(f'{self.algo} is not supported!') if self.algo_type in ['off-policy', 'model-based']: @@ -69,15 +77,17 @@ def learn(self): physical_cores = psutil.cpu_count(logical=False) use_number_of_threads = bool(self.parallel > physical_cores) + torch.set_num_threads(5) + cfgs = get_default_kwargs_yaml(self.algo, self.env_id, self.algo_type) exp_name = os.path.join(self.env_id, self.algo) cfgs.recurisve_update({'exp_name': exp_name, 'env_id': self.env_id}) if self.custom_cfgs is not None: cfgs.recurisve_update(self.custom_cfgs) - check_all_configs(cfgs, self.algo_type) + # check_all_configs(cfgs, self.algo_type) - if distributed_utils.mpi_fork( + if distributed.fork( self.parallel, use_number_of_threads=use_number_of_threads, device=cfgs.device ): # Re-launches the current script with workers linked by MPI @@ -87,22 +97,25 @@ def learn(self): cfgs=cfgs, ) agent.learn() - return agent.env.record_queue.get_mean('ep_ret', 'ep_cost', 'ep_len') + ep_ret = agent.logger.get_stats('Metrics/EpRet') + ep_len = agent.logger.get_stats('Metrics/EpLen') + ep_cost = agent.logger.get_stats('Metrics/EpCost') + return ep_ret, ep_len, ep_cost - def evaluate(self, num_episodes: int = 10, horizon: int = 1000, cost_criteria: float = 1.0): - """Agent Evaluation.""" - assert self.evaluator is not None, 'Please run learn() first!' - self.evaluator.evaluate(num_episodes, horizon, cost_criteria) + # def evaluate(self, num_episodes: int = 10, horizon: int = 1000, cost_criteria: float = 1.0): + # """Agent Evaluation.""" + # assert self.evaluator is not None, 'Please run learn() first!' + # self.evaluator.evaluate(num_episodes, horizon, cost_criteria) - # pylint: disable-next=too-many-arguments - def render( - self, - num_episode: int = 0, - horizon: int = 1000, - seed: int = None, - play=True, - save_replay_path: str = None, - ): - """Render the environment.""" - assert self.evaluator is not None, 'Please run learn() first!' - self.evaluator.render(num_episode, horizon, seed, play, save_replay_path) + # # pylint: disable-next=too-many-arguments + # def render( + # self, + # num_episode: int = 0, + # horizon: int = 1000, + # seed: int = None, + # play=True, + # save_replay_path: Optional[str] = None, + # ): + # """Render the environment.""" + # assert self.evaluator is not None, 'Please run learn() first!' + # self.evaluator.render(num_episode, horizon, seed, play, save_replay_path) diff --git a/omnisafe/algorithms/base_algo.py b/omnisafe/algorithms/base_algo.py new file mode 100644 index 000000000..d28282115 --- /dev/null +++ b/omnisafe/algorithms/base_algo.py @@ -0,0 +1,67 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Policy Gradient algorithm.""" + +from abc import ABC, abstractmethod + +import torch + +from omnisafe.utils import distributed +from omnisafe.utils.config import Config +from omnisafe.utils.tools import seed_all + + +class BaseAlgo(ABC): # pylint: disable=too-few-public-methods + """Base class for all algorithms.""" + + def __init__(self, env_id: str, cfgs: Config) -> None: + self._env_id = env_id + self._cfgs = cfgs + + assert hasattr(cfgs, 'seed'), 'Please specify the seed in the config file.' + self._seed = cfgs.seed + distributed.get_rank() * 1000 + seed_all(self._seed) + + assert hasattr(cfgs, 'device'), 'Please specify the device in the config file.' + self._device = torch.device(self._cfgs.device) + + distributed.setup_distributed() + + self._init_env() + self._init_model() + + self._init() + + self._init_log() + + @abstractmethod + def _init(self) -> None: + """Initialize the algorithm.""" + + @abstractmethod + def _init_env(self) -> None: + """Initialize the environment.""" + + @abstractmethod + def _init_model(self) -> None: + """Initialize the model.""" + + @abstractmethod + def _init_log(self) -> None: + """Initialize the logger.""" + + @abstractmethod + def learn(self) -> None: + """Learn the policy.""" diff --git a/omnisafe/algorithms/model_based/cap.py b/omnisafe/algorithms/model_based/cap.py deleted file mode 100644 index 195466ecb..000000000 --- a/omnisafe/algorithms/model_based/cap.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the CAP algorithm. The CAP in safety-gym may unable to converge.""" - -import numpy as np - -from omnisafe.algorithms import registry -from omnisafe.algorithms.model_based.planner import CCEPlanner -from omnisafe.algorithms.model_based.policy_gradient import PolicyGradientModelBased -from omnisafe.common.lagrange import Lagrange - - -@registry.register -class CAP( - PolicyGradientModelBased, CCEPlanner, Lagrange -): # pylint: disable=too-many-instance-attributes - """The Conservative and Adaptive Penalty (CAP) algorithm. - - References: - Title: Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning - Authors: Yecheng Jason Ma, Andrew Shen, Osbert Bastani, Dinesh Jayaraman. - URL: https://arxiv.org/abs/2112.07701 - """ - - def __init__(self, env_id, cfgs) -> None: - PolicyGradientModelBased.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__(self, **self.cfgs.lagrange_cfgs, device=self.cfgs.device) - CCEPlanner.__init__( - self, - algo=self.algo, - cfgs=self.cfgs, - device=self.device, - env=self.env, - models=self.virtual_env, - **self.cfgs.mpc_config, - lagrangian_multiplier=self.lagrangian_multiplier, - ) - # Set up model saving - what_to_save = { - 'dynamics': self.dynamics, - } - self.logger.setup_torch_saver(what_to_save=what_to_save) - self.logger.torch_save() - - def _specific_init_logs(self): - self.logger.register_key('Loss/DynamicsTrainMseLoss') - self.logger.register_key('Loss/DynamicsValMseLoss') - self.logger.register_key('Penalty') - - def algorithm_specific_logs(self, time_step): - """Log algo parameter""" - super().algorithm_specific_logs(time_step) - self.logger.store( - **{'Penalty': self.lambda_range_projection(self.lagrangian_multiplier).item()} - ) - - def update_dynamics_model(self): - """Update dynamics.""" - state = self.off_replay_buffer.data['obs'][: self.off_replay_buffer.size, :] - action = self.off_replay_buffer.data['act'][: self.off_replay_buffer.size, :] - reward = self.off_replay_buffer.data['reward'][: self.off_replay_buffer.size] - cost = self.off_replay_buffer.data['cost'][: self.off_replay_buffer.size] - next_state = self.off_replay_buffer.data['next_obs'][: self.off_replay_buffer.size, :] - delta_state = next_state - state - inputs = np.concatenate((state, action), axis=-1) - if self.env.env_type == 'mujoco-velocity': - labels = np.concatenate( - ( - np.reshape(reward, (reward.shape[0], -1)), - np.reshape(cost, (cost.shape[0], -1)), - delta_state, - ), - axis=-1, - ) - elif self.env.env_type == 'gym': - labels = np.concatenate( - (np.reshape(reward, (reward.shape[0], -1)), delta_state), axis=-1 - ) - train_mse_losses, val_mse_losses = self.dynamics.train( - inputs, labels, batch_size=256, holdout_ratio=0.2 - ) - - ep_costs = self.logger.get_stats('Metrics/EpCost')[0] - # update Lagrange multiplier parameter - self.update_lagrange_multiplier(ep_costs) - - self.logger.store( - **{ - 'Loss/DynamicsTrainMseLoss': train_mse_losses, - 'Loss/DynamicsValMseLoss': val_mse_losses, - } - ) - - def select_action(self, time_step, state, env): - """action selection""" - action = self.get_action(np.array(state)) - return action, None - - def store_real_data( - self, - time_step, - ep_len, - state, - action_info, - action, - reward, - cost, - terminated, - truncated, - next_state, - info, - ): # pylint: disable=too-many-arguments - """store real data""" - if not terminated and not truncated and not info['goal_met']: - # Current goal position is not related to the last goal position, so do not store. - self.off_replay_buffer.store( - obs=state, act=action, reward=reward, cost=cost, next_obs=next_state, done=truncated - ) - - def algo_reset(self): - """reset planner""" - - def set_algorithm_specific_actor_critic(self): - """Initialize Soft Actor-Critic""" diff --git a/omnisafe/algorithms/model_based/mbppo_lag.py b/omnisafe/algorithms/model_based/mbppo_lag.py deleted file mode 100644 index 1f3e04d37..000000000 --- a/omnisafe/algorithms/model_based/mbppo_lag.py +++ /dev/null @@ -1,445 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the Model-based PPO-Lag algorithm.""" - -import numpy as np -import torch - -from omnisafe.algorithms import registry -from omnisafe.algorithms.model_based.policy_gradient import PolicyGradientModelBased -from omnisafe.common.buffer import OnPolicyBuffer -from omnisafe.common.lagrange import Lagrange -from omnisafe.models.constraint_actor_critic import ConstraintActorCritic -from omnisafe.utils import core -from omnisafe.wrappers import wrapper_registry - - -@registry.register -# pylint: disable-next=too-many-instance-attributes -class MBPPOLag(PolicyGradientModelBased, Lagrange): - """The Model-based PPO-Lag algorithm. - - References: - Title: Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm - Authors: Ashish Kumar Jayant, Shalabh Bhatnagar. - URL: https://arxiv.org/abs/2210.07573 - """ - - def __init__(self, env_id, cfgs) -> None: - PolicyGradientModelBased.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__(self, **self.cfgs.lagrange_cfgs, device=self.cfgs.device) - self.clip = self.cfgs.clip - self.loss_pi_before = 0.0 - self.loss_v_before = 0.0 - self.loss_c_before = 0.0 - self.env_auxiliary = wrapper_registry.get(self.wrapper_type)(self.algo, self.env_id) - # Initialize Actor-Critic - self.actor_critic = self.set_algorithm_specific_actor_critic() - self.buf = OnPolicyBuffer( - obs_space=self.env.observation_space, - act_space=self.env.action_space, - size=self.cfgs.imaging_steps_per_policy_update, - gamma=self.cfgs.buffer_cfgs.gamma, - lam=self.cfgs.buffer_cfgs.lam, - lam_c=self.cfgs.buffer_cfgs.lam_c, - advantage_estimator=self.cfgs.buffer_cfgs.advantage_estimator, - penalty_coefficient=0, - standardized_adv_r=self.cfgs.buffer_cfgs.standardized_reward, - standardized_adv_c=self.cfgs.buffer_cfgs.standardized_cost, - device=self.device, - ) - # Set up model saving - what_to_save = { - 'pi': self.actor_critic.actor, - 'dynamics': self.dynamics, - } - self.logger.setup_torch_saver(what_to_save=what_to_save) - self.logger.torch_save() - - def _specific_init_logs(self): - self.logger.register_key('DynaMetrics/EpRet') - self.logger.register_key('DynaMetrics/EpLen') - self.logger.register_key('DynaMetrics/EpCost') - self.logger.register_key('Loss/DynamicsTrainMseLoss') - self.logger.register_key('Loss/DynamicsValMseLoss') - self.logger.register_key('Loss/Pi') - self.logger.register_key('Loss/Value') - self.logger.register_key('Loss/DeltaPi') - self.logger.register_key('Loss/DeltaValue') - self.logger.register_key('Loss/CValue') - self.logger.register_key('Loss/DeltaCValue') - self.logger.register_key('Penalty') - self.logger.register_key('Values/Adv') - self.logger.register_key('Values/Adv_C') - self.logger.register_key('Megaiter') - self.logger.register_key('Entropy') - self.logger.register_key('KL') - self.logger.register_key('Misc/StopIter') - self.logger.register_key('PolicyRatio') - - def algorithm_specific_logs(self, time_step): - """log algo parameter""" - super().algorithm_specific_logs(time_step) - self.logger.store( - **{'Penalty': self.lambda_range_projection(self.lagrangian_multiplier).item()} - ) - - def update_actor_critic(self, time_step): # pylint: disable=unused-argument - """update actor critic""" - megaiter = 0 - last_valid_rets = np.zeros(self.cfgs.dynamics_cfgs.elite_size) - while True: - self.roll_out_in_imaginary(megaiter) - # validation - if megaiter > 0: - old_actor = self.get_param_values(self.actor_critic.actor) - old_reward_critic = self.get_param_values(self.actor_critic.reward_critic) - old_cost_critic = self.get_param_values(self.actor_critic.cost_critic) - self.update() - result, valid_rets = self.validation(last_valid_rets) - if result is True: - # backtrack - self.set_param_values(old_actor, self.actor_critic.actor) - self.set_param_values(old_reward_critic, self.actor_critic.reward_critic) - self.set_param_values(old_cost_critic, self.actor_critic.cost_critic) - megaiter += 1 - break - megaiter += 1 - last_valid_rets = valid_rets - else: - megaiter += 1 - self.update() - - self.logger.store(Megaiter=megaiter) - - def update(self): - """Get data from buffer and update Lagrange multiplier, actor, critic""" - data = self.buf.get() - # Note that logger already uses MPI statistics across all processes.. - ep_costs = self.logger.get_stats('DynaMetrics/EpCost')[0] - # First update Lagrange multiplier parameter - self.update_lagrange_multiplier(ep_costs) - # now update policy and value network - self.update_policy_net(data=data) - self.update_value_net(data=data) - - def compute_loss_v(self, data): - """compute the loss of value function""" - obs, ret, cret = data['obs'], data['target_v'], data['target_c'] - return ((self.actor_critic.reward_critic(obs) - ret) ** 2).mean(), ( - (self.actor_critic.cost_critic(obs) - cret) ** 2 - ).mean() - - def compute_loss_pi(self, data): - """compute the loss of policy""" - dist, _log_p = self.actor_critic.actor(data['obs'], data['act']) - ratio = torch.exp(_log_p - data['log_p']) - ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) - loss_pi = -(torch.min(ratio * data['adv'], ratio_clip * data['adv'])).mean() - - # ensure that Lagrange multiplier is positive - penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() - loss_pi += penalty * ((ratio * data['cost_adv']).mean()) - loss_pi /= 1 + penalty - - # Useful extra info - approx_kl = (data['log_p'] - _log_p).mean().item() - ent = dist.entropy().mean().item() - clipped = ratio.gt(1 + self.clip) | ratio.lt(1 - self.clip) - clipfrac = torch.as_tensor(clipped, device=self.device, dtype=torch.float32).mean().item() - pi_info = {'kl': approx_kl, 'ent': ent, 'cf': clipfrac} - return loss_pi, pi_info - - def update_dynamics_model(self): - """compute the loss of dynamics""" - state = self.off_replay_buffer.data['obs'][: self.off_replay_buffer.size, :] - action = self.off_replay_buffer.data['act'][: self.off_replay_buffer.size, :] - reward = self.off_replay_buffer.data['reward'][: self.off_replay_buffer.size] - cost = self.off_replay_buffer.data['cost'][: self.off_replay_buffer.size] - next_state = self.off_replay_buffer.data['next_obs'][: self.off_replay_buffer.size, :] - delta_state = next_state - state - inputs = np.concatenate((state, action), axis=-1) - if self.env.env_type == 'mujoco-velocity': - labels = np.concatenate( - ( - np.reshape(reward, (reward.shape[0], -1)), - np.reshape(cost, (cost.shape[0], -1)), - delta_state, - ), - axis=-1, - ) - elif self.env.env_type == 'gym': - labels = delta_state - train_mse_losses, val_mse_losses = self.dynamics.train( - inputs, labels, batch_size=256, holdout_ratio=0.2 - ) - self.logger.store( - **{ - 'Loss/DynamicsTrainMseLoss': train_mse_losses, - 'Loss/DynamicsValMseLoss': val_mse_losses, - } - ) - - def update_policy_net(self, data): - """update policy""" - # Get prob. distribution before updates: used to measure KL distance - pi_l_old, pi_info_old = self.compute_loss_pi(data) - self.loss_pi_before = pi_l_old.item() - # Train policy with multiple steps of gradient descent - for i in range(self.cfgs.pi_iters): - loss_pi, pi_info = self.compute_loss_pi(data) - kl_div = pi_info['kl'] - if self.cfgs.kl_early_stopping: - if kl_div > self.cfgs.target_kl: - self.logger.log(f'Reached ES criterion after {i+1} steps.') - break - self.actor_optimizer.zero_grad() - loss_pi.backward() - self.actor_optimizer.step() - self.logger.store( - **{ - 'Loss/Pi': self.loss_pi_before, - 'Loss/DeltaPi': loss_pi.item() - self.loss_pi_before, - 'Misc/StopIter': i + 1, - 'Values/Adv': data['adv'].cpu().numpy(), - 'Values/Adv_C': data['cost_adv'].cpu().numpy(), - 'Entropy': pi_info_old['ent'], - 'KL': pi_info['kl'], - 'PolicyRatio': pi_info['cf'], - } - ) - - def update_value_net(self, data): - """Value function learning""" - v_l_old, cv_l_old = self.compute_loss_v(data) - self.loss_v_before, self.loss_c_before = v_l_old.item(), cv_l_old.item() - - for _ in range(self.cfgs.critic_iters): - loss_v, loss_vc = self.compute_loss_v(data) - self.reward_critic_optimizer.zero_grad() - loss_v.backward() - self.reward_critic_optimizer.step() - - self.cost_critic_optimizer.zero_grad() - loss_vc.backward() - self.cost_critic_optimizer.step() - - self.logger.store( - **{ - 'Loss/DeltaValue': loss_v.item() - self.loss_v_before, - 'Loss/Value': self.loss_v_before, - 'Loss/DeltaCValue': loss_vc.item() - self.loss_c_before, - 'Loss/CValue': self.loss_c_before, - } - ) - - def get_param_values(self, model): - """get the dynamics parameters""" - trainable_params = list(model.parameters()) - params = np.concatenate( - [p.contiguous().view(-1).data.cpu().numpy() for p in trainable_params] - ) - return params.copy() - - def set_param_values(self, new_params, model, set_new=True): - """set the dynamics parameters""" - trainable_params = list(model.parameters()) - param_shapes = [p.data.cpu().numpy().shape for p in trainable_params] - param_sizes = [p.data.cpu().numpy().size for p in trainable_params] - if set_new: - current_idx = 0 - for idx, param in enumerate(trainable_params): - vals = new_params[current_idx : current_idx + param_sizes[idx]] - vals = vals.reshape(param_shapes[idx]) - param.data = torch.from_numpy(vals).float().to(self.device) - current_idx += param_sizes[idx] - - def roll_out_in_imaginary(self, megaiter): # pylint: disable=too-many-locals - """collect data and store to experience buffer.""" - state = self.env_auxiliary.reset() - dep_ret, dep_cost, dep_len = 0, 0, 0 - mix_real = self.cfgs.mixed_real_time_steps if megaiter == 0 else 0 - - for time_step in range(self.cfgs.imaging_steps_per_policy_update - mix_real): - action, action_info = self.select_action(time_step, state, self.env_auxiliary) - next_state, reward, cost, info = self.virtual_step(state, action) - - dep_ret += reward - dep_cost += (self.cost_gamma**dep_len) * cost - dep_len += 1 - - self.buf.store( - obs=action_info['state_vec'], - act=action, - rew=reward, - val=action_info['val'], - logp=action_info['logp'], - cost=cost, - cost_val=action_info['cval'], - ) - state = next_state - - timeout = dep_len == self.cfgs.horizon - truncated = timeout - epoch_ended = time_step == self.cfgs.imaging_steps_per_policy_update - 1 - if truncated or epoch_ended or info['goal_flag']: - if timeout or epoch_ended or info['goal_flag']: - state_tensor = torch.as_tensor( - action_info['state_vec'], device=self.device, dtype=torch.float32 - ) - _, val, cval, _ = self.actor_critic.step(state_tensor) - del state_tensor - else: - # this means episode is terminated, - # and this will be triggered only in robots fall down case - val = 0 - cval = 0 - self.buf.finish_path(val, cval) - if timeout: - # only save EpRet / EpLen if trajectory finished - self.logger.store( - **{ - 'DynaMetrics/EpRet': dep_ret, - 'DynaMetrics/EpLen': dep_len, - 'DynaMetrics/EpCost': dep_cost, - } - ) - state = self.env_auxiliary.reset() - dep_ret, dep_len, dep_cost = 0, 0, 0 - - def validation(self, last_valid_rets): - """policy validation""" - valid_rets = np.zeros(self.cfgs.validation_num) - winner = 0 - for valid_id in range(len(valid_rets)): # pylint:disable=consider-using-enumerate - state = self.env_auxiliary.reset() - for step in range(self.cfgs.validation_horizon): - action, _ = self.select_action(step, state, self.env_auxiliary) - next_state, reward, _, info = self.virtual_step(state, action, idx=valid_id) - valid_rets[valid_id] += reward - state = next_state - if info['goal_flag']: - state = self.env_auxiliary.reset() - if valid_rets[valid_id] > last_valid_rets[valid_id]: - winner += 1 - performance_ratio = winner / self.cfgs.validation_num - threshold = self.cfgs.validation_threshold_num / self.cfgs.validation_num - result = performance_ratio < threshold - return result, valid_rets - - # pylint: disable-next=too-many-arguments - def store_real_data( - self, - time_step, - ep_len, - state, - action_info, - action, - reward, - cost, - terminated, - truncated, - next_state, - info, - ): - """store real data""" - if not terminated and not truncated and not info['goal_met']: - self.off_replay_buffer.store( - obs=state, act=action, reward=reward, cost=cost, next_obs=next_state, done=truncated - ) - if ( - time_step % self.cfgs.update_policy_freq <= self.cfgs.mixed_real_time_steps - and self.buf.ptr < self.cfgs.mixed_real_time_steps - ): - self.buf.store( - obs=action_info['state_vec'], - act=action, - rew=reward, - val=action_info['val'], - logp=action_info['logp'], - cost=cost, - cost_val=action_info['cval'], - ) - if terminated: - # this means episode is terminated, - # which will be triggered only in robots fall down case - val = 0 - cval = 0 - self.buf.finish_path(val, cval) - - # reached max imaging horizon, mixed real timestep, real max timestep , or episode truncated. - elif ( - time_step % self.cfgs.horizon < self.cfgs.action_repeat - or self.buf.ptr == self.cfgs.mixed_real_time_steps - or time_step >= self.cfgs.max_real_time_steps - or truncated - ): - state_tensor = torch.as_tensor( - action_info['state_vec'], device=self.device, dtype=torch.float32 - ) - _, val, cval, _ = self.actor_critic.step(state_tensor) - del state_tensor - self.buf.finish_path(val, cval) - - def algo_reset(self): - """reset algo parameters""" - - def virtual_step(self, state, action, idx=None): - """use virtual environment to predict next state, reward, cost""" - if self.env.env_type == 'gym': - next_state, _, _, _ = self.virtual_env.mbppo_step(state, action, idx) - next_state = np.nan_to_num(next_state) - next_state = np.clip(next_state, -self.cfgs.obs_clip, self.cfgs.obs_clip) - reward, cost, goal_flag = self.env_auxiliary.get_reward_cost(next_state) - info = {'goal_flag': goal_flag} - elif self.env.env_type == 'mujoco-velocity': - next_state, reward, cost, _ = self.virtual_env.mbppo_step(state, action, idx) - next_state = np.nan_to_num(next_state) - reward = np.nan_to_num(reward) - cost = np.nan_to_num(cost) - next_state = np.clip(next_state, -self.cfgs.obs_clip, self.cfgs.obs_clip) - info = {'goal_flag': False} - return next_state, reward, cost, info - - def set_algorithm_specific_actor_critic(self): - """ - Use this method to initialize network. - e.g. Initialize Soft Actor Critic - - Returns: - Actor_critic - """ - self.actor_critic = ConstraintActorCritic( - observation_space=self.env.observation_space, - action_space=self.env.action_space, - model_cfgs=self.cfgs.model_cfgs, - ).to(self.device) - # Set up optimizer for policy and value function - - self.actor_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.actor, learning_rate=self.cfgs.actor_lr - ) - self.reward_critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.reward_critic, learning_rate=self.cfgs.critic_lr - ) - self.cost_critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.cost_critic, learning_rate=self.cfgs.critic_lr - ) - - return self.actor_critic diff --git a/omnisafe/algorithms/model_based/models/dynamic_model.py b/omnisafe/algorithms/model_based/models/dynamic_model.py deleted file mode 100644 index 37ae6d9ef..000000000 --- a/omnisafe/algorithms/model_based/models/dynamic_model.py +++ /dev/null @@ -1,405 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# Modified version of model.py from https://github.com/Xingyu-Lin/mbpo_pytorch/blob/main/model.py -# original version doesn't validate model error batch-wise and is highly memory intensive. -# ============================================================================== -"""Dynamics Model""" - -import itertools - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F - - -def swish(data): - """Transform data using sigmoid function.""" - return data * torch.sigmoid(data) - - -class StandardScaler: - """Normalize data""" - - def __init__(self, device=torch.device('cpu')): - self.mean = 0.0 - self.std = 1.0 - self.mean_t = torch.tensor(self.mean).to(device) - self.std_t = torch.tensor(self.std).to(device) - self.device = device - - def fit(self, data): - """Runs two ops, one for assigning the mean of the data to the internal mean, and - another for assigning the standard deviation of the data to the internal standard deviation. - This function must be called within a 'with .as_default()' block. - - Arguments: - data (np.ndarray): A numpy array containing the input - - Returns: None. - """ - self.mean = np.mean(data, axis=0, keepdims=True) - self.std = np.std(data, axis=0, keepdims=True) - self.std[self.std < 1e-12] = 1.0 - self.mean_t = torch.FloatTensor(self.mean).to(self.device) - self.std_t = torch.FloatTensor(self.std).to(self.device) - - def transform(self, data): - """Transforms the input matrix data using the parameters of this scaler. - - Arguments: - data (np.array): A numpy array containing the points to be transformed. - - Returns: (np.array) The transformed dataset. - """ - if torch.is_tensor(data): - return (data - self.mean_t) / self.std_t - return (data - self.mean) / self.std - - -def init_weights(layer): - """Initialize network weight""" - - def truncated_normal_init(weight, mean=0.0, std=0.01): - """Initialize network weight""" - torch.nn.init.normal_(weight, mean=mean, std=std) - while True: - cond = torch.logical_or(weight < mean - 2 * std, weight > mean + 2 * std) - if not torch.sum(cond): - break - weight = torch.where( - cond, torch.nn.init.normal_(torch.ones(weight.shape), mean=mean, std=std), weight - ) - return weight - - if isinstance(layer, (nn.Linear, EnsembleFC)): - input_dim = layer.in_features - truncated_normal_init(layer.weight, std=1 / (2 * np.sqrt(input_dim))) - layer.bias.data.fill_(0.0) - - -class EnsembleFC(nn.Module): - """Ensemble fully connected network""" - - __constants__ = ['in_features', 'out_features'] - in_features: int - out_features: int - ensemble_size: int - weight: torch.Tensor - - # pylint: disable-next=too-many-arguments - def __init__( - self, - in_features: int, - out_features: int, - ensemble_size: int, - weight_decay: float = 0.0, - bias: bool = True, - ) -> None: - super().__init__() - self.in_features = in_features - self.out_features = out_features - self.ensemble_size = ensemble_size - self.weight = nn.Parameter(torch.Tensor(ensemble_size, in_features, out_features)) - self.weight_decay = weight_decay - if bias: - self.bias = nn.Parameter(torch.Tensor(ensemble_size, out_features)) - else: - self.register_parameter('bias', None) - self.reset_parameters() - - def reset_parameters(self) -> None: - """reset parameters""" - - def forward(self, input_data: torch.Tensor) -> torch.Tensor: - """forward""" - w_times_x = torch.bmm(input_data, self.weight) - return torch.add(w_times_x, self.bias[:, None, :]) # w times x + b - - -# pylint: disable-next=too-many-instance-attributes -class EnsembleModel(nn.Module): - """Ensemble dynamics model""" - - # pylint: disable-next=too-many-arguments - def __init__( - self, - algo, - env_type, - state_size, - action_size, - reward_size, - cost_size, - ensemble_size, - hidden_size=200, - learning_rate=1e-3, - use_decay=False, - ): - super().__init__() - self.algo = algo - self.env_type = env_type - - self.state_size = state_size - self.reward_size = reward_size - self.cost_size = cost_size - if self.algo == 'MBPPOLag' and self.env_type == 'gym': - self.output_dim = state_size - elif self.algo == 'SafeLOOP' and self.env_type == 'gym': - self.output_dim = state_size + reward_size - elif self.algo == 'CAP' and self.env_type == 'gym': - self.output_dim = state_size + reward_size - elif self.env_type == 'mujoco-velocity': - self.output_dim = state_size + reward_size + cost_size - self.hidden_size = hidden_size - self.use_decay = use_decay - - self.nn1 = EnsembleFC( - state_size + action_size, hidden_size, ensemble_size, weight_decay=0.000025 - ) - self.nn2 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.00005) - self.nn3 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.000075) - self.nn4 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.000075) - self.nn5 = EnsembleFC(hidden_size, self.output_dim * 2, ensemble_size, weight_decay=0.0001) - - self.register_buffer('max_logvar', (torch.ones((1, self.output_dim)).float() / 2)) - self.register_buffer('min_logvar', (-torch.ones((1, self.output_dim)).float() * 10)) - self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) - self.apply(init_weights) - - # pylint: disable-next=too-many-locals - def forward(self, data, ret_log_var=False): - """Compute next state, reward, cost""" - nn1_output = swish(self.nn1(data)) - nn2_output = swish(self.nn2(nn1_output)) - nn3_output = swish(self.nn3(nn2_output)) - nn4_output = swish(self.nn4(nn3_output)) - nn5_output = self.nn5(nn4_output) - mean = nn5_output[:, :, : self.output_dim] - logvar = self.max_logvar - F.softplus(self.max_logvar - nn5_output[:, :, self.output_dim :]) - logvar = self.min_logvar + F.softplus(logvar - self.min_logvar) - var = torch.exp(logvar) - if ret_log_var: - return mean, logvar - return mean, var - - def get_decay_loss(self): - """Get decay loss""" - decay_loss = 0.0 - for layer in self.children(): - if isinstance(layer, EnsembleFC): - decay_loss += layer.weight_decay * torch.sum(torch.square(layer.weight)) / 2.0 - return decay_loss - - def loss(self, mean, logvar, labels, inc_var_loss=True): - """ - mean, logvar: Ensemble_size x N x dim - labels: N x dim - """ - assert len(mean.shape) == len(logvar.shape) == len(labels.shape) == 3 - inv_var = torch.exp(-logvar) - if inc_var_loss: - # Average over batch and dim, sum over ensembles. - mse_loss = torch.mean(torch.mean(torch.pow(mean - labels, 2) * inv_var, dim=-1), dim=-1) - var_loss = torch.mean(torch.mean(logvar, dim=-1), dim=-1) - total_loss = torch.sum(mse_loss) + torch.sum(var_loss) - else: - mse_loss = torch.mean(torch.pow(mean - labels, 2), dim=(1, 2)) - total_loss = torch.sum(mse_loss) - return total_loss, mse_loss - - def train_ensemble(self, loss): - """Train the dynamics model""" - self.optimizer.zero_grad() - loss += 0.01 * torch.sum(self.max_logvar) - 0.01 * torch.sum(self.min_logvar) - if self.use_decay: - loss += self.get_decay_loss() - loss.backward() - self.optimizer.step() - - -# pylint: disable-next=too-many-instance-attributes -class EnsembleDynamicsModel: - """Dynamics model for predict next state, reward and cost""" - - # pylint: disable-next=too-many-arguments - def __init__( - self, - algo, - env_type, - device, - network_size, - elite_size, - hidden_size, - use_decay, - state_size, - action_size, - reward_size, - cost_size, - ): - self.algo = algo - self.network_size = network_size - self.elite_size = elite_size - self.model_list = [] - self.state_size = state_size - self.action_size = action_size - self.reward_size = reward_size - self.cost_size = cost_size - self.network_size = network_size - self.device = device - if self.algo == 'MBPPOLag': - self.elite_model_idxes = [] - elif self.algo in ['SafeLOOP', 'CAP']: - self.elite_model_idxes = list(range(self.elite_size)) - self.env_type = env_type - self.ensemble_model = EnsembleModel( - algo, - env_type, - state_size, - action_size, - reward_size, - cost_size, - network_size, - hidden_size, - use_decay=use_decay, - ) - self.ensemble_model.to(self.device) - self.scaler = StandardScaler(self.device) - self._max_epochs_since_update = 5 - self._epochs_since_update = 0 - self._state = {} - self._snapshots = {i: (None, 1e10) for i in range(self.network_size)} - - # pylint: disable-next=too-many-locals, too-many-arguments - def train(self, inputs, labels, batch_size=256, holdout_ratio=0.0, max_epochs_since_update=5): - """train dynamics, holdout_ratio is the data ratio hold out for validation""" - self._max_epochs_since_update = max_epochs_since_update - self._epochs_since_update = 0 - self._state = {} - self._snapshots = {i: (None, 1e10) for i in range(self.network_size)} - - num_holdout = int(inputs.shape[0] * holdout_ratio) - permutation = np.random.permutation(inputs.shape[0]) - inputs, labels = inputs[permutation], labels[permutation] - - # split training and testing dataset - train_inputs, train_labels = inputs[num_holdout:], labels[num_holdout:] - holdout_inputs, holdout_labels = inputs[:num_holdout], labels[:num_holdout] - self.scaler.fit(train_inputs) - train_inputs = self.scaler.transform(train_inputs) - holdout_inputs = self.scaler.transform(holdout_inputs) - - for epoch in itertools.count(): - train_mse_losses = [] - # training - train_idx = np.vstack( - [np.random.permutation(train_inputs.shape[0]) for _ in range(self.network_size)] - ) - # shape: [train_inputs.shape[0],network_size] - - for start_pos in range(0, train_inputs.shape[0], batch_size): - idx = train_idx[:, start_pos : start_pos + batch_size] - train_input = torch.from_numpy(train_inputs[idx]).float().to(self.device) - train_label = torch.from_numpy(train_labels[idx]).float().to(self.device) - mean, logvar = self.ensemble_model(train_input, ret_log_var=True) - total_loss, mse_loss = self.ensemble_model.loss(mean, logvar, train_label) - self.ensemble_model.train_ensemble(total_loss) - train_mse_losses.append(mse_loss.detach().cpu().numpy().mean()) - - # validation - val_idx = np.vstack( - [np.random.permutation(holdout_inputs.shape[0]) for _ in range(self.network_size)] - ) - val_batch_size = 512 - val_losses_list = [] - len_valid = 0 - for start_pos in range(0, holdout_inputs.shape[0], val_batch_size): - with torch.no_grad(): - idx = val_idx[:, start_pos : start_pos + val_batch_size] - val_input = torch.from_numpy(holdout_inputs[idx]).float().to(self.device) - val_label = torch.from_numpy(holdout_labels[idx]).float().to(self.device) - holdout_mean, holdout_logvar = self.ensemble_model(val_input, ret_log_var=True) - _, holdout_mse_losses = self.ensemble_model.loss( - holdout_mean, holdout_logvar, val_label, inc_var_loss=False - ) - holdout_mse_losses = holdout_mse_losses.detach().cpu().numpy() - val_losses_list.append(holdout_mse_losses) - len_valid += 1 - val_losses = np.array(val_losses_list) - val_losses = np.sum(val_losses, axis=0) / len_valid - sorted_loss_idx = np.argsort(val_losses) - self.elite_model_idxes = sorted_loss_idx[: self.elite_size].tolist() - break_train = self._save_best(epoch, val_losses) - if break_train: - break - - train_mse_losses = np.array(train_mse_losses).mean() - val_mse_losses = val_losses - return train_mse_losses, val_mse_losses - - def _save_best(self, epoch, holdout_losses): - updated = False - for i, current_loss in enumerate(holdout_losses): - _, best = self._snapshots[i] - improvement = (best - current_loss) / best - if improvement > 0.01: - self._snapshots[i] = (epoch, current_loss) - updated = True - - if updated: - self._epochs_since_update = 0 - else: - self._epochs_since_update += 1 - return self._epochs_since_update > self._max_epochs_since_update - - def predict_t(self, inputs, batch_size=1024, repeat_network=False): - """Input type and output type both are tensor, used for planning loop""" - inputs = self.scaler.transform(inputs) - # input shape: [networ_size, (num_gaus+num_actor)*paritcle ,state_dim + action_dim] - ensemble_mean, ensemble_var = [], [] - for i in range(0, inputs.shape[0], batch_size): - model_input = inputs[i : min(i + batch_size, inputs.shape[0])].float().to(self.device) - # input shape: [networ_size, (num_gaus+num_actor)*paritcle ,state_dim + action_dim] - if repeat_network: - b_mean, b_var = self.ensemble_model( - model_input[None, :, :].repeat([self.network_size, 1, 1]), ret_log_var=False - ) - else: - b_mean, b_var = self.ensemble_model(model_input, ret_log_var=False) - - ensemble_mean.append(b_mean) - ensemble_var.append(b_var) - ensemble_mean = torch.cat(ensemble_mean, dim=1) - ensemble_var = torch.cat(ensemble_var, dim=1) - - return ensemble_mean, ensemble_var - - def predict(self, inputs, batch_size=1024): - """Input type and output type both are numpy""" - inputs = self.scaler.transform(inputs) - ensemble_mean, ensemble_var = [], [] - for i in range(0, inputs.shape[0], batch_size): - model_input = ( - torch.from_numpy(inputs[i : min(i + batch_size, inputs.shape[0])]) - .float() - .to(self.device) - ) - b_mean, b_var = self.ensemble_model( - model_input[None, :, :].repeat([self.network_size, 1, 1]), ret_log_var=False - ) - ensemble_mean.append(b_mean.detach().cpu().numpy()) - ensemble_var.append(b_var.detach().cpu().numpy()) - ensemble_mean = np.hstack(ensemble_mean) - ensemble_var = np.hstack(ensemble_var) - return ensemble_mean, ensemble_var diff --git a/omnisafe/algorithms/model_based/models/virtual_env.py b/omnisafe/algorithms/model_based/models/virtual_env.py deleted file mode 100644 index 8c1b71b7d..000000000 --- a/omnisafe/algorithms/model_based/models/virtual_env.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Virtual Environment""" - -import numpy as np -import torch - - -class VirtualEnv: - """Virtual environment for generating data or planning""" - - def __init__(self, algo, model, env_name, device=torch.device('cpu')): - self.algo = algo - self.model = model - self.env_name = env_name - self.device = device - if self.model.env_type == 'gym' and self.algo in ['MBPPOLag']: - self.state_start_dim = 0 - elif self.model.env_type == 'gym' and self.algo in ['CAP', 'SafeLOOP']: - self.state_start_dim = 1 - elif self.model.env_type == 'mujoco-velocity' and self.algo in [ - 'MBPPOLag', - 'CAP', - 'SafeLOOP', - ]: - self.state_start_dim = 2 - - def _termination_fn(self, env_name, obs, act, next_obs): - """Terminal function""" - if env_name == 'Hopper-v2': - assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 - - height = next_obs[:, 0] - angle = next_obs[:, 1] - not_done = ( - np.isfinite(next_obs).all(axis=-1) - * np.abs(next_obs[:, 1:] < 100).all(axis=-1) - * (height > 0.7) - * (np.abs(angle) < 0.2) - ) - - done = ~not_done - done = done[:, None] - return done - if env_name == 'Walker2d-v2': - assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 - - height = next_obs[:, 0] - angle = next_obs[:, 1] - not_done = (height > 0.8) * (height < 2.0) * (angle > -1.0) * (angle < 1.0) - done = ~not_done - done = done[:, None] - return done - if 'walker_' in env_name: - torso_height = next_obs[:, -2] - torso_ang = next_obs[:, -1] - if 'walker_7' in env_name or 'walker_5' in env_name: - offset = 0.0 - else: - offset = 0.26 - not_done = ( - (torso_height > 0.8 - offset) - * (torso_height < 2.0 - offset) - * (torso_ang > -1.0) - * (torso_ang < 1.0) - ) - done = ~not_done - done = done[:, None] - return done - - return False - - def _get_logprob(self, input_data, means, variances): - k = input_data.shape[-1] - log_prob = ( - -1 - / 2 - * ( - k * np.log(2 * np.pi) - + np.log(variances).sum(-1) - + (np.power(input_data - means, 2) / variances).sum(-1) - ) - ) - - # [ batch_size ] - prob = np.exp(log_prob).sum(0) - - # [ batch_size ] - log_prob = np.log(prob) - - stds = np.std(means, 0).mean(-1) - - return log_prob, stds - - # pylint: disable-next=too-many-locals - def mbppo_step(self, obs, act, idx=None, deterministic=False): - # pylint: disable-next=line-too-long - """use numpy input to predict single next state by randomly select one model result or select index model result.""" - if len(obs.shape) == 1: - obs = obs[None] - act = act[None] - return_single = True - else: - return_single = False - - if idx is None: - idx = self.model.elite_model_idxes - else: - idx = [idx] - inputs = np.concatenate((obs, act), axis=-1) - ensemble_model_means, ensemble_model_vars = self.model.predict(inputs) - ensemble_model_means[:, :, self.state_start_dim :] += obs - - ensemble_model_stds = np.sqrt(ensemble_model_vars) - - if deterministic: - ensemble_samples = ensemble_model_means - else: - ensemble_samples = ( - ensemble_model_means - + np.random.normal(size=ensemble_model_means.shape) * ensemble_model_stds - ) - - _, batch_size, _ = ensemble_model_means.shape - model_idxes = np.random.choice(idx, size=batch_size) - batch_idxes = np.arange(0, batch_size) - - samples = ensemble_samples[model_idxes, batch_idxes] - if self.algo == 'MBPPOLag' and self.model.env_type == 'mujoco-velocity': - rewards, cost, next_obs = ( - samples[:, 0], - samples[:, 1], - samples[:, self.state_start_dim :], - ) - terminals = self._termination_fn(self.env_name, obs, act, next_obs) - elif self.algo == 'MBPPOLag' and self.model.env_type == 'gym': - next_obs = samples - rewards = None - cost = None - terminals = None - - if return_single: - next_obs = next_obs[0] - if self.model.env_type == 'mujoco-velocity': - rewards = rewards[0] - cost = cost[0] - - return next_obs, rewards, cost, terminals - - # pylint: disable-next=too-many-arguments,too-many-locals - def safeloop_step(self, obs, act, deterministic=False, all_model=False, repeat_network=False): - """Use tensor input to predict single next state by randomly select elite model result for online planning""" - if len(obs.shape) == 1: - obs = obs[None] - act = act[None] - - inputs = torch.cat((obs, act), dim=-1) - ensemble_model_means, ensemble_model_vars = self.model.predict_t( - inputs, repeat_network=repeat_network - ) - - ensemble_model_means[:, :, self.state_start_dim :] += obs - - ensemble_model_stds = torch.sqrt(ensemble_model_vars) - - if deterministic: - ensemble_samples = ensemble_model_means - else: - ensemble_samples = ( - ensemble_model_means - + torch.randn(size=ensemble_model_means.shape).to(self.device) * ensemble_model_stds - ) - - # use all dynamics model result - if all_model: - samples = ensemble_samples - samples_var = ensemble_model_vars - # only use elite model result - else: - _, batch_size, _ = ensemble_model_means.shape - model_idxes = np.random.choice(self.model.elite_model_idxes, size=batch_size) - batch_idxes = np.arange(0, batch_size) - samples = ensemble_samples[model_idxes, batch_idxes] - samples_var = ensemble_model_vars[model_idxes, batch_idxes] - - return samples, samples_var - - # pylint: disable-next=too-many-arguments, too-many-locals - def cap_step(self, obs, act, deterministic=False, all_model=True, repeat_network=False): - """Use tensor input to predict single next state by randomly select elite model result for online planning""" - if len(obs.shape) == 1: - obs = obs[None] - act = act[None] - - inputs = torch.cat((obs, act), dim=-1) - ensemble_model_means, ensemble_model_vars = self.model.predict_t( - inputs, repeat_network=repeat_network - ) - - ensemble_model_means[:, :, self.state_start_dim :] += obs - - ensemble_model_stds = torch.sqrt(ensemble_model_vars) - - if deterministic: - ensemble_samples = ensemble_model_means - else: - ensemble_samples = ( - ensemble_model_means - + torch.randn(size=ensemble_model_means.shape).to(self.device) * ensemble_model_stds - ) - - # use all dynamics model result - if all_model: - samples = ensemble_samples - samples_var = ensemble_model_vars - # only use elite model result - else: - _, batch_size, _ = ensemble_model_means.shape - model_idxes = np.random.choice(self.model.elite_model_idxes, size=batch_size) - batch_idxes = np.arange(0, batch_size) - samples = ensemble_samples[model_idxes, batch_idxes] - samples_var = ensemble_model_vars[model_idxes, batch_idxes] - - rewards, rewards_var = samples[:, :, 0].unsqueeze(2), samples_var[:, :, 0].unsqueeze(2) - next_obs, next_obs_var = ( - samples[:, :, self.state_start_dim :], - samples_var[:, :, self.state_start_dim :], - ) - output = { - 'state': (next_obs, next_obs_var), - 'reward': (rewards, rewards_var), - } - if self.model.env_type == 'mujoco-velocity': - cost, cost_var = samples[:, :, 1].unsqueeze(2), samples_var[:, :, 1].unsqueeze(2) - output['cost'] = (cost, cost_var) - - return output diff --git a/omnisafe/algorithms/model_based/planner.py b/omnisafe/algorithms/model_based/planner.py deleted file mode 100644 index 58a34a168..000000000 --- a/omnisafe/algorithms/model_based/planner.py +++ /dev/null @@ -1,919 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Safe controllers which do a black box optimization incorporating the constraint costs.""" - -import numpy as np -import scipy.stats as stats -import torch - - -class ARCPlanner: # pylint: disable=too-many-instance-attributes - """The Actor Regularized Control (ARC) Planner. - - References: - Title: Learning Off-Policy with Online Planning - Authors: Harshit Sikchi, Wenxuan Zhou, David Held. - URL: https://arxiv.org/abs/2008.10066 - """ - - # pylint: disable-next=too-many-locals,too-many-arguments - def __init__( - self, - algo, - cfgs, - device, - env, - models, - actor_critic, - horizon, - popsize, - particles, - max_iters, - alpha, - mixture_coefficient, - kappa, - safety_threshold, - minimal_elites, - obs_clip, - lagrangian_multiplier=None, - ): - self.algo = algo - self.cfgs = cfgs - self.device = device - self.obs_dim = env.observation_space.shape[0] - self.action_dim = env.action_space.shape[0] - self.env = env - self.models = models - self.actor_critic = actor_critic - self.termination_function = default_termination_function - self.horizon = horizon - self.sol_dim = self.env.action_space.shape[0] * horizon - self.action_max = np.repeat(self.env.action_space.high, self.horizon, axis=0) - self.action_min = np.repeat(self.env.action_space.low, self.horizon, axis=0) - self.mean = np.zeros((self.sol_dim,)) - # Shape: [ H * action_dim, 1 ] - self.num_gaussian_traj = popsize - self.mixture_coefficient = mixture_coefficient - self.num_actor_traj = int(self.mixture_coefficient * self.num_gaussian_traj) - - self.particles = particles - self.max_iters = max_iters - self.alpha_plan = alpha - self.kappa = kappa - self.safety_threshold = safety_threshold - self.minimal_elites = minimal_elites - self.state_start_dim = 2 if self.env.env_type == 'mujoco-velocity' else 1 - self.obs_clip = obs_clip - self.lagrangian_multiplier = lagrangian_multiplier - - def planner_reset(self): - """Reset planner when the episode end.""" - self.mean = np.zeros((self.sol_dim,)) - - def generate_actor_action(self, curr_state): - """Generate H steps deterministic and stochastic actor action trajectory using dynamics model.""" - # Set the reward of initial state to zero. - actor_state = np.array( - [np.concatenate(([0] * self.state_start_dim, curr_state.copy()), axis=0)] - * (self.num_actor_traj) - ) - # Shape: [actor_traj, reward_dim (+ cost_dim) + state_dim] - - # Add trajectories using actions suggested by actors - actor_action_traj = np.zeros((self.num_actor_traj, self.sol_dim)) - # Shape: [actor_traj, H * action_dim] - - actor_state = torch.FloatTensor(actor_state).to(self.device) - # Shape: [actor_traj, reward_dim (+ cost_dim) + state_dim] - - actor_state_m = actor_state[0, :].reshape(1, -1) - # Shape: [1, reward_dim (+ cost_dim) + state_dim] - - actor_state_m2 = actor_state[1, :].reshape(1, -1) - # Shape: [1, reward_dim (+ cost_dim) + state_dim] - - for current_horizon in range(self.horizon): - # Use deterministic policy to plan a action trajectory - actor_actions_m, _, _ = self.actor_critic.step( - actor_state_m.reshape(1, -1)[:, self.state_start_dim :], deterministic=True - ) - # Shape: [1, action_dim] - actor_actions_m = torch.tensor(actor_actions_m).to(self.device) - # Use dynamics model to plan - actor_state_m, _ = self.models.safeloop_step( - actor_state_m[:, self.state_start_dim :], - actor_actions_m, - repeat_network=True, - ) - # Shape: [1, reward_dim + state_dim] - - # protection for producing nan - actor_state_m = torch.clamp(actor_state_m, -self.obs_clip, self.obs_clip) - actor_state_m = torch.nan_to_num(actor_state_m) - - # Store a planning action to action buffer - actor_action_traj[ - 0, current_horizon * self.action_dim : (current_horizon + 1) * self.action_dim - ] = (actor_actions_m.detach().cpu().numpy()) - - # Using Stochastic policy to plan a action trajectory - actor_actions, _, _ = self.actor_critic.step(actor_state_m2[:, self.state_start_dim :]) - # Shape: [1, action_dim] - actor_actions = torch.tensor(actor_actions).to(self.device) - - # Use dynamics model to plan - actor_state_m2, _ = self.models.safeloop_step( - actor_state_m2[:, self.state_start_dim :], - actor_actions, - repeat_network=True, - ) - # Shape: [1, reward_dim + state_dim] - - # protection for producing nan - actor_state_m2 = torch.clamp(actor_state_m2, -self.obs_clip, self.obs_clip) - actor_state_m2 = torch.nan_to_num(actor_state_m2) - - # Copy the planning action of stochastic actor (actor_traj-1) times, and store to action buffer - actor_action_traj[ - 1:, current_horizon * self.action_dim : (current_horizon + 1) * self.action_dim - ] = (actor_actions.detach().cpu().numpy()) - return actor_action_traj - - def compute_terminal_reward(self, action_traj, state_traj): - """Compute the terminal reward behind H horizon""" - # This is the final action for evaluating terminated reward and cost - final_action = ( - torch.from_numpy( - action_traj[ - :, - (self.horizon - 1) * self.action_dim : (self.horizon) * self.action_dim, - ].reshape((self.num_gaussian_traj + self.num_actor_traj) * self.particles, -1) - ) - .float() - .to(self.device) - ) - # Shape: [ (num_gau_traj + num_actor_traj) * particles, action_dim ) , - # action_traj Shape: [ (num_gau_traj + num_actor_traj) * particles, H * action_dim] - - final_action = final_action.repeat(self.models.model.network_size, 1) - # Shape: [ (num_gau_traj + num_actor_traj) * particles, network_size , action_dim ) - - # This is the final state for evaluating terminated reward and cost - final_state = ( - torch.from_numpy( - state_traj[self.horizon, :, :, self.state_start_dim :].reshape( - (self.num_gaussian_traj + self.num_actor_traj) - * self.particles - * state_traj.shape[1], - -1, - ) - ) - .float() - .to(self.device) - ) - # [ (num_gau_traj + num_actor_traj) * particles, state_dim ] - - terminal_reward = ( - self.actor_critic.critic(final_state, final_action)[0].cpu().detach().numpy() - ) - terminal_reward = terminal_reward.reshape(state_traj.shape[1], -1) - # [ (num_gau_traj + num_actor_traj) * particles, 1] - - return terminal_reward - - def compute_cost_from_state(self, state_traj): - """compute cost from state that dynamics model predict""" - states_flatten = state_traj[:, :, :, self.state_start_dim :].reshape(-1, self.obs_dim) - # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, state_dim] - - all_safety_costs = np.zeros((states_flatten.shape[0],)) - # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1] - - all_safety_costs = self.env.get_observation_cost(states_flatten) - # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1] - - all_safety_costs = all_safety_costs.reshape( - state_traj.shape[0], state_traj.shape[1], state_traj.shape[2], 1 - ) - # [ horizon+1, network_size, (num_gau_traj + num_actor_traj) * particles, 1] - return all_safety_costs - - # pylint: disable-next=too-many-statements,too-many-locals,too-many-branches - def get_action(self, curr_state): - """Select action when interact with environment.""" - # sample action from actor - if self.num_actor_traj != 0.0: - actor_action_traj = self.generate_actor_action(curr_state) - # Shape: [actor_traj, H * action_dim] - - curr_state = np.array( - [np.concatenate(([0] * self.state_start_dim, curr_state.copy()), axis=0)] - * ((self.num_gaussian_traj + self.num_actor_traj) * self.particles) - ) - # Shape: [(num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim] - - curr_state = np.expand_dims(curr_state, axis=0) - # Shape: [1, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) +state_dim] - - curr_state = np.repeat(curr_state, self.models.model.network_size, 0) - # Shape: [network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim] - - # initial mean and var of the sampling normal dist - # shift the current array to the left, clear the used action - self.mean[: -self.action_dim] = self.mean[self.action_dim :] - # Shape: [ H * action_dim, 1 ] - - # fill the last position with the last second action - self.mean[-self.action_dim :] = self.mean[-2 * self.action_dim : -self.action_dim] - mean = self.mean - # Shape: [ H * action_dim, 1 ] - - var = np.tile( - np.square(self.env.action_space.high[0] - self.env.action_space.low[0]) / 16, - [self.sol_dim], - ) - # Shape: [ H * action_dim, 1 ] - - # Create gaussian distribution. - # mean is the zero vector, var is Unit Matrix - gaussian = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean)) - - current_iter = 0 - while current_iter < self.max_iters: - lb_dist, ub_dist = mean - self.action_min, self.action_max - mean - - constrained_var = np.minimum( - np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var - ) - - # Generate random normal gaussian variable and multiply by the var, then add to the mean - action_traj = ( - gaussian.rvs(size=(self.num_gaussian_traj, self.sol_dim)) * np.sqrt(constrained_var) - + mean - ).astype(np.float32) - # Shape: [ N , H * action_dim] - - if self.num_actor_traj != 0: - # Combine the actor action with gaussian action - action_traj = np.concatenate((action_traj, actor_action_traj), axis=0) - # Shape: [ num_gau_traj + num_actor_traj, H * action_dim] - - # Multiple particles go through the same action sequence - action_traj = np.repeat(action_traj, self.particles, axis=0) - # Shape: [ particles, num_gau_traj + num_actor_traj, H * action_dim] - - # actions clipped between -1 and 1 - action_traj = np.clip(action_traj, -1, 1) - # Shape: [ particles, num_gau_traj + num_actor_traj, H * action_dim] - - state_traj = ( - torch.from_numpy(np.expand_dims(curr_state.copy(), axis=0)).float().to(self.device) - ) - # Shape: [1, network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim] - - var_traj = ( - torch.zeros([1, curr_state.shape[0], curr_state.shape[1], 1]) - .float() - .to(self.device) - ) - # Shape: [1, network_size, (num_gau_traj + num_actor_traj) * particles, 1] - actions = np.repeat( - np.expand_dims(action_traj, axis=0), self.models.model.network_size, axis=0 - ) - # Shape: [ network_size, particles, num_gau_traj + num_actor_traj, H * action_dim] - - actions = torch.FloatTensor(actions).to(self.device) - # Shape: [ network_size, particles, num_gau_traj + num_actor_traj, H * action_dim] - for current_horizon in range(self.horizon): - states_h = state_traj[current_horizon, :, :, self.state_start_dim :] - # [ network_size, (num_gau_traj + num_actor_traj) * particles, state_dim] - # use all dynamics model to predict next state (all_model=True) - next_states, next_var = self.models.safeloop_step( - states_h, - actions[ - :, - :, - current_horizon * self.action_dim : (current_horizon + 1) * self.action_dim, - ], - all_model=True, - repeat_network=False, - ) - # next_states and var shape: - # [ network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim] - - # protection for producing nan in rare cases - next_states = torch.clamp(next_states, -self.obs_clip, self.obs_clip) - next_states = torch.nan_to_num(next_states) - - state_traj = torch.cat((state_traj, next_states.unsqueeze(0)), axis=0) - # pylint: disable-next=line-too-long - # [ horizon + 1, network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim] - - next_var = next_var[:, :, self.state_start_dim :].sqrt().norm(dim=2).unsqueeze(2) - # [network_size, (num_gau_traj + num_actor_traj) * particles,1] - - var_traj = torch.cat((var_traj, next_var.unsqueeze(0)), axis=0) - # [ horizon + 1, network_size, (num_gau_traj + num_actor_traj) * particles, 1] - - state_traj = state_traj.cpu().detach().numpy() - # pylint: disable-next=line-too-long - # [ horizon + 1, network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim] - - var_traj_numpy = var_traj.detach().cpu().numpy() - del var_traj - - if self.env.env_type == 'mujoco-terminated': - done = np.zeros((state_traj.shape[1], state_traj.shape[2], 1)) - # [network_size, (num_gau_traj + num_actor_traj) * particles, 1] - - # Set the reward of terminated states to zero - for current_horizon in range(1, self.horizon + 1): - for ens in range(state_traj.shape[1]): - # check the state whether terminate - done[ens, :, :] = np.logical_or( - done[ens, :, :], - self.termination_function( - None, - None, - state_traj[current_horizon, ens, :, self.state_start_dim :], - ), - ) - not_done = 1 - done[ens, :, :] - # Set the reward of terminated states to zero - state_traj[current_horizon, ens, :, 0] *= not_done.astype( - np.float32 - ).reshape(-1) - - # Find average cost of each trajectory - returns = np.zeros((self.num_gaussian_traj + self.num_actor_traj,)) - safety_costs = np.zeros((self.num_gaussian_traj + self.num_actor_traj,)) - trajectory_max_vars = np.zeros((self.num_gaussian_traj + self.num_actor_traj,)) - - # Shape: [ num_gau_traj + num_actor_traj, 1 ] - if self.algo == 'SafeLOOP': - terminal_reward = self.compute_terminal_reward(action_traj, state_traj) - # [ (num_gau_traj + num_actor_traj) * particles, 1] - - if self.env.env_type == 'gym': - all_safety_costs = self.compute_cost_from_state(state_traj) - # [ horizon+1, network_size, (num_gau_traj + num_actor_traj) * particles, 1] - - # Calculate the average reward and max cost of N action trajectory, - # each action trajectory have generated (network_size * particles) state-action trajectory - # using (network_size * particles) ensemble models. - for ensemble in self.models.model.elite_model_idxes: - if self.algo == 'SafeLOOP': - if self.env.env_type == 'mujoco-terminated': - done[ensemble, :, :] = np.logical_or( - done[ensemble, :, :], - self.termination_function( - None, - None, - state_traj[self.horizon - 1, ensemble, :, self.state_start_dim :], - ), - ) - not_done = 1 - done[ensemble, :, :] - q_rews = terminal_reward[ensemble, :] * not_done.reshape(-1) - else: - q_rews = terminal_reward[ensemble, :] - - traj_indices = np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype( - int - ) - for particle in range(self.particles): - returns[traj_indices] += np.sum( - state_traj[ - 1 : self.horizon + 1, - ensemble, - traj_indices * self.particles + particle, - 0, - ], - axis=0, - ) - if self.algo == 'SafeLOOP': - returns[traj_indices] += q_rews.reshape(-1)[ - traj_indices * self.particles + particle - ] - if self.env.env_type == 'gym': - # use state that dynamics predict to compute cost - safety_costs[traj_indices] = np.maximum( - safety_costs, - np.sum( - all_safety_costs[ - 0 : self.horizon, - ensemble, - traj_indices * self.particles + particle, - 0, - ], - axis=0, - ), - ) - elif self.env.env_type == 'mujoco-velocity': - # use cost that dynamics predict at dimension one - safety_costs[traj_indices] += np.sum( - state_traj[ - 1 : self.horizon + 1, - ensemble, - traj_indices * self.particles + particle, - 1, - ], - axis=0, - ) - if self.algo == 'CAP': - trajectory_max_vars[traj_indices] += np.maximum( - trajectory_max_vars, - np.sum( - var_traj_numpy[ - 1 : self.horizon + 1, - ensemble, - traj_indices * self.particles + particle, - 0, - ], - axis=0, - ), - ) - returns /= state_traj.shape[1] * self.particles - # [ num_gau_traj + num_actor_traj, 1] - - if self.algo == 'SafeLOOP': - new_mean, new_var, safety_costs_mean, fail_flag = self.safe_loop_elite_select( - returns, safety_costs, action_traj - ) - if fail_flag is False: - mean = new_mean - else: # rare case for protecting bug - break - elif self.algo == 'CAP': - safety_costs /= state_traj.shape[1] * self.particles - if self.cfgs.cost_gamma == 1.0: - c_gamma_discount = self.cfgs.max_ep_len / self.horizon - # Extend the cost to the entire trajectory - else: - c_gamma_discount = (1 / self.horizon) * ( - (1 - self.cfgs.cost_gamma**self.cfgs.max_ep_len) - / (1 - self.cfgs.cost_gamma) - ) - # average the cost, then view it as the starting element of the arithmetic progression - safety_costs = c_gamma_discount * safety_costs - - penalty = torch.nn.ReLU()(self.lagrangian_multiplier).item() - safety_costs = safety_costs + penalty * trajectory_max_vars - mean, new_var, safety_costs_mean = self.cap_elite_select( - returns, safety_costs, action_traj - ) - var = (self.alpha_plan) * var + (1 - self.alpha_plan) * new_var - current_iter += 1 - - del state_traj, action_traj - - # Initialize the var every 6 times - if (current_iter + 1) % 6 == 0: - var = np.tile( - np.square(self.env.action_space.high[0] - self.env.action_space.low[0]) / 16.0, - [self.sol_dim], - ) * (1.5 ** ((current_iter + 1) // 6)) - - # If safe trajectory not enough and t>5 or t>25 ,then break - if ( - ((safety_costs < self.safety_threshold).sum() >= self.minimal_elites) - and current_iter > 5 - ) or current_iter > 25: - break - - # Store the mean and use it in next plan - self.mean = mean - - # Return [1, action_dim], that is the first action of H horizon action mean, which shape is [1, H * action_dim] - return mean[: self.action_dim], safety_costs_mean - - def cap_elite_select(self, returns, safety_costs, action_traj): - """TODO""" - # returns: [ num_gau_traj + num_actor_traj, 1] - # safety_costs: [ num_gau_traj + num_actor_traj, 1] - # action_traj: [ (num_gau_traj + num_actor_traj) * particle, H * action_dim] - safety_costs_mean = np.mean(safety_costs) - if (safety_costs < self.safety_threshold).sum() < self.minimal_elites: - indices = np.argsort(safety_costs) - indices *= self.particles - elites = action_traj[indices][: self.minimal_elites] - else: - costs = ( - -returns * (safety_costs < self.safety_threshold) - + (safety_costs >= self.safety_threshold) * 1e4 - ) - indices = np.argsort(costs) - indices = np.array([idx for idx in indices if costs[idx] < 1e3]) - indices *= self.particles - elites = action_traj[indices][: min(self.minimal_elites, indices.shape[0])] - mean = np.mean(elites, axis=0) - new_var = np.var(elites, axis=0) - return mean, new_var, safety_costs_mean - - def cap_elite_selection(self, returns, safety_costs, action_traj): - """TODO""" - # returns: [ num_gau_traj + num_actor_traj, 1] - # safety_costs: [ num_gau_traj + num_actor_traj, 1] - # action_traj: [ (num_gau_traj + num_actor_traj) * particle, H * action_dim] - all_action = action_traj[ - np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int) - * self.particles, - :, - ] - # all_action is [ num_gau_traj + num_actor_traj, H * action_dim] - - # find the index for safe trajectories - feasible_ids = (safety_costs <= self.safety_threshold).nonzero()[0] - if feasible_ids.shape[0] < self.minimal_elites: - # if safe trajectories not enough - elite_ids = np.argsort(safety_costs)[: self.minimal_elites] - else: - # if have enough safe trajectories - # select the top k reward in safe action trajectories - elite_ids = feasible_ids[np.argsort(-returns[feasible_ids])][: self.minimal_elites] - - elite_action = all_action[elite_ids] - # [ elite_ids, H * action_dim] - - mean = np.mean(elite_action, axis=0) - # [ 1, H * action_dim] - - var = np.var(elite_action, axis=0) - # [ 1, H * action_dim] - - return mean, var - - def safe_loop_elite_select(self, returns, safety_costs, action_traj): - """Update mean and var using reward and cost""" - # returns: [ num_gau_traj + num_actor_traj, 1] - # safety_costs: [ num_gau_traj + num_actor_traj, 1] - # action_traj: [ (num_gau_traj + num_actor_traj) * particle, H * action_dim] - safety_costs_mean = np.mean(safety_costs) - - if (safety_costs < self.safety_threshold).sum() < self.minimal_elites: - safety_rewards = -safety_costs - # [ num_gau_traj + num_actor_traj, 1] - - max_safety_reward = np.max(safety_rewards) - # [1,1] - - score = np.exp(self.kappa * (safety_rewards - max_safety_reward)) - # [ num_gau_traj + num_actor_traj, 1] - - indices = np.argsort(safety_costs) - # [ num_gau_traj + num_actor_traj, 1] - - mean = np.sum( - action_traj[ - np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int) - * self.particles, - :, - ] - * score.reshape(-1, 1), - axis=0, - ) / (np.sum(score) + 1e-10) - # mean: [1, H * action_dim], - # action_traj: [ num_gau_traj + num_actor_traj, H * action_dim], - # score: [ num_gau_traj + num_actor_traj, 1] - - new_var = np.average( - ( - action_traj[ - np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int) - * self.particles, - :, - ] - - mean - ) - ** 2, - weights=score.reshape(-1), - axis=0, - ) - # [ 1, H * action_dim] - - else: # if have enough safe trajectory - # safe trajectory's costs is -reward, unsafe trajectory's costs is 1e4 - costs = ( - -returns * (safety_costs < self.safety_threshold) - + (safety_costs >= self.safety_threshold) * 1e4 - ) - # [ num_gau_traj + num_actor_traj, 1] - - # select indices of safe trajectory - indices = np.arange(costs.shape[0]) - indices = np.array([idx for idx in indices if costs[idx] < 1e3]) - # [ num_safe_traj, 1] - - # rare case - if indices.shape[0] == 0 or action_traj.shape[0] == 0: - return False, False, False, True - - safe_action_traj = action_traj[ - np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int) - * self.particles, - :, - ][indices, :] - # [ num_safe_traj, H * action_dim] - - # use safe trajectory and its reward as weight to update - rewards = -costs[indices] - # [ num_safe_traj, 1 ] - - max_reward = np.max(rewards) - # [ 1, 1 ] - - score = np.exp(self.kappa * (rewards - max_reward)) - # [ num_safe_traj, 1 ] - - mean = np.sum(safe_action_traj * score.reshape(-1, 1), axis=0) / (np.sum(score) + 1e-10) - # [1, H * action_dim] = [1, H * action_dim] / [1,1] - - new_var = np.average((safe_action_traj - mean) ** 2, weights=score.reshape(-1), axis=0) - # [ 1, H * action_dim] - return mean, new_var, safety_costs_mean, False - - -def default_termination_function(state, action, next_state): # pylint: disable=unused-argument - '# Default termination function that outputs done=False' - if torch.is_tensor(next_state): - done = torch.zeros((next_state.shape[0], 1)) - else: - done = np.zeros((next_state.shape[0], 1)) - return done - - -# pylint: disable-next=too-many-instance-attributes -class CCEPlanner: - """Constrained Cross-Entropy (CCE) Planner. - - References: - Title: Constrained Cross-Entropy Method for Safe Reinforcement Learning - Authors: Min Wen, Ufuk Topcu. - URL: https://proceedings.neurips.cc/paper/2018/hash/34ffeb359a192eb8174b6854643cc046-Abstract.html - """ - - # pylint: disable-next=too-many-locals, too-many-arguments - def __init__( - self, - algo, - cfgs, - device, - env, - models, - horizon, - popsize, - particles, - max_iters, - alpha, - mixture_coefficient, - minimal_elites, - epsilon, - obs_clip, - lagrangian_multiplier, - cost_constrained=True, - penalize_uncertainty=True, - ): - self.algo = algo - self.cfgs = cfgs - self.obs_dim, self.action_dim = env.observation_space.shape[0], env.action_space.shape[0] - self.action_max, self.action_min = env.action_space.high, env.action_space.low - self.gamma = self.cfgs.gamma - self.c_gamma = self.cfgs.cost_gamma - self.cost_limit = self.cfgs.lagrange_cfgs.cost_limit - self.cost_constrained = cost_constrained - self.penalize_uncertainty = penalize_uncertainty - self.device = device - self.obs_clip = obs_clip - self.particles = particles - self.horizon = horizon - self.num_gaussian_traj = popsize - self.minimal_elites = minimal_elites - self.max_iters = max_iters - self.alpha = alpha - self.epsilon = epsilon - self.horizin_action_min = np.tile(self.action_min, [self.horizon]) - self.horizin_action_max = np.tile(self.action_max, [self.horizon]) - self.env = env - self.ac_buf = np.array([]).reshape(0, self.action_dim) - self.prev_sol = np.tile((self.action_min + self.action_max) / 2, [self.horizon]) - self.init_var = np.tile(np.square(self.action_max - self.action_min) / 16, [self.horizon]) - self.state_start_dim = 2 if self.env.env_type == 'mujoco-velocity' else 1 - self.mixture_coefficient = mixture_coefficient - self.lagrangian_multiplier = lagrangian_multiplier - self.models = models - self.elites = None - - def get_action(self, obs): - """Get action from previous solution or planner""" - if self.models is None: - return np.random.uniform(self.action_min, self.action_max, self.action_min.shape) - if self.ac_buf.shape[0] > 0: - action, self.ac_buf = self.ac_buf[0], self.ac_buf[1:] - return action - - soln = self.obtain_solution(obs, self.prev_sol, self.init_var) - self.prev_sol = np.concatenate( - [np.copy(soln)[self.action_dim :], np.zeros(self.action_dim)] - ) - self.ac_buf = soln[: self.action_dim].reshape(-1, self.action_dim) - - return self.get_action(obs) - - # pylint: disable-next=too-many-locals - def obtain_solution(self, obs, init_mean, init_var): - """Get action from planner""" - mean, var, iteration = init_mean, init_var, 0 - gaussian = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(var)) - - while (iteration < self.max_iters) and np.max(var) > self.epsilon: - lb_dist, ub_dist = mean - self.horizin_action_min, self.horizin_action_max - mean - constrained_var = np.minimum( - np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var - ) - - noise = gaussian.rvs(size=[self.num_gaussian_traj, self.horizon * self.action_dim]) - - samples = noise * np.sqrt(constrained_var) + mean - samples = samples.astype(np.float32) - - rewards, costs, eps_lens = self.rollout(obs, samples) - epoch_ratio = np.ones_like(eps_lens) * self.cfgs.max_ep_len / self.horizon - terminated = eps_lens != self.horizon - if self.c_gamma == 1: - c_gamma_discount = epoch_ratio - else: - c_gamma_discount = ( - (1 - self.c_gamma ** (epoch_ratio * self.horizon)) - / (1 - self.c_gamma) - / self.horizon - ) - rewards = rewards * epoch_ratio - costs = costs * c_gamma_discount - - feasible_ids = ((costs <= self.cost_limit) & (~terminated)).nonzero()[0] - if self.cost_constrained: - if feasible_ids.shape[0] >= self.minimal_elites: - elite_ids = feasible_ids[np.argsort(-rewards[feasible_ids])][ - : self.minimal_elites - ] - else: - elite_ids = np.argsort(costs)[: self.minimal_elites] - else: - elite_ids = np.argsort(-rewards)[: self.minimal_elites] - self.elites = samples[elite_ids] - new_mean = np.mean(self.elites, axis=0) - new_var = np.var(self.elites, axis=0) - mean = self.alpha * mean + (1 - self.alpha) * new_mean - var = self.alpha * var + (1 - self.alpha) * new_var - iteration += 1 - - return mean - - @torch.no_grad() - def rollout(self, obs, ac_seqs): - """Roll out H step to compute reward, cost""" - # obs: [obs_dim,] - # ac_seqs: [num_gaussian_traj, horizon * action_dim] - - ac_seqs = torch.from_numpy(ac_seqs).float().to(self.device) - ac_seqs = ac_seqs.view(-1, self.horizon, self.action_dim) - transposed = ac_seqs.transpose(0, 1) - expanded = transposed[:, :, None] - tiled = expanded.expand(-1, -1, self.particles, -1) - ac_seqs = tiled.contiguous().view(self.horizon, -1, self.action_dim) - - # Expand current observation - cur_obs = torch.from_numpy(obs).float().to(self.device) - cur_obs = cur_obs[None] - cur_obs = cur_obs.expand(self.num_gaussian_traj * self.particles, -1) - # cur_obs: [num_gaussian_traj * particles, obs_dim] - rewards = torch.zeros(self.num_gaussian_traj, self.particles, device=self.device) - costs = torch.zeros(self.num_gaussian_traj, self.particles, device=self.device) - length = torch.zeros(self.num_gaussian_traj, self.particles, device=self.device) - - for horizon in range(self.horizon): - cur_acs = ac_seqs[horizon] - cur_obs, reward, cost = self._predict_next(cur_obs, cur_acs) - # Clip state value - cur_obs = torch.clamp(cur_obs, -self.obs_clip, self.obs_clip) - reward = reward.view(-1, self.particles) - cost = cost.view(-1, self.particles) - - rewards += reward - costs += cost - length += 1 - - # Replace nan with high cost - rewards = rewards.nan_to_num_(-1e6) - costs = costs.nan_to_num_(1e6) - - return ( - rewards.mean(dim=1).detach().cpu().numpy(), - costs.mean(dim=1).detach().cpu().numpy(), - length.mean(dim=1).detach().cpu().numpy(), - ) - - def _predict_next(self, obs, acs): - """Predict next state, reward and cost""" - # obs: [num_gaussian_traj * particles, obs_dim] - proc_obs = self._expand_to_ts_format(obs) - # [network_size, num_gaussian_traj*particles/network_size, state_dim] - proc_acs = self._expand_to_ts_format(acs) - output = self.models.cap_step(proc_obs, proc_acs) - next_obs, var = output['state'] - # [network_size, num_gaussian_traj*particles/network_size, state_dim] - reward, _ = output['reward'] - # [network_size, num_gaussian_traj*particles, 1] - reward = self._flatten_to_matrix(reward) - # [network_size * num_gaussian_traj * particles, 1] - - if self.env.env_type == 'mujoco-velocity': - cost, _ = output['cost'] - cost = self._flatten_to_matrix(cost) - elif self.env.env_type == 'gym': - next_obs_cost = next_obs.unsqueeze(0) - cost = self.compute_cost_from_state(next_obs_cost) - cost = torch.tensor(cost, device=self.device) - # [1, network_size, num_gaussian_traj*particles/network_size, 1] - cost = cost.squeeze(0) - # [network_size, num_gaussian_traj*particles/network_size, 1] - cost = self._flatten_to_matrix(cost) - # [num_gaussian_traj*particles, 1] - - next_obs = self._flatten_to_matrix(next_obs) - - obs = obs.detach().cpu().numpy() - acs = acs.detach().cpu().numpy() - - if self.cost_constrained and self.penalize_uncertainty: - # var: [network_size, num_gaussian_traj*particles/network_size, state_dim] - var_penalty = var.sqrt().norm(dim=2).max(0)[0] - # cost_penalty: [num_gaussian_traj*particles/network_size] - var_penalty = var_penalty.repeat_interleave(self.models.model.network_size).view( - cost.shape - ) - # cost_penalty: [num_gaussian_traj*particles, 1] - penalty = torch.nn.ReLU()(self.lagrangian_multiplier).item() - cost += penalty * var_penalty - - return next_obs, reward, cost - - def _expand_to_ts_format(self, mat): - """Expand input to ensemble network input format""" - dim = mat.shape[-1] - # eg:state_dim - reshaped = mat.view( - -1, - self.models.model.network_size, - self.particles // self.models.model.network_size, - dim, - ) - # [num_gaussian_traj, network_size, particles // network_size, state_dim] - transposed = reshaped.transpose(0, 1) - # [network_size, num_gaussian_traj, particles // network_size, state_dim] - reshaped = transposed.contiguous().view(self.models.model.network_size, -1, dim) - # [network_size, num_gaussian_traj * particles / network_size, state_dim] - - return reshaped - - def _flatten_to_matrix(self, ts_fmt_arr): - """Flatten ensemble network output format to matrix""" - - dim = ts_fmt_arr.shape[-1] - reshaped = ts_fmt_arr.view( - self.models.model.network_size, - -1, - self.particles // self.models.model.network_size, - dim, - ) - transposed = reshaped.transpose(0, 1) - reshaped = transposed.contiguous().view(-1, dim) - return reshaped - - def compute_cost_from_state(self, state_traj): - """compute cost from state that dynamics model predict""" - states_flatten = state_traj[:, :, :, :].reshape(-1, self.obs_dim) - # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, state_dim] - - all_safety_costs = np.zeros((states_flatten.shape[0],)) - # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1] - - all_safety_costs = self.env.get_observation_cost(states_flatten) - # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1] - - all_safety_costs = all_safety_costs.reshape( - state_traj.shape[0], state_traj.shape[1], state_traj.shape[2], 1 - ) - # [ horizon+1, network_size, (num_gau_traj + num_actor_traj) * particles, 1] - return all_safety_costs diff --git a/omnisafe/algorithms/model_based/policy_gradient.py b/omnisafe/algorithms/model_based/policy_gradient.py deleted file mode 100644 index 310a7fb72..000000000 --- a/omnisafe/algorithms/model_based/policy_gradient.py +++ /dev/null @@ -1,304 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""The Policy Gradient algorithm in model-based setting.""" - -import time -from copy import deepcopy - -import numpy as np -import torch - -from omnisafe.algorithms import registry -from omnisafe.algorithms.model_based.models import EnsembleDynamicsModel, VirtualEnv -from omnisafe.common.buffer import OffPolicyBuffer -from omnisafe.common.logger import Logger -from omnisafe.models.constraint_actor_critic import ConstraintActorCritic -from omnisafe.utils import core -from omnisafe.utils.distributed_utils import proc_id -from omnisafe.wrappers import wrapper_registry - - -@registry.register -class PolicyGradientModelBased: # pylint: disable=too-many-instance-attributes - """The Policy Gradient algorithm in Model-Based setting. - - References: - Title: Policy Gradient Methods for Reinforcement Learning with Function Approximation - Authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour. - URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf - """ - - def __init__(self, env_id, cfgs=None) -> None: - self.env_id = env_id - self.cfgs = deepcopy(cfgs) - self.algo = self.__class__.__name__ - self.wrapper_type = self.cfgs.wrapper_type - self.env = wrapper_registry.get(self.wrapper_type)(self.algo, self.env_id) - - self.device = torch.device(self.cfgs.device) - self.cost_gamma = self.cfgs.cost_gamma - # Set up logger and save configuration to disk - # Get local parameters before logger instance to avoid unnecessary print - self.logger = Logger( - output_dir=cfgs.data_dir, - exp_name=cfgs.exp_name, - seed=cfgs.seed, - use_tensorboard=cfgs.use_tensorboard, - use_wandb=cfgs.use_wandb, - config=cfgs, - ) - - # Set seed - seed = int(cfgs.seed) - seed += 10000 * proc_id() - torch.manual_seed(seed) - np.random.seed(seed) - - # Set env - self.env.env.reset(seed=seed) - self.env.set_eplen(int(self.cfgs.max_ep_len)) - - # Initialize dynamics model - self.dynamics = EnsembleDynamicsModel( - self.algo, - self.env.env_type, - self.device, - state_size=self.env.dynamics_state_size, - action_size=self.env.action_space.shape[0], - reward_size=1, - cost_size=1, - **self.cfgs.dynamics_cfgs, - ) - self.virtual_env = VirtualEnv(self.algo, self.dynamics, self.env_id, self.device) - - # Initialize off-policy buffer - self.off_replay_buffer = OffPolicyBuffer( - obs_space=self.env.observation_space, - act_space=self.env.action_space, - size=self.cfgs.replay_size, - batch_size=self.cfgs.batch_size, - device=self.device, - ) - - if self.algo in ['MBPPOLag', 'SafeLOOP']: - self.use_actor = True - self.actor_critic = self.set_algorithm_specific_actor_critic() - else: - self.use_actor = False - - # Setup statistics - self.start_time = time.time() - self.epoch_time = time.time() - - self.logger.log('Start with training.') - - self._init_log() - - def _init_log(self): - self.logger.register_key('TotalEnvSteps3') - self.logger.register_key('Metrics/EpRet') - self.logger.register_key('Metrics/EpCost') - self.logger.register_key('Metrics/EpLen') - self._specific_init_logs() - self.logger.register_key('Time') - - def _specific_init_logs(self): - pass - - def learn(self): # pylint: disable=too-many-locals - """training the policy.""" - self.start_time = time.time() - ep_len, ep_ret, ep_cost = 0, 0, 0 - state = self.env.reset() - time_step = 0 - last_policy_update, last_dynamics_update, last_log = 0, 0, 0 - while time_step < self.cfgs.max_real_time_steps: - # select action - action, action_info = self.select_action(time_step, state, self.env) - - next_state, reward, cost, terminated, truncated, info = self.env.step( - action, self.cfgs.action_repeat - ) - - time_step += info['step_num'] - ep_cost += (self.cost_gamma**ep_len) * cost - ep_len += 1 - ep_ret += reward - self.store_real_data( - time_step, - ep_len, - state, - action_info, - action, - reward, - cost, - terminated, - truncated, - next_state, - info, - ) - - state = next_state - if terminated or truncated: - self.logger.store( - **{ - 'Metrics/EpRet': ep_ret, - 'Metrics/EpLen': ep_len * self.cfgs.action_repeat, - 'Metrics/EpCost': ep_cost, - } - ) - ep_ret, ep_cost, ep_len = 0, 0, 0 - state = self.env.reset() - self.algo_reset() - - if ( - time_step % self.cfgs.update_dynamics_freq < self.cfgs.action_repeat - and time_step - last_dynamics_update >= self.cfgs.update_dynamics_freq - ): - self.update_dynamics_model() - last_dynamics_update = time_step - - if ( - self.use_actor - and time_step % self.cfgs.update_policy_freq < self.cfgs.action_repeat - and time_step - last_policy_update >= self.cfgs.update_policy_freq - ): - self.update_actor_critic(time_step) - last_policy_update = time_step - - # Evaluate episode - if ( - time_step % self.cfgs.log_freq < self.cfgs.action_repeat - and time_step - last_log >= self.cfgs.log_freq - ) or time_step == self.cfgs.max_real_time_steps - 1: - self.log(time_step) - self.logger.torch_save() - last_log = time_step - # Close opened files to avoid number of open files overflow - self.logger.close() - - def log(self, time_step: int): - """ - logging data - """ - # Some child classes may add information to logs - self.algorithm_specific_logs(time_step) - - self.logger.store( - **{ - 'TotalEnvSteps3': time_step, - 'Time': int(time.time() - self.start_time), - } - ) - - self.logger.dump_tabular() - - def select_action(self, time_step, state, env): # pylint: disable=unused-argument - """ - Select action when interact with real environment. - - Returns: - action, action_info - """ - if self.env.env_type == 'gym': - state = env.generate_lidar(state) - state_vec = np.array(state) - state_tensor = torch.as_tensor(state_vec, device=self.device, dtype=torch.float32) - action, val, cval, logp = self.actor_critic.step(state_tensor) - action = np.nan_to_num(action) - action_info = {'state_vec': state_vec, 'val': val, 'cval': cval, 'logp': logp} - return action, action_info - - def algorithm_specific_logs(self, time_step): - """ - Use this method to collect log information. - e.g. log lagrangian for lagrangian-base , log q, r, s, c for CPO, etc - - Returns: - No return - """ - - def update_actor_critic(self, time_step): - """ - Use this method to update actor and critic. - - Returns: - No return - """ - - def set_algorithm_specific_actor_critic(self): - """ - Use this method to initialize network. - e.g. Initialize Soft Actor Critic - - Returns: - Actor_critic - """ - self.actor_critic = ConstraintActorCritic( - observation_space=self.env.observation_space, - action_space=self.env.action_space, - model_cfgs=self.cfgs.model_cfgs, - ).to(self.device) - # Set up optimizer for policy and value function - - self.actor_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.actor, learning_rate=self.cfgs.actor_lr - ) - self.reward_critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.reward_critic, learning_rate=self.cfgs.critic_lr - ) - self.cost_critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.cost_critic, learning_rate=self.cfgs.critic_lr - ) - - return self.actor_critic - - def update_dynamics_model(self): - """ - training the dynamics model - - Returns: - No return - """ - - def algo_reset(self): - """ - reset algo parameters - - Returns: - No return - """ - - # pylint: disable-next=too-many-arguments - def store_real_data( - self, - time_step, - ep_len, - state, - action_info, - action, - reward, - cost, - terminated, - truncated, - next_state, - info, - ): - """ - store real env data to buffer - - Returns: - No return - """ diff --git a/omnisafe/algorithms/model_based/safeloop.py b/omnisafe/algorithms/model_based/safeloop.py deleted file mode 100644 index f1d52e4bb..000000000 --- a/omnisafe/algorithms/model_based/safeloop.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the SafeLOOP algorithm.""" - -from copy import deepcopy - -import numpy as np -import torch - -from omnisafe.algorithms import registry -from omnisafe.algorithms.model_based.planner import ARCPlanner -from omnisafe.algorithms.model_based.policy_gradient import PolicyGradientModelBased -from omnisafe.models.actor_q_critic import ActorQCritic -from omnisafe.utils import core - - -@registry.register -class SafeLOOP( - PolicyGradientModelBased, ARCPlanner -): # pylint: disable=too-many-instance-attributes - """The Safe Learning Off-Policy with Online Planning (SafeLOOP) algorithm. - - References: - Title: Learning Off-Policy with Online Planning - Authors: Harshit Sikchi, Wenxuan Zhou, David Held. - URL: https://arxiv.org/abs/2008.10066 - """ - - def __init__(self, env_id, cfgs) -> None: - PolicyGradientModelBased.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - # # Initialize Actor-Critic - self.actor_critic = self.set_algorithm_specific_actor_critic() - self.ac_targ = deepcopy(self.actor_critic) - self._ac_training_setup() - - self.alpha = self.cfgs.alpha - self.alpha_gamma = self.cfgs.alpha_gamma - ARCPlanner.__init__( - self, - self.algo, - self.cfgs, - self.device, - self.env, - self.virtual_env, - self.actor_critic, - **self.cfgs.mpc_config, - ) - - # Set up model saving - what_to_save = { - 'pi': self.actor_critic.actor, - 'dynamics': self.dynamics, - } - self.logger.setup_torch_saver(what_to_save=what_to_save) - self.logger.torch_save() - - def _specific_init_logs(self): - self.logger.register_key('Loss/DynamicsTrainMseLoss') - self.logger.register_key('Loss/DynamicsValMseLoss') - self.logger.register_key('Plan/safety_costs_mean') - self.logger.register_key('QVals') - self.logger.register_key('Loss/Pi') - self.logger.register_key('Loss/Value') - - # pylint: disable-next=too-many-locals - def compute_loss_v(self, data): - """Computing value loss. - - Args: - data (dict): data from replay buffer. - - Returns: - torch.Tensor. - """ - obs, act, rew, next_obs, done = ( - data['obs'], - data['act'], - data['reward'], - data['next_obs'], - data['done'], - ) - q_value_list = self.actor_critic.critic(obs, act) - # Bellman backup for Q function - with torch.no_grad(): - act_targ, logp_a_next = self.ac_targ.actor.predict( - obs, deterministic=False, need_log_prob=True - ) - q_targ = torch.min(torch.vstack(self.ac_targ.critic(next_obs, act_targ)), dim=0).values - backup = rew + self.cfgs.gamma * (1 - done) * (q_targ - self.alpha * logp_a_next) - # MSE loss against Bellman backup - loss_q = [] - q_values = [] - for q_value in q_value_list: - loss_q.append(torch.mean((q_value - backup) ** 2)) - q_values.append(torch.mean(q_value)) - - # Useful info for logging - q_info = {'QVals': sum(q_values).cpu().detach().numpy()} - return sum(loss_q), q_info - - def compute_loss_pi(self, data: dict): - """Computing pi/actor loss. - - Args: - data (dict): data from replay buffer. - - Returns: - torch.Tensor. - """ - action, logp_a = self.actor_critic.actor.predict( - data['obs'], deterministic=True, need_log_prob=True - ) - loss_pi = self.actor_critic.critic(data['obs'], action)[0] - self.alpha * logp_a - pi_info = {'LogPi': logp_a.cpu().detach().numpy()} - return -loss_pi.mean(), pi_info - - def update_policy_net(self, data) -> None: - """Update policy network. - - Args: - data (dict): data dictionary. - """ - # Train policy with one steps of gradient descent - self.actor_optimizer.zero_grad() - loss_pi, _ = self.compute_loss_pi(data) - loss_pi.backward() - self.actor_optimizer.step() - self.logger.store(**{'Loss/Pi': loss_pi.item()}) - - def alpha_discount(self): - """Alpha discount.""" - self.alpha *= self.alpha_gamma - - def polyak_update_target(self): - """Polyak update target network.""" - with torch.no_grad(): - for param, param_targ in zip(self.actor_critic.parameters(), self.ac_targ.parameters()): - # Notes: We use an in-place operations "mul_", "add_" to update target - # params, as opposed to "mul" and "add", which would make new tensors. - param_targ.data.mul_(self.cfgs.polyak) - param_targ.data.add_((1 - self.cfgs.polyak) * param.data) - - def update_value_net(self, data: dict) -> None: - """Update value network. - - Args: - data (dict): data dictionary - """ - # Train value critic with one steps of gradient descent - self.critic_optimizer.zero_grad() - loss_q, q_info = self.compute_loss_v(data) - loss_q.backward() - self.critic_optimizer.step() - self.logger.store(**{'Loss/Value': loss_q.item(), 'QVals': q_info['QVals']}) - - def set_algorithm_specific_actor_critic(self): - """ - Use this method to initialize network. - e.g. Initialize Soft Actor Critic - - Returns: - Actor_critic - """ - self.actor_critic = ActorQCritic( - observation_space=self.env.observation_space, - action_space=self.env.action_space, - model_cfgs=self.cfgs.model_cfgs, - ).to(self.device) - # Set up optimizer for policy and q-function - self.actor_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.actor, learning_rate=self.cfgs.actor_lr - ) - self.critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.critic, learning_rate=self.cfgs.critic_lr - ) - return self.actor_critic - - def _ac_training_setup(self): - """Set up target network for off_policy training.""" - # Freeze target networks with respect to optimizer (only update via polyak averaging) - for param in self.ac_targ.actor.parameters(): - param.requires_grad = False - for param in self.ac_targ.critic.parameters(): - param.requires_grad = False - - def algorithm_specific_logs(self, time_step): - """Log algo parameter""" - super().algorithm_specific_logs(time_step) - if time_step < self.cfgs.update_policy_start_timesteps: - self.logger.store( - **{ - 'Loss/Pi': 0, - 'Plan/safety_costs_mean': 0, - 'QVals': 0, - 'Loss/Value': 0, - } - ) - - def update_actor_critic(self, time_step): - """update actor and critic""" - if time_step >= self.cfgs.update_policy_start_timesteps: - for _ in range(self.cfgs.update_policy_iters): - data = self.off_replay_buffer.sample_batch() - # First run one gradient descent step for Q. - self.update_value_net(data) - - # Freeze Q-network so you don't waste computational effort - # computing gradients for it during the policy learning step. - for param in self.actor_critic.critic.parameters(): - param.requires_grad = False - - # Next run one gradient descent step for actor. - self.update_policy_net(data) - - # Unfreeze Q-network so you can optimize it at next DDPG step. - for param in self.actor_critic.critic.parameters(): - param.requires_grad = True - - # Finally, update target networks by polyak averaging. - self.polyak_update_target() - self.alpha_discount() - - def update_dynamics_model(self): - """Update dynamics.""" - state = self.off_replay_buffer.data['obs'][: self.off_replay_buffer.size, :] - action = self.off_replay_buffer.data['act'][: self.off_replay_buffer.size, :] - reward = self.off_replay_buffer.data['reward'][: self.off_replay_buffer.size] - cost = self.off_replay_buffer.data['cost'][: self.off_replay_buffer.size] - next_state = self.off_replay_buffer.data['next_obs'][: self.off_replay_buffer.size, :] - delta_state = next_state - state - inputs = np.concatenate((state, action), axis=-1) - if self.env.env_type == 'mujoco-velocity': - labels = np.concatenate( - ( - np.reshape(reward, (reward.shape[0], -1)), - np.reshape(cost, (cost.shape[0], -1)), - delta_state, - ), - axis=-1, - ) - elif self.env.env_type == 'gym': - labels = np.concatenate( - (np.reshape(reward, (reward.shape[0], -1)), delta_state), axis=-1 - ) - train_mse_losses, val_mse_losses = self.dynamics.train( - inputs, labels, batch_size=256, holdout_ratio=0.2 - ) - self.logger.store( - **{ - 'Loss/DynamicsTrainMseLoss': train_mse_losses, - 'Loss/DynamicsValMseLoss': val_mse_losses, - } - ) - - def select_action(self, time_step, state, env): - """action selection""" - if time_step < self.cfgs.update_policy_start_timesteps: - action = self.env.action_space.sample() - - else: - action, safety_costs_mean = self.get_action(np.array(state)) - self.logger.store( - **{ - 'Plan/safety_costs_mean': safety_costs_mean, - } - ) - action = action + np.random.normal(action.shape) * self.cfgs.exploration_noise - action = np.clip(action, env.action_space.low, env.action_space.high) - return action, None - - def store_real_data( - self, - time_step, - ep_len, - state, - action_info, - action, - reward, - cost, - terminated, - truncated, - next_state, - info, - ): # pylint: disable=too-many-arguments - """store real data""" - if not terminated and not truncated and not info['goal_met']: - # Current goal position is not related to the last goal position, so do not store. - self.off_replay_buffer.store( - obs=state, act=action, reward=reward, cost=cost, next_obs=next_state, done=truncated - ) - - def algo_reset(self): - """reset planner""" - if self.env.env_type == 'gym': - self.planner_reset() diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py deleted file mode 100644 index 120a3bec3..000000000 --- a/omnisafe/algorithms/off_policy/__init__.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Off-policy algorithms.""" - -from omnisafe.algorithms.off_policy.crpo import OffCRPO -from omnisafe.algorithms.off_policy.cvpo import CVPO -from omnisafe.algorithms.off_policy.ddpg import DDPG -from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag -from omnisafe.algorithms.off_policy.ddpg_pid import DDPGPid -from omnisafe.algorithms.off_policy.sac import SAC -from omnisafe.algorithms.off_policy.sac_lag import SACLag -from omnisafe.algorithms.off_policy.sac_pid import SACPid -from omnisafe.algorithms.off_policy.sddpg import SDDPG -from omnisafe.algorithms.off_policy.td3 import TD3 -from omnisafe.algorithms.off_policy.td3_lag import TD3Lag -from omnisafe.algorithms.off_policy.td3_pid import TD3Pid - - -__all__ = [ - 'DDPG', - 'DDPGLag', - 'SAC', - 'SACLag', - 'SDDPG', - 'TD3', - 'TD3Lag', - 'CVPO', - 'DDPGPid', - 'TD3Pid', - 'SACPid', - 'OffCRPO', -] diff --git a/omnisafe/algorithms/off_policy/cvpo.py b/omnisafe/algorithms/off_policy/cvpo.py deleted file mode 100644 index 446ab588f..000000000 --- a/omnisafe/algorithms/off_policy/cvpo.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the CVPO algorithm.""" - -import numpy as np -import torch -from scipy.optimize import minimize -from torch.distributions import MultivariateNormal -from torch.nn.utils import clip_grad_norm_ - -from omnisafe.algorithms import registry -from omnisafe.algorithms.off_policy.ddpg import DDPG -from omnisafe.utils.algo_utils import gaussian_kl -from omnisafe.utils.tools import to_ndarray - - -@registry.register -# pylint: disable-next=too-many-instance-attributes,too-many-locals -class CVPO(DDPG): - """Constrained Variational Policy Optimization for Safe Reinforcement Learning. - - References: - - - Title: Constrained Variational Policy Optimization for Safe Reinforcement Learning. - - Authors: Zuxin Liu, Zhepeng Cen, Vladislav Isenbaev, - Wei Liu, Zhiwei Steven Wu, Bo Li, Ding Zhao - - URL: https://arxiv.org/abs/2201.11927v2 - """ - - def __init__( - self, - env_id: str, - cfgs, - ) -> None: - """Constrained Variational Policy Optimization. - - Args: - env_id (str): Environment ID. - cfgs (dict): Configuration dictionary. - """ - - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) - self.eta = 0.1 - self.lam = 0.1 - self.alpha_mean = 0.0 - self.alpha_var = 0.0 - self.cost_limit = self.cfgs.cost_limit - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Loss/Loss_l') - self.logger.register_key('Misc/mean_sigma_det') - self.logger.register_key('Misc/max_kl_sigma') - self.logger.register_key('Misc/max_kl_mu') - self.logger.register_key('Misc/eta') - - # pylint: disable-next=too-many-locals - def update_policy_net(self, obs) -> None: - """Update policy network. - - Args: - obs (torch.Tensor): observation. - """ - num_action = self.cfgs.sample_action_num - num_obs = obs.shape[0] - act_dim = self.actor_critic.act_dim - obs_dim = self.actor_critic.obs_shape[0] - - with torch.no_grad(): - # sample N actions per state - b_mean, _, b_var = self.ac_targ.actor.predict( - obs, deterministic=True, need_log_prob=True - ) - b_dist = MultivariateNormal(b_mean, scale_tril=b_var) - sampled_actions = b_dist.sample((num_action,)) - - expanded_states = obs[None, ...].expand(num_action, -1, -1) - target_q = self.ac_targ.critic( - expanded_states.reshape(-1, obs_dim), sampled_actions.reshape(-1, act_dim) - )[0] - target_q = target_q.reshape(num_action, num_obs) - target_q_np = to_ndarray(target_q).T - target_qc = self.ac_targ.cost_critic( - expanded_states.reshape(-1, obs_dim), sampled_actions.reshape(-1, act_dim) - )[0] - target_qc = target_qc.reshape(num_action, num_obs) - target_qc_np = to_ndarray(target_qc).T - - def dual(val): - """Dual function of the non-parametric variational.""" - beta, lam = val - target_q_np_comb = target_q_np - lam * target_qc_np - max_q = np.max(target_q_np_comb, 1) - return ( - beta * self.cfgs.dual_constraint - + lam * self.cost_limit - + np.mean(max_q) - + beta - * np.mean( - np.log(np.mean(np.exp((target_q_np_comb - max_q[:, None]) / beta), axis=1)) - ) - ) - - bounds = [(1e-6, 1e5), (1e-6, 1e5)] - options = {'ftol': 1e-3, 'maxiter': 10} - res = minimize( - dual, - np.array([self.eta, self.lam]), - method='SLSQP', - bounds=bounds, - tol=1e-3, - options=options, - ) - self.eta, self.lam = res.x - - raw_loss = torch.softmax((target_q - self.lam * target_qc) / self.eta, dim=0) - - # M-Step of Policy Improvement - for _ in range(self.cfgs.mstep_iteration_num): - mean, _, var = self.actor_critic.actor.predict( - obs, deterministic=True, need_log_prob=True - ) - - actor = MultivariateNormal(loc=mean, scale_tril=b_var) - actor_ = MultivariateNormal(loc=b_mean, scale_tril=var) - loss_p = torch.mean( - raw_loss - * ( - actor.expand((num_action, num_obs)).log_prob(sampled_actions) - + actor_.expand((num_action, num_obs)).log_prob(sampled_actions) - ) - ) - - kl_mu, kl_sigma, _, sigma_det = gaussian_kl( - mean_p=b_mean, mean_q=mean, var_p=b_var, var_q=var - ) - - if np.isnan(kl_mu.item()): - raise RuntimeError('kl_mu is nan') - if np.isnan(kl_sigma.item()): - raise RuntimeError('kl_sigma is nan') - - # update lagrange multipliers by gradient descent - self.alpha_mean -= ( - self.cfgs.alpha_mean_scale * (self.cfgs.kl_mean_constraint - kl_mu).detach().item() - ) - self.alpha_var -= ( - self.cfgs.alpha_var_scale * (self.cfgs.kl_var_constraint - kl_sigma).detach().item() - ) - - self.alpha_mean = np.clip(self.alpha_mean, 0.0, self.cfgs.alpha_mean_max) - self.alpha_var = np.clip(self.alpha_var, 0.0, self.cfgs.alpha_var_max) - self.actor_optimizer.zero_grad() - loss_l = -( - loss_p - + self.alpha_mean * (self.cfgs.kl_mean_constraint - kl_mu) - + self.alpha_var * (self.cfgs.kl_var_constraint - kl_sigma) - ) - loss_l.backward() - clip_grad_norm_(self.actor_critic.actor.parameters(), 0.01) - self.actor_optimizer.step() - self.logger.store( - **{ - 'Loss/Loss_pi': loss_p.mean().item(), - 'Loss/Loss_l': loss_l.mean().item(), - 'Misc/mean_sigma_det': sigma_det.item(), - 'Misc/max_kl_sigma': kl_sigma.item(), - 'Misc/max_kl_mu': kl_mu.item(), - 'Misc/eta': self.eta, - } - ) - - def algorithm_specific_logs(self): - """Log the CVPO specific information.""" diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py deleted file mode 100644 index aee66a7cc..000000000 --- a/omnisafe/algorithms/off_policy/ddpg_lag.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the Lagrange version of the DDPG algorithm.""" - -from typing import Dict, NamedTuple, Tuple - -import torch -import torch.nn.functional as F - -from omnisafe.algorithms import registry -from omnisafe.algorithms.off_policy.ddpg import DDPG -from omnisafe.common.lagrange import Lagrange - - -@registry.register -# pylint: disable-next=too-many-instance-attributes -class DDPGLag(DDPG, Lagrange): - """The Lagrange version of the DDPG Algorithm. - - References: - - Title: Continuous control with deep reinforcement learning - - Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, - Yuval Tassa, David Silver, Daan Wierstra. - - URL: `DDPG `_ - """ - - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize DDPG.""" - DDPG.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__(self, **self.cfgs.lagrange_cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - self.logger.register_key('Loss/Loss_pi_c') - self.logger.register_key('Misc/CostLimit') - - def algorithm_specific_logs(self) -> None: - """Log the DDPG Lag specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - * - Loss/Loss_pi_c - - The loss of the critic network. - * - Misc/CostLimit - - The cost limit. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - 'Misc/CostLimit': self.cost_limit, - } - ) - - def compute_loss_pi(self, obs: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r"""Computing ``pi/actor`` loss. - - In the lagrange version of DDPG, the loss is defined as: - - .. math:: - L=\mathbb{E}_{s \sim \mathcal{D}} [ Q(s, \pi(s))- \lambda C(s, \pi(s))] - - where :math:`\lambda` is the lagrange multiplier. - - Args: - obs (:class:`torch.Tensor`): ``observation`` saved in data. - """ - _, action = self.actor_critic.actor.predict(obs, deterministic=False, need_log_prob=False) - loss_pi = self.actor_critic.critic(obs, action)[0] - loss_pi_c = self.actor_critic.cost_critic(obs, action)[0] - loss_pi_c = F.relu(loss_pi_c - self.cost_limit) - self.update_lagrange_multiplier(loss_pi_c.mean().item()) - penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() - loss_pi -= penalty * loss_pi_c - loss_pi /= 1 + penalty - pi_info = {} - self.logger.store( - **{ - 'Loss/Loss_pi_c': loss_pi_c.mean().item(), - } - ) - return -loss_pi.mean(), pi_info diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index 7b8582d73..b155319bd 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -14,7 +14,7 @@ # ============================================================================== """On-policy algorithms.""" -from omnisafe.algorithms.on_policy import ( +from omnisafe.algorithms.on_policy import ( # simmer, base, early_terminated, first_order, @@ -23,7 +23,6 @@ pid_lagrange, saute, second_order, - simmer, ) from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, PPOLagEarlyTerminated @@ -33,12 +32,14 @@ from omnisafe.algorithms.on_policy.pid_lagrange import CPPOPid, TRPOPid from omnisafe.algorithms.on_policy.saute import PPOLagSaute, PPOSaute from omnisafe.algorithms.on_policy.second_order import CPO, PCPO -from omnisafe.algorithms.on_policy.simmer import ( - PPOLagSimmerPid, - PPOLagSimmerQ, - PPOSimmerPid, - PPOSimmerQ, -) + + +# from omnisafe.algorithms.on_policy.simmer import ( +# PPOLagSimmerPid, +# PPOLagSimmerQ, +# PPOSimmerPid, +# PPOSimmerQ, +# ) __all__ = [ @@ -50,5 +51,5 @@ *pid_lagrange.__all__, *saute.__all__, *second_order.__all__, - *simmer.__all__, + # *simmer.__all__, ] diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py index 6865d7579..be36d8723 100644 --- a/omnisafe/algorithms/on_policy/base/natural_pg.py +++ b/omnisafe/algorithms/on_policy/base/natural_pg.py @@ -14,15 +14,14 @@ # ============================================================================== """Implementation of the Natural Policy Gradient algorithm.""" -from typing import NamedTuple, Tuple - import torch from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient -from omnisafe.utils import distributed_utils +from omnisafe.utils import distributed +from omnisafe.utils.config import Config +from omnisafe.utils.math import conjugate_gradients from omnisafe.utils.tools import ( - conjugate_gradients, get_flat_gradients_from, get_flat_params_from, set_param_values_to_model, @@ -43,62 +42,21 @@ class NaturalPG(PolicyGradient): - URL: `Natural PG `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize Natural Policy Gradient. + def __init__(self, env_id: str, cfgs: Config) -> None: + super().__init__(env_id, cfgs) - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - self.cg_damping = cfgs.cg_damping - self.cg_iters = cfgs.cg_iters - self.target_kl = cfgs.target_kl - self.fvp_obs = cfgs.fvp_obs - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Misc/AcceptanceStep') - self.logger.register_key('Misc/Alpha') - self.logger.register_key('Misc/FinalStepNorm') - self.logger.register_key('Misc/gradient_norm') - self.logger.register_key('Misc/xHx') - self.logger.register_key('Misc/H_inv_g') - - def search_step_size(self, step_dir: torch.Tensor) -> Tuple[torch.Tensor, int]: - """NPG use full step_size, so we just return 1. + self._fvp_obs: torch.Tensor - Args: - step_dir (torch.Tensor): The step direction. - """ - accept_step = 1 - return step_dir, accept_step - - def algorithm_specific_logs(self) -> None: - r"""Log the Natural Policy Gradient specific information. - - .. list-table:: - - * - Things to log - - Description - * - ``Misc/AcceptanceStep`` - - The acceptance step size. - * - ``Misc/Alpha`` - - :math:`\frac{\delta_{KL}}{xHx}` in original paper. - where :math:`x` is the step direction, :math:`H` is the Hessian matrix, - and :math:`\delta_{KL}` is the target KL divergence. - * - ``Misc/FinalStepNorm`` - - The final step norm. - * - ``Misc/gradient_norm`` - - The gradient norm. - * - ``Misc/xHx`` - - :math:`xHx` in original paper. - * - ``Misc/H_inv_g`` - - :math:`H^{-1}g` in original paper. + def _init_log(self) -> None: + super()._init_log() - """ + self._logger.register_key('Misc/Alpha') + self._logger.register_key('Misc/FinalStepNorm') + self._logger.register_key('Misc/gradient_norm') + self._logger.register_key('Misc/xHx') + self._logger.register_key('Misc/H_inv_g') - def Fvp(self, params: torch.Tensor) -> torch.Tensor: + def _fvp(self, params: torch.Tensor) -> torch.Tensor: """Build the `Hessian-vector product `_ based on an approximation of the KL-divergence. The Hessian-vector product is approximated by the Fisher information matrix, @@ -108,98 +66,64 @@ def Fvp(self, params: torch.Tensor) -> torch.Tensor: Args: params (torch.Tensor): The parameters of the actor network. """ - self.actor_critic.actor.zero_grad() - q_dist = self.actor_critic.actor(self.fvp_obs) + self._actor_critic.actor.zero_grad() + q_dist = self._actor_critic.actor(self._fvp_obs) with torch.no_grad(): - p_dist = self.actor_critic.actor(self.fvp_obs) + p_dist = self._actor_critic.actor(self._fvp_obs) kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean() - grads = torch.autograd.grad(kl, self.actor_critic.actor.parameters(), create_graph=True) + grads = torch.autograd.grad(kl, self._actor_critic.actor.parameters(), create_graph=True) # type: ignore flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_p = (flat_grad_kl * params).sum() - grads = torch.autograd.grad(kl_p, self.actor_critic.actor.parameters(), retain_graph=False) - # contiguous indicating, if the memory is contiguously stored or not + grads = torch.autograd.grad(kl_p, self._actor_critic.actor.parameters(), retain_graph=False) # type: ignore + flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]) - distributed_utils.mpi_avg_torch_tensor(flat_grad_grad_kl) - return flat_grad_grad_kl + params * self.cg_damping + distributed.avg_tensor(flat_grad_grad_kl) + return flat_grad_grad_kl + params * self._cfgs.cg_damping - # pylint: disable-next=too-many-locals,too-many-arguments - def update_policy_net( + def _update_actor( # pylint: disable=too-many-arguments, too-many-locals self, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, ) -> None: - """Update policy network. + self._fvp_obs = obs[::4] + theta_old = get_flat_params_from(self._actor_critic.actor) + self._actor_critic.actor.zero_grad() + adv = self._compute_adv_surrogate(adv_r, adv_c) + loss, info = self._loss_pi(obs, act, logp, adv) - Natural Policy Gradient (NPG) update policy network using the conjugate gradient algorithm, - following the steps: + loss.backward() + distributed.avg_grads(self._actor_critic.actor) - - Calculate the gradient of the policy network, - - Use the conjugate gradient algorithm to calculate the step direction. - - Use the line search algorithm to find the step size. - - Args: - obs (torch.Tensor): The observation tensor. - act (torch.Tensor): The action tensor. - log_p (torch.Tensor): The log probability of the action. - adv (torch.Tensor): The advantage tensor. - cost_adv (torch.Tensor): The cost advantage tensor. - """ - # get loss and info values before update - self.fvp_obs = obs[::4] - theta_old = get_flat_params_from(self.actor_critic.actor) - self.actor_critic.actor.zero_grad() - processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv) - loss_pi, pi_info = self.compute_loss_pi( - obs=obs, - act=act, - log_p=log_p, - adv=processed_adv, - ) - # train policy with multiple steps of gradient descent - loss_pi.backward() - # average grads across MPI processes - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - g_flat = get_flat_gradients_from(self.actor_critic.actor) - g_flat *= -1 - - # pylint: disable-next=invalid-name - x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters) + grad = -get_flat_gradients_from(self._actor_critic.actor) + x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters) assert torch.isfinite(x).all(), 'x is not finite' - # note that xHx = g^T x, but calculating xHx is faster than g^T x - xHx = torch.dot(x, self.Fvp(x)) # equivalent to : g^T x - assert xHx.item() >= 0, 'No negative values' - - # perform descent direction - alpha = torch.sqrt(2 * self.target_kl / (xHx + 1e-8)) - step_direction = alpha * x + xHx = torch.dot(x, self._fvp(x)) + assert xHx.item() >= 0, 'xHx is negative' + alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8)) + step_direction = x * alpha assert torch.isfinite(step_direction).all(), 'step_direction is not finite' - # determine step direction and apply SGD step after grads where set - # TRPO uses custom backtracking line search - final_step_dir, accept_step = self.search_step_size(step_dir=step_direction) - - # update actor network parameters - new_theta = theta_old + final_step_dir - set_param_values_to_model(self.actor_critic.actor, new_theta) + theta_new = theta_old + step_direction + set_param_values_to_model(self._actor_critic.actor, theta_new) with torch.no_grad(): - loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv) - self.loss_record.append(loss_pi=loss_pi.mean().item()) + loss, info = self._loss_pi(obs, act, logp, adv) - self.logger.store( + self._logger.store( **{ - 'Train/Entropy': pi_info['ent'], - 'Train/PolicyRatio': pi_info['ratio'], - 'Misc/AcceptanceStep': accept_step, + 'Train/Entropy': info['entrophy'], + 'Train/PolicyRatio': info['ratio'], + 'Train/PolicyStd': info['std'], + 'Loss/Loss_pi': loss.mean().item(), 'Misc/Alpha': alpha.item(), - 'Misc/FinalStepNorm': torch.norm(final_step_dir).mean().item(), + 'Misc/FinalStepNorm': torch.norm(step_direction).mean().item(), 'Misc/xHx': xHx.item(), - 'Misc/gradient_norm': torch.norm(g_flat).mean().item(), + 'Misc/gradient_norm': torch.norm(grad).mean().item(), 'Misc/H_inv_g': x.norm().item(), } ) diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index 0ab7925cb..345508ac8 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -15,26 +15,24 @@ """Implementation of the Policy Gradient algorithm.""" import time -from copy import deepcopy from typing import Dict, Tuple import torch import torch.nn as nn +from torch.utils.data import DataLoader, TensorDataset +from omnisafe.adapter import OnPolicyAdapter from omnisafe.algorithms import registry +from omnisafe.algorithms.base_algo import BaseAlgo from omnisafe.common.buffer import VectorOnPolicyBuffer from omnisafe.common.logger import Logger -from omnisafe.common.record_queue import RecordQueue -from omnisafe.models.constraint_actor_critic import ConstraintActorCritic -from omnisafe.utils import core, distributed_utils -from omnisafe.utils.config import Config -from omnisafe.utils.tools import get_flat_params_from -from omnisafe.wrappers import wrapper_registry +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic +from omnisafe.utils import distributed @registry.register -# pylint: disable-next=too-many-instance-attributes -class PolicyGradient: +# pylint: disable-next=too-many-instance-attributes, too-few-public-methods +class PolicyGradient(BaseAlgo): """The Policy Gradient algorithm. References: @@ -44,475 +42,153 @@ class PolicyGradient: /1999/file/64d828b85b0bed98e80ade0a5c43b0f-Paper.pdf>`_ """ - def __init__(self, env_id: str, cfgs: Config) -> None: - """Initialize PolicyGradient. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - self.algo = self.__class__.__name__ - self.cfgs = deepcopy(cfgs) - self.wrapper_type = self.cfgs.wrapper_type - self.device = ( - f'cuda:{self.cfgs.device_id}' - if torch.cuda.is_available() and self.cfgs.device == 'cuda' - else 'cpu' + def _init_env(self) -> None: + self._env = OnPolicyAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs) + assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, ( + 'The number of steps per epoch is not divisible by the number of ' 'environments.' + ) + self._steps_per_epoch = ( + self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs ) - added_cfgs = self._get_added_cfgs() - self.cfgs.env_cfgs.recurisve_update(added_cfgs) - env_cfgs = self.cfgs.env_cfgs - self.env = wrapper_registry.get(self.wrapper_type)(env_id, cfgs=env_cfgs) + def _init_model(self) -> None: + self._actor_critic = ConstraintActorCritic( + obs_space=self._env.observation_space, + act_space=self._env.action_space, + model_cfgs=self._cfgs.model_cfgs, + epochs=self._cfgs.epochs, + ).to(self._device) + + if distributed.world_size() > 1: + distributed.sync_params(self._actor_critic) + + if self._cfgs.exploration_noise_anneal: + self._actor_critic.set_annealing( + epochs=[0, self._cfgs.epochs], + std=self._cfgs.std, + ) - assert self.cfgs.steps_per_epoch % distributed_utils.num_procs() == 0, ( - f'Number of processes ({distributed_utils.num_procs()})' - f'is not a divisor of the number of steps per epoch {self.cfgs.steps_per_epoch}.' - ) - self.steps_per_epoch = self.cfgs.steps_per_epoch - self.local_steps_per_epoch = ( - cfgs.steps_per_epoch // cfgs.env_cfgs.num_envs // distributed_utils.num_procs() - ) + 1 - - # ensure local each local process can experience at least one complete episode - assert self.env.rollout_data.max_ep_len <= self.local_steps_per_epoch, ( - f'Reduce number of cores ({distributed_utils.num_procs()})' - f'or reduce the number of parallel envrionments {self.env.cfgs.num_envs}' - f'or increase batch size {self.cfgs.steps_per_epoch}.' + def _init(self) -> None: + self._buf = VectorOnPolicyBuffer( + obs_space=self._env.observation_space, + act_space=self._env.action_space, + size=self._steps_per_epoch, + gamma=self._cfgs.buffer_cfgs.gamma, + lam=self._cfgs.buffer_cfgs.lam, + lam_c=self._cfgs.buffer_cfgs.lam_c, + advantage_estimator=self._cfgs.buffer_cfgs.adv_estimation_method, + standardized_adv_r=self._cfgs.buffer_cfgs.standardized_rew_adv, + standardized_adv_c=self._cfgs.buffer_cfgs.standardized_cost_adv, + penalty_coefficient=self._cfgs.penalty_param, + num_envs=self._cfgs.num_envs, + device=self._device, ) - # setup actor-critic module - self.actor_critic = ConstraintActorCritic( - observation_space=self.env.observation_space, - action_space=self.env.action_space, - model_cfgs=cfgs.model_cfgs, - ).to(self.device) - self.set_mpi() - - # set up logger and save configuration to disk - self.logger = Logger( - output_dir=cfgs.data_dir, - exp_name=cfgs.exp_name, - seed=cfgs.seed, - use_tensorboard=cfgs.use_tensorboard, - use_wandb=cfgs.use_wandb, - config=cfgs, - models=[self.actor_critic], + def _init_log(self) -> None: + self._logger = Logger( + output_dir=self._cfgs.data_dir, + exp_name=self._cfgs.exp_name, + seed=self._cfgs.seed, + use_tensorboard=self._cfgs.use_tensorboard, + use_wandb=self._cfgs.use_wandb, + config=self._cfgs, ) - # set up experience buffer - self.buf = VectorOnPolicyBuffer( - obs_space=self.env.observation_space, - act_space=self.env.action_space, - size=self.local_steps_per_epoch, - gamma=cfgs.buffer_cfgs.gamma, - lam=cfgs.buffer_cfgs.lam, - lam_c=cfgs.buffer_cfgs.lam_c, - advantage_estimator=cfgs.buffer_cfgs.adv_estimation_method, - standardized_adv_r=cfgs.buffer_cfgs.standardized_rew_adv, - standardized_adv_c=cfgs.buffer_cfgs.standardized_cost_adv, - penalty_coefficient=cfgs.penalty_param, - num_envs=cfgs.env_cfgs.num_envs, - device=self.device, - ) - # set up optimizer for policy and value function - self.actor_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.actor, learning_rate=cfgs.actor_lr - ) - self.reward_critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.reward_critic, learning_rate=cfgs.critic_lr - ) - if cfgs.use_cost: - self.cost_critic_optimizer = core.set_optimizer( - 'Adam', module=self.actor_critic.cost_critic, learning_rate=cfgs.critic_lr - ) - # set up scheduler for policy learning rate decay - self.scheduler = self.set_learning_rate_scheduler() - # set up model saving what_to_save = { - 'pi': self.actor_critic.actor, - 'obs_normalizer': self.env.obs_normalizer, - } - self.logger.setup_torch_saver(what_to_save=what_to_save) - self.logger.torch_save() - # set up statistics - self.start_time = time.time() - self.logger.log('Start with training.') - self.epoch_time = None - self.penalty_param = None - self.critic_loss_fn = nn.MSELoss() - self.loss_record = RecordQueue('loss_pi', 'loss_v', 'loss_c', maxlen=100) - - self._init_log() - - def _init_log(self): - self.logger.register_key('Train/Epoch') - self.logger.register_key('Metrics/EpRet') - self.logger.register_key('Metrics/EpCost') - self.logger.register_key('Metrics/EpLen') - - # log information about actor - self.logger.register_key('Loss/Loss_pi') - self.logger.register_key('Loss/Delta_loss_pi') - self.logger.register_key('Values/Adv') - - # log information about critic - self.logger.register_key('Loss/Loss_reward_critic') - self.logger.register_key('Loss/Delta_loss_reward_critic') - self.logger.register_key('Values/V') - - if self.cfgs.use_cost: - # log information about cost critic - self.logger.register_key('Loss/Loss_cost_critic') - self.logger.register_key('Loss/Delta_loss_cost_critic') - self.logger.register_key('Values/C') - - self.logger.register_key('Train/Entropy') - self.logger.register_key('Train/KL') - self.logger.register_key('Train/StopIter') - self.logger.register_key('Train/PolicyRatio') - self.logger.register_key('Train/LR') - - if self.cfgs.env_cfgs.normalized_rew: - self.logger.register_key('Misc/RewScaleMean') - self.logger.register_key('Misc/RewScaleStddev') - - if self.cfgs.exploration_noise_anneal: - self.logger.register_key('Misc/ExplorationNoiseStd') - - if self.cfgs.model_cfgs.actor_type == 'gaussian_learning': - self.logger.register_key('Misc/ExplorationNoiseStd') - - self._specific_init_logs() - - # some sub-classes may add information to logs - self.logger.register_key('TotalEnvSteps') - self.logger.register_key('Time') - self.logger.register_key('FPS') - - def _specific_init_logs(self): - pass - - def _get_added_cfgs(self) -> dict: - """Get additional configurations. - - Returns: - dict: The additional configurations. - """ - added_configs = { - 'device': f'cuda:{self.cfgs.device_id}' - if torch.cuda.is_available() and self.cfgs.device == 'cuda' - else 'cpu', - 'seed': self.cfgs.seed, + 'pi': self._actor_critic.actor, } - return added_configs - - def set_learning_rate_scheduler(self) -> torch.optim.lr_scheduler.LambdaLR: - """Set up learning rate scheduler. - - If use linear learning rate decay, - the learning rate will be annealed linearly. - """ - scheduler = None - if self.cfgs.linear_lr_decay: - # linear anneal - def linear_anneal(epoch): - return 1 - epoch / self.cfgs.epochs - - scheduler = torch.optim.lr_scheduler.LambdaLR( - optimizer=self.actor_optimizer, lr_lambda=linear_anneal - ) - return scheduler - - def set_mpi(self) -> None: - """Initialize MPI specifics. - - Sync parameters of actor and critic across cores, - only once necessary.""" - if distributed_utils.num_procs() > 1: - # avoid slowdowns from PyTorch + MPI combo - distributed_utils.setup_torch_for_mpi() - start = time.time() - self.logger.log('INFO: Sync actor critic parameters') - # sync parameters across cores: only once necessary, grads are averaged! - distributed_utils.sync_params(self.actor_critic) - self.logger.log(f'Done! (took {time.time()-start:0.3f} sec.)') + self._logger.setup_torch_saver(what_to_save) + self._logger.torch_save() - def algorithm_specific_logs(self) -> None: - """Use this method to collect log information. + self._logger.register_key('Metrics/EpRet', window_length=50) + self._logger.register_key('Metrics/EpCost', window_length=50) + self._logger.register_key('Metrics/EpLen', window_length=50) - e.g. log lagrangian for lagrangian-base algorithms, + self._logger.register_key('Train/Epoch') + self._logger.register_key('Train/Entropy') + self._logger.register_key('Train/KL') + self._logger.register_key('Train/StopIter') + self._logger.register_key('Train/PolicyRatio') + self._logger.register_key('Train/LR') + if self._cfgs.model_cfgs.actor_type == 'gaussian_learning': + self._logger.register_key('Train/PolicyStd') - .. code-block:: python + self._logger.register_key('TotalEnvSteps') - self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item()) - """ - - def check_distributed_parameters(self) -> None: - """Check if parameters are synchronized across all processes.""" - if distributed_utils.num_procs() > 1: - self.logger.log('Check if distributed parameters are synchronous..') - modules = { - 'Policy': self.actor_critic.actor, - 'Value': self.actor_critic.reward_critic, - } - for key, module in modules.items(): - flat_params = get_flat_params_from(module) - global_min = distributed_utils.mpi_min(torch.sum(flat_params)) - global_max = distributed_utils.mpi_max(torch.sum(flat_params)) - assert torch.allclose(global_min, global_max), f'{key} not synced.' - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. - - Policy Gradient only use reward advantage. - - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - return adv - 0.0 * cost_adv - - # pylint: disable-next=too-many-arguments - def compute_loss_pi( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r"""Computing pi/actor loss. - - In Policy Gradient, the loss is defined as: - - .. math:: - - L = -\mathbb{E}_{s_t \sim \rho_\theta} \left[ - \sum_{t=0}^T \left( \frac{\pi_\theta ^{'}(a_t|s_t)}{\pi_\theta(a_t|s_t)} \right) - \left( \sum_{t'=t}^T \gamma^{t'-t} r_{t'} \right) - \right] - - where :math:`\rho_\theta` is the policy distribution, :math:`\pi_\theta` is the parameters of policy network, - :math:`a_t` is the action at time step :math:`t`, :math:`s_t` is the observation at time step :math:`t`, - :math:`\gamma` is the discount factor, :math:`r_{t'}` is the reward at time step :math:`t'`. - - Args: - obs (torch.Tensor): ``observation`` stored in buffer. - act (torch.Tensor): ``action`` stored in buffer. - log_p (torch.Tensor): ``log probability`` of action stored in buffer. - adv (torch.Tensor): ``advantage`` stored in buffer. - """ - # policy loss - dist, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - - loss_pi = -(ratio * adv).mean() - # useful extra info - approx_kl = (0.5 * (dist.mean - act) ** 2 / dist.stddev**2).mean().item() + # log information about actor + self._logger.register_key('Loss/Loss_pi', delta=True) + self._logger.register_key('Value/Adv') - # compute policy's entropy - ent = dist.entropy().mean().item() + # log information about critic + self._logger.register_key('Loss/Loss_reward_critic', delta=True) + self._logger.register_key('Value/reward') - pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()} + if self._cfgs.use_cost: + # log information about cost critic + self._logger.register_key('Loss/Loss_cost_critic', delta=True) + self._logger.register_key('Value/cost') - return loss_pi, pi_info + self._logger.register_key('Time/Total') + self._logger.register_key('Time/Rollout') + self._logger.register_key('Time/Update') + self._logger.register_key('Time/Epoch') + self._logger.register_key('Time/FPS') - def learn(self) -> ConstraintActorCritic: + def learn(self) -> None: """This is main function for algorithm update, divided into the following steps: - :meth:`rollout`: collect interactive data from environment. - :meth:`update`: perform actor/critic updates. - :meth:`log`: epoch/update information for visualization and terminal log print. """ - # main loop: collect experience in env and update/log each epoch - for epoch in range(self.cfgs.epochs): - self.epoch_time = time.time() - # update internals of AC - if self.cfgs.exploration_noise_anneal: - self.actor_critic.anneal_exploration(frac=epoch / self.cfgs.epochs) - # collect data from environment - self.env.set_rollout_cfgs( - local_steps_per_epoch=self.local_steps_per_epoch, - use_cost=self.cfgs.use_cost, - ) - self.env.on_policy_roll_out( - self.actor_critic, - self.buf, - self.logger, - ) - # update: actor, critic, running statistics - self.update() - # log and store information - self.log(epoch) - # check if all models own the same parameter values - if epoch % self.cfgs.check_freq == 0: - self.check_distributed_parameters() - # save model to disk - if (epoch + 1) % self.cfgs.save_freq == 0: - self.logger.torch_save() - - # close opened files to avoid number of open files overflow - self.logger.close() - return self.actor_critic - - def log(self, epoch: int) -> None: - """Log info about epoch. - - .. list-table:: - - * - Things to log - - Description - * - Train/Epoch - - Current epoch. - * - Metrics/EpCost - - Average cost of the epoch. - * - Metrics/EpCost - - Average cost of the epoch. - * - Metrics/EpRet - - Average return of the epoch. - * - Metrics/EpLen - - Average length of the epoch. - * - Values/V - - Average value in :meth:`roll_out()` (from critic network) of the epoch. - * - Values/C - - Average cost in :meth:`roll_out()` (from critic network) of the epoch. - * - Values/Adv - - Average advantage in :meth:`roll_out()` of the epoch. - * - Loss/Loss_pi - - Loss of the policy network. - * - Loss/Delta_loss_pi - - Delta loss of the policy network. - * - Loss/Loss_reward_critic - - Loss of the value network. - * - Loss/Delta_loss_reward_critic - - Delta loss of the value network. - * - Loss/Loss_cost_critic - - Loss of the cost network. - * - Loss/Delta_loss_cost_critic - - Delta loss of the cost network. - * - Train/Entropy - - Entropy of the policy network. - * - Train/KL - - KL divergence of the policy network. - * - Train/StopIters - - Number of iterations of the policy network. - * - Train/PolicyRatio - - Ratio of the policy network. - * - Train/LR - - Learning rate of the policy network. - * - Misc/Seed - - Seed of the experiment. - * - Misc/RewScaleMean - - Mean of the reward scale. - * - Misc/RewScaleStddev - - Std of the reward scale. - * - Misc/ExplorationNoisestd - - Std of the exploration noise. - * - Misc/TotalEnvSteps - - Total steps of the experiment. - * - Time - - Total time. - * - FPS - - Frames per second of the epoch. - - Args: - epoch (int): current epoch. - """ - total_env_steps = (epoch + 1) * self.cfgs.steps_per_epoch - fps = self.cfgs.steps_per_epoch / (time.time() - self.epoch_time) - # step the actor learning rate scheduler if provided - if self.scheduler and self.cfgs.linear_lr_decay: - current_lr = self.scheduler.get_last_lr()[0] - self.scheduler.step() - else: - current_lr = self.cfgs.actor_lr - - self.logger.store( - **{ - 'Train/Epoch': epoch + 1, - 'Train/LR': current_lr, - 'TotalEnvSteps': total_env_steps, - 'Time': (time.time() - self.start_time), - 'FPS': fps, - } - ) + start_time = time.time() + self._logger.log('INFO: Start training') - if self.cfgs.env_cfgs.normalized_rew: - reward_norm_mean = self.env.rew_normalizer.mean.mean().item() - reward_norm_stddev = self.env.rew_normalizer.std.mean().item() - self.logger.store( - **{ - 'Misc/RewScaleMean': reward_norm_mean, - 'Misc/RewScaleStddev': reward_norm_stddev, - } - ) + for epoch in range(self._cfgs.epochs): + epoch_time = time.time() - if self.cfgs.exploration_noise_anneal: - noise_std = self.actor_critic.actor.std - self.logger.store( - **{ - 'Misc/ExplorationNoiseStd': noise_std, - } - ) + # if self._cfgs.exploration_noise_anneal: + # self._actor_critic.anneal_exploration(frac=epoch / self._cfgs.epochs) - if self.cfgs.model_cfgs.actor_type == 'gaussian_learning': - self.logger.store( - **{ - 'Misc/ExplorationNoiseStd': self.actor_critic.actor.std, - } + roll_out_time = time.time() + self._env.roll_out( + steps_per_epoch=self._steps_per_epoch, + agent=self._actor_critic, + buffer=self._buf, + logger=self._logger, ) + self._logger.store(**{'Time/Rollout': time.time() - roll_out_time}) - self.algorithm_specific_logs() - self.logger.dump_tabular() - - # pylint: disable-next=too-many-locals - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - r"""Update actor, critic, running statistics, following next steps: - - - Get the ``raw data`` and ``processed data`` from buffer - - .. note:: + update_time = time.time() + self._update() + self._logger.store(**{'Time/Update': time.time() - update_time}) - ``raw data`` is the data from environment, while ``processed data`` is the data after pre-processing. + self._actor_critic.actor_scheduler.step() + if self._cfgs.exploration_noise_anneal: + self._actor_critic.annealing(epoch) - .. list-table:: - - * - obs - - ``observaion`` stored in buffer. - * - act - - ``action`` stored in buffer. - * - target_v - - ``target value`` stored in buffer. - * - target_c - - ``target cost`` stored in buffer. - * - log_p - - ``log probability`` stored in buffer. - * - adv - - ``estimated advantage`` (e.g. **GAE**) stored in buffer. - * - cost_adv - - ``estimated cost advantage`` (e.g. **GAE**) stored in buffer. + self._logger.store( + **{ + 'TotalEnvSteps': (epoch + 1) * self._cfgs.steps_per_epoch, + 'Time/FPS': self._cfgs.steps_per_epoch / (time.time() - epoch_time), + 'Time/Total': (time.time() - start_time), + 'Time/Epoch': (time.time() - epoch_time), + 'Train/Epoch': epoch, + 'Train/LR': self._actor_critic.actor_scheduler.get_last_lr()[0], + } + ) - - Update value net by :meth:`update_value_net()`. - - Update cost net by :meth:`update_cost_net()`. - - Update policy net by :meth:`update_policy_net()`. + self._logger.dump_tabular() - The cost and value critic network will be updated ``critic_iters`` times (always 40), - while the policy network will be updated ``actor_iters`` times (always 80). - The basic process of each update is as follows: + # save model to disk + if (epoch + 1) % self._cfgs.save_freq == 0: + self._logger.torch_save() - #. Get the mini-batch data from buffer. - #. Get the loss of network. - #. Update the network by loss. - #. Repeat steps 2, 3 until the number of mini-batch data is used up. + self._logger.close() - """ - # get the data from buffer - data = self.buf.get() - obs, act, log_p, target_v, target_c, adv, cost_adv = ( + def _update(self) -> None: + data = self._buf.get() + obs, act, logp, target_value_r, target_value_c, adv_r, adv_c = ( data['obs'], data['act'], data['logp'], @@ -521,220 +197,135 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: data['adv_r'], data['adv_c'], ) - # get the loss before - loss_pi_before, loss_v_before = self.loss_record.get_mean('loss_pi', 'loss_v') - if self.cfgs.use_cost: - loss_c_before = self.loss_record.get_mean('loss_c') - self.loss_record.reset('loss_pi', 'loss_v', 'loss_c') - # compute the old distribution of policy net. - old_dist = self.actor_critic.actor(obs) - - # load the data into the data loader. - dataset = torch.utils.data.TensorDataset(obs, act, target_v, target_c, log_p, adv, cost_adv) - loader = torch.utils.data.DataLoader( - dataset, batch_size=self.cfgs.num_mini_batches, shuffle=True + + original_obs = obs + old_distribution = self._actor_critic.actor(obs) + + dataloader = DataLoader( + dataset=TensorDataset(obs, act, logp, target_value_r, target_value_c, adv_r, adv_c), + batch_size=self._cfgs.num_mini_batches, + shuffle=True, ) - # update the value net, cost net and policy net for several times. - for i in range(self.cfgs.actor_iters): - for _, (obs_b, act_b, target_v_b, target_c_b, log_p_b, adv_b, cost_adv_b) in enumerate( - loader - ): - # update the value net. - self.update_value_net(obs_b, target_v_b) - # update the cost net, if use cost. - if self.cfgs.use_cost: - self.update_cost_net(obs_b, target_c_b) - # update the policy net. - self.update_policy_net(obs_b, act_b, log_p_b, adv_b, cost_adv_b) - # compute the new distribution of policy net. - new_dist = self.actor_critic.actor(obs) - # compute the KL divergence between old and new distribution. - torch_kl = ( - torch.distributions.kl.kl_divergence(old_dist, new_dist) + for i in range(self._cfgs.actor_iters): + for ( + obs, + act, + logp, + target_value_r, + target_value_c, + adv_r, + adv_c, + ) in dataloader: + self._update_rewrad_critic(obs, target_value_r) + if self._cfgs.use_cost: + self._update_cost_critic(obs, target_value_c) + self._update_actor(obs, act, logp, adv_r, adv_c) + + new_distribution = self._actor_critic.actor(original_obs) + + kl = ( + torch.distributions.kl.kl_divergence(old_distribution, new_distribution) .sum(-1, keepdim=True) .mean() .item() ) - torch_kl = distributed_utils.mpi_avg(torch_kl) - # if the KL divergence is larger than the target KL divergence, stop the update. - if self.cfgs.kl_early_stopping and torch_kl > self.cfgs.target_kl: - self.logger.log(f'KL early stop at the {i+1} th step.') + kl = distributed.dist_avg(kl) + + if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl: + self._logger.log(f'Early stopping at iter {i} due to reaching max kl') break - # log the information. - loss_pi, loss_v = self.loss_record.get_mean('loss_pi', 'loss_v') - self.logger.store( - **{ - 'Loss/Loss_pi': loss_pi, - 'Loss/Delta_loss_pi': loss_pi - loss_pi_before, - 'Train/StopIter': i + 1, - 'Values/Adv': adv.mean().item(), - 'Train/KL': torch_kl, - 'Loss/Delta_loss_reward_critic': loss_v - loss_v_before, - 'Loss/Loss_reward_critic': loss_v, - } - ) - if self.cfgs.use_cost: - loss_c = self.loss_record.get_mean('loss_c') - self.logger.store( - **{ - 'Loss/Delta_loss_cost_critic': loss_c - loss_c_before, - 'Loss/Loss_cost_critic': loss_c, - } - ) - return data - # pylint: disable-next=too-many-locals,too-many-arguments - def update_policy_net( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> None: - r"""Update policy network under a double for loop. - - The pseudo code is shown below: - - .. code-block:: python - - for _ in range(self.cfgs.actor_iters): - for _ in range(self.cfgs.num_mini_batches): - # Get mini-batch data - # Compute loss - # Update network - - .. warning:: - For some ``KL divergence`` based algorithms (e.g. TRPO, CPO, etc.), - the ``KL divergence`` between the old policy and the new policy is calculated. - And the ``KL divergence`` is used to determine whether the update is successful. - If the ``KL divergence`` is too large, the update will be terminated. - - Args: - obs (torch.Tensor): ``observation`` stored in buffer. - act (torch.Tensor): ``action`` stored in buffer. - log_p (torch.Tensor): ``log_p`` stored in buffer. - adv (torch.Tensor): ``advantage`` stored in buffer. - cost_adv (torch.Tensor): ``cost_advantage`` stored in buffer. - """ - # process the advantage function. - processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv) - # compute the loss of policy net. - loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv) - # log the loss of policy net. - self.loss_record.append(loss_pi=loss_pi.mean().item()) - # update the policy net. - self.actor_optimizer.zero_grad() - # backward the loss of policy net. - loss_pi.backward() - # clip the gradient of policy net. - if self.cfgs.use_max_grad_norm: - torch.nn.utils.clip_grad_norm_( - self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm - ) - # average the gradient of policy net. - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - self.actor_optimizer.step() - self.logger.store( + self._logger.store( **{ - 'Train/Entropy': pi_info['ent'], - 'Train/PolicyRatio': pi_info['ratio'], + 'Train/StopIter': i + 1, + 'Value/Adv': adv_r.mean().item(), + 'Train/KL': kl, } ) - def update_value_net( - self, - obs: torch.Tensor, - target_v: torch.Tensor, - ) -> None: - r"""Update value network under a double for loop. - - The loss function is ``MSE loss``, which is defined in ``torch.nn.MSELoss``. - Specifically, the loss function is defined as: - - .. math:: - L = \frac{1}{N} \sum_{i=1}^N (\hat{V} - V)^2 + def _update_rewrad_critic(self, obs: torch.Tensor, target_value_r: torch.Tensor) -> None: + self._actor_critic.reward_critic_optimizer.zero_grad() + loss = nn.functional.mse_loss(self._actor_critic.reward_critic(obs)[0], target_value_r) - where :math:`\hat{V}` is the predicted cost and :math:`V` is the target cost. - The pseudo code is shown below: + if self._cfgs.use_critic_norm: + for param in self._actor_critic.reward_critic.parameters(): + loss += param.pow(2).sum() * self._cfgs.critic_norm_coeff - .. code-block:: python + loss.backward() - for _ in range(self.cfgs.actor_iters): - for _ in range(self.cfgs.num_mini_batches): - # Get mini-batch data - # Compute loss - # Update network - - Args: - obs (torch.Tensor): ``observation`` stored in buffer. - target_v (torch.Tensor): ``target_v`` stored in buffer. - """ - self.reward_critic_optimizer.zero_grad() - # compute the loss of value net. - loss_v = self.critic_loss_fn( - self.actor_critic.reward_critic(obs), - target_v, - ) - # add the norm of critic network parameters to the loss function. - if self.cfgs.use_critic_norm: - for param in self.actor_critic.reward_critic.parameters(): - loss_v += param.pow(2).sum() * self.cfgs.critic_norm_coeff - # log the loss of value net. - self.loss_record.append(loss_v=loss_v.mean().item()) - # backward - loss_v.backward() - # clip the gradient - if self.cfgs.use_max_grad_norm: + if self._cfgs.use_max_grad_norm: torch.nn.utils.clip_grad_norm_( - self.actor_critic.reward_critic.parameters(), self.cfgs.max_grad_norm + self._actor_critic.reward_critic.parameters(), self._cfgs.max_grad_norm ) - distributed_utils.mpi_avg_grads(self.actor_critic.reward_critic) - self.reward_critic_optimizer.step() + distributed.avg_grads(self._actor_critic.reward_critic) + self._actor_critic.reward_critic_optimizer.step() - def update_cost_net(self, obs: torch.Tensor, target_c: torch.Tensor) -> None: - r"""Update cost network under a double for loop. + self._logger.store(**{'Loss/Loss_reward_critic': loss.mean().item()}) - The loss function is ``MSE loss``, which is defined in ``torch.nn.MSELoss``. - Specifically, the loss function is defined as: + def _update_cost_critic(self, obs: torch.Tensor, target_value_c: torch.Tensor) -> None: + self._actor_critic.cost_critic_optimizer.zero_grad() + loss = nn.functional.mse_loss(self._actor_critic.cost_critic(obs)[0], target_value_c) - .. math:: - L = \frac{1}{N} \sum_{i=1}^N (\hat{C} - C)^2 + if self._cfgs.use_critic_norm: + for param in self._actor_critic.cost_critic.parameters(): + loss += param.pow(2).sum() * self._cfgs.critic_norm_coeff - where :math:`\hat{C}` is the predicted cost and :math:`C` is the target cost. - The pseudo code is shown below: + loss.backward() - .. code-block:: python + if self._cfgs.use_max_grad_norm: + torch.nn.utils.clip_grad_norm_( + self._actor_critic.cost_critic.parameters(), self._cfgs.max_grad_norm + ) + distributed.avg_grads(self._actor_critic.cost_critic) + self._actor_critic.cost_critic_optimizer.step() - for _ in range(self.cfgs.actor_iters): - for _ in range(self.cfgs.num_mini_batches): - # Get mini-batch data - # Compute loss - # Update network + self._logger.store(**{'Loss/Loss_cost_critic': loss.mean().item()}) - Args: - obs (torch.Tensor): ``observation`` stored in buffer. - target_c (torch.Tensor): ``target_c`` stored in buffer. - """ - self.cost_critic_optimizer.zero_grad() - # compute the loss of cost net. - loss_c = self.critic_loss_fn( - self.actor_critic.cost_critic(obs), - target_c, - ) - # add the norm of critic network parameters to the loss function. - if self.cfgs.use_critic_norm: - for param in self.actor_critic.cost_critic.parameters(): - loss_c += param.pow(2).sum() * self.cfgs.critic_norm_coeff - # log the loss. - self.loss_record.append(loss_c=loss_c.mean().item()) - # backward. - loss_c.backward() - # clip the gradient. - if self.cfgs.use_max_grad_norm: + def _update_actor( # pylint: disable=too-many-arguments + self, + obs: torch.Tensor, + act: torch.Tensor, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, + ) -> None: + adv = self._compute_adv_surrogate(adv_r, adv_c) + loss, info = self._loss_pi(obs, act, logp, adv) + self._actor_critic.actor_optimizer.zero_grad() + loss.backward() + if self._cfgs.use_max_grad_norm: torch.nn.utils.clip_grad_norm_( - self.actor_critic.cost_critic.parameters(), self.cfgs.max_grad_norm + self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm ) - distributed_utils.mpi_avg_grads(self.actor_critic.cost_critic) - self.cost_critic_optimizer.step() + distributed.avg_grads(self._actor_critic.actor) + self._actor_critic.actor_optimizer.step() + self._logger.store( + **{ + 'Train/Entropy': info['entrophy'], + 'Train/PolicyRatio': info['ratio'], + 'Train/PolicyStd': info['std'], + 'Loss/Loss_pi': loss.mean().item(), + } + ) + + def _compute_adv_surrogate( # pylint: disable=unused-argument + self, adv_r: torch.Tensor, adv_c: torch.Tensor + ) -> torch.Tensor: + return adv_r + + def _loss_pi( + self, + obs: torch.Tensor, + act: torch.Tensor, + logp: torch.Tensor, + adv: torch.Tensor, + ) -> Tuple[torch.Tensor, Dict[str, float]]: + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + std = self._actor_critic.actor.std + ratio = torch.exp(logp_ - logp) + loss = -(ratio * adv).mean() + entrophy = distribution.entropy().mean().item() + info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std} + return loss, info diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index 6343903a3..0cb3f6e10 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -14,7 +14,7 @@ # ============================================================================== """Implementation of the PPO algorithm.""" -from typing import NamedTuple, Tuple +from typing import Dict, Tuple import torch @@ -32,31 +32,9 @@ class PPO(PolicyGradient): - URL: `PPO `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize Proximal Policy Optimization. - - .. note:: - The ``clip`` parameter is the clip parameter in PPO, - which is used to clip the ratio of the new policy and the old policy. - The ``clip`` parameter is set to 0.2 in the original paper. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__( - env_id=env_id, - cfgs=cfgs, - ) - - # pylint: disable-next=too-many-arguments - def compute_loss_pi( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + def _loss_pi( + self, obs: torch.Tensor, act: torch.Tensor, logp: torch.Tensor, adv: torch.Tensor + ) -> Tuple[torch.Tensor, Dict[str, float]]: r"""Computing pi/actor loss. In Proximal Policy Optimization, the loss is defined as: @@ -75,15 +53,14 @@ def compute_loss_pi( adv (torch.Tensor): ``advantage`` stored in buffer. cost_adv (torch.Tensor): ``cost advantage`` stored in buffer. """ - dist, _log_p = self.actor_critic.actor(obs, act) - # importance ratio - ratio = torch.exp(_log_p - log_p) - ratio_clip = torch.clamp(ratio, 1 - self.cfgs.clip, 1 + self.cfgs.clip) - loss_pi = -(torch.min(ratio * adv, ratio_clip * adv)) - loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean() + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + std = self._actor_critic.actor.std + ratio = torch.exp(logp_ - logp) + ratio_cliped = torch.clamp(ratio, 1 - self._cfgs.clip, 1 + self._cfgs.clip) + loss = -torch.min(ratio * adv, ratio_cliped * adv).mean() + loss += self._cfgs.entropy_coef * distribution.entropy().mean() # useful extra info - approx_kl = (0.5 * (dist.mean - act) ** 2 / dist.stddev**2).mean().item() - ent = dist.entropy().mean().item() - pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio_clip.mean().item()} - - return loss_pi.mean(), pi_info + entrophy = distribution.entropy().mean().item() + info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std} + return loss, info diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py index 11d8e4359..ebc19d2b2 100644 --- a/omnisafe/algorithms/on_policy/base/trpo.py +++ b/omnisafe/algorithms/on_policy/base/trpo.py @@ -14,15 +14,16 @@ # ============================================================================== """Implementation of the TRPO algorithm.""" -from typing import NamedTuple, Tuple +from typing import Tuple import torch +from torch.distributions import Distribution from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG -from omnisafe.utils import distributed_utils +from omnisafe.utils import distributed +from omnisafe.utils.math import conjugate_gradients from omnisafe.utils.tools import ( - conjugate_gradients, get_flat_gradients_from, get_flat_params_from, set_param_values_to_model, @@ -39,26 +40,21 @@ class TRPO(NaturalPG): - URL: `TRPO `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize Trust Region Policy Optimization. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Misc/AcceptanceStep') # pylint: disable-next=too-many-arguments,too-many-locals,arguments-differ - def search_step_size( + def _search_step_size( self, - step_dir: torch.Tensor, - g_flat: torch.Tensor, - p_dist: torch.distributions.Distribution, + step_direction: torch.Tensor, + grad: torch.Tensor, + p_dist: Distribution, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, + logp: torch.Tensor, adv: torch.Tensor, - loss_pi_before: float, + loss_before: float, total_steps: int = 15, decay: float = 0.8, ) -> Tuple[torch.Tensor, int]: @@ -86,60 +82,58 @@ def search_step_size( # How far to go in a single update step_frac = 1.0 # Get old parameterized policy expression - _theta_old = get_flat_params_from(self.actor_critic.actor) + theta_old = get_flat_params_from(self._actor_critic.actor) # Change expected objective function gradient = expected_imrpove best this moment - expected_improve = g_flat.dot(step_dir) + expected_improve = grad.dot(step_direction) + expected_improve = torch.dot(grad, step_direction) # While not within_trust_region and not out of total_steps: - for j in range(total_steps): + for step in range(total_steps): # update theta params - new_theta = _theta_old + step_frac * step_dir + new_theta = theta_old + step_frac * step_direction # set new params as params of net - set_param_values_to_model(self.actor_critic.actor, new_theta) - # the stepNo this update accept - acceptance_step = j + 1 + set_param_values_to_model(self._actor_critic.actor, new_theta) with torch.no_grad(): - loss_pi, _ = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv) + loss, _ = self._loss_pi(obs, act, logp, adv) # compute KL distance between new and old policy - q_dist = self.actor_critic.actor(obs) + q_dist = self._actor_critic.actor(obs) # KL-distance of old p-dist and new q-dist, applied in KLEarlyStopping - torch_kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item() + kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item() + kl = distributed.dist_avg(kl) # real loss improve: old policy loss - new policy loss - loss_improve = loss_pi_before - loss_pi.item() + loss_improve = loss_before - loss.item() # average processes.... multi-processing style like: mpi_tools.mpi_avg(xxx) - torch_kl = distributed_utils.mpi_avg(torch_kl) - loss_improve = distributed_utils.mpi_avg(loss_improve) - menu = (expected_improve, loss_improve) - self.logger.log(f'Expected Improvement: {menu[0]} Actual: {menu[1]}') - if not torch.isfinite(loss_pi): - self.logger.log('WARNING: loss_pi not finite') + loss_improve = distributed.dist_avg(loss_improve) + self._logger.log(f'Expected Improvement: {expected_improve} Actual: {loss_improve}') + if not torch.isfinite(loss): + self._logger.log('WARNING: loss_pi not finite') elif loss_improve < 0: - self.logger.log('INFO: did not improve improve <0') - elif torch_kl > self.target_kl * 1.5: - self.logger.log('INFO: violated KL constraint.') + self._logger.log('INFO: did not improve improve <0') + elif kl > self._cfgs.target_kl * 1.5: + self._logger.log('INFO: violated KL constraint.') else: # step only if surrogate is improved and when within trust reg. - self.logger.log(f'Accept step at i={acceptance_step}') + acceptance_step = step + 1 + self._logger.log(f'Accept step at i={acceptance_step}') break step_frac *= decay else: - self.logger.log('INFO: no suitable step found...') - step_dir = torch.zeros_like(step_dir) + self._logger.log('INFO: no suitable step found...') + step_direction = torch.zeros_like(step_direction) acceptance_step = 0 - set_param_values_to_model(self.actor_critic.actor, _theta_old) + set_param_values_to_model(self._actor_critic.actor, theta_old) - return step_frac * step_dir, acceptance_step + return step_frac * step_direction, acceptance_step - # pylint: disable-next=too-many-locals,too-many-arguments - def update_policy_net( + def _update_actor( # pylint: disable=too-many-arguments,too-many-locals self, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, ) -> None: """Update policy network. @@ -155,75 +149,58 @@ def update_policy_net( Args: obs (torch.Tensor): The observation tensor. act (torch.Tensor): The action tensor. - log_p (torch.Tensor): The log probability of the action. - adv (torch.Tensor): The advantage tensor. - cost_adv (torch.Tensor): The cost advantage tensor. + logp (torch.Tensor): The log probability of the action. + adv_r (torch.Tensor): The advantage tensor. + adv_c (torch.Tensor): The cost advantage tensor. """ - # get loss and info values before update - self.fvp_obs = obs[::4] - theta_old = get_flat_params_from(self.actor_critic.actor) - self.actor_critic.actor.zero_grad() - # process the advantage function. - processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv) - # compute the loss of policy net. - loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv) - loss_pi_before = distributed_utils.mpi_avg(loss_pi.item()) - p_dist = self.actor_critic.actor(obs) - # train policy with multiple steps of gradient descent - loss_pi.backward() - # average grads across MPI processes - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - g_flat = get_flat_gradients_from(self.actor_critic.actor) - g_flat *= -1 - - # pylint: disable-next=invalid-name - x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters) + self._fvp_obs = obs[::4] + theta_old = get_flat_params_from(self._actor_critic.actor) + self._actor_critic.actor.zero_grad() + adv = self._compute_adv_surrogate(adv_r, adv_c) + loss, info = self._loss_pi(obs, act, logp, adv) + loss_before = distributed.dist_avg(loss).item() + p_dist = self._actor_critic.actor(obs) + + loss.backward() + distributed.avg_grads(self._actor_critic.actor) + + grad = -get_flat_gradients_from(self._actor_critic.actor) + x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters) assert torch.isfinite(x).all(), 'x is not finite' - # note that xHx = g^T x, but calculating xHx is faster than g^T x - xHx = torch.dot(x, self.Fvp(x)) # equivalent to : g^T x - assert xHx.item() >= 0, 'No negative values' - - # perform descent direction - alpha = torch.sqrt(2 * self.target_kl / (xHx + 1e-8)) - step_direction = alpha * x + xHx = torch.dot(x, self._fvp(x)) + assert xHx.item() >= 0, 'xHx is negative' + alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8)) + step_direction = x * alpha assert torch.isfinite(step_direction).all(), 'step_direction is not finite' - # determine step direction and apply SGD step after grads where set - # TRPO uses custom backtracking line search - final_step_dir, accept_step = self.search_step_size( - step_dir=step_direction, - g_flat=g_flat, + step_direction, accept_step = self._search_step_size( + step_direction=step_direction, + grad=grad, p_dist=p_dist, - loss_pi_before=loss_pi_before, obs=obs, act=act, - log_p=log_p, + logp=logp, adv=adv, + loss_before=loss_before, ) - # update actor network parameters - new_theta = theta_old + final_step_dir - set_param_values_to_model(self.actor_critic.actor, new_theta) + theta_new = theta_old + step_direction + set_param_values_to_model(self._actor_critic.actor, theta_new) with torch.no_grad(): - q_dist = self.actor_critic.actor(obs) - kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item() - loss_pi, pi_info = self.compute_loss_pi( - obs=obs, act=act, log_p=log_p, adv=processed_adv - ) - self.loss_record.append(loss_pi=loss_pi.mean().item()) - - self.logger.store( + loss, info = self._loss_pi(obs, act, logp, adv) + + self._logger.store( **{ - 'Values/Adv': adv.mean().item(), - 'Train/Entropy': pi_info['ent'], - 'Train/KL': kl, - 'Train/PolicyRatio': pi_info['ratio'], - 'Misc/AcceptanceStep': accept_step, + 'Train/Entropy': info['entrophy'], + 'Train/PolicyRatio': info['ratio'], + 'Train/PolicyStd': info['std'], + 'Loss/Loss_pi': loss.mean().item(), 'Misc/Alpha': alpha.item(), - 'Misc/FinalStepNorm': torch.norm(final_step_dir).mean().item(), + 'Misc/FinalStepNorm': torch.norm(step_direction).mean().item(), 'Misc/xHx': xHx.item(), - 'Misc/gradient_norm': torch.norm(g_flat).mean().item(), + 'Misc/gradient_norm': torch.norm(grad).mean().item(), 'Misc/H_inv_g': x.norm().item(), + 'Misc/AcceptanceStep': accept_step, } ) diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py index c82f127c7..508773acf 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py @@ -14,8 +14,7 @@ # ============================================================================== """Implementation of the early terminated algorithm using PPO.""" -from typing import NamedTuple - +from omnisafe.adapter import EarlyTerminatedAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO @@ -30,11 +29,8 @@ class PPOEarlyTerminated(PPO): URL: `Safe Exploration by Solving Early Terminated MDP `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPO_Earyly_Terminated. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) + def _init_env(self) -> None: + self._env = EarlyTerminatedAdapter( + self._env_id, self._cfgs.num_envs, self._seed, self._cfgs + ) + self._steps_per_epoch = self._cfgs.steps_per_epoch diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py index 54602dfa9..1b546b984 100644 --- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py +++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py @@ -14,8 +14,8 @@ # ============================================================================== """Implementation of the Lagrange version of the early terminated algorithm using PPOLag.""" -from typing import NamedTuple +from omnisafe.adapter import EarlyTerminatedAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag @@ -30,11 +30,8 @@ class PPOLagEarlyTerminated(PPOLag): URL: `Safe Exploration by Solving Early Terminated MDP `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPO_Lag_Earyly_Terminated. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) + def _init_env(self) -> None: + self._env = EarlyTerminatedAdapter( + self._env_id, self._cfgs.num_envs, self._seed, self._cfgs + ) + self._steps_per_epoch = self._cfgs.steps_per_epoch diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py index 2c720ebe0..3f0969685 100644 --- a/omnisafe/algorithms/on_policy/first_order/cup.py +++ b/omnisafe/algorithms/on_policy/first_order/cup.py @@ -14,99 +14,49 @@ # ============================================================================== """Implementation of the CUP algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch +from torch.distributions import Normal +from torch.utils.data import DataLoader, TensorDataset from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO from omnisafe.common.lagrange import Lagrange -from omnisafe.common.record_queue import RecordQueue -from omnisafe.utils import distributed_utils +from omnisafe.utils import distributed +from omnisafe.utils.config import Config @registry.register -class CUP(PPO, Lagrange): +class CUP(PPO): """The Constrained Update Projection (CUP) Approach to Safe Policy Optimization. References: - Title: Constrained Update Projection Approach to Safe Policy Optimization - Authors: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li, - Yaodong Yang, Gang Pan. + Yaodong Yang, Gang Pan. - URL: `CUP `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize CUP. - - CUP is a combination of :class:`PPO` and :class:`Lagrange` model. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - PPO.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__( - self, - cost_limit=self.cfgs.lagrange_cfgs.cost_limit, - lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, - lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, - lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, - lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, - ) - self.lam = self.cfgs.lam - self.eta = self.cfgs.eta - self.max_ratio = 0 - self.min_ratio = 0 - self.p_dist = None - self.loss_record = RecordQueue('loss_pi', 'loss_v', 'loss_c', 'loss_pi_c', maxlen=100) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - self.logger.register_key('Train/MaxRatio') - self.logger.register_key('Train/MinRatio') - self.logger.register_key('Loss/Loss_pi_c') - self.logger.register_key('Loss/Delta_loss_pi_c') - self.logger.register_key('Train/SecondStepStopIter') - self.logger.register_key('Train/SecondStepEntropy') - self.logger.register_key('Train/SecondStepPolicyRatio') - - def algorithm_specific_logs(self) -> None: - """Log the CUP specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - * - Train/MaxRatio - - The maximum ratio between the current policy and the old policy. - * - Train/MinRatio - - The minimum ratio between the current policy and the old policy. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - 'Train/MaxRatio': self.max_ratio, - 'Train/MinRatio': self.min_ratio, - } - ) - - # pylint: disable-next=too-many-locals - def compute_loss_cost_performance( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - cost_adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + def _init(self) -> None: + super()._init() + self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs) + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') + self._logger.register_key('Train/MaxRatio') + self._logger.register_key('Train/MinRatio') + self._logger.register_key('Loss/Loss_pi_c', delta=True) + self._logger.register_key('Train/SecondStepStopIter') + self._logger.register_key('Train/SecondStepEntropy') + self._logger.register_key('Train/SecondStepPolicyRatio') + + def __init__(self, env_id: str, cfgs: Config) -> None: + super().__init__(env_id, cfgs) + self._p_dist: Normal + self._max_ratio: float = 0.0 + self._min_ratio: float = 0.0 + + def _loss_pi_cost(self, obs, act, logp, adv_c): r"""Compute the performance of cost on this moment. Detailedly, we compute the KL divergence between the current policy and the old policy, @@ -134,107 +84,98 @@ def compute_loss_cost_performance( log_p (torch.Tensor): Log probability. cost_adv (torch.Tensor): Cost advantage. """ - dist, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + std = self._actor_critic.actor.std + ratio = torch.exp(logp_ - logp) - kl_new_old = torch.distributions.kl.kl_divergence(dist, self.p_dist).sum(-1, keepdim=True) + kl = torch.distributions.kl_divergence(distribution, self._p_dist).sum(-1, keepdim=True) - coef = (1 - self.cfgs.buffer_cfgs.gamma * self.cfgs.buffer_cfgs.lam) / ( - 1 - self.cfgs.buffer_cfgs.gamma + coef = (1 - self._cfgs.buffer_cfgs.gamma * self._cfgs.buffer_cfgs.lam) / ( + 1 - self._cfgs.buffer_cfgs.gamma ) - cost_loss = (self.lagrangian_multiplier * coef * ratio * cost_adv + kl_new_old).mean() - self.loss_record.append(loss_pi_c=cost_loss.item()) + loss = (self._lagrange.lagrangian_multiplier * coef * ratio * adv_c + kl).mean() # useful extra info temp_max = torch.max(ratio).detach().mean().item() temp_min = torch.min(ratio).detach().mean().item() - if temp_max > self.max_ratio: - self.max_ratio = temp_max - if temp_min < self.min_ratio: - self.min_ratio = temp_min - approx_kl = 0.5 * (log_p - _log_p).mean().item() - ent = dist.entropy().mean().item() - pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()} - - return cost_loss, pi_info - - # pylint: disable-next=too-many-locals - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + if temp_max > self._max_ratio: + self._max_ratio = temp_max + if temp_min < self._min_ratio: + self._min_ratio = temp_min + entrophy = distribution.entropy().mean().item() + info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std} + + self._logger.store(**{'Loss/Loss_pi_c': loss.item()}) + + return loss, info + + def _update(self) -> None: """Update actor, critic, running statistics as we used in the :class:`PolicyGradient`. In addition, we also update the Lagrange multiplier parameter, by calling the :meth:`update_lagrange_multiplier` function. """ # note that logger already uses MPI statistics across all processes.. - Jc = self.logger.get_stats('Metrics/EpCost')[0] + Jc = self._logger.get_stats('Metrics/EpCost')[0] # first update Lagrange multiplier parameter - self.update_lagrange_multiplier(Jc) - # the first stage is to maximize reward. - data = PPO.update(self) - # the second stage is to minimize cost. - # get the loss before - loss_pi_c_before = self.loss_record.get_mean('loss_pi_c') - self.loss_record.reset('loss_pi_c') - obs, act, log_p, cost_adv = ( + self._lagrange.update_lagrange_multiplier(Jc) + + super()._update() + + data = self._buf.get() + obs, act, logp, adv_c = ( data['obs'], data['act'], data['logp'], data['adv_c'], ) + original_obs = obs with torch.no_grad(): - old_dist = self.actor_critic.actor(obs) - old_mean, old_std = old_dist.mean, old_dist.stddev - # load the data into the data loader. - dataset = torch.utils.data.TensorDataset(obs, act, log_p, cost_adv, old_mean, old_std) - loader = torch.utils.data.DataLoader( - dataset, batch_size=self.cfgs.num_mini_batches, shuffle=True + old_distribution = self._actor_critic.actor(obs) + old_mean = old_distribution.mean + old_std = old_distribution.stddev + + dataloader = DataLoader( + dataset=TensorDataset(obs, act, logp, adv_c, old_mean, old_std), + batch_size=self._cfgs.num_mini_batches, + shuffle=True, ) - # update the policy net several times - for i in range(self.cfgs.actor_iters): - for _, (obs_b, act_b, log_p_b, cost_adv_b, old_mean_b, old_std_b) in enumerate(loader): - # compute the old distribution of policy net. - self.p_dist = torch.distributions.Normal(old_mean_b, old_std_b) - # compute the loss of cost performance. - loss_pi_c, pi_info_c = self.compute_loss_cost_performance( - obs_b, act_b, log_p_b, cost_adv_b - ) - # update the policy net. - self.actor_optimizer.zero_grad() - # backward - loss_pi_c.backward() - # clip the gradient of policy net. - if self.cfgs.use_max_grad_norm: + for i in range(self._cfgs.actor_iters): + for obs, act, logp, adv_c, old_mean, old_std in dataloader: + self._p_dist = Normal(old_mean, old_std) + loss_cost, info = self._loss_pi_cost(obs, act, logp, adv_c) + self._actor_critic.actor_optimizer.zero_grad() + loss_cost.backward() + if self._cfgs.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( - self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm + self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm ) - # average the gradient of policy net. - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - self.actor_optimizer.step() - # compute the new distribution of policy net. - new_dist = self.actor_critic.actor(obs) - # compute the KL divergence between old and new distribution. - torch_kl = ( - torch.distributions.kl.kl_divergence(old_dist, new_dist) + distributed.avg_grads(self._actor_critic.actor) + self._actor_critic.actor_optimizer.step() + + new_distribution = self._actor_critic.actor(original_obs) + + kl = ( + torch.distributions.kl.kl_divergence(old_distribution, new_distribution) .sum(-1, keepdim=True) .mean() .item() ) - torch_kl = distributed_utils.mpi_avg(torch_kl) - # if the KL divergence is larger than the target KL divergence, stop the update. - if self.cfgs.kl_early_stopping and torch_kl > self.cfgs.target_kl: - self.logger.log(f'KL early stop at the {i+1} th step in the second stage.') + kl = distributed.dist_avg(kl) + + if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl: + self._logger.log(f'Early stopping at iter {i} due to reaching max kl') break - loss_pi_c = self.loss_record.get_mean('loss_pi_c') - # log the information. - self.logger.store( + self._logger.store( **{ - 'Loss/Loss_pi_c': loss_pi_c, - 'Loss/Delta_loss_pi_c': loss_pi_c - loss_pi_c_before, + 'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier.item(), + 'Train/MaxRatio': self._max_ratio, + 'Train/MinRatio': self._min_ratio, 'Train/SecondStepStopIter': i + 1, - 'Train/SecondStepEntropy': pi_info_c['ent'], - 'Train/SecondStepPolicyRatio': pi_info_c['ratio'], + 'Train/SecondStepEntropy': info['entrophy'], + 'Train/SecondStepPolicyRatio': info['ratio'], } ) - return data diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py index 0f3202c83..0856f04a7 100644 --- a/omnisafe/algorithms/on_policy/first_order/focops.py +++ b/omnisafe/algorithms/on_policy/first_order/focops.py @@ -14,17 +14,21 @@ # ============================================================================== """Implementation of the FOCOPS algorithm.""" -from typing import Dict, NamedTuple, Tuple +from typing import Dict, Tuple import torch +from torch.distributions import Normal +from torch.utils.data import DataLoader, TensorDataset from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient from omnisafe.common.lagrange import Lagrange +from omnisafe.utils import distributed +from omnisafe.utils.config import Config @registry.register -class FOCOPS(PolicyGradient, Lagrange): +class FOCOPS(PolicyGradient): """The First Order Constrained Optimization in Policy Space (FOCOPS) algorithm. References: @@ -33,126 +37,55 @@ class FOCOPS(PolicyGradient, Lagrange): - URL: `FOCOPS `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize FOCOPS. - - FOCOPS is a combination of :class:`PolicyGradient` and :class:`Lagrange` model. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - PolicyGradient.__init__( - self, - env_id=env_id, - cfgs=cfgs, + def _init(self) -> None: + super()._init() + self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs) + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') + + def __init__(self, env_id: str, cfgs: Config) -> None: + super().__init__(env_id, cfgs) + self._p_dist: Normal + + def _loss_pi( + self, obs: torch.Tensor, act: torch.Tensor, logp: torch.Tensor, adv: torch.Tensor + ) -> Tuple[torch.Tensor, Dict[str, float]]: + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + std = self._actor_critic.actor.std + ratio = torch.exp(logp_ - logp) + + kl = torch.distributions.kl_divergence(distribution, self._p_dist).sum(-1, keepdim=True) + loss = (kl - (1 / self._cfgs.lam) * ratio * adv) * (kl.detach() <= self._cfgs.eta).type( + torch.float32 ) - Lagrange.__init__( - self, - cost_limit=self.cfgs.lagrange_cfgs.cost_limit, - lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, - lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, - lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, - lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound, - ) - self.lam = self.cfgs.lam - self.eta = self.cfgs.eta - self.p_dist = None - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') + loss = loss.mean() + loss -= self._cfgs.entropy_coef * distribution.entropy().mean() - def algorithm_specific_logs(self) -> None: - """Log the FOCOPS specific information. + entrophy = distribution.entropy().mean().item() + info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std} + return loss, info - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - } + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + return (adv_r - self._lagrange.lagrangian_multiplier * adv_c) / ( + 1 + self._lagrange.lagrangian_multiplier ) - # pylint: disable-next=too-many-arguments - def compute_loss_pi( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r""" - Computing pi/actor loss. - In FOCOPS, the loss is defined as: - - .. math:: - :nowrap: - - \begin{eqnarray} - L = \nabla_\theta D_{K L}\left(\pi_\theta \| \pi_{\theta^{old}}\right)[s] - -\frac{1}{\eta} \underset{a \sim \pi_{\theta^{old}}} - {\mathbb{E}}\left[\frac{\nabla_\theta \pi_\theta(a \mid s)} - {\pi_{\theta^{old}}(a \mid s)}\left(A^{R}_{\pi_{\theta^{old}}}(s, a) - -\lambda A^C_{\pi_{\theta^{old}}}(s, a)\right)\right] - \end{eqnarray} - - where :math:`\eta` is a hyperparameter, :math:`\lambda` is the Lagrange multiplier, - :math:`A_{\pi_{\theta_k}}(s, a)` is the advantage function, - :math:`A^C_{\pi_{\theta_k}}(s, a)` is the cost advantage function, - :math:`\pi^*` is the optimal policy, and :math:`\pi_{\theta_k}` is the current policy. - """ - dist, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - - kl_new_old = torch.distributions.kl.kl_divergence(dist, self.p_dist).sum(-1, keepdim=True) - loss_pi = (kl_new_old - (1 / self.lam) * ratio * adv) * ( - kl_new_old.detach() <= self.eta - ).type(torch.float32) - loss_pi = loss_pi.mean() - loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean() - - # useful extra info - approx_kl = 0.5 * (log_p - _log_p).mean().item() - ent = dist.entropy().mean().item() - pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()} - - return loss_pi, pi_info - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. - - Policy Gradient only use reward advantage. - - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - return (adv - self.lagrangian_multiplier * cost_adv) / (1 + self.lagrangian_multiplier) - - # pylint: disable-next=too-many-locals - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + def _update(self) -> None: """Update actor, critic, running statistics as we used in the :class:`PolicyGradient`. In addition, we also update the Lagrange multiplier parameter, by calling the :meth:`update_lagrange_multiplier` function. """ # note that logger already uses MPI statistics across all processes.. - Jc = self.logger.get_stats('Metrics/EpCost')[0] + Jc = self._logger.get_stats('Metrics/EpCost')[0] # first update Lagrange multiplier parameter - self.update_lagrange_multiplier(Jc) - data = self.buf.get() - obs, act, log_p, target_v, target_c, adv, cost_adv = ( + self._lagrange.update_lagrange_multiplier(Jc) + + data = self._buf.get() + obs, act, logp, target_value_r, target_value_c, adv_r, adv_c = ( data['obs'], data['act'], data['logp'], @@ -161,76 +94,58 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: data['adv_r'], data['adv_c'], ) - # get the loss before - loss_pi_before, loss_v_before = self.loss_record.get_mean('loss_pi', 'loss_v') - if self.cfgs.use_cost: - loss_c_before = self.loss_record.get_mean('loss_c') - self.loss_record.reset('loss_pi', 'loss_v', 'loss_c') + original_obs = obs with torch.no_grad(): - old_dist = self.actor_critic.actor(obs) - old_mean, old_std = old_dist.mean, old_dist.stddev - - # load the data into the data loader. - dataset = torch.utils.data.TensorDataset( - obs, act, target_v, target_c, log_p, adv, cost_adv, old_mean, old_std - ) - loader = torch.utils.data.DataLoader( - dataset, batch_size=self.cfgs.num_mini_batches, shuffle=True + old_distribution = self._actor_critic.actor(obs) + old_mean = old_distribution.mean + old_std = old_distribution.stddev + + dataloader = DataLoader( + dataset=TensorDataset( + obs, act, logp, target_value_r, target_value_c, adv_r, adv_c, old_mean, old_std + ), + batch_size=self._cfgs.num_mini_batches, + shuffle=True, ) - # update the value net, cost net and policy net for several times. - for i in range(self.cfgs.actor_iters): - for _, ( - obs_b, - act_b, - target_v_b, - target_c_b, - log_p_b, - adv_b, - cost_adv_b, - old_mean_b, - old_std_b, - ) in enumerate(loader): - # update the value net. - self.update_value_net(obs_b, target_v_b) - # update the cost net, if use cost. - if self.cfgs.use_cost: - self.update_cost_net(obs_b, target_c_b) - # update the policy net. - self.p_dist = torch.distributions.Normal(old_mean_b, old_std_b) - self.update_policy_net(obs_b, act_b, log_p_b, adv_b, cost_adv_b) - # compute the new distribution of policy net. - new_dist = self.actor_critic.actor(obs) - # compute the KL divergence between old and new distribution. - torch_kl = ( - torch.distributions.kl.kl_divergence(old_dist, new_dist) + for i in range(self._cfgs.actor_iters): + for ( + obs, + act, + logp, + target_value_r, + target_value_c, + adv_r, + adv_c, + old_mean, + old_std, + ) in dataloader: + self._update_rewrad_critic(obs, target_value_r) + if self._cfgs.use_cost: + self._update_cost_critic(obs, target_value_c) + + self._p_dist = Normal(old_mean, old_std) + self._update_actor(obs, act, logp, adv_r, adv_c) + + new_distribution = self._actor_critic.actor(original_obs) + + kl = ( + torch.distributions.kl.kl_divergence(old_distribution, new_distribution) .sum(-1, keepdim=True) .mean() .item() ) - # if the KL divergence is larger than the target KL divergence, stop the update. - if self.cfgs.kl_early_stopping and torch_kl > self.cfgs.target_kl: - self.logger.log(f'KL early stop at the {i+1} th step.') + kl = distributed.dist_avg(kl) + + if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl: + self._logger.log(f'Early stopping at iter {i} due to reaching max kl') break - # log the information. - loss_pi, loss_v = self.loss_record.get_mean('loss_pi', 'loss_v') - self.logger.store( + + self._logger.store( **{ - 'Loss/Loss_pi': loss_pi, - 'Loss/Delta_loss_pi': loss_pi - loss_pi_before, 'Train/StopIter': i + 1, - 'Values/Adv': adv.mean().item(), - 'Train/KL': torch_kl, - 'Loss/Delta_loss_reward_critic': loss_v - loss_v_before, - 'Loss/Loss_reward_critic': loss_v, + 'Value/Adv': adv_r.mean().item(), + 'Train/KL': kl, + 'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier, } ) - if self.cfgs.use_cost: - loss_c = self.loss_record.get_mean('loss_c') - self.logger.store( - **{ - 'Loss/Delta_loss_cost_critic': loss_c - loss_c_before, - 'Loss/Loss_cost_critic': loss_c, - } - ) - return data diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py b/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py index 3d84b72a4..acfe874e1 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py @@ -14,12 +14,11 @@ # ============================================================================== """Implementation of the on-policy CRPO algorithm.""" -from typing import NamedTuple - import torch from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.utils.config import Config @registry.register @@ -32,58 +31,29 @@ class OnCRPO(PPO): - URL: `CRPO `_. """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize CRPO. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - PPO.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - self.rew_update = 0 - self.cost_update = 0 - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Misc/RewUpdate') - self.logger.register_key('Misc/CostUpdate') + def __init__(self, env_id: str, cfgs: Config) -> None: + super().__init__(env_id, cfgs) + self._rew_update = 0 + self._cost_update = 0 - def algorithm_specific_logs(self) -> None: - """Log the CRPO specific information. + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Misc/RewUpdate') + self._logger.register_key('Misc/CostUpdate') - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( + def _update(self) -> None: + super()._update() + self._logger.store( **{ - 'Misc/RewUpdate': self.rew_update, - 'Misc/CostUpdate': self.cost_update, + 'Misc/RewUpdate': self._rew_update, + 'Misc/CostUpdate': self._cost_update, } ) - def compute_surrogate(self, adv: torch.Tensor, cost_adv: torch.Tensor) -> torch.Tensor: - """Compute the surrogate loss of the policy. - - In CRPO algorithm, we first judge whether the cost is within the limit. - If the cost is within the limit, we use the advantage of the policy. - Otherwise, we use the advantage of the cost. - - Args: - adv (torch.Tensor): The advantage of the policy. - cost_adv (torch.Tensor): The advantage of the cost. - """ - Jc = self.logger.get_stats('Metrics/EpCost')[0] - if Jc <= self.cfgs.cost_limit + self.cfgs.distance: - self.rew_update += 1 - return adv - self.cost_update += 1 - return -cost_adv + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + Jc = self._logger.get_stats('Metrics/EpCost')[0] + if Jc <= self._cfgs.cost_limit + self._cfgs.distance: + self._rew_update += 1 + return adv_r + self._cost_update += 1 + return -adv_c diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py index d3233871d..f73d120da 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py @@ -14,8 +14,6 @@ # ============================================================================== """Implementation of the PDO algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry @@ -24,39 +22,21 @@ @registry.register -class PDO(PolicyGradient, Lagrange): +class PDO(PolicyGradient): """The Lagrange version of the Policy Gradient algorithm. A simple combination of the :class:`Lagrange` method and the :class:`PolicyGradient` algorithm. """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PDO. - - PDO is a combination of :class:`PolicyGradient` and :class:`Lagrange` model. + def _init(self) -> None: + super()._init() + self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs) - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - PolicyGradient.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__( - self, - cost_limit=cfgs.lagrange_cfgs.cost_limit, - lagrangian_multiplier_init=cfgs.lagrange_cfgs.lagrangian_multiplier_init, - lambda_lr=cfgs.lagrange_cfgs.lambda_lr, - lambda_optimizer=cfgs.lagrange_cfgs.lambda_optimizer, - ) + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + def _update(self) -> None: r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm. Additionally, we update the Lagrange multiplier parameter, @@ -73,44 +53,15 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: where :math:`\lambda` is the Lagrange multiplier parameter. """ - # note that logger already uses MPI statistics across all processes. - Jc = self.logger.get_stats('Metrics/EpCost')[0] + # note that logger already uses MPI statistics across all processes.. + Jc = self._logger.get_stats('Metrics/EpCost')[0] # first update Lagrange multiplier parameter - self.update_lagrange_multiplier(Jc) + self._lagrange.update_lagrange_multiplier(Jc) # then update the policy and value function - PolicyGradient.update(self) - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. + super()._update() - PDO uses the Lagrange method to combine the reward and cost. - The surrogate loss is defined as the difference between the reward - advantage and the cost advantage + self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier}) - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() - return (adv - penalty * cost_adv) / (1 + penalty) - - def algorithm_specific_logs(self) -> None: - """Log the PDO specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - } - ) + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + penalty = self._lagrange.lagrangian_multiplier.item() + return (adv_r - penalty * adv_c) / (1 + penalty) diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py index cc438181e..3c7a31f4c 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py @@ -14,8 +14,6 @@ # ============================================================================== """Implementation of the Lagrange version of the PPO algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry @@ -24,46 +22,28 @@ @registry.register -class PPOLag(PPO, Lagrange): +class PPOLag(PPO): """The Lagrange version of the PPO algorithm. A simple combination of the Lagrange method and the Proximal Policy Optimization algorithm. """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOLag. - - PPOLag is a combination of :class:`PPO` and :class:`Lagrange` model. + def _init(self) -> None: + super()._init() + self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs) - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - PPO.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__( - self, - cost_limit=self.cfgs.lagrange_cfgs.cost_limit, - lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, - lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, - lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, - ) + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - r"""Update actor, critic, running statistics as we used in the :class:`PPO` algorithm. + def _update(self) -> None: + r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm. Additionally, we update the Lagrange multiplier parameter, by calling the :meth:`update_lagrange_multiplier` method. .. note:: - The :meth:`compute_loss_pi` is defined in the :class:`PPO` algorithm. + The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm. When a lagrange multiplier is used, the :meth:`compute_loss_pi` method will return the loss of the policy as: @@ -74,42 +54,14 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: where :math:`\lambda` is the Lagrange multiplier parameter. """ # note that logger already uses MPI statistics across all processes.. - Jc = self.logger.get_stats('Metrics/EpCost')[0] + Jc = self._logger.get_stats('Metrics/EpCost')[0] # first update Lagrange multiplier parameter - self.update_lagrange_multiplier(Jc) - PPO.update(self) - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. + self._lagrange.update_lagrange_multiplier(Jc) + # then update the policy and value function + super()._update() - PPOLag uses the Lagrange method to combine the reward and cost. - The surrogate loss is defined as the difference between the reward - advantage and the cost advantage + self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier}) - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() - return (adv - penalty * cost_adv) / (1 + penalty) - - def algorithm_specific_logs(self) -> None: - """Log the PPOLag specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - } - ) + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + penalty = self._lagrange.lagrangian_multiplier.item() + return (adv_r - penalty * adv_c) / (1 + penalty) diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py b/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py index 169b7af65..9017f4383 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py @@ -14,8 +14,6 @@ # ============================================================================== """Implementation of the Reward Constrained Policy Optimization algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry @@ -24,7 +22,7 @@ @registry.register -class RCPO(NaturalPG, Lagrange): +class RCPO(NaturalPG): """Reward Constrained Policy Optimization. References: @@ -33,34 +31,16 @@ class RCPO(NaturalPG, Lagrange): - URL: `Reward Constrained Policy Optimization `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize RCPO. - - RCPO is a combination of :class:`NaturalPG` and :class:`Lagrange` model. + def _init(self) -> None: + super()._init() + self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs) - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - NaturalPG.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__( - self, - cost_limit=self.cfgs.lagrange_cfgs.cost_limit, - lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, - lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, - lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, - ) + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - r"""Update actor, critic, running statistics as we used in the :class:`NaturalPG` algorithm. + def _update(self) -> None: + r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm. Additionally, we update the Lagrange multiplier parameter, by calling the :meth:`update_lagrange_multiplier` method. @@ -77,43 +57,14 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: where :math:`\lambda` is the Lagrange multiplier parameter. """ # note that logger already uses MPI statistics across all processes.. - Jc = self.logger.get_stats('Metrics/EpCost')[0] + Jc = self._logger.get_stats('Metrics/EpCost')[0] # first update Lagrange multiplier parameter - self.update_lagrange_multiplier(Jc) - # then update the policy and value net. - NaturalPG.update(self) - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. + self._lagrange.update_lagrange_multiplier(Jc) + # then update the policy and value function + super()._update() - RCPO uses the Lagrange method to combine the reward and cost. - The surrogate loss is defined as the difference between the reward - advantage and the cost advantage + self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier}) - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() - return (adv - penalty * cost_adv) / (1 + penalty) - - def algorithm_specific_logs(self) -> None: - """Log the RCPO specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - } - ) + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + penalty = self._lagrange.lagrangian_multiplier.item() + return (adv_r - penalty * adv_c) / (1 + penalty) diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py index 29c39cf76..8a53b38b3 100644 --- a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py +++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py @@ -14,8 +14,6 @@ # ============================================================================== """Implementation of the Lagrange version of the TRPO algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry @@ -24,46 +22,28 @@ @registry.register -class TRPOLag(TRPO, Lagrange): +class TRPOLag(TRPO): """The Lagrange version of the TRPO algorithm. A simple combination of the Lagrange method and the Trust Region Policy Optimization algorithm. """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize TRPOLag. - - TRPOLag is a combination of :class:`TRPO` and :class:`Lagrange` model. + def _init(self) -> None: + super()._init() + self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs) - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - TRPO.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - Lagrange.__init__( - self, - cost_limit=self.cfgs.lagrange_cfgs.cost_limit, - lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init, - lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr, - lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer, - ) + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - r"""Update actor, critic, running statistics as we used in the :class:`TRPO` algorithm. + def _update(self) -> None: + r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm. Additionally, we update the Lagrange multiplier parameter, by calling the :meth:`update_lagrange_multiplier` method. .. note:: - The :meth:`compute_loss_pi` method is defined in the :class:`PolicyGradient` algorithm. + The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm. When a lagrange multiplier is used, the :meth:`compute_loss_pi` method will return the loss of the policy as: @@ -74,43 +54,14 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: where :math:`\lambda` is the Lagrange multiplier parameter. """ # note that logger already uses MPI statistics across all processes.. - Jc = self.logger.get_stats('Metrics/EpCost')[0] + Jc = self._logger.get_stats('Metrics/EpCost')[0] # first update Lagrange multiplier parameter - self.update_lagrange_multiplier(Jc) + self._lagrange.update_lagrange_multiplier(Jc) # then update the policy and value function - TRPO.update(self) - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. + super()._update() - TRPOLag uses the Lagrange method to combine the reward and cost. - The surrogate loss is defined as the difference between the reward - advantage and the cost advantage + self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier}) - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - penalty = self.lambda_range_projection(self.lagrangian_multiplier).item() - return (adv - penalty * cost_adv) / (1 + penalty) - - def algorithm_specific_logs(self) -> None: - """Log the TRPOLag specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(), - } - ) + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + penalty = self._lagrange.lagrangian_multiplier.item() + return (adv_r - penalty * adv_c) / (1 + penalty) diff --git a/omnisafe/algorithms/on_policy/penalty_function/ipo.py b/omnisafe/algorithms/on_policy/penalty_function/ipo.py index 96848aa72..222c1493e 100644 --- a/omnisafe/algorithms/on_policy/penalty_function/ipo.py +++ b/omnisafe/algorithms/on_policy/penalty_function/ipo.py @@ -14,8 +14,6 @@ # ============================================================================== """Implementation of IPO algorithm.""" -from typing import NamedTuple - import torch from omnisafe.algorithms import registry @@ -32,31 +30,17 @@ class IPO(PPO): - URL: `IPO `_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize IPO.""" - PPO.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - self.penalty = 0 - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Penalty') - - def algorithm_specific_logs(self): - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Penalty': self.penalty, - } - ) - - def compute_surrogate(self, adv: torch.Tensor, cost_adv: torch.Tensor) -> torch.Tensor: + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Misc/Penalty') + + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: """Compute surrogate loss.""" - Jc = self.logger.get_stats('Metrics/EpCost')[0] - self.penalty = self.cfgs.kappa / (self.cfgs.cost_limit - Jc + 1e-8) - if self.penalty < 0 or self.penalty > self.cfgs.penalty_max: - self.penalty = self.cfgs.penalty_max - return (adv - self.penalty * cost_adv) / (1 + self.penalty) + Jc = self._logger.get_stats('Metrics/EpCost')[0] + penalty = self._cfgs.kappa / (self._cfgs.cost_limit - Jc + 1e-8) + if penalty < 0 or penalty > self._cfgs.penalty_max: + penalty = self._cfgs.penalty_max + + self._logger.store(**{'Misc/Penalty': penalty}) + + return (adv_r - penalty * adv_c) / (1 + penalty) diff --git a/omnisafe/algorithms/on_policy/penalty_function/p3o.py b/omnisafe/algorithms/on_policy/penalty_function/p3o.py index c92fc4a42..1fc94881f 100644 --- a/omnisafe/algorithms/on_policy/penalty_function/p3o.py +++ b/omnisafe/algorithms/on_policy/penalty_function/p3o.py @@ -19,7 +19,7 @@ from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO -from omnisafe.utils import distributed_utils +from omnisafe.utils import distributed @registry.register @@ -32,48 +32,32 @@ class P3O(PPO): - URL: `P3O `_ """ - def compute_loss_cost_performance( + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Loss/Loss_pi_cost', delta=True) + + def _loss_pi_cost( self, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - cost_adv: torch.Tensor, + logp: torch.Tensor, + adv_c: torch.Tensor, ) -> torch.Tensor: - r"""Compute the loss of the cost performance. - - The loss is defined as: - - .. math:: - - \mathcal{L}_{\pi_c} = \kappa \cdot \max - \left(0, \frac{\pi_c(a_t|s_t)}{\pi(a_t|s_t)} \cdot A_{c_t} + J_c - \bar{J}_c\right) - - where :math:`\kappa` is the penalty coefficient, :math:`\pi_c` is the cost performance, - :math:`\pi` is the policy, :math:`A_{c_t}` is the cost advantage, :math:`J_c` is the cost - of the current episode, and :math:`\bar{J}_c` is the cost limit. - - Args: - obs (torch.Tensor): The observation tensor. - act (torch.Tensor): The action tensor. - log_p (torch.Tensor): The log probability of the action. - cost_adv (torch.Tensor): The cost advantage. - """ - _, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - ratio_clip = torch.clamp(ratio, 1 - self.cfgs.clip, 1 + self.cfgs.clip) - surr_cadv = (ratio_clip * cost_adv).mean() - Jc = self.logger.get_stats('Metrics/EpCost')[0] - loss_pi_c = self.cfgs.kappa * F.relu(surr_cadv + Jc) - return loss_pi_c.mean() - - # pylint: disable-next=too-many-locals,too-many-arguments - def update_policy_net( + self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + ratio = torch.exp(logp_ - logp) + surr_cadv = (ratio * adv_c).mean() + Jc = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit + loss_cost = self._cfgs.kappa * F.relu(surr_cadv + Jc) + return loss_cost.mean() + + def _update_actor( self, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, ) -> None: r"""Update policy network under a double for loop. @@ -100,31 +84,26 @@ def update_policy_net( adv (torch.Tensor): ``advantage`` stored in buffer. cost_adv (torch.Tensor): ``cost_advantage`` stored in buffer. """ - # process the advantage function. - processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv) - # compute the loss of policy net. - loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv) - # compute the cost performance of policy net. - loss_pi_c = self.compute_loss_cost_performance( - obs=obs, act=act, log_p=log_p, cost_adv=cost_adv - ) - # log the loss of policy net. - self.loss_record.append(loss_pi=(loss_pi - loss_pi_c).mean().item()) - # update the policy net. - self.actor_optimizer.zero_grad() - # backward the loss of policy net. - (loss_pi + loss_pi_c).backward() - # clip the gradient of policy net. - if self.cfgs.use_max_grad_norm: + loss_reward, info = self._loss_pi(obs, act, logp, adv_r) + loss_cost = self._loss_pi_cost(obs, act, logp, adv_c) + + loss = loss_reward - loss_cost + + self._actor_critic.actor_optimizer.zero_grad() + loss.backward() + if self._cfgs.use_max_grad_norm: torch.nn.utils.clip_grad_norm_( - self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm + self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm ) - # average the gradient of policy net. - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - self.actor_optimizer.step() - self.logger.store( + distributed.avg_grads(self._actor_critic.actor) + self._actor_critic.actor_optimizer.step() + + self._logger.store( **{ - 'Train/Entropy': pi_info['ent'], - 'Train/PolicyRatio': pi_info['ratio'], + 'Train/Entropy': info['entrophy'], + 'Train/PolicyRatio': info['ratio'], + 'Train/PolicyStd': info['std'], + 'Loss/Loss_pi': loss_reward.mean().item(), + 'Loss/Loss_pi_cost': loss_cost.mean().item(), } ) diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py index 4da4a90dc..64ad66dcd 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py @@ -14,17 +14,15 @@ # ============================================================================== """Implementation of the PID-Lagrange version of the CPPO algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient +from omnisafe.algorithms.on_policy.base.ppo import PPO from omnisafe.common.pid_lagrange import PIDLagrangian @registry.register -class CPPOPid(PolicyGradient, PIDLagrangian): +class CPPOPid(PPO): r"""The PID-Lagrange version of the CPPO algorithm. Similar to :class:`PDO`, which is a simple combination of :class:`PolicyGradient` and :class:`Lagrange`, @@ -41,127 +39,50 @@ class CPPOPid(PolicyGradient, PIDLagrangian): - URL: https://arxiv.org/abs/2007.03964 """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize CPPOPid. - - CPPOPid is a simple combination of :class:`PolicyGradient` and :class:`PIDLagrangian`. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - PolicyGradient.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs) - - self.clip = self.cfgs.clip - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - self.logger.register_key('PID/pid_Kp') - self.logger.register_key('PID/pid_Ki') - self.logger.register_key('PID/pid_Kd') - - def algorithm_specific_logs(self) -> None: - """Log the CPPOPid specific information. - - .. list-table:: - - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - * - PID/pid_Kp - - The Kp value in current epoch. - * - PID/pid_Ki - - The Ki value in current epoch. - * - PID/pid_Kd - - The Kd value in current epoch. - """ - super().algorithm_specific_logs() - self.logger.store( - **{ - 'Metrics/LagrangeMultiplier': self.cost_penalty, - 'PID/pid_Kp': self.pid_kp, - 'PID/pid_Ki': self.pid_ki, - 'PID/pid_Kd': self.pid_kd, - } - ) - - # pylint: disable-next=too-many-arguments,too-many-locals - def compute_loss_pi( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r""" - Computing pi/actor loss. - In CPPOPid, the loss is defined as: - - .. math:: - L^{CLIP} = \mathbb{E}_{s_t \sim \rho_{\pi}} - \left[ \min(r_t (A^{R}_t - \lambda A^{C}_t), \text{clip}(r_t, 1-\epsilon, 1+\epsilon) (A^{R}_t - - \lambda A^{C}_t)) \right] - - where :math:`r_t = \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)}`, - :math:`\epsilon` is the clip parameter, :math:`A^{R}_t` is the reward advantage, - :math:`A^{C}_t` is the cost advantage, and :math:`\lambda` is the Lagrange multiplier. - - Args: - obs (torch.Tensor): ``observation`` stored in buffer. - act (torch.Tensor): ``action`` stored in buffer. - log_p (torch.Tensor): ``log probability`` of action stored in buffer. - adv (torch.Tensor): ``advantage`` stored in buffer. - cost_adv (torch.Tensor): ``cost advantage`` stored in buffer. - """ - dist, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) + def _init(self) -> None: + super()._init() + self._pid_lag = PIDLagrangian(**self._cfgs.PID_cfgs) - surr_adv = (torch.min(ratio * adv, ratio_clip * adv)).mean() + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') + self._logger.register_key('PID/pid_Kp') + self._logger.register_key('PID/pid_Ki') + self._logger.register_key('PID/pid_Kd') - loss_pi = -surr_adv - loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean() + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + penalty = self._pid_lag.cost_penalty + return (adv_r - penalty * adv_c) / (1 + penalty) - # useful extra info - approx_kl = 0.5 * (log_p - _log_p).mean().item() - ent = dist.entropy().mean().item() - pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()} + def _update(self) -> None: + r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm. - return loss_pi, pi_info - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. - - CPPOPid uses the Lagrange method to combine the reward and cost. - The surrogate loss is defined as the difference between the reward - advantage and the cost advantage + Additionally, we update the Lagrange multiplier parameter, + by calling the :meth:`update_lagrange_multiplier` method. - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - return (adv - self.cost_penalty * cost_adv) / (1 + self.cost_penalty) + .. note:: + The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm. + When a lagrange multiplier is used, + the :meth:`compute_loss_pi` method will return the loss of the policy as: - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - r"""Update actor, critic, running statistics as we used in the :class:`PPO` algorithm. + .. math:: + L_{\pi} = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)} + [A^{R}(s_t, a_t) - \lambda A^{C}(s_t, a_t)] \right] - Additionally, we update the Lagrange multiplier parameter, - by calling the :meth:`update_lagrange_multiplier` method. + where :math:`\lambda` is the Lagrange multiplier parameter. """ - # note that logger already uses MPI statistics across all processes. - Jc = self.logger.get_stats('Metrics/EpCost')[0] - # first update Lagrange multiplier parameter. - self.pid_update(Jc) - # then update the policy and value net. - PolicyGradient.update(self) + # note that logger already uses MPI statistics across all processes.. + Jc = self._logger.get_stats('Metrics/EpCost')[0] + # first update Lagrange multiplier parameter + self._pid_lag.pid_update(Jc) + # then update the policy and value function + super()._update() + + self._logger.store( + **{ + 'Metrics/LagrangeMultiplier': self._pid_lag.cost_penalty, + 'PID/pid_Kp': self._pid_lag.pid_kp, + 'PID/pid_Ki': self._pid_lag.pid_ki, + 'PID/pid_Kd': self._pid_lag.pid_kd, + } + ) diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py index b11091bfb..35a303e23 100644 --- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py +++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py @@ -14,8 +14,6 @@ # ============================================================================== """Implementation of the PID-Lagrange version of the TRPO algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry @@ -24,7 +22,7 @@ @registry.register -class TRPOPid(TRPO, PIDLagrangian): +class TRPOPid(TRPO): """The PID-Lagrange version of the TRPO algorithm. References: @@ -33,122 +31,50 @@ class TRPOPid(TRPO, PIDLagrangian): - URL: https://arxiv.org/abs/2007.03964 """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize TRPOPid. + def _init(self) -> None: + super()._init() + self._pid_lag = PIDLagrangian(**self._cfgs.PID_cfgs) - TRPOPid is a simple combination of :class:`TRPO` and :class:`PIDLagrangian`. + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/LagrangeMultiplier') + self._logger.register_key('PID/pid_Kp') + self._logger.register_key('PID/pid_Ki') + self._logger.register_key('PID/pid_Kd') - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - TRPO.__init__( - self, - env_id=env_id, - cfgs=cfgs, - ) - PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs) - self.cost_limit = self.cfgs.cost_limit + def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor: + penalty = self._pid_lag.cost_penalty + return (adv_r - penalty * adv_c) / (1 + penalty) - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/LagrangeMultiplier') - self.logger.register_key('PID/pid_Kp') - self.logger.register_key('PID/pid_Ki') - self.logger.register_key('PID/pid_Kd') + def _update(self) -> None: + r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm. - def algorithm_specific_logs(self) -> None: - """Log the TRPOPid specific information. + Additionally, we update the Lagrange multiplier parameter, + by calling the :meth:`update_lagrange_multiplier` method. + + .. note:: + The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm. + When a lagrange multiplier is used, + the :meth:`compute_loss_pi` method will return the loss of the policy as: - .. list-table:: + .. math:: + L_{\pi} = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)} + [A^{R}(s_t, a_t) - \lambda A^{C}(s_t, a_t)] \right] - * - Things to log - - Description - * - Metrics/LagrangeMultiplier - - The Lagrange multiplier value in current epoch. - * - PID/pid_Kp - - The Kp value in current epoch. - * - PID/pid_Ki - - The Ki value in current epoch. - * - PID/pid_Kd - - The Kd value in current epoch. + where :math:`\lambda` is the Lagrange multiplier parameter. """ - super().algorithm_specific_logs() - self.logger.store( + # note that logger already uses MPI statistics across all processes.. + Jc = self._logger.get_stats('Metrics/EpCost')[0] + # first update Lagrange multiplier parameter + self._pid_lag.pid_update(Jc) + # then update the policy and value function + super()._update() + + self._logger.store( **{ - 'Metrics/LagrangeMultiplier': self.cost_penalty, - 'PID/pid_Kp': self.pid_kp, - 'PID/pid_Ki': self.pid_ki, - 'PID/pid_Kd': self.pid_kd, + 'Metrics/LagrangeMultiplier': self._pid_lag.cost_penalty, + 'PID/pid_Kp': self._pid_lag.pid_kp, + 'PID/pid_Ki': self._pid_lag.pid_ki, + 'PID/pid_Kd': self._pid_lag.pid_kd, } ) - - # pylint: disable-next=too-many-arguments - def compute_loss_pi( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r""" - Computing pi/actor loss. - In CPPOPid, the loss is defined as: - - .. math:: - L = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)} - {\pi_\theta^{old}(a_t|s_t)} [A^{R}_t(s_t, a_t) - \lambda A^{C}_t(s_t, a_t)] \right] - - where :math:`A^{R}_t` is the advantage from the reward and :math:`A^{C}_t` is the advantage from the cost, - and :math:`\lambda` is the Lagrange multiplier controlled by the PID controller. - - Args: - obs (torch.Tensor): :meth:`observation` stored in buffer. - act (torch.Tensor): :meth:`action` stored in buffer. - log_p (torch.Tensor): ``log probability`` of action stored in buffer. - adv (torch.Tensor): :meth:`advantage` stored in buffer. - cost_adv (torch.Tensor): :meth:`cost advantage` stored in buffer. - """ - dist, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - - # compute loss via ratio and advantage - loss_pi = -(ratio * adv).mean() - loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean() - - # useful extra info - approx_kl = 0.5 * (log_p - _log_p).mean().item() - ent = dist.entropy().mean().item() - pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()} - - return loss_pi, pi_info - - def compute_surrogate( - self, - adv: torch.Tensor, - cost_adv: torch.Tensor, - ) -> torch.Tensor: - """Compute surrogate loss. - - TRPOPid uses the Lagrange method to combine the reward and cost. - The surrogate loss is defined as the difference between the reward - advantage and the cost advantage - - Args: - adv (torch.Tensor): reward advantage - cost_adv (torch.Tensor): cost advantage - """ - return (adv - self.cost_penalty * cost_adv) / (1 + self.cost_penalty) - - def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - r"""Update actor, critic, running statistics as we used in the :class:`TRPO` algorithm. - - Additionally, we update the Lagrange multiplier parameter, - by calling the :meth:`update_lagrange_multiplier` method. - """ - # note that logger already uses MPI statistics across all processes. - Jc = self.logger.get_stats('Metrics/EpCost')[0] - # first update Lagrange multiplier parameter - self.pid_update(Jc) - # then update the policy and value net. - TRPO.update(self) diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py index fb79bfeaa..f8b9970ea 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py @@ -14,10 +14,10 @@ # ============================================================================== """Implementation of the Lagrange version of the Saute algorithm using PPOLag.""" -from typing import NamedTuple - +from omnisafe.adapter import SauteAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag +from omnisafe.utils import distributed @registry.register @@ -31,18 +31,15 @@ class PPOLagSaute(PPOLag): - URL: `Saute RL`_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOLagSaute. - - PPOLagSaute is a combination of :class:`PPO` and :class:`Lagrange` model, - using :class:`Saute` as the environment wrapper. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/EpBudget') + def _init_env(self) -> None: + self._env = SauteAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs) + assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, ( + 'The number of steps per epoch is not divisible by the number of ' 'environments.' + ) + self._steps_per_epoch = ( + self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs + ) + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/EpBudget') diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py index c92f5ffc4..7ee288198 100644 --- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py +++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py @@ -14,10 +14,10 @@ # ============================================================================== """Implementation of the Saute algorithm.""" -from typing import NamedTuple - +from omnisafe.adapter import SauteAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.utils import distributed @registry.register @@ -31,17 +31,15 @@ class PPOSaute(PPO): - URL: `Saute RL`_ """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOSaute. - - PPOSaute is a combination of :class:`PPO` and :class:`Saute`. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/EpBudget') + def _init_env(self) -> None: + self._env = SauteAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs) + assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, ( + 'The number of steps per epoch is not divisible by the number of ' 'environments.' + ) + self._steps_per_epoch = ( + self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs + ) + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/EpBudget') diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py index 8118da581..52cfa2ea1 100644 --- a/omnisafe/algorithms/on_policy/second_order/cpo.py +++ b/omnisafe/algorithms/on_policy/second_order/cpo.py @@ -14,16 +14,16 @@ # ============================================================================== """Implementation of the CPO algorithm.""" -from typing import Dict, NamedTuple, Tuple +from typing import Tuple import numpy as np import torch from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.trpo import TRPO -from omnisafe.utils import distributed_utils +from omnisafe.utils import distributed +from omnisafe.utils.math import conjugate_gradients from omnisafe.utils.tools import ( - conjugate_gradients, get_flat_gradients_from, get_flat_params_from, set_param_values_to_model, @@ -42,44 +42,34 @@ class CPO(TRPO): - URL: https://arxiv.org/abs/1705.10528 """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize CPO. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - self.cost_limit = cfgs.cost_limit - self.loss_pi_cost_before = 0.0 - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Misc/cost_gradient_norm') - self.logger.register_key('Misc/A') - self.logger.register_key('Misc/B') - self.logger.register_key('Misc/q') - self.logger.register_key('Misc/r') - self.logger.register_key('Misc/s') - self.logger.register_key('Misc/Lambda_star') - self.logger.register_key('Misc/Nu_star') - self.logger.register_key('Misc/OptimCase') + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Misc/cost_gradient_norm') + self._logger.register_key('Misc/A') + self._logger.register_key('Misc/B') + self._logger.register_key('Misc/q') + self._logger.register_key('Misc/r') + self._logger.register_key('Misc/s') + self._logger.register_key('Misc/Lambda_star') + self._logger.register_key('Misc/Nu_star') + self._logger.register_key('Misc/OptimCase') # pylint: disable-next=too-many-arguments,too-many-locals - def search_step_size( + def _cpo_search_step( self, - step_dir: torch.Tensor, - g_flat: torch.Tensor, + step_direction: torch.Tensor, + grad: torch.Tensor, p_dist: torch.distributions.Distribution, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - loss_pi_before: float, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, + loss_reward_before: float, + loss_cost_before: float, total_steps: int = 15, decay: float = 0.8, - cost_adv: torch.Tensor = None, - c: int = 0, + violation_c: int = 0, optim_case: int = 0, ) -> Tuple[torch.Tensor, int]: r"""Use line-search to find the step size that satisfies the constraint. @@ -112,104 +102,73 @@ def search_step_size( # get distance each time theta goes towards certain direction step_frac = 1.0 # get and flatten parameters from pi-net - _theta_old = get_flat_params_from(self.actor_critic.actor) + theta_old = get_flat_params_from(self._actor_critic.actor) # reward improvement, g-flat as gradient of reward - expected_rew_improve = g_flat.dot(step_dir) + expected_reward_improve = torch.dot(grad, step_direction) # while not within_trust_region and not finish all steps: - for j in range(total_steps): + for step in range(total_steps): # get new theta - new_theta = _theta_old + step_frac * step_dir + new_theta = theta_old + step_frac * step_direction # set new theta as new actor parameters - set_param_values_to_model(self.actor_critic.actor, new_theta) + set_param_values_to_model(self._actor_critic.actor, new_theta) # the last acceptance steps to next step - acceptance_step = j + 1 + acceptance_step = step + 1 with torch.no_grad(): # loss of policy reward from target/expected reward - loss_pi_rew, _ = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv) + loss_reward, _ = self._loss_pi(obs=obs, act=act, logp=logp, adv=adv_r) # loss of cost of policy cost from real/expected reward - loss_pi_cost, _ = self.compute_loss_cost_performance( - obs=obs, act=act, log_p=log_p, cost_adv=cost_adv - ) - self.loss_record.append(loss_pi=(loss_pi_rew.mean() + loss_pi_cost.mean()).item()) + loss_cost = self._loss_pi_cost(obs=obs, act=act, logp=logp, adv_c=adv_c) # compute KL distance between new and old policy - q_dist = self.actor_critic.actor(obs) - torch_kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item() + q_dist = self._actor_critic.actor(obs) + kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean() # compute improvement of reward - loss_rew_improve = loss_pi_before - loss_pi_rew.item() - cost_diff = loss_pi_cost.item() - self.loss_pi_cost_before + loss_reward_improve = loss_reward_before - loss_reward.item() + # compute difference of cost + loss_cost_diff = loss_cost.item() - loss_cost_before # average across MPI processes... - torch_kl = distributed_utils.mpi_avg(torch_kl) + kl = distributed.dist_avg(kl) # pi_average of torch_kl above - loss_rew_improve = distributed_utils.mpi_avg(loss_rew_improve) - cost_diff = distributed_utils.mpi_avg(cost_diff) - menu = (expected_rew_improve, loss_rew_improve) - self.logger.log(f'Expected Improvement: {menu[0]} Actual: {menu[1]}') + loss_reward_improve = distributed.dist_avg(loss_reward_improve) + loss_cost_diff = distributed.dist_avg(loss_cost_diff) + self._logger.log( + f'Expected Improvement: {expected_reward_improve} Actual: {loss_reward_improve}' + ) # check whether there are nan. - if not torch.isfinite(loss_pi_rew) and not torch.isfinite(loss_pi_cost): - self.logger.log('WARNING: loss_pi not finite') - elif loss_rew_improve < 0 if optim_case > 1 else False: - self.logger.log('INFO: did not improve improve <0') + if not torch.isfinite(loss_reward) and not torch.isfinite(loss_cost): + self._logger.log('WARNING: loss_pi not finite') + elif loss_reward_improve < 0 if optim_case > 1 else False: + self._logger.log('INFO: did not improve improve <0') # change of cost's range - elif cost_diff > max(-c, 0): - self.logger.log(f'INFO: no improve {cost_diff} > {max(-c, 0)}') + elif loss_cost_diff > max(-violation_c, 0): + self._logger.log(f'INFO: no improve {loss_cost_diff} > {max(-violation_c, 0)}') # check KL-distance to avoid too far gap - elif torch_kl > self.target_kl * 1.5: - self.logger.log(f'INFO: violated KL constraint {torch_kl} at step {j + 1}.') + elif kl > self._cfgs.target_kl * 1.5: + self._logger.log(f'INFO: violated KL constraint {kl} at step {step + 1}.') else: # step only if surrogate is improved and we are # within the trust region - self.logger.log(f'Accept step at i={j + 1}') + self._logger.log(f'Accept step at i={step + 1}') break step_frac *= decay else: # if didn't find a step satisfy those conditions - self.logger.log('INFO: no suitable step found...') - step_dir = torch.zeros_like(step_dir) + self._logger.log('INFO: no suitable step found...') + step_direction = torch.zeros_like(step_direction) acceptance_step = 0 - set_param_values_to_model(self.actor_critic.actor, _theta_old) - return step_frac * step_dir, acceptance_step - - def algorithm_specific_logs(self) -> None: - r"""Log the CPO specific information. - - .. list-table:: - - * - Things to log - - Description - * - Misc/cost_gradient_norm - - The norm of the cost gradient. - * - Misc/q - - The :math:`q` vector, which is the conjugate of Hessian :math:`H`. - * - Misc/r - - The :math:`r` vector, where :math:`r = g^T H^{-1} b`. - * - Misc/s - - The :math:`s` vector, where :math:`s = b^T H^{-1} b` - * - Misc/A - - The A matrix, where :math:`A = q - \frac{r^2}{s}` - * - Misc/B - - The B matrix, where :math:`B = 2 \delta_{KL} - \frac{c^2}{s}` , - where :math:`c` is the cost violation in current epoch, and - :math:`\delta_{KL}` is the target KL divergence. - * - Misc/Lambda_star - - The :math:`\lambda^*` vector. - * - Misc/Nu_star - - The :math:`\nu^*` vector. - * - Misc/OptimCase - - The optimization case. - """ - TRPO.algorithm_specific_logs(self) + set_param_values_to_model(self._actor_critic.actor, theta_old) + return step_frac * step_direction, acceptance_step - def compute_loss_cost_performance( + def _loss_pi_cost( self, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - cost_adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + logp: torch.Tensor, + adv_c: torch.Tensor, + ) -> torch.Tensor: r"""Compute the performance of cost on this moment. Detailedly, we compute the loss of cost of policy cost from real cost. @@ -224,163 +183,125 @@ def compute_loss_cost_performance( Args: obs (torch.Tensor): Observation. act (torch.Tensor): Action. - log_p (torch.Tensor): Log probability. - cost_adv (torch.Tensor): Cost advantage. + logp (torch.Tensor): Log probability of action. + adv_c (torch.Tensor): Cost advantage. + + Returns: + torch.Tensor: The loss of cost of policy cost from real cost. """ - _, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - cost_loss = (ratio * cost_adv).mean() - info = {} - return cost_loss, info - - # pylint: disable-next=too-many-statements,too-many-locals,too-many-arguments - def update_policy_net( + self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + ratio = torch.exp(logp_ - logp) + cost_loss = (ratio * adv_c).mean() + return cost_loss + + # pylint: disable=invalid-name, too-many-arguments, too-many-locals + def _update_actor( self, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, ) -> None: - """Update policy network. + self._fvp_obs = obs[::4] + theta_old = get_flat_params_from(self._actor_critic.actor) + self._actor_critic.actor.zero_grad() + loss_reward, info = self._loss_pi(obs, act, logp, adv_r) + loss_reward_before = distributed.dist_avg(loss_reward).item() + p_dist = self._actor_critic.actor(obs) + + loss_reward.backward() + distributed.avg_grads(self._actor_critic.actor) + + grad = -get_flat_gradients_from(self._actor_critic.actor) + x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters) + assert torch.isfinite(x).all(), 'x is not finite' + xHx = torch.dot(x, self._fvp(x)) + assert xHx.item() >= 0, 'xHx is negative' + alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8)) + + self._actor_critic.actor_optimizer.zero_grad() + loss_cost = self._loss_pi_cost(obs, act, logp, adv_c) + loss_cost_before = distributed.dist_avg(loss_cost).item() - Constrained Policy Optimization updates policy network using the conjugate gradient algorithm, - following the steps: + loss_cost.backward() + distributed.avg_grads(self._actor_critic.actor) - - Compute the gradient of the policy. - - Compute the step direction. - - Search for a step size that satisfies the constraint. (Both KL divergence and cost limit). - - Update the policy network. + b_grad = get_flat_gradients_from(self._actor_critic.actor) + ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit + cost = ep_costs / (self._logger.get_stats('Metrics/EpLen')[0] + 1e-8) - Args: - obs (torch.Tensor): The observation tensor. - act (torch.Tensor): The action tensor. - log_p (torch.Tensor): The log probability of the action. - adv (torch.Tensor): The advantage tensor. - cost_adv (torch.Tensor): The cost advantage tensor. - """ - # get loss and info values before update - self.fvp_obs = obs[::4] - theta_old = get_flat_params_from(self.actor_critic.actor) - self.actor_optimizer.zero_grad() - # process the advantage function. - processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv) - # compute the loss of policy net. - loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv) - loss_pi_before = distributed_utils.mpi_avg(loss_pi.item()) - # get prob. distribution before updates, previous dist of possibilities - p_dist = self.actor_critic.actor(obs) - # train policy with multiple steps of gradient descent - loss_pi.backward() - # average grads across MPI processes - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - g_flat = get_flat_gradients_from(self.actor_critic.actor) - - # flip sign since policy_loss = -(ration * adv) - g_flat *= -1 - # x: g or g_T in original paper, stands for gradient of cost function - x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters) # pylint: disable=invalid-name - assert torch.isfinite(x).all(), 'x is not finite' # pylint: disable=invalid-name - eps = 1.0e-8 - # note that xHx = g^T x, but calculating xHx is faster than g^T x - # equivalent to : g^T x - xHx = torch.dot(x, self.Fvp(x)) # pylint: disable = invalid-name - alpha = torch.sqrt(2 * self.target_kl / (xHx + eps)) - assert xHx.item() >= 0, 'No negative values' # pylint: disable = invalid-name - - # get the policy cost performance gradient b (flat as vector) - self.actor_optimizer.zero_grad() - loss_cost, _ = self.compute_loss_cost_performance( - obs=obs, act=act, log_p=log_p, cost_adv=cost_adv - ) - loss_cost.backward() - # average grads across MPI processes - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - self.loss_pi_cost_before = loss_cost.item() - b_flat = get_flat_gradients_from(self.actor_critic.actor) - # :param ep_costs: do samplings to get approximate costs as ep_costs - ep_costs = self.logger.get_stats('Metrics/EpCost')[0] - # :params c: how much sampled result of cost goes beyond limit - cost = ep_costs - self.cost_limit - # Rescale, and add small float to avoid nan - cost /= self.logger.get_stats('Metrics/EpLen')[0] + eps # rescale - - # set variable names as used in the paper with conjugate_gradient method, - # used to solve equation(compute Hessian Matrix) instead of Natural Gradient - - p = conjugate_gradients(self.Fvp, b_flat, self.cg_iters) # pylint: disable = invalid-name - q = xHx # pylint: disable = invalid-name - r = g_flat.dot(p) # pylint: disable = invalid-name - s = b_flat.dot(p) # pylint: disable = invalid-name - - # optim_case: divided into 5 kinds to compute - if b_flat.dot(b_flat) <= 1e-6 and cost < 0: + p = conjugate_gradients(self._fvp, b_grad, self._cfgs.cg_iters) + q = xHx + r = torch.dot(grad, p) + s = torch.dot(b_grad, p) + + if torch.dot(b_grad, b_grad) <= 1e-6 and cost < 0: # feasible step and cost grad is zero: use plain TRPO update... - A = torch.zeros(1) # pylint: disable = invalid-name - B = torch.zeros(1) # pylint: disable = invalid-name + A = torch.zeros(1) + B = torch.zeros(1) optim_case = 4 else: assert torch.isfinite(r).all(), 'r is not finite' assert torch.isfinite(s).all(), 's is not finite' - # A,b: mathematical value, not too much true meaning - A = q - r**2 / s # pylint: disable = invalid-name - B = 2 * self.target_kl - cost**2 / s # pylint: disable = invalid-name + A = q - r**2 / s + B = 2 * self._cfgs.target_kl - cost**2 / s if cost < 0 and B < 0: # point in trust region is feasible and safety boundary doesn't intersect # ==> entire trust region is feasible optim_case = 3 - elif cost < 0 and B >= 0: # pylint: disable=chained-comparison - # x = 0 is feasible and safety boundary intersects - # ==> most of trust region is feasible + elif cost < 0 <= B: + # point in trust region is feasible but safety boundary intersects + # ==> only part of trust region is feasible optim_case = 2 elif cost >= 0 and B >= 0: - # x = 0 is infeasible and safety boundary intersects - # ==> part of trust region is feasible, recovery possible + # point in trust region is infeasible and cost boundary doesn't intersect + # ==> entire trust region is infeasible optim_case = 1 - self.logger.log('Alert! Attempting feasible recovery!', 'yellow') + self._logger.log('Alert! Attempting feasible recovery!', 'yellow') else: # x = 0 infeasible, and safety half space is outside trust region # ==> whole trust region is infeasible, try to fail gracefully optim_case = 0 - self.logger.log('Alert! Attempting infeasible recovery!', 'red') + self._logger.log('Alert! Attempting infeasible recovery!', 'red') - # the following computes required nu_star and lambda_star - if optim_case in [3, 4]: + if optim_case in (3, 4): # under 3 and 4 cases directly use TRPO method - alpha = torch.sqrt( - 2 * self.target_kl / (xHx + 1e-8) - ) # step gap fixed by KKT condition in conjugate algorithm + alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8)) nu_star = torch.zeros(1) lambda_star = 1 / alpha - step_dir = alpha * x # change step direction to gap * gradient + step_direction = alpha * x - elif optim_case in [1, 2]: - # in 1 and 2, - def project_on_set(data: torch.Tensor, low: float, high: float) -> torch.Tensor: - return torch.Tensor([max(low, min(data, high))]) + elif optim_case in (1, 2): + + def project(data: torch.Tensor, low: float, high: float) -> torch.Tensor: + """Project data to [low, high] interval.""" + return torch.max(torch.min(data, torch.tensor(high)), torch.tensor(low)) # analytical Solution to LQCLP, employ lambda,nu to compute final solution of OLOLQC # λ=argmax(f_a(λ),f_b(λ)) = λa_star or λb_star # computing formula shown in appendix, lambda_a and lambda_b lambda_a = torch.sqrt(A / B) - lambda_b = torch.sqrt(q / (2 * self.target_kl)) + lambda_b = torch.sqrt(q / (2 * self._cfgs.target_kl)) # λa_star = Proj(lambda_a ,0 ~ r/c) λb_star=Proj(lambda_b,r/c~ +inf) # where projection(str,b,c)=max(b,min(str,c)) # may be regarded as a projection from effective region towards safety region + r_num = r.item() if cost < 0: - lambda_a_star = project_on_set(lambda_a, 0.0, r / cost) - lambda_b_star = project_on_set(lambda_b, r / cost, np.inf) + lambda_a_star = project(lambda_a, 0.0, r_num / cost) + lambda_b_star = project(lambda_b, r_num / cost, np.inf) else: - lambda_a_star = project_on_set(lambda_a, r / cost, np.inf) - lambda_b_star = project_on_set(lambda_b, 0.0, r / cost) + lambda_a_star = project(lambda_a, r_num / cost, np.inf) + lambda_b_star = project(lambda_b, 0.0, r_num / cost) def f_a(lam): - return -0.5 * (A / (lam + eps) + B * lam) - r * cost / (s + eps) + return -0.5 * (A / (lam + 1e-8) + B * lam) - r * cost / (s + 1e-8) def f_b(lam): - return -0.5 * (q / (lam + eps) + 2 * self.target_kl * lam) + return -0.5 * (q / (lam + 1e-8) + 2 * self._cfgs.target_kl * lam) lambda_star = ( lambda_a_star if f_a(lambda_a_star) >= f_b(lambda_b_star) else lambda_b_star @@ -388,45 +309,54 @@ def f_b(lam): # discard all negative values with torch.clamp(x, min=0) # Nu_star = (lambda_star * - r)/s - nu_star = torch.clamp(lambda_star * cost - r, min=0) / (s + eps) + nu_star = torch.clamp(lambda_star * cost - r, min=0) / (s + 1e-8) # final x_star as final direction played as policy's loss to backward and update - step_dir = 1.0 / (lambda_star + eps) * (x - nu_star * p) + step_direction = 1.0 / (lambda_star + 1e-8) * (x - nu_star * p) else: # case == 0 # purely decrease costs # without further check lambda_star = torch.zeros(1) - nu_star = np.sqrt(2 * self.target_kl / (s + eps)) - step_dir = -nu_star * p - - final_step_dir, accept_step = self.search_step_size( - step_dir, - g_flat, - c=cost, - loss_pi_before=loss_pi_before, - optim_case=optim_case, + nu_star = np.sqrt(2 * self._cfgs.target_kl / (s + 1e-8)) + step_direction = -nu_star * p + + step_direction, accept_step = self._cpo_search_step( + step_direction=step_direction, + grad=grad, p_dist=p_dist, obs=obs, act=act, - log_p=log_p, - adv=adv, - cost_adv=cost_adv, + logp=logp, + adv_r=adv_r, + adv_c=adv_c, + loss_reward_before=loss_reward_before, + loss_cost_before=loss_cost_before, total_steps=20, + violation_c=cost, + optim_case=optim_case, ) - # update actor network parameters - new_theta = theta_old + final_step_dir - set_param_values_to_model(self.actor_critic.actor, new_theta) - self.logger.store( + + theta_new = theta_old + step_direction + set_param_values_to_model(self._actor_critic.actor, theta_new) + + with torch.no_grad(): + loss_reward, info = self._loss_pi(obs, act, logp, adv_r) + loss_cost = self._loss_pi_cost(obs, act, logp, adv_c) + loss = loss_reward + loss_cost + + self._logger.store( **{ - 'Train/Entropy': pi_info['ent'], - 'Train/PolicyRatio': pi_info['ratio'], + 'Loss/Loss_pi': loss.item(), + 'Train/Entropy': info['entrophy'], + 'Train/PolicyRatio': info['ratio'], + 'Train/PolicyStd': info['std'], 'Misc/AcceptanceStep': accept_step, 'Misc/Alpha': alpha.item(), - 'Misc/FinalStepNorm': final_step_dir.norm().mean().item(), + 'Misc/FinalStepNorm': step_direction.norm().mean().item(), 'Misc/xHx': xHx.mean().item(), 'Misc/H_inv_g': x.norm().item(), # H^-1 g - 'Misc/gradient_norm': torch.norm(g_flat).mean().item(), - 'Misc/cost_gradient_norm': torch.norm(b_flat).mean().item(), + 'Misc/gradient_norm': torch.norm(grad).mean().item(), + 'Misc/cost_gradient_norm': torch.norm(b_grad).mean().item(), 'Misc/Lambda_star': lambda_star.item(), 'Misc/Nu_star': nu_star.item(), 'Misc/OptimCase': int(optim_case), diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py index bf548c728..d69ae6cea 100644 --- a/omnisafe/algorithms/on_policy/second_order/pcpo.py +++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py @@ -14,15 +14,13 @@ # ============================================================================== """Implementation of the PCPO algorithm.""" -from typing import Dict, NamedTuple, Tuple - import torch from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.base.trpo import TRPO -from omnisafe.utils import distributed_utils +from omnisafe.algorithms.on_policy.second_order.cpo import CPO +from omnisafe.utils import distributed +from omnisafe.utils.math import conjugate_gradients from omnisafe.utils.tools import ( - conjugate_gradients, get_flat_gradients_from, get_flat_params_from, set_param_values_to_model, @@ -30,7 +28,7 @@ @registry.register -class PCPO(TRPO): +class PCPO(CPO): """The Projection-Based Constrained Policy Optimization (PCPO) algorithm. References: @@ -39,196 +37,14 @@ class PCPO(TRPO): URL:`PCPO _` """ - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PCPO. - - PCPO is a derivative of TRPO. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - self.cost_limit = self.cfgs.cost_limit - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Misc/cost_gradient_norm') - self.logger.register_key('Misc/A') - self.logger.register_key('Misc/B') - self.logger.register_key('Misc/q') - self.logger.register_key('Misc/r') - self.logger.register_key('Misc/s') - self.logger.register_key('Misc/Lambda_star') - self.logger.register_key('Misc/Nu_star') - self.logger.register_key('Misc/OptimCase') - # pylint: disable-next=too-many-locals,too-many-arguments - def adjust_cpo_step_direction( + def _update_actor( self, - step_dir: torch.Tensor, - g_flat: torch.Tensor, - cost: torch.Tensor, - optim_case: int, - p_dist: torch.distributions.Distribution, obs: torch.Tensor, act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, - loss_pi_before: torch.Tensor, - loss_pi_cost_before: torch.Tensor, - total_steps: int = 25, - decay: float = 0.8, - ) -> Tuple[torch.Tensor, int]: - r"""Use line-search to find the step size that satisfies the constraint. - - PCPO uses line-search to find the step size that satisfies the constraint. - The constraint is defined as: - - .. math:: - J^C(\theta + \alpha \delta) - J^C(\theta) \leq \max \{0, c\}\\ - D_{KL}(\pi_{\theta}(\cdot|s) || \pi_{\theta + \alpha \delta}(\cdot|s)) \leq \delta_{KL} - - where :math:`\delta_{KL}` is the constraint of KL divergence, :math:`\alpha` is the step size, - :math:`c` is the violation of constraint. - - Args: - step_dir (torch.Tensor): The step direction. - g_flat (torch.Tensor): The gradient of the policy. - p_dist (torch.distributions.Distribution): The old policy distribution. - obs (torch.Tensor): The observation. - act (torch.Tensor): The action. - log_p (torch.Tensor): The log probability of the action. - adv (torch.Tensor): The advantage. - cost_adv (torch.Tensor): The cost advantage. - loss_pi_before (torch.Tensor): The loss of the policy before the step. - loss_pi_cost_before (torch.Tensor): The loss of the cost before the step. - total_steps (int, optional): The total steps of line-search. Defaults to 25. - decay (float, optional): The decay of step size. Defaults to 0.8. - """ - step_frac = 1.0 - _theta_old = get_flat_params_from(self.actor_critic.actor) - expected_rew_improve = g_flat.dot(step_dir) - - # while not within_trust_region: - for j in range(total_steps): - new_theta = _theta_old + step_frac * step_dir - set_param_values_to_model(self.actor_critic.actor, new_theta) - acceptance_step = j + 1 - - with torch.no_grad(): - # loss of policy reward from target/expected reward - loss_pi_rew, _ = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv) - # loss of cost of policy cost from real/expected reward - loss_pi_cost, _ = self.compute_loss_cost_performance( - obs=obs, act=act, log_p=log_p, cost_adv=cost_adv - ) - self.loss_record.append(loss_pi=(loss_pi_rew.mean() + loss_pi_cost.mean()).item()) - # determine KL div between new and old policy - q_dist = self.actor_critic.actor(obs) - torch_kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item() - loss_rew_improve = loss_pi_before - loss_pi_rew.item() - cost_diff = loss_pi_cost.item() - loss_pi_cost_before - - # average across MPI processes... - torch_kl = distributed_utils.mpi_avg(torch_kl) - loss_rew_improve = distributed_utils.mpi_avg(loss_rew_improve) - cost_diff = distributed_utils.mpi_avg(cost_diff) - menu = (expected_rew_improve, loss_rew_improve) - self.logger.log(f'Expected Improvement: {menu[0]} Actual: {menu[1]}') - - if not torch.isfinite(loss_pi_rew) and not torch.isfinite(loss_pi_cost): - self.logger.log('WARNING: loss_pi not finite') - elif loss_rew_improve < 0 if optim_case > 1 else False: - self.logger.log('INFO: did not improve improve <0') - - elif cost_diff > max(-cost, 0): - self.logger.log(f'INFO: no improve {cost_diff} > {max(-cost, 0)}') - elif torch_kl > self.target_kl * 1.5: - self.logger.log(f'INFO: violated KL constraint {torch_kl} at step {j + 1}.') - else: - # step only if surrogate is improved and we are - # within the trust region - self.logger.log(f'Accept step at i={j + 1}') - break - step_frac *= decay - else: - self.logger.log('INFO: no suitable step found...') - step_dir = torch.zeros_like(step_dir) - acceptance_step = 0 - - set_param_values_to_model(self.actor_critic.actor, _theta_old) - return step_frac * step_dir, acceptance_step - - def algorithm_specific_logs(self) -> None: - r"""Log the PCPO specific information. - - .. list-table:: - - * - Things to log - - Description - * - Misc/cost_gradient_norm - - The norm of the cost gradient. - * - Misc/q - - The :math:`q` vector, which is the conjugate of Hessian :math:`H`. - * - Misc/r - - The :math:`r` vector, where :math:`r = g^T H^{-1} b`. - * - Misc/s - - The :math:`s` vector, where :math:`s = b^T H^{-1} b` - * - Misc/A - - The A matrix, where :math:`A = q - \frac{r^2}{s}` - * - Misc/B - - The B matrix, where :math:`B = 2 \delta_{KL} - \frac{c^2}{s}` , - where :math:`c` is the cost violation in current epoch, and - :math:`\delta_{KL}` is the target KL divergence. - * - Misc/Lambda_star - - The :math:`\lambda^*` vector. - * - Misc/Nu_star - - The :math:`\nu^*` vector. - * - Misc/OptimCase - - The optimization case. - """ - TRPO.algorithm_specific_logs(self) - - def compute_loss_cost_performance( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - cost_adv: torch.Tensor, - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r"""Compute the performance of cost on this moment. - - Detailedly, we compute the loss of cost of policy cost from real cost. - - .. math:: - L = \mathbb{E}_{\pi} \left[ \frac{\pi(a|s)}{\pi_{old}(a|s)} A^C(s, a) \right] - - where :math:`A^C(s, a)` is the cost advantage, - :math:`\pi_{old}(a|s)` is the old policy, - :math:`\pi(a|s)` is the current policy. - - Args: - obs (torch.Tensor): Observation. - act (torch.Tensor): Action. - log_p (torch.Tensor): Log probability. - cost_adv (torch.Tensor): Cost advantage. - """ - _, _log_p = self.actor_critic.actor(obs, act) - ratio = torch.exp(_log_p - log_p) - cost_loss = (ratio * cost_adv).mean() - info = {} - return cost_loss, info - - # pylint: disable-next=too-many-locals,too-many-arguments - def update_policy_net( - self, - obs: torch.Tensor, - act: torch.Tensor, - log_p: torch.Tensor, - adv: torch.Tensor, - cost_adv: torch.Tensor, + logp: torch.Tensor, + adv_r: torch.Tensor, + adv_c: torch.Tensor, ) -> None: """Update policy network. @@ -247,96 +63,88 @@ def update_policy_net( adv (torch.Tensor): The advantage tensor. cost_adv (torch.Tensor): The cost advantage tensor. """ - self.fvp_obs = obs[::4] - theta_old = get_flat_params_from(self.actor_critic.actor) - self.actor_optimizer.zero_grad() - # process the advantage function. - processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv) - # compute the loss of policy net. - loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv) - loss_pi_before = loss_pi.item() - # get prob. distribution before updates - p_dist = self.actor_critic.actor(obs) - # train policy with multiple steps of gradient descent - loss_pi.backward() - # average grads across MPI processes - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - g_flat = get_flat_gradients_from(self.actor_critic.actor) - - # flip sign since policy_loss = -(ration * adv) - g_flat *= -1 - x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters) # pylint: disable = invalid-name + # pylint: disable=invalid-name + self._fvp_obs = obs[::4] + theta_old = get_flat_params_from(self._actor_critic.actor) + self._actor_critic.actor.zero_grad() + loss_reward, info = self._loss_pi(obs, act, logp, adv_r) + loss_reward_before = distributed.dist_avg(loss_reward).item() + p_dist = self._actor_critic.actor(obs) + + loss_reward.backward() + distributed.avg_grads(self._actor_critic.actor) + + grad = -get_flat_gradients_from(self._actor_critic.actor) + x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters) assert torch.isfinite(x).all(), 'x is not finite' - eps = 1.0e-8 - # note that xHx = g^T x, but calculating xHx is faster than g^T x - xHx = torch.dot(x, self.Fvp(x)) # pylint: disable = invalid-name - H_inv_g = self.Fvp(x) # pylint: disable = invalid-name - alpha = torch.sqrt(2 * self.target_kl / (xHx + eps)) - assert xHx.item() >= 0, 'No negative values' + xHx = torch.dot(x, self._fvp(x)) + H_inv_g = self._fvp(x) + assert xHx.item() >= 0, 'xHx is negative' + alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8)) + + self._actor_critic.actor_optimizer.zero_grad() + loss_cost = self._loss_pi_cost(obs, act, logp, adv_c) + loss_cost_before = distributed.dist_avg(loss_cost).item() - # get the policy cost performance gradient b (flat as vector) - self.actor_optimizer.zero_grad() - loss_cost, _ = self.compute_loss_cost_performance( - obs=obs, act=act, log_p=log_p, cost_adv=cost_adv - ) loss_cost.backward() - # average grads across MPI processes - distributed_utils.mpi_avg_grads(self.actor_critic.actor) - loss_pi_cost_before = loss_cost.item() - b_flat = get_flat_gradients_from(self.actor_critic.actor) + distributed.avg_grads(self._actor_critic.actor) + + b_grad = get_flat_gradients_from(self._actor_critic.actor) + ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit + cost = ep_costs / (self._logger.get_stats('Metrics/EpLen')[0] + 1e-8) - ep_costs = self.logger.get_stats('Metrics/EpCost')[0] - cost = ep_costs - self.cost_limit - cost /= self.logger.get_stats('Metrics/EpLen')[0] + eps # rescale - self.logger.log(f'c = {cost}') - self.logger.log(f'b^T b = {b_flat.dot(b_flat).item()}') + self._logger.log(f'c = {cost}') + self._logger.log(f'b^T b = {b_grad.dot(b_grad).item()}') - # set variable names as used in the paper - p = conjugate_gradients(self.Fvp, b_flat, self.cg_iters) # pylint: disable = invalid-name - q = xHx # pylint: disable = invalid-name - # g^T H^{-1} b - r = g_flat.dot(p) # pylint: disable = invalid-name - # b^T H^{-1} b - s = b_flat.dot(p) # pylint: disable = invalid-name - step_dir = ( - torch.sqrt(2 * self.target_kl / (q + 1e-8)) * H_inv_g + p = conjugate_gradients(self._fvp, b_grad, self._cfgs.cg_iters) + q = xHx + r = torch.dot(grad, p) + s = torch.dot(b_grad, p) + + step_direction = ( + torch.sqrt(2 * self._cfgs.target_kl / (q + 1e-8)) * H_inv_g - torch.clamp_min( - (torch.sqrt(2 * self.target_kl / q) * r + cost) / s, - torch.tensor(0.0, device=self.cfgs.device), + (torch.sqrt(2 * self._cfgs.target_kl / q) * r + cost) / s, + torch.tensor(0.0, device=self._device), ) * p ) # pylint: disable = invalid-name - final_step_dir, accept_step = self.adjust_cpo_step_direction( - step_dir, - g_flat, - cost=cost, - optim_case=2, + step_direction, accept_step = self._cpo_search_step( + step_direction=step_direction, + grad=grad, p_dist=p_dist, obs=obs, act=act, - log_p=log_p, - adv=adv, - cost_adv=cost_adv, - loss_pi_before=loss_pi_before, - loss_pi_cost_before=loss_pi_cost_before, + logp=logp, + adv_r=adv_r, + adv_c=adv_c, + loss_reward_before=loss_reward_before, + loss_cost_before=loss_cost_before, total_steps=20, + violation_c=cost, ) - # update actor network parameters - new_theta = theta_old + final_step_dir - set_param_values_to_model(self.actor_critic.actor, new_theta) + theta_new = theta_old + step_direction + set_param_values_to_model(self._actor_critic.actor, theta_new) + + with torch.no_grad(): + loss_reward, info = self._loss_pi(obs, act, logp, adv_r) + loss_cost = self._loss_pi_cost(obs, act, logp, adv_c) + loss = loss_reward + loss_cost - self.logger.store( + self._logger.store( **{ - 'Train/Entropy': pi_info['ent'], - 'Train/PolicyRatio': pi_info['ratio'], + 'Loss/Loss_pi': loss.item(), + 'Train/Entropy': info['entrophy'], + 'Train/PolicyRatio': info['ratio'], + 'Train/PolicyStd': info['std'], 'Misc/AcceptanceStep': accept_step, 'Misc/Alpha': alpha.item(), - 'Misc/FinalStepNorm': final_step_dir.norm().mean().item(), + 'Misc/FinalStepNorm': step_direction.norm().mean().item(), 'Misc/xHx': xHx.mean().item(), 'Misc/H_inv_g': x.norm().item(), # H^-1 g - 'Misc/gradient_norm': torch.norm(g_flat).mean().item(), - 'Misc/cost_gradient_norm': torch.norm(b_flat).mean().item(), + 'Misc/gradient_norm': torch.norm(grad).mean().item(), + 'Misc/cost_gradient_norm': torch.norm(b_grad).mean().item(), 'Misc/Lambda_star': 1.0, 'Misc/Nu_star': 1.0, 'Misc/OptimCase': int(1), diff --git a/omnisafe/algorithms/on_policy/simmer/__init__.py b/omnisafe/algorithms/on_policy/simmer/__init__.py deleted file mode 100644 index 91b2cca64..000000000 --- a/omnisafe/algorithms/on_policy/simmer/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Simmer algorithms.""" - -from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_pid import PPOLagSimmerPid -from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_q import PPOLagSimmerQ -from omnisafe.algorithms.on_policy.simmer.ppo_simmer_pid import PPOSimmerPid -from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ - - -__all__ = [ - 'PPOLagSimmerPid', - 'PPOLagSimmerQ', - 'PPOSimmerPid', - 'PPOSimmerQ', -] diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py deleted file mode 100644 index 302b65a47..000000000 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the PID version of the Simmer algorithm using PPOLag.""" - -from typing import NamedTuple - -from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag - - -@registry.register -class PPOLagSimmerPid(PPOLag): - """The PID version of the Simmer algorithm implemented with PPOLag. - - References: - - Title: Effects of Safety State Augmentation on Safe Exploration - - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - - URL: `Simmer RL `_ - """ - - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOLagSimmerPid. - - PPOLagSimmerPid is a combination of :class:`PPO` and :class:`Lagrange` model, - using :class:`Simmer` as the environment wrapper. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/EpBudget') - self.logger.register_key('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py deleted file mode 100644 index 731d954b8..000000000 --- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the Q Simmer algorithm using PPOLag.""" - -from typing import NamedTuple - -from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag - - -@registry.register -class PPOLagSimmerQ(PPOLag): - """The Q Simmer algorithm implemented with PPOLag. - - References: - - Title: Effects of Safety State Augmentation on Safe Exploration - - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - - URL: `Simmer RL `_ - """ - - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOLagSimmerQ. - - PPOLagSimmerQ is a combination of :class:`PPO` and :class:`Lagrange` model, - using :class:`Simmer` as the environment wrapper. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/EpBudget') - self.logger.register_key('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py deleted file mode 100644 index 6c0bc33c8..000000000 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the PID version of the Simmer algorithm using PPO.""" - -from typing import NamedTuple - -from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.base.ppo import PPO - - -@registry.register -class PPOSimmerPid(PPO): - """The PID version of the Simmer algorithm implemented with PPO. - - References: - - Title: Effects of Safety State Augmentation on Safe Exploration - - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - - URL: `Simmer RL `_ - """ - - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOSimmerPid. - - PPOSimmerPid is a combination of :class:`PPO` and :class:`Simmer` environment wrapper. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/EpBudget') - self.logger.register_key('Metrics/SafetyBudget') diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py deleted file mode 100644 index 2e6589433..000000000 --- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the Q Simmer algorithm using PPO.""" - -from typing import NamedTuple - -from omnisafe.algorithms import registry -from omnisafe.algorithms.on_policy.base.ppo import PPO - - -@registry.register -class PPOSimmerQ(PPO): - """The Q Simmer algorithm implemented with PPO. - - References: - - Title: Effects of Safety State Augmentation on Safe Exploration - - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar. - - URL: `Simmer RL `_ - """ - - def __init__(self, env_id: str, cfgs: NamedTuple) -> None: - """Initialize PPOSimmerQ. - - PPOSimmerQ is a combination of :class:`PPO` and :class:`Simmer` environment wrapper. - - Args: - env_id (str): The environment id. - cfgs (NamedTuple): The configuration of the algorithm. - """ - super().__init__(env_id=env_id, cfgs=cfgs) - - def _specific_init_logs(self): - super()._specific_init_logs() - self.logger.register_key('Metrics/EpBudget') - self.logger.register_key('Metrics/SafetyBudget') diff --git a/omnisafe/common/buffer/onpolicy_buffer.py b/omnisafe/common/buffer/onpolicy_buffer.py index 4c159dafb..cd858ad66 100644 --- a/omnisafe/common/buffer/onpolicy_buffer.py +++ b/omnisafe/common/buffer/onpolicy_buffer.py @@ -20,9 +20,8 @@ from omnisafe.common.buffer.base import BaseBuffer from omnisafe.typing import AdvatageEstimator, OmnisafeSpace -from omnisafe.utils import distributed_utils -from omnisafe.utils.core import discount_cumsum_torch -from omnisafe.utils.vtrace import calculate_v_trace +from omnisafe.utils import distributed +from omnisafe.utils.math import discount_cumsum class OnPolicyBuffer(BaseBuffer): # pylint: disable=too-many-instance-attributes @@ -95,14 +94,14 @@ def finish_path( ) -> None: """Finish the current path and calculate the advantages of state-action pairs.""" path_slice = slice(self.path_start_idx, self.ptr) - last_value_r = last_value_r.to(self.device) - last_value_c = last_value_c.to(self.device) + last_value_r = last_value_r.to(self._device) + last_value_c = last_value_c.to(self._device) rewards = torch.cat([self.data['reward'][path_slice], last_value_r]) values_r = torch.cat([self.data['value_r'][path_slice], last_value_r]) costs = torch.cat([self.data['cost'][path_slice], last_value_c]) values_c = torch.cat([self.data['value_c'][path_slice], last_value_c]) - discountred_ret = discount_cumsum_torch(rewards, self._gamma)[:-1] + discountred_ret = discount_cumsum(rewards, self._gamma)[:-1] self.data['discounted_ret'][path_slice] = discountred_ret rewards -= self._penalty_coefficient * costs @@ -122,7 +121,6 @@ def finish_path( def get(self) -> Dict[str, torch.Tensor]: """Get the data in the buffer.""" - assert self.ptr == self.max_size, 'The buffer is not full!' self.ptr, self.path_start_idx = 0, 0 data = { @@ -136,11 +134,11 @@ def get(self) -> Dict[str, torch.Tensor]: 'target_value_c': self.data['target_value_c'], } - self.data['adv_r'] = torch.zeros_like(self.data['adv_r']) - self.data['adv_c'] = torch.zeros_like(self.data['adv_c']) + # self.data['adv_r'] = torch.zeros_like(self.data['adv_r']) + # self.data['adv_c'] = torch.zeros_like(self.data['adv_c']) - adv_mean, adv_std, *_ = distributed_utils.mpi_statistics_scalar(data['adv_r']) - cadv_mean, *_ = distributed_utils.mpi_statistics_scalar(data['adv_c']) + adv_mean, adv_std, *_ = distributed.dist_statistics_scalar(data['adv_r']) + cadv_mean, *_ = distributed.dist_statistics_scalar(data['adv_c']) if self._standardized_adv_r: data['adv_r'] = (data['adv_r'] - adv_mean) / (adv_std + 1e-8) if self._standardized_adv_c: @@ -206,15 +204,15 @@ def _calculate_adv_and_value_targets( if self._advantage_estimator == 'gae': # GAE formula: A_t = \sum_{k=0}^{n-1} (lam*gamma)^k delta_{t+k} deltas = rewards[:-1] + self._gamma * values[1:] - values[:-1] - adv = discount_cumsum_torch(deltas, self._gamma * lam) + adv = discount_cumsum(deltas, self._gamma * lam) target_value = adv + values[:-1] elif self._advantage_estimator == 'gae-rtg': # GAE formula: A_t = \sum_{k=0}^{n-1} (lam*gamma)^k delta_{t+k} deltas = rewards[:-1] + self._gamma * values[1:] - values[:-1] - adv = discount_cumsum_torch(deltas, self._gamma * lam) + adv = discount_cumsum(deltas, self._gamma * lam) # compute rewards-to-go, to be targets for the value function update - target_value = discount_cumsum_torch(rewards, self._gamma)[:-1] + target_value = discount_cumsum(rewards, self._gamma)[:-1] elif self._advantage_estimator == 'vtrace': # v_s = V(x_s) + \sum^{T-1}_{t=s} \gamma^{t-s} @@ -222,7 +220,7 @@ def _calculate_adv_and_value_targets( # * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t)) path_slice = slice(self.path_start_idx, self.ptr) action_probs = self.data['logp'][path_slice].exp() - target_value, adv, _ = calculate_v_trace( + target_value, adv, _ = self._calculate_v_trace( policy_action_probs=action_probs, values=values, rewards=rewards, @@ -235,9 +233,72 @@ def _calculate_adv_and_value_targets( elif self._advantage_estimator == 'plain': # A(x, u) = Q(x, u) - V(x) = r(x, u) + gamma V(x+1) - V(x) adv = rewards[:-1] + self._gamma * values[1:] - values[:-1] - target_value = discount_cumsum_torch(rewards, self._gamma)[:-1] + target_value = discount_cumsum(rewards, self._gamma)[:-1] else: raise NotImplementedError return adv, target_value + + @staticmethod + # pylint: disable-next=too-many-arguments,too-many-locals + def _calculate_v_trace( + policy_action_probs: torch.Tensor, + values: torch.Tensor, # including bootstrap + rewards: torch.Tensor, # including bootstrap + behavior_action_probs: torch.Tensor, + gamma: float = 0.99, + rho_bar: float = 1.0, + c_bar: float = 1.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]: + r"""This function is used to calculate V-trace targets. + + .. math:: + A_t = \sum_{k=0}^{n-1} (\lambda \gamma)^k \delta_{t+k} + + (\lambda \gamma)^n * \rho_{t+n} * (1 - d_{t+n}) * (V(x_{t+n}) - b_{t+n}) + + Calculate V-trace targets for off-policy actor-critic learning recursively. + For more details, + please refer to the paper: `Espeholt et al. 2018, IMPALA `_. + + Args: + policy_action_probs (torch.Tensor): action probabilities of policy network, shape=(sequence_length,) + values (torch.Tensor): state values, shape=(sequence_length+1,) + rewards (torch.Tensor): rewards, shape=(sequence_length+1,) + behavior_action_probs (torch.Tensor): action probabilities of behavior network, shape=(sequence_length,) + gamma (float): discount factor + rho_bar (float): clip rho + c_bar (float): clip c + + Returns: + tuple: V-trace targets, shape=(batch_size, sequence_length) + """ + assert values.ndim == 1, 'Please provide 1d-arrays' + assert rewards.ndim == 1 + assert policy_action_probs.ndim == 1 + assert behavior_action_probs.ndim == 1 + assert c_bar <= rho_bar + + sequence_length = policy_action_probs.shape[0] + # pylint: disable-next=assignment-from-no-return + rhos = torch.div(policy_action_probs, behavior_action_probs) + clip_rhos = torch.min( + rhos, torch.as_tensor(rho_bar) + ) # pylint: disable=assignment-from-no-return + clip_cs = torch.min( + rhos, torch.as_tensor(c_bar) + ) # pylint: disable=assignment-from-no-return + v_s = values[:-1].clone() # copy all values except bootstrap value + last_v_s = values[-1] # bootstrap from last state + + # calculate v_s + for index in reversed(range(sequence_length)): + delta = clip_rhos[index] * (rewards[index] + gamma * values[index + 1] - values[index]) + v_s[index] += delta + gamma * clip_cs[index] * (last_v_s - values[index + 1]) + last_v_s = v_s[index] # accumulate current v_s for next iteration + + # calculate q_targets + v_s_plus_1 = torch.cat((v_s[1:], values[-1:])) + policy_advantage = clip_rhos * (rewards[:-1] + gamma * v_s_plus_1 - values[:-1]) + + return v_s, policy_advantage, clip_rhos diff --git a/omnisafe/common/buffer/vector_onpolicy_buffer.py b/omnisafe/common/buffer/vector_onpolicy_buffer.py index 1d11d523c..59f634e69 100644 --- a/omnisafe/common/buffer/vector_onpolicy_buffer.py +++ b/omnisafe/common/buffer/vector_onpolicy_buffer.py @@ -20,7 +20,7 @@ from omnisafe.common.buffer.onpolicy_buffer import OnPolicyBuffer from omnisafe.typing import AdvatageEstimator, OmnisafeSpace -from omnisafe.utils import distributed_utils +from omnisafe.utils import distributed class VectorOnPolicyBuffer(OnPolicyBuffer): @@ -88,8 +88,8 @@ def get(self) -> Dict[str, torch.Tensor]: data_pre[k].append(v) data = {k: torch.cat(v, dim=0) for k, v in data_pre.items()} - adv_mean, adv_std, *_ = distributed_utils.mpi_statistics_scalar(data['adv_r']) - cadv_mean, *_ = distributed_utils.mpi_statistics_scalar(data['adv_c']) + adv_mean, adv_std, *_ = distributed.dist_statistics_scalar(data['adv_r']) + cadv_mean, *_ = distributed.dist_statistics_scalar(data['adv_c']) if self._standardized_adv_r: data['adv_r'] = (data['adv_r'] - adv_mean) / (adv_std + 1e-8) if self._standardized_adv_c: diff --git a/omnisafe/common/experiment_grid.py b/omnisafe/common/experiment_grid.py index 8b879cc9f..fb5069a05 100644 --- a/omnisafe/common/experiment_grid.py +++ b/omnisafe/common/experiment_grid.py @@ -21,6 +21,7 @@ from concurrent.futures import ProcessPoolExecutor as Pool from copy import deepcopy from textwrap import dedent +from typing import Any, Dict, List import numpy as np from tqdm import trange @@ -34,10 +35,10 @@ class ExperimentGrid: """Tool for running many experiments given hyper-parameters ranges.""" def __init__(self, exp_name='') -> None: - self.keys = [] - self.vals = [] - self.shs = [] - self.in_names = [] + self.keys: List[str] = [] + self.vals: List[Any] = [] + self.shs: List[str] = [] + self.in_names: List[str] = [] self.div_line_width = 80 assert isinstance(exp_name, str), 'Name has to be a string.' self.name = exp_name @@ -206,7 +207,7 @@ def update_dic(self, total_dic, item_dic): def _variants(self, keys, vals): """Recursively builds list of valid variants.""" if len(keys) == 1: - pre_variants = [{}] + pre_variants: List[Dict] = [{}] else: pre_variants = self._variants(keys[1:], vals[1:]) @@ -259,7 +260,7 @@ def variants(self): def unflatten_var(var): """Build the full nested dict version of var, based on key names.""" - new_var = {} + new_var: Dict = {} unflatten_set = set() for key, value in var.items(): diff --git a/omnisafe/common/lagrange.py b/omnisafe/common/lagrange.py index b5e85b82f..d03d2cddc 100644 --- a/omnisafe/common/lagrange.py +++ b/omnisafe/common/lagrange.py @@ -14,12 +14,10 @@ # ============================================================================== """Implementation of Lagrange.""" -import abc - import torch -class Lagrange(abc.ABC): +class Lagrange: r"""Abstract base class for Lagrangian-base Algorithms. This class implements the Lagrange multiplier update and the Lagrange loss. diff --git a/omnisafe/common/logger.py b/omnisafe/common/logger.py index 9398ebf33..61a68f335 100644 --- a/omnisafe/common/logger.py +++ b/omnisafe/common/logger.py @@ -26,7 +26,7 @@ import wandb from omnisafe.utils.config import Config -from omnisafe.utils.distributed_utils import mpi_statistics_scalar, proc_id +from omnisafe.utils.distributed import dist_statistics_scalar, get_rank # As of torch v1.9.0, torch.utils.tensorboard has a bug that is exposed by setuptools 59.6.0. The @@ -113,10 +113,10 @@ def __init__( # pylint: disable=too-many-arguments,too-many-locals self._hms_time = hms_time self._log_dir = os.path.join(output_dir, exp_name, relpath) self._verbose = verbose - self._main_proc = proc_id() == 0 + self._maste_proc = get_rank() == 0 self._output_file: TextIO - if self._main_proc: + if self._maste_proc: os.makedirs(self._log_dir, exist_ok=True) self._output_file = open( # pylint: disable=consider-using-with os.path.join(self._log_dir, output_fname), encoding='utf-8', mode='w' @@ -130,6 +130,7 @@ def __init__( # pylint: disable=too-many-arguments,too-many-locals self._data: Dict[str, Union[Deque[Union[int, float]], List[Union[int, float]]]] = {} self._headers_windwos: Dict[str, Optional[int]] = {} self._headers_minmax: Dict[str, bool] = {} + self._headers_delta: Dict[str, bool] = {} self._current_row: Dict[str, Union[int, float]] = {} if config is not None: @@ -139,10 +140,10 @@ def __init__( # pylint: disable=too-many-arguments,too-many-locals self._use_tensorboard = use_tensorboard self._use_wandb = use_wandb - if self._use_tensorboard and self._main_proc: + if self._use_tensorboard and self._maste_proc: self._tensorboard_writer = SummaryWriter(log_dir=os.path.join(self._log_dir, 'tb')) - if self._use_wandb and self._main_proc: + if self._use_wandb and self._maste_proc: project: str = self._config.get('wandb_project', 'omnisafe') name: str = self._config.get('wandb_name', f'{exp_name}/{relpath}') entity: str = self._config.get('wandb_entity', None) @@ -168,7 +169,7 @@ def log( msg (str): The message to be logged. color (int): The color of the message. """ - if self._verbose and self._main_proc: + if self._verbose and self._maste_proc: print(WordColor.colorize(msg, color, bold, highlight)) def save_config(self, config: Config) -> None: @@ -177,7 +178,7 @@ def save_config(self, config: Config) -> None: Args: config (dict): The configuration to be saved. """ - if self._main_proc: + if self._maste_proc: self.log('Save with config in config.json', 'yellow', bold=True) with open(os.path.join(self._log_dir, 'config.json'), encoding='utf-8', mode='w') as f: f.write(config.tojson()) @@ -192,7 +193,7 @@ def setup_torch_saver(self, what_to_save: Dict[str, Any]) -> None: def torch_save(self) -> None: """Save the torch model.""" - if self._main_proc: + if self._maste_proc: assert self._what_to_save is not None, 'Please setup torch saver first' path = os.path.join(self._log_dir, 'torch_save', f'epoch-{self._epoch}.pt') os.makedirs(os.path.dirname(path), exist_ok=True) @@ -204,7 +205,11 @@ def torch_save(self) -> None: torch.save(params, path) def register_key( - self, key: str, window_length: Optional[int] = None, min_and_max: bool = False + self, + key: str, + window_length: Optional[int] = None, + min_and_max: bool = False, + delta: bool = False, ) -> None: """Register a key to the logger. @@ -220,10 +225,17 @@ def register_key( self._current_row[f'{key}/Max'] = 0 self._current_row[f'{key}/Std'] = 0 self._headers_minmax[key] = True + else: self._current_row[key] = 0 self._headers_minmax[key] = False + if delta: + self._current_row[f'{key}/Delta'] = 0 + self._headers_delta[key] = True + else: + self._headers_delta[key] = False + if window_length is not None: self._data[key] = deque(maxlen=window_length) self._headers_windwos[key] = window_length @@ -250,21 +262,8 @@ def store(self, **kwargs: Union[int, float, np.ndarray, torch.Tensor]) -> None: def dump_tabular(self) -> None: """Dump the tabular data to the console and the file.""" - for key in self._data: - if self._headers_minmax[key]: - mean, min_val, max_val, std = self.get_stats(key, True) - self._current_row[f'{key}/Mean'] = mean - self._current_row[f'{key}/Min'] = min_val - self._current_row[f'{key}/Max'] = max_val - self._current_row[f'{key}/Std'] = std - else: - mean = self.get_stats(key, False)[0] - self._current_row[key] = mean - - if self._headers_windwos[key] is None: - self._data[key] = [] - - if self._main_proc: + self._update_current_row() + if self._maste_proc: self._epoch += 1 if self._verbose: key_lens = list(map(len, self._current_row.keys())) @@ -291,6 +290,26 @@ def dump_tabular(self) -> None: if self._use_wandb: wandb.log(self._current_row, step=self._epoch) + def _update_current_row(self) -> None: + for key in self._data: + if self._headers_minmax[key]: + old_data = self._current_row[f'{key}/Mean'] + mean, min_val, max_val, std = self.get_stats(key, True) + self._current_row[f'{key}/Mean'] = mean + self._current_row[f'{key}/Min'] = min_val + self._current_row[f'{key}/Max'] = max_val + self._current_row[f'{key}/Std'] = std + else: + old_data = self._current_row[key] + mean = self.get_stats(key, False)[0] + self._current_row[key] = mean + + if self._headers_delta[key]: + self._current_row[f'{key}/Delta'] = mean - old_data + + if self._headers_windwos[key] is None: + self._data[key] = [] + def get_stats(self, key, min_and_max: bool = False) -> Tuple[Union[int, float], ...]: """Get the statistics of the key.""" assert key in self._current_row, f'Key {key} has not been registered' @@ -299,17 +318,17 @@ def get_stats(self, key, min_and_max: bool = False) -> Tuple[Union[int, float], vals = list(vals) if min_and_max: - mean, std, min_val, max_val = mpi_statistics_scalar( + mean, std, min_val, max_val = dist_statistics_scalar( torch.tensor(vals), with_min_and_max=True ) return mean.item(), min_val.item(), max_val.item(), std.item() - mean, std = mpi_statistics_scalar( # pylint: disable=unbalanced-tuple-unpacking + mean, std = dist_statistics_scalar( # pylint: disable=unbalanced-tuple-unpacking torch.tensor(vals) ) return (mean.item(),) def close(self) -> None: """Close the logger.""" - if self._main_proc: + if self._maste_proc: self._output_file.close() diff --git a/omnisafe/common/normalizer.py b/omnisafe/common/normalizer.py index 1327d731f..fde7567cb 100644 --- a/omnisafe/common/normalizer.py +++ b/omnisafe/common/normalizer.py @@ -14,6 +14,8 @@ # ============================================================================== """Implementation of Vector Buffer.""" +from typing import Tuple + import torch import torch.nn as nn @@ -21,56 +23,88 @@ class Normalizer(nn.Module): """Calculate normalized raw_data from running mean and std - See https://www.johndcook.com/blog/standard_deviation/ + See Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1979), "Updating Formulae and + a Pairwise Algorithm for Computing Sample Variances." (PDF), Technical Report STAN-CS-79-773, + Department of Computer Science, Stanford University. """ - def __init__(self, shape, clip=1e6): + def __init__(self, shape: Tuple[int, ...], clip: float = 1e6) -> None: """Initialize the normalize.""" super().__init__() - self.raw_data = nn.Parameter( - torch.zeros(*shape), requires_grad=False - ) # Current value of data stream - self.mean = nn.Parameter(torch.zeros(*shape), requires_grad=False) # Current mean - self.sumsq = nn.Parameter( - torch.zeros(*shape), requires_grad=False - ) # Current sum of squares, used in var/std calculation + if shape == (): + self.register_buffer('_mean', torch.tensor(0.0)) + self.register_buffer('_sumsq', torch.tensor(0.0)) + self.register_buffer('_var', torch.tensor(0.0)) + self.register_buffer('_std', torch.tensor(0.0)) + self.register_buffer('_count', torch.tensor(0)) + self.register_buffer('_clip', clip * torch.tensor(1.0)) + else: + self.register_buffer('_mean', torch.zeros(*shape)) + self.register_buffer('_sumsq', torch.zeros(*shape)) + self.register_buffer('_var', torch.zeros(*shape)) + self.register_buffer('_std', torch.zeros(*shape)) + self.register_buffer('_count', torch.tensor(0)) + self.register_buffer('_clip', clip * torch.ones(*shape)) - self.var = nn.Parameter(torch.zeros(*shape), requires_grad=False) # Current variance - self.std = nn.Parameter(torch.zeros(*shape), requires_grad=False) # Current std + self._mean: torch.Tensor # running mean + self._sumsq: torch.Tensor # running sum of squares + self._var: torch.Tensor # running variance + self._std: torch.Tensor # running standard deviation + self._count: torch.Tensor # number of samples + self._clip: torch.Tensor # clip value - self.count = nn.Parameter(torch.zeros(1), requires_grad=False) # Counter + self._shape = shape + self._first = True - self.clip = nn.Parameter(clip * torch.ones(*shape), requires_grad=False) + @property + def shape(self) -> Tuple[int, ...]: + """Return the shape of the normalize.""" + return self._shape - def push(self, raw_data): - """Push a new value into the stream.""" - self.raw_data.data = raw_data - self.count.data[0] += 1 - if self.count.data[0] == 1: - self.mean.data = raw_data - else: - old_mean = self.mean - self.mean.data += (raw_data - self.mean.data) / self.count.data - self.sumsq.data += (raw_data - old_mean.data) * (raw_data - self.mean.data) - self.var.data = self.sumsq.data / (self.count.data - 1) - self.std.data = torch.sqrt(self.var.data) - self.std.data = torch.max(self.std.data, 1e-2 * torch.ones_like(self.std.data)) + @property + def mean(self) -> torch.Tensor: + """Return the mean of the normalize.""" + return self._mean - def forward(self, raw_data=None): - """Normalize the raw_data.""" - return self.normalize(raw_data) + @property + def std(self) -> torch.Tensor: + """Return the std of the normalize.""" + return self._std - def pre_process(self, raw_data): - """Pre-process the raw_data.""" - if len(raw_data.shape) == 1: - raw_data = raw_data.unsqueeze(-1) - return raw_data + def forward(self, data: torch.Tensor) -> torch.Tensor: + """Normalize the data.""" + return self.normalize(data) - def normalize(self, raw_data=None): - """Normalize the raw_data.""" - raw_data = self.pre_process(raw_data) - self.push(raw_data) - if self.count <= 1: - return self.raw_data.data - output = (self.raw_data.data - self.mean.data) / self.std.data - return torch.clamp(output, -self.clip.data, self.clip.data) + def normalize(self, data: torch.Tensor) -> torch.Tensor: + """Normalize the _data.""" + data = data.to(self._mean.device) + self._push(data) + if self._count <= 1: + return data + output = (data - self._mean) / self._std + return torch.clamp(output, -self._clip, self._clip) + + def _push(self, raw_data: torch.Tensor) -> None: + if raw_data.shape == self._shape: + raw_data = raw_data.unsqueeze(0) + assert raw_data.shape[1:] == self._shape, 'data shape must be equal to (batch_size, *shape)' + + if self._first: + self._mean = torch.mean(raw_data, dim=0) + self._sumsq = torch.sum((raw_data - self._mean) ** 2, dim=0) + self._count = torch.tensor( + raw_data.shape[0], dtype=self._count.dtype, device=self._count.device + ) + self._first = False + else: + count_raw = raw_data.shape[0] + count = self._count + count_raw + mean_raw = torch.mean(raw_data, dim=0) + delta = mean_raw - self._mean + self._mean += delta * count_raw / count + sumq_raw = torch.sum((raw_data - mean_raw) ** 2, dim=0) + self._sumsq += sumq_raw + delta**2 * self._count * count_raw / count + self._count = count + self._var = self._sumsq / (self._count - 1) + self._std = torch.sqrt(self._var) + self._std = torch.max(self._std, 1e-2 * torch.ones_like(self._std)) diff --git a/omnisafe/common/pid_lagrange.py b/omnisafe/common/pid_lagrange.py index 9b5c6b377..cab894708 100644 --- a/omnisafe/common/pid_lagrange.py +++ b/omnisafe/common/pid_lagrange.py @@ -16,6 +16,7 @@ import abc from collections import deque +from typing import Deque # pylint: disable-next=too-few-public-methods,too-many-instance-attributes @@ -62,12 +63,12 @@ def __init__( self.sum_norm = sum_norm self.diff_norm = diff_norm self.pid_i = lagrangian_multiplier_init - self.cost_ds = deque(maxlen=self.pid_d_delay) + self.cost_ds: Deque[float] = deque(maxlen=self.pid_d_delay) self.cost_ds.append(0) - self._delta_p = 0 - self._cost_d = 0 - self.cost_limit = cost_limit - self.cost_penalty = 0 + self._delta_p: float = 0 + self._cost_d: float = 0 + self.cost_limit: float = cost_limit + self.cost_penalty: float = 0 def pid_update(self, ep_cost_avg: float) -> None: r"""Update the PID controller. diff --git a/omnisafe/common/record_queue.py b/omnisafe/common/record_queue.py deleted file mode 100644 index 4a9d998b3..000000000 --- a/omnisafe/common/record_queue.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of Record Queue.""" - -from collections import deque - -import numpy as np - -from omnisafe.typing import List - - -class RecordQueue: - """RecordQueue.""" - - def __init__(self, *names, maxlen=100) -> None: - """Initialize the RecordQueue.""" - self.queues = {} - self._create_deques(*names, maxlen=maxlen) - - def _create_deques(self, *names, maxlen=100) -> None: - """Create queues by names.""" - for name in names: - self.queues[name] = deque(maxlen=maxlen) - - def append(self, **kwargs) -> None: - """Add values to the queues.""" - for key, value in kwargs.items(): - assert key in self.queues, f'{key} has not been set in queues {self.queues.keys()}' - self.queues[key].append(value) - - def non_empty_mean(self, name) -> np.ndarray: - """Get the mean of the non-empty values.""" - return np.mean(self.queues[name]) if len(self.queues[name]) else 0.0 - - def get_mean(self, *names) -> List: - """Get the means of needed queue names.""" - assert all( - name in self.queues for name in names - ), f'{names} has not been set in queues {self.queues.keys()}' - if len(names) == 1: - return self.non_empty_mean(names[0]) - return [self.non_empty_mean(name) for name in names] - - def reset(self, *names) -> None: - """Reset the needed queue.""" - assert all( - name in self.queues for name in names - ), f'{names} has not been set in queues {self.queues.keys()}' - for name in names: - self.queues[name].clear() diff --git a/omnisafe/configs/on-policy/CPO.yaml b/omnisafe/configs/on-policy/CPO.yaml index 08b3a1712..bf605bd2d 100644 --- a/omnisafe/configs/on-policy/CPO.yaml +++ b/omnisafe/configs/on-policy/CPO.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: True - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/CPPOPid.yaml b/omnisafe/configs/on-policy/CPPOPid.yaml index bd74d172d..741f12196 100644 --- a/omnisafe/configs/on-policy/CPPOPid.yaml +++ b/omnisafe/configs/on-policy/CPPOPid.yaml @@ -31,7 +31,7 @@ defaults: # Number of epochs epochs: 500 # Number of steps per epoch - steps_per_epoch: 32784 + steps_per_epoch: 32768 # Number of update iteration for Actor network actor_iters: 10 # Number of update iteration for Critic network @@ -59,6 +59,14 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +78,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -86,38 +96,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor @@ -148,7 +148,7 @@ defaults: max_len: 100 # The number of threads used to sample data num_threads: 20 - ## --------------------------------------Configuration For PID--------------------------------- ## +## --------------------------------------Configuration For PID--------------------------------- ## PID_cfgs: # KP for PID pid_kp: 0.01 diff --git a/omnisafe/configs/on-policy/CUP.yaml b/omnisafe/configs/on-policy/CUP.yaml index 514e678d9..a9d93db4a 100644 --- a/omnisafe/configs/on-policy/CUP.yaml +++ b/omnisafe/configs/on-policy/CUP.yaml @@ -52,6 +52,14 @@ defaults: critic_lr: 0.0003 # The Address for saving training process data data_dir: "./runs" + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True ## ---------------------------Basic configurations for derived class FOCOPS------------------- ## # The thereshold for KL early stopping @@ -77,6 +85,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -93,38 +103,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/FOCOPS.yaml b/omnisafe/configs/on-policy/FOCOPS.yaml index 6e2ac0d89..4094af4e2 100644 --- a/omnisafe/configs/on-policy/FOCOPS.yaml +++ b/omnisafe/configs/on-policy/FOCOPS.yaml @@ -64,6 +64,14 @@ defaults: lam: 1.5 # The size of batch for policy update batch_size: 2000 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -75,6 +83,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -91,38 +101,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/IPO.yaml b/omnisafe/configs/on-policy/IPO.yaml index f7c3c8ba3..2cf52f48a 100644 --- a/omnisafe/configs/on-policy/IPO.yaml +++ b/omnisafe/configs/on-policy/IPO.yaml @@ -63,6 +63,14 @@ defaults: kappa: 0.01 # The max of cost penalty penalty_max: 1.0 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -76,6 +84,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/NaturalPG.yaml b/omnisafe/configs/on-policy/NaturalPG.yaml index 8aef74061..30c044d46 100644 --- a/omnisafe/configs/on-policy/NaturalPG.yaml +++ b/omnisafe/configs/on-policy/NaturalPG.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/OnCRPO.yaml b/omnisafe/configs/on-policy/OnCRPO.yaml index 516567025..5b7f0c999 100644 --- a/omnisafe/configs/on-policy/OnCRPO.yaml +++ b/omnisafe/configs/on-policy/OnCRPO.yaml @@ -59,6 +59,14 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -88,38 +98,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/P3O.yaml b/omnisafe/configs/on-policy/P3O.yaml index 4f1b25f03..ccb642043 100644 --- a/omnisafe/configs/on-policy/P3O.yaml +++ b/omnisafe/configs/on-policy/P3O.yaml @@ -61,6 +61,14 @@ defaults: clip: 0.2 # The coefficient of cost penalty kappa: 20.0 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -88,38 +98,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PCPO.yaml b/omnisafe/configs/on-policy/PCPO.yaml index d363832ea..97485d459 100644 --- a/omnisafe/configs/on-policy/PCPO.yaml +++ b/omnisafe/configs/on-policy/PCPO.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PDO.yaml b/omnisafe/configs/on-policy/PDO.yaml index d9ccd2da9..7b64a8564 100644 --- a/omnisafe/configs/on-policy/PDO.yaml +++ b/omnisafe/configs/on-policy/PDO.yaml @@ -31,7 +31,7 @@ defaults: # Number of epochs epochs: 500 # Number of steps per epoch - steps_per_epoch: 32768 + steps_per_epoch: 32000 # Number of update iteration for Actor network actor_iters: 10 # Number of update iteration for Critic network @@ -57,17 +57,28 @@ defaults: target_kl: 0.02 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True + # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## # Whether to use cost critic - use_cost: True + use_cost: False # Cost discounted factor cost_gamma: 1.0 # Whether to use linear decay of learning rate linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -84,38 +95,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml index e2c79598f..df1a2f18b 100644 --- a/omnisafe/configs/on-policy/PPO.yaml +++ b/omnisafe/configs/on-policy/PPO.yaml @@ -31,7 +31,7 @@ defaults: # Number of epochs epochs: 500 # Number of steps per epoch - steps_per_epoch: 32768 + steps_per_epoch: 32000 # Number of update iteration for Actor network actor_iters: 40 # Number of update iteration for Critic network @@ -59,6 +59,14 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +78,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -84,38 +94,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml index 313fbdd1f..71951d7d8 100644 --- a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml +++ b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml @@ -59,6 +59,16 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 1 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True + # cost_limit + cost_limit: 25 # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +80,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -86,38 +98,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PPOLag.yaml b/omnisafe/configs/on-policy/PPOLag.yaml index 6b00a1aa0..cfd1d4ab5 100644 --- a/omnisafe/configs/on-policy/PPOLag.yaml +++ b/omnisafe/configs/on-policy/PPOLag.yaml @@ -59,6 +59,14 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +78,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -86,38 +96,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml index 71eb61fe2..2823db932 100644 --- a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml +++ b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml @@ -59,6 +59,16 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 1 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True + # cost_limit + cost_limit: 25 # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +80,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -86,38 +98,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/PPOLagSaute.yaml b/omnisafe/configs/on-policy/PPOLagSaute.yaml index 1cc769c86..e1b2b7362 100644 --- a/omnisafe/configs/on-policy/PPOLagSaute.yaml +++ b/omnisafe/configs/on-policy/PPOLagSaute.yaml @@ -59,6 +59,14 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: False + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +78,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -86,38 +96,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor @@ -163,6 +163,6 @@ defaults: # Whether to use standardized obs normalized_obs: True # The maximum length of record queue - max_len: 100 + max_ep_len: 1000 # The number of threads used to sample data num_threads: 20 diff --git a/omnisafe/configs/on-policy/PPOSaute.yaml b/omnisafe/configs/on-policy/PPOSaute.yaml index 73d1d4dec..ebfb57d4c 100644 --- a/omnisafe/configs/on-policy/PPOSaute.yaml +++ b/omnisafe/configs/on-policy/PPOSaute.yaml @@ -59,6 +59,14 @@ defaults: batch_size: 10000 # The clip range for PPO loss clip: 0.2 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: False + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -70,6 +78,8 @@ defaults: linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -84,38 +94,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor @@ -151,6 +151,6 @@ defaults: # Whether to use standardized obs normalized_obs: True # The maximum length of record queue - max_len: 100 + max_ep_len: 1000 # The number of threads used to sample data num_threads: 20 diff --git a/omnisafe/configs/on-policy/PolicyGradient.yaml b/omnisafe/configs/on-policy/PolicyGradient.yaml index 8c1c9e6ab..b7b83b878 100644 --- a/omnisafe/configs/on-policy/PolicyGradient.yaml +++ b/omnisafe/configs/on-policy/PolicyGradient.yaml @@ -46,10 +46,6 @@ defaults: max_ep_len: 1000 # The size of mini batch num_mini_batches: 64 - # The learning rate of Actor network - actor_lr: 0.0003 - # The learning rate of Critic network - critic_lr: 0.0003 # The Address for saving training process data data_dir: "./runs" ## ---------------------------Basic configurations for derived class PPO---------------------- ## @@ -57,6 +53,14 @@ defaults: target_kl: 0.02 # The size of batch for policy update batch_size: 10000 + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -64,10 +68,10 @@ defaults: use_cost: False # Cost discounted factor cost_gamma: 1.0 - # Whether to use linear decay of learning rate - linear_lr_decay: True # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -82,38 +86,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/RCPO.yaml b/omnisafe/configs/on-policy/RCPO.yaml index 7b331f4ff..7fb671f6a 100644 --- a/omnisafe/configs/on-policy/RCPO.yaml +++ b/omnisafe/configs/on-policy/RCPO.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index 71dd6a162..1f359b864 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/TRPOLag.yaml b/omnisafe/configs/on-policy/TRPOLag.yaml index 7b331f4ff..7fb671f6a 100644 --- a/omnisafe/configs/on-policy/TRPOLag.yaml +++ b/omnisafe/configs/on-policy/TRPOLag.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor diff --git a/omnisafe/configs/on-policy/TRPOPid.yaml b/omnisafe/configs/on-policy/TRPOPid.yaml index 98778f0f7..ee26aa807 100644 --- a/omnisafe/configs/on-policy/TRPOPid.yaml +++ b/omnisafe/configs/on-policy/TRPOPid.yaml @@ -63,6 +63,14 @@ defaults: cg_iters: 10 # Subsampled observation fvp_obs: None + # The number of parallel environments + num_envs: 32 + # Whether to use standardized reward + reward_normalize: True + # Whether to use standardized cost + cost_normalize: True + # Whether to use standardized obs + obs_normalize: True # ---------------------------------------Optional Configuration-------------------------------- # ## -----------------------------------Configuration For Cost Critic--------------------------- ## @@ -74,6 +82,8 @@ defaults: linear_lr_decay: False # Whether to use exploration noise anneal exploration_noise_anneal: False + # std + std: [0.5, 0.1] # The coefficient of reward penalty penalty_param: 0.0 # Whether to use KL early stopping @@ -90,38 +100,28 @@ defaults: critic_norm_coeff: 0.001 ## ---------------------------------------Configuration For Model----------------------------- ## model_cfgs: - # Whether to share the weight of Actor network with Critic network - shared_weights: False # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal". weight_initialization_mode: "kaiming_uniform" # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor" - actor_type: gaussian - # Configuration of Actor and Critic network - ac_kwargs: - # Configuration of Actor network - pi: - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh - # Output activation function - output_activation: identity - # Whether to scale action. - scale_action: False - # Whether to clip action. - clip_action: False - # Whther to learn the standard deviation of Gaussian noise - std_learning: True - # The initial value of standard deviation of Gaussian noise - std_init: 1.0 - # Configuration of Critic network - val: - # Number of critic networks - num_critics: 1 - # Size of hidden layers - hidden_sizes: [64, 64] - # Activation function - activation: tanh + actor_type: gaussian_learning + # Whether to use linear decay of learning rate + linear_lr_decay: True + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # Size of hidden layers + hidden_sizes: [64, 64] + # Activation function + activation: tanh + # The learning rate of Critic network + lr: 0.0003 ## --------------------------------------Configuration For Buffer----------------------------- ## buffer_cfgs: # Reward discounted factor @@ -152,7 +152,7 @@ defaults: max_len: 100 # The number of threads used to sample data num_threads: 20 - ## --------------------------------------Configuration For PID--------------------------------- ## +## --------------------------------------Configuration For PID--------------------------------- ## PID_cfgs: # KP for PID pid_kp: 0.01 diff --git a/omnisafe/algorithms/model_based/models/__init__.py b/omnisafe/envs/__init__.py similarity index 77% rename from omnisafe/algorithms/model_based/models/__init__.py rename to omnisafe/envs/__init__.py index 13e8f052f..eb2348aee 100644 --- a/omnisafe/algorithms/model_based/models/__init__.py +++ b/omnisafe/envs/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""The model-based dynamics model.""" +"""Environment api for omnisafe.""" -from omnisafe.algorithms.model_based.models.dynamic_model import EnsembleDynamicsModel -from omnisafe.algorithms.model_based.models.virtual_env import VirtualEnv +from omnisafe.envs.core import CMDP, env_register, make, support_envs +from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv diff --git a/omnisafe/envs/core.py b/omnisafe/envs/core.py new file mode 100644 index 000000000..cd92b3205 --- /dev/null +++ b/omnisafe/envs/core.py @@ -0,0 +1,336 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""The core module of the environment.""" + + +import inspect +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch + +from omnisafe.typing import OmnisafeSpace + + +class CMDP(ABC): + """The core class of the environment. + + The CMDP class is the core class of the environment. It defines the basic + interface of the environment. The environment should inherit from this class + and implement the abstract methods. + + Attributes: + _support_envs (List[str]): the supported environments. + _action_space (OmnisafeSpace): the action space of the environment. + _observation_space (OmnisafeSpace): the observation space of the environment. + _num_envs (int): the parallel environments, for env that not support parallel, num_envs should be 1 + _time_limit (Optional[int]): the time limit of the environment, if None, the environment is infinite. + """ + + _support_envs: List[str] + _action_space: OmnisafeSpace + _observation_space: OmnisafeSpace + + _num_envs: int + _time_limit: Optional[int] = None + need_time_limit_wrapper: bool + need_auto_reset_wrapper: bool + + @classmethod + def support_envs(cls) -> List[str]: + """The supported environments. + + Returns: + List[str]: the supported environments. + """ + return cls._support_envs + + @abstractmethod + def __init__(self, env_id: str, **kwargs) -> None: + """Initialize the environment. + + Args: + env_id (str): the environment id. + """ + assert ( + env_id in self.support_envs() + ), f'env_id {env_id} is not supported by {self.__class__.__name__}' + + @property + def action_space(self) -> OmnisafeSpace: + """The action space of the environment. + + Returns: + OmnisafeSpace: the action space. + """ + return self._action_space + + @property + def observation_space(self) -> OmnisafeSpace: + """The observation space of the environment. + + Returns: + OmnisafeSpace: the observation space. + """ + return self._observation_space + + @property + def num_envs(self) -> int: + """The parallel environments. + + Returns: + int: the parallel environments. + """ + return self._num_envs + + @property + def time_limit(self) -> Optional[int]: + """The time limit of the environment. + + Returns: + Optional[int]: the time limit of the environment. + """ + return self._time_limit + + @abstractmethod + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + """Run one timestep of the environment's dynamics using the agent actions. + + Args: + action (torch.Tensor): action. + + Returns: + observation (torch.Tensor): agent's observation of the current environment. + reward (torch.Tensor): amount of reward returned after previous action. + cost (torch.Tensor): amount of cost returned after previous action. + terminated (torch.Tensor): whether the episode has ended, in which case further step() + calls will return undefined results. + truncated (torch.Tensor): whether the episode has been truncated due to a time limit. + info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + + @abstractmethod + def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + """Resets the environment and returns an initial observation. + + Args: + seed (Optional[int]): seed for the environment. + + Returns: + observation (torch.Tensor): the initial observation of the space. + info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + + @abstractmethod + def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + """For parallel env, reset one of the env and returns an initial observation, + if env not support parallel, should be same as reset. + + Args: + seed (Optional[int]): seed for the environment. + + Returns: + observation (torch.Tensor): the initial observation of the space. + info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + + @abstractmethod + def set_seed(self, seed: int) -> None: + """Sets the seed for this env's random number generator(s). + + Args: + seed (int): the seed to use. + """ + + @abstractmethod + def sample_action(self) -> torch.Tensor: + """Sample an action from the action space. + + Returns: + torch.Tensor: the sampled action. + """ + + @abstractmethod + def render(self) -> Any: + """Compute the render frames as specified by :attr:`render_mode` during the initialization of the environment. + + Returns: + Any: the render frames, we recommend to use `np.ndarray` which could construct video by moviepy. + """ + + @abstractmethod + def close(self) -> None: + """Close the environment.""" + + +class Wrapper(CMDP): + """The wrapper class of the environment. + + The Wrapper class is the wrapper class of the environment. It defines the basic + interface of the environment wrapper. The environment wrapper should inherit + from this class and implement the abstract methods. + + Attributes: + _env (CMDP): the environment. + + """ + + def __init__(self, env: CMDP) -> None: + """Initialize the wrapper. + + Args: + env (CMDP): the environment. + """ + self._env = env + + def __getattr__(self, name: str) -> Any: + """Get the attribute of the environment. + + Args: + name (str): the attribute name. + + Returns: + Any: the attribute. + """ + if name.startswith('_'): + raise AttributeError(f'attempted to get missing private attribute {name}') + return getattr(self._env, name) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + return self._env.step(action) + + def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + return self._env.reset(seed) + + def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + return self._env.single_reset(idx, seed) + + def set_seed(self, seed: int) -> None: + self._env.set_seed(seed) + + def sample_action(self) -> torch.Tensor: + return self._env.sample_action() + + def render(self) -> Any: + return self._env.render() + + def close(self) -> None: + self._env.close() + + +class EnvRegister: + """The environment register. + + The EnvRegister is used to register the environment class. It provides the + method to get the environment class by the environment id. + + """ + + def __init__(self) -> None: + self._class: Dict[str, Type[CMDP]] = {} + self._support_envs: Dict[str, List[str]] = {} + + def _register(self, env_class: Type[CMDP]) -> None: + """Register the environment class. + + Args: + env_class (Type[CMDP]): the environment class. + """ + if not inspect.isclass(env_class): + raise TypeError(f'{env_class} must be a class') + class_name = env_class.__name__ + if not issubclass(env_class, CMDP): + raise TypeError(f'{class_name} must be subclass of CMDP') + if class_name in self._class: + raise ValueError(f'{class_name} has been registered') + env_ids = env_class.support_envs() + self._class[class_name] = env_class + self._support_envs[class_name] = env_ids + + def register(self, env_class: Type[CMDP]) -> Type[CMDP]: + """Register the environment class. + + Args: + env_class (Type[CMDP]): the environment class. + + Returns: + Type[CMDP]: the environment class. + """ + self._register(env_class) + return env_class + + def get_class(self, env_id: str, class_name: Optional[str]) -> Type[CMDP]: + """Get the environment class. + + Args: + env_id (str): the environment id. + class_name (Optional[str]): the environment class name. + + Returns: + Type[CMDP]: the environment class. + """ + if class_name is not None: + assert class_name in self._class, f'{class_name} is not registered' + assert ( + env_id in self._support_envs[class_name] + ), f'{env_id} is not supported by {class_name}' + return self._class[class_name] + + for cls_name, env_ids in self._support_envs.items(): + if env_id in env_ids: + return self._class[cls_name] + raise ValueError(f'{env_id} is not supported by any environment class') + + def support_envs(self) -> List[str]: + """The supported environments. + + Returns: + List[str]: the supported environments. + """ + return list({env_id for env_ids in self._support_envs.values() for env_id in env_ids}) + + +ENV_REGISTRY = EnvRegister() + +env_register = ENV_REGISTRY.register +support_envs = ENV_REGISTRY.support_envs + + +def make(env_id: str, class_name: Optional[str] = None, **kwargs) -> CMDP: + """Create an environment. + + Args: + env_id (str): the environment id. + class_name (Optional[str]): the environment class name. + **kwargs: the keyword arguments for the environment initialization. + + Returns: + CMDP: the environment. + """ + env_class = ENV_REGISTRY.get_class(env_id, class_name) + return env_class(env_id, **kwargs) + + +__all__ = [ + 'CMDP', + 'Wrapper', + 'env_register', + 'support_envs', + 'make', +] diff --git a/omnisafe/envs/safety_gymnasium_env.py b/omnisafe/envs/safety_gymnasium_env.py new file mode 100644 index 000000000..f35e13afd --- /dev/null +++ b/omnisafe/envs/safety_gymnasium_env.py @@ -0,0 +1,117 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Environments in the Safety Gymnasium.""" + + +from typing import Any, Dict, Optional, Tuple + +import safety_gymnasium +import torch + +from omnisafe.envs.core import CMDP, env_register + + +@env_register +class SafetyGymnasiumEnv(CMDP): + """Safety Gymnasium environment.""" + + _support_envs = [ + 'SafetyPointGoal0-v0', + 'SafetyPointGoal1-v0', + 'SafetyPointGoal2-v0', + 'SafetyPointButton0-v0', + 'SafetyPointButton1-v0', + 'SafetyPointButton2-v0', + 'SafetyPointPush0-v0', + 'SafetyPointPush1-v0', + 'SafetyPointPush2-v0', + 'SafetyPointCircle0-v0', + 'SafetyPointCircle1-v0', + 'SafetyPointCircle2-v0', + 'SafetyCarGoal0-v0', + 'SafetyCarGoal1-v0', + 'SafetyCarGoal2-v0', + 'SafetyCarButton0-v0', + 'SafetyCarButton1-v0', + 'SafetyCarButton2-v0', + 'SafetyCarPush0-v0', + 'SafetyCarPush1-v0', + 'SafetyCarPush2-v0', + 'SafetyCarCircle0-v0', + 'SafetyCarCircle1-v0', + 'SafetyCarCircle2-v0', + 'SafetyAntGoal0-v0', + 'SafetyAntGoal1-v0', + 'SafetyAntGoal2-v0', + 'SafetyAntButton0-v0', + 'SafetyAntButton1-v0', + 'SafetyAntButton2-v0', + 'SafetyAntPush0-v0', + 'SafetyAntPush1-v0', + 'SafetyAntPush2-v0', + 'SafetyAntCircle0-v0', + 'SafetyAntCircle1-v0', + 'SafetyAntCircle2-v0', + 'SafetyHalfCheetahVelocity-v4', + 'SafetyHopperVelocity-v4', + 'SafetySwimmerVelocity-v4', + 'SafetyWalker2dVelocity-v4', + 'SafetyAntVelocity-v4', + 'SafetyHumanoidVelocity-v4', + ] + need_auto_reset_wrapper = False + need_time_limit_wrapper = False + + def __init__(self, env_id: str, num_envs: int = 1, **kwargs) -> None: + if num_envs > 1: + self._env = safety_gymnasium.vector.make(env_id=env_id, num_envs=num_envs, **kwargs) + self._action_space = self._env.single_action_space + self._observation_space = self._env.single_observation_space + else: + self._env = safety_gymnasium.make(id=env_id, autoreset=True, **kwargs) + self._action_space = self._env.action_space + self._observation_space = self._env.observation_space + + self._num_envs = num_envs + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = self._env.step(action) + obs, reward, cost, terminated, truncated = map( + lambda x: torch.as_tensor(x, dtype=torch.float32), + (obs, reward, cost, terminated, truncated), + ) + return obs, reward, cost, terminated, truncated, info + + def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + obs, info = self._env.reset(seed=seed) + return torch.as_tensor(obs, dtype=torch.float32), info + + def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + obs, info = self.reset(seed=seed) + return obs[idx], info + + def set_seed(self, seed: int) -> None: + self.reset(seed=seed) + + def sample_action(self) -> torch.Tensor: + return torch.as_tensor(self._env.action_space.sample(), torch.float32) + + def render(self) -> Any: + return self._env.render() + + def close(self) -> None: + self._env.close() diff --git a/omnisafe/envs/wrapper.py b/omnisafe/envs/wrapper.py new file mode 100644 index 000000000..632819020 --- /dev/null +++ b/omnisafe/envs/wrapper.py @@ -0,0 +1,288 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Wrapper for the environment.""" + + +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import torch +from gymnasium import spaces + +from omnisafe.common import Normalizer +from omnisafe.envs.core import CMDP, Wrapper + + +class TimeLimit(Wrapper): + """Time limit wrapper for the environment. + + Example: + >>> env = TimeLimit(env, time_limit=100) + """ + + def __init__(self, env: CMDP, time_limit: int) -> None: + """Initialize the time limit wrapper. + + Args: + env (CMDP): The environment to wrap. + time_limit (int): The time limit for each episode. + """ + super().__init__(env) + self._time_limit: int = time_limit + self._time: Union[int, np.ndarray] = ( + 0 if self.num_envs == 1 else np.array([0] * self.num_envs) + ) + + def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + self._time = 0 if self.num_envs == 1 else np.array([0] * self.num_envs) + return super().reset(seed) + + def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + if isinstance(self._time, np.ndarray): + self._time[idx] = 0 + else: + self._time = 0 + return super().single_reset(idx, seed) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = super().step(action) + + self._time += 1 + truncated = torch.tensor(self._time >= self._time_limit, dtype=torch.bool) + + return obs, reward, cost, terminated, truncated, info + + +class AutoReset(Wrapper): + """Auto reset the environment when the episode is terminated. + + Example: + >>> env = AutoReset(env) + + """ + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = super().step(action) + + if self.num_envs == 1: + if terminated or truncated: + obs, _ = self.reset() + else: + dones = terminated | truncated + for idx, done in enumerate(dones): + if done: + obs[idx], _ = self.single_reset(idx) + + return obs, reward, cost, terminated, truncated, info + + +class ObsNormalize(Wrapper): + """Normalize the observation. + + Example: + >>> env = ObsNormalize(env) + + >>> norm = Normalizer(env.observation_space.shape) # load saved normalizer + >>> env = ObsNormalize(env, norm) + + """ + + def __init__(self, env: CMDP, norm: Optional[Normalizer] = None) -> None: + super().__init__(env) + assert isinstance(self.observation_space, spaces.Box), 'Observation space must be Box' + + if norm is not None: + self._obs_normalizer = norm + else: + self._obs_normalizer = Normalizer(self.observation_space.shape, clip=5) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = super().step(action) + info['original_obs'] = obs + obs = self._obs_normalizer.normalize(obs) + return obs, reward, cost, terminated, truncated, info + + def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + obs, info = super().reset(seed) + info['original_obs'] = obs + obs = self._obs_normalizer.normalize(obs) + return obs, info + + def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + obs, info = super().single_reset(idx, seed) + info['original_obs'] = obs + obs = self._obs_normalizer.normalize(obs.unsqueeze(0)).squeeze(0) + return obs, info + + +class RewardNormalize(Wrapper): + """Normalize the reward. + + Example: + >>> env = RewardNormalize(env) + + >>> norm = Normalizer(()) # load saved normalizer + >>> env = RewardNormalize(env, norm) + + """ + + def __init__(self, env: CMDP, norm: Optional[Normalizer] = None) -> None: + """Initialize the reward normalizer. + + Args: + env (CMDP): The environment to wrap. + norm (Optional[Normalizer], optional): The normalizer to use. Defaults to None. + + """ + super().__init__(env) + if norm is not None: + self._reward_normalizer = norm + else: + self._reward_normalizer = Normalizer((), clip=5) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = super().step(action) + info['original_reward'] = reward + reward = self._reward_normalizer.normalize(reward) + return obs, reward, cost, terminated, truncated, info + + +class CostNormalize(Wrapper): + """Normalize the cost. + + Example: + >>> env = CostNormalize(env) + + >>> norm = Normalizer(()) # load saved normalizer + >>> env = CostNormalize(env, norm) + """ + + def __init__(self, env: CMDP, norm: Optional[Normalizer] = None) -> None: + """Initialize the cost normalizer. + + Args: + env (CMDP): The environment to wrap. + norm (Normalizer, optional): The normalizer to use. Defaults to None. + """ + super().__init__(env) + if norm is not None: + self._obs_normalizer = norm + else: + self._cost_normalizer = Normalizer((), clip=5) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = super().step(action) + info['original_cost'] = cost + cost = self._cost_normalizer.normalize(cost) + return obs, reward, cost, terminated, truncated, info + + +class ActionScale(Wrapper): + """Scale the action space to a given range. + + Example: + >>> env = ActionScale(env, low=-1, high=1) + >>> env.action_space + Box(-1.0, 1.0, (1,), float32) + """ + + def __init__( + self, + env: CMDP, + low: Union[int, float], + high: Union[int, float], + ) -> None: + """Initialize the wrapper. + + Args: + env: The environment to wrap. + low: The lower bound of the action space. + high: The upper bound of the action space. + """ + super().__init__(env) + assert isinstance(self.action_space, spaces.Box), 'Action space must be Box' + + self._old_min_action = torch.tensor(self.action_space.low, dtype=torch.float32) + self._old_max_action = torch.tensor(self.action_space.high, dtype=torch.float32) + + min_action = np.zeros(self.action_space.shape, dtype=self.action_space.dtype) + low + max_action = np.zeros(self.action_space.shape, dtype=self.action_space.dtype) + high + self._action_space = spaces.Box( + low=min_action, + high=max_action, + shape=self.action_space.shape, + dtype=self.action_space.dtype, # type: ignore + ) + + self._min_action = torch.tensor(min_action, dtype=torch.float32) + self._max_action = torch.tensor(max_action, dtype=torch.float32) + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + action = self._old_min_action + (self._old_max_action - self._old_min_action) * ( + action - self._min_action + ) / (self._max_action - self._min_action) + return super().step(action) + + +class Unsqueeze(Wrapper): + """Unsqueeze the observation, reward, cost, terminated, truncated and info. + + Example: + >>> env = Unsqueeze(env) + """ + + def __init__(self, env: CMDP) -> None: + """Initialize the wrapper. + + Args: + env: The environment to wrap. + """ + super().__init__(env) + assert self.num_envs == 1, 'Unsqueeze only works with single environment' + assert isinstance(self.observation_space, spaces.Box), 'Observation space must be Box' + + def step( + self, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]: + obs, reward, cost, terminated, truncated, info = super().step(action) + obs, reward, cost, terminated, truncated = map( + lambda x: x.unsqueeze(0), (obs, reward, cost, terminated, truncated) + ) + for k, v in info.items(): + if isinstance(v, torch.Tensor): + info[k] = v.unsqueeze(0) + + return obs, reward, cost, terminated, truncated, info + + def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]: + obs, info = super().reset(seed) + obs = obs.unsqueeze(0) + for k, v in info.items(): + if isinstance(v, torch.Tensor): + info[k] = v.unsqueeze(0) + + return obs, info diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py deleted file mode 100644 index d3f3fc62e..000000000 --- a/omnisafe/evaluator.py +++ /dev/null @@ -1,318 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of Evaluator.""" - -import dataclasses -import json -import os - -import numpy as np -import torch -from gymnasium.spaces import Box, Discrete -from gymnasium.utils.save_video import save_video - -from omnisafe.models.actor import ActorBuilder -from omnisafe.utils.config import Config -from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper as EnvWrapper -from omnisafe.wrappers.saute_wrapper import SauteWrapper -from omnisafe.wrappers.simmer_wrapper import SimmerWrapper - - -class Evaluator: # pylint: disable=too-many-instance-attributes - """This class includes common evaluation methods for safe RL algorithms.""" - - # pylint: disable-next=too-many-arguments - def __init__( - self, - env=None, - actor=None, - obs_normalize=None, - play=True, - save_replay=True, - ): - """Initialize the evaluator. - - Args: - env (gymnasium.Env): the environment. if None, the environment will be created from the config. - pi (omnisafe.algos.models.actor.Actor): the policy. if None, the policy will be created from the config. - obs_normalize (omnisafe.algos.models.obs_normalize): the observation Normalize. - """ - # set the attributes - self.env = env - self.actor = actor - self.obs_normalizer = obs_normalize if obs_normalize is not None else lambda x: x - self.env_wrapper_class = type(env) if env is not None else None - - # used when load model from saved file. - self.cfg = None - self.save_dir = None - self.model_name = None - self.algo_name = None - self.model_params = None - - # set the render mode - self.play = play - self.save_replay = save_replay - if play and save_replay: - self.render_mode = 'rgb_array' - elif play and not save_replay: - self.render_mode = 'human' - elif not play and save_replay: - self.render_mode = 'rgb_array_list' - else: - self.render_mode = None - - # pylint: disable-next=too-many-locals - def load_saved_model(self, save_dir: str, model_name: str): - """Load a saved model. - - Args: - save_dir (str): directory where the model is saved. - model_name (str): name of the model. - """ - # load the config - self.save_dir = save_dir - self.model_name = model_name - cfg_path = os.path.join(save_dir, 'config.json') - try: - with open(cfg_path, encoding='utf-8') as file: - self.cfg = json.load(file) - except FileNotFoundError as error: - raise FileNotFoundError( - 'The config file is not found in the save directory.' - ) from error - - # load the saved model - model_path = os.path.join(save_dir, 'torch_save', model_name) - try: - self.model_params = torch.load(model_path) - except FileNotFoundError as error: - raise FileNotFoundError('The model is not found in the save directory.') from error - - self.algo_name = self.cfg['exp_name'].split('/')[1] - - # make the environment - env_id = self.cfg['env_id'] - self.env = self._make_env(env_id, render_mode=self.render_mode) - - # make the actor - observation_space = self.env.observation_space - action_space = self.env.action_space - - act_space_type = 'discrete' if isinstance(action_space, Discrete) else 'continuous' - actor_type = self.cfg['model_cfgs']['actor_type'] - if isinstance(action_space, Box): - act_dim = action_space.shape[0] - elif isinstance(action_space, Discrete): - act_dim = action_space.n - else: - raise ValueError - - obs_dim = observation_space.shape[0] - pi_cfg = self.cfg['model_cfgs']['ac_kwargs']['pi'] - weight_initialization_mode = self.cfg['model_cfgs']['weight_initialization_mode'] - actor_builder = ActorBuilder( - obs_dim=obs_dim, - act_dim=act_dim, - hidden_sizes=pi_cfg['hidden_sizes'], - activation=pi_cfg['activation'], - weight_initialization_mode=weight_initialization_mode, - shared=None, - ) - if act_space_type == 'discrete': - self.actor = actor_builder.build_actor('categorical') - else: - act_max = torch.as_tensor(action_space.high) - act_min = torch.as_tensor(action_space.low) - self.actor = actor_builder.build_actor(actor_type, act_max=act_max, act_min=act_min) - self.actor.load_state_dict(self.model_params['pi']) - - # pylint: disable-next=too-many-locals - def evaluate( - self, - num_episodes: int = 10, - cost_criteria: float = 1.0, - ): - """Evaluate the agent for num_episodes episodes. - - Args: - num_episodes (int): number of episodes to evaluate the agent. - cost_criteria (float): the cost criteria for the evaluation. - - Returns: - episode_rewards (list): list of episode rewards. - episode_costs (list): list of episode costs. - episode_lengths (list): list of episode lengths. - """ - if self.env is None or self.actor is None: - raise ValueError( - 'The environment and the policy must be provided or created before evaluating the agent.' - ) - - episode_rewards = [] - episode_costs = [] - episode_lengths = [] - horizon = self.env.rollout_data.max_ep_len - - for _ in range(num_episodes): - obs, _ = self.env.reset() - ep_ret, ep_cost = 0.0, 0.0 - - for step in range(horizon): - with torch.no_grad(): - if self.env.obs_normalizer is not None: - obs = self.env.obs_normalizer.normalize(obs) - _, act = self.actor.predict( - torch.as_tensor(obs, dtype=torch.float32), - deterministic=True, - need_log_prob=False, - ) - [obs, rew, cost], done, truncated, _ = self.env.step(act) - ep_ret += rew - ep_cost += (cost_criteria**step) * cost - - if done or truncated: - episode_rewards.append(ep_ret) - episode_costs.append(ep_cost) - episode_lengths.append(step + 1) - break - - print('Evaluation results:') - print(f'Average episode reward: {np.mean(episode_rewards):.3f}') - print(f'Average episode cost: {np.mean(episode_costs):.3f}') - print(f'Average episode length: {np.mean(episode_lengths):.3f}') - return ( - episode_rewards, - episode_costs, - ) - - def render( # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements - self, - num_episode: int = 0, - play=True, - save_replay_path: str = None, - camera_name: str = None, - camera_id: str = None, - width: int = None, - height: int = None, - ): - """Render the environment for one episode. - - Args: - seed (int): seed for the environment. If None, the environment will be reset with a random seed. - save_replay_path (str): path to save the replay. If None, no replay is saved. - """ - - if save_replay_path is None: - save_replay_path = os.path.join(self.save_dir, 'video', self.model_name.split('.')[0]) - - # remake the environment if the render mode can not support needed play or save_replay - if self.env is None or self.actor is None: - raise ValueError( - 'The environment and the policy must be provided or created before evaluating the agent.' - ) - - width = self.env.width if width is None else width - height = self.env.height if height is None else height - env_kwargs = dataclasses.asdict(self.env.render_data) - if env_kwargs.get('render_mode') is None: - print("Remake the environment with render_mode='rgb_array' to render the environment.") - self.env = self._make_env(**env_kwargs) - self.render_mode = 'rgb_array' - - if env_kwargs.get('render_mode') == 'human' and save_replay_path is not None: - print("Remake the environment with render_mode='rgb_array' to save the replay.") - self.env = self._make_env(**env_kwargs) - self.render_mode = 'rgb_array' - - if env_kwargs.get('render_mode') == 'rgb_array_list' and play: - print("Remake the environment with render_mode='rgb_array' to render the environment.") - self.env = self._make_env(**env_kwargs) - self.render_mode = 'rgb_array' - - if env_kwargs.get('camara_id') != camera_id or env_kwargs.get('camera_name') != camera_name: - print("Remake the environment with render_mode='rgb_array' to change the camera.") - env_kwargs['camera_id'] = camera_id - env_kwargs['camera_name'] = camera_name - self.env = self._make_env(**env_kwargs) - self.render_mode = 'rgb_array' - - if env_kwargs.get('height') != height or env_kwargs.get('width') != width: - print( - "Remake the environment with render_mode='rgb_array' to change the camera width or height." - ) - self.env = self._make_env(**env_kwargs) - self.render_mode = 'rgb_array' - - horizon = self.env.rollout_data.max_ep_len - frames = [] - obs, _ = self.env.reset() - self.actor.to(self.env.cfgs.device) - if self.render_mode == 'human': - self.env.render() - elif self.render_mode == 'rgb_array': - frames.append(self.env.render()) - if self.env.obs_normalizer is not None: - self.env.obs_normalizer.load_state_dict(self.model_params['obs_normalizer']) - for episode_idx in range(num_episode): - for _ in range(horizon): - with torch.no_grad(): - if self.env.obs_normalizer is not None: - obs = self.env.obs_normalizer.normalize(obs) - _, act = self.actor.predict(obs, deterministic=True) - [obs, _, _], done, truncated, _ = self.env.step(act.cpu().squeeze()) - if done[0] or truncated[0]: - break - if self.render_mode == 'rgb_array': - frames.append(self.env.render()) - - if self.render_mode == 'rgb_array_list': - frames = self.env.render() - if save_replay_path is not None: - save_video( - frames, - save_replay_path, - fps=self.env.env.metadata['render_fps'], - episode_trigger=lambda x: True, - episode_index=episode_idx, - name_prefix='eval', - ) - self.env.reset() - frames = [] - - def _make_env(self, env_id, **env_kwargs): - """Make wrapped environment.""" - env_cfgs = { - 'num_envs': 1, - 'seed': 0, - 'normalized_obs': False, - 'normalized_rew': False, - 'normalized_cost': False, - 'device': 'cpu', - 'num_threads': 20, - 'max_len': 100, - 'async_env': True, - } - env_cfgs = Config(**env_cfgs) - if self.cfg is not None and 'env_cfgs' in self.cfg: - self.cfg['env_cfgs']['device'] = 'cpu' - self.cfg['env_cfgs']['seed'] = 0 - env_cfgs = Config(**self.cfg['env_cfgs']) - - if self.algo_name in ['PPOSimmerPid', 'PPOSimmerQ', 'PPOLagSimmerQ', 'PPOLagSimmerPid']: - return SimmerWrapper(env_id, env_cfgs, **env_kwargs) - if self.algo_name in ['PPOSaute', 'PPOLagSaute']: - return SauteWrapper(env_id, env_cfgs, **env_kwargs) - return EnvWrapper(env_id, env_cfgs, **env_kwargs) diff --git a/omnisafe/models/__init__.py b/omnisafe/models/__init__.py index 6fec5506f..b1b9a049d 100644 --- a/omnisafe/models/__init__.py +++ b/omnisafe/models/__init__.py @@ -15,15 +15,12 @@ """This module contains the model for all methods.""" from omnisafe.models.actor import ActorBuilder -from omnisafe.models.actor.categorical_actor import CategoricalActor -from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor from omnisafe.models.actor.gaussian_actor import GaussianActor -from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor -from omnisafe.models.actor_critic import ActorCritic -from omnisafe.models.actor_q_critic import ActorQCritic +from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor +from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor +from omnisafe.models.actor_critic.actor_critic import ActorCritic +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic from omnisafe.models.base import Actor, Critic -from omnisafe.models.constraint_actor_critic import ConstraintActorCritic -from omnisafe.models.constraint_actor_q_critic import ConstraintActorQCritic from omnisafe.models.critic import CriticBuilder from omnisafe.models.critic.q_critic import QCritic from omnisafe.models.critic.v_critic import VCritic diff --git a/omnisafe/models/actor/__init__.py b/omnisafe/models/actor/__init__.py index dbb8b6301..191befa5a 100644 --- a/omnisafe/models/actor/__init__.py +++ b/omnisafe/models/actor/__init__.py @@ -15,7 +15,6 @@ """The abstract interfaces of Actor networks for the Actor-Critic algorithm.""" from omnisafe.models.actor.actor_builder import ActorBuilder -from omnisafe.models.actor.categorical_actor import CategoricalActor -from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor from omnisafe.models.actor.gaussian_actor import GaussianActor -from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor +from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor +from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index 13dff87ec..17ea33efd 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -15,135 +15,54 @@ """Implementation of ActorBuilder.""" import difflib -from dataclasses import dataclass -from typing import Optional, Union +from typing import List -import torch.nn as nn - -from omnisafe.models.actor.categorical_actor import CategoricalActor -from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor -from omnisafe.models.actor.gaussian_actor import GaussianActor -from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor -from omnisafe.utils.model_utils import Activation, InitFunction - - -@dataclass -class NetworkConfig: - """Class for storing network configurations.""" - - obs_dim: int - act_dim: int - hidden_sizes: list - activation: Activation = 'tanh' - weight_initialization_mode: InitFunction = 'kaiming_uniform' - shared: nn.Module = None - output_activation: Optional[Activation] = None - - -@dataclass -class ActionConfig: - """Class for storing action configurations.""" - - scale_action: bool = False - clip_action: bool = False - std_learning: bool = True - std_init: float = 1.0 +from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor +from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor +from omnisafe.models.base import Actor +from omnisafe.typing import Activation, ActorType, InitFunction, OmnisafeSpace # pylint: disable-next=too-few-public-methods class ActorBuilder: """Class for building actor networks.""" - # pylint: disable-next=too-many-arguments def __init__( self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, - activation: Activation = 'tanh', + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], + activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'kaiming_uniform', - shared: nn.Module = None, - scale_action: bool = False, - clip_action: bool = False, - output_activation: Optional[Activation] = 'identity', - std_learning: bool = True, - std_init: float = 1.0, ) -> None: """Initialize ActorBuilder.""" - self.network_config = NetworkConfig( - obs_dim=obs_dim, - act_dim=act_dim, - hidden_sizes=hidden_sizes, - activation=activation, - output_activation=output_activation, - weight_initialization_mode=weight_initialization_mode, - shared=shared, - ) - self.action_config = ActionConfig( - scale_action=scale_action, - clip_action=clip_action, - std_learning=std_learning, - std_init=std_init, - ) + self._obs_space = obs_space + self._act_space = act_space + self._weight_initialization_mode = weight_initialization_mode + self._activation = activation + self._hidden_sizes = hidden_sizes # pylint: disable-next=too-many-return-statements - def build_actor( - self, actor_type: str, **kwargs - ) -> Union[ - CategoricalActor, - GaussianStdNetActor, - MLPCholeskyActor, - GaussianActor, - NotImplementedError, - ]: + def build_actor(self, actor_type: ActorType) -> Actor: """Build actor network.""" - if actor_type == 'categorical': - return CategoricalActor( - obs_dim=self.network_config.obs_dim, - act_dim=self.network_config.act_dim, - hidden_sizes=self.network_config.hidden_sizes, - activation=self.network_config.activation, - weight_initialization_mode=self.network_config.weight_initialization_mode, - shared=self.network_config.shared, - **kwargs, - ) - if actor_type == 'gaussian_stdnet': - return GaussianStdNetActor( - obs_dim=self.network_config.obs_dim, - act_dim=self.network_config.act_dim, - hidden_sizes=self.network_config.hidden_sizes, - activation=self.network_config.activation, - weight_initialization_mode=self.network_config.weight_initialization_mode, - shared=self.network_config.shared, - scale_action=self.action_config.scale_action, - **kwargs, - ) - if actor_type == 'cholesky': - return MLPCholeskyActor( - obs_dim=self.network_config.obs_dim, - act_dim=self.network_config.act_dim, - hidden_sizes=self.network_config.hidden_sizes, - activation=self.network_config.activation, - weight_initialization_mode=self.network_config.weight_initialization_mode, - **kwargs, + if actor_type == 'gaussian_learning': + return GaussianLearningActor( + self._obs_space, + self._act_space, + self._hidden_sizes, + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, ) - if actor_type == 'gaussian': - return GaussianActor( - obs_dim=self.network_config.obs_dim, - act_dim=self.network_config.act_dim, - hidden_sizes=self.network_config.hidden_sizes, - activation=self.network_config.activation, - weight_initialization_mode=self.network_config.weight_initialization_mode, - scale_action=self.action_config.scale_action, - clip_action=self.action_config.clip_action, - output_activation=self.network_config.output_activation, - std_learning=self.action_config.std_learning, - std_init=self.action_config.std_init, - shared=self.network_config.shared, - **kwargs, + if actor_type == 'gaussian_sac': + return GaussianSACActor( + self._obs_space, + self._act_space, + self._hidden_sizes, + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, ) raise NotImplementedError( f'Actor type {actor_type} is not implemented! ' - f'Did you mean {difflib.get_close_matches(actor_type, ["categorical", "gaussian_stdnet", "cholesky", "gaussian"], n=1)[0]}?' # pylint: disable=line-too-long + f'Did you mean {difflib.get_close_matches(actor_type, ["gaussian_learning", "gaussian_sac"])[0]}?' ) diff --git a/omnisafe/models/actor/categorical_actor.py b/omnisafe/models/actor/categorical_actor.py deleted file mode 100644 index 0132a128a..000000000 --- a/omnisafe/models/actor/categorical_actor.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of categorical actor.""" - -from typing import Tuple, Union - -import torch -import torch.nn as nn -from torch.distributions.categorical import Categorical - -from omnisafe.models.base import Actor -from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network - - -class CategoricalActor(Actor): - """Implementation of CategoricalActor. - - A Categorical policy that uses a MLP to map observations to actions distributions. - :class:`CategoricalActor` uses a single headed MLP, - to predict the logits of the Categorical distribution. - This class is an inherit class of :class:`Actor`. - You can design your own Categorical policy by inheriting this class or :class:`Actor`. - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, - activation: Activation = 'relu', - weight_initialization_mode: InitFunction = 'xavier_uniform', - shared: nn.Module = None, - ) -> None: - """Initialize CategoricalActor. - - Args: - obs_dim (int): Observation dimension. - act_dim (int): Action dimension. - hidden_sizes (list): Hidden layer sizes. - activation (Activation): Activation function. - weight_initialization_mode (InitFunction): Weight initialization mode. - shared (nn.Module): Shared network. - """ - super().__init__( - obs_dim, act_dim, hidden_sizes, activation, weight_initialization_mode, shared=shared - ) - if shared is not None: - action_head = build_mlp_network( - sizes=[hidden_sizes[-1], act_dim], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.net = nn.Sequential(shared, action_head) - else: - self.net = build_mlp_network( - [obs_dim] + list(hidden_sizes) + [act_dim], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - - def _distribution(self, obs: torch.Tensor) -> Categorical: - """Get distribution of the action. - - .. note:: - This function is used to get the distribution of the action. - It is used to sample actions and compute log probabilities. - - Args: - obs (torch.Tensor): Observation. - """ - logits = self.net(obs) - return Categorical(logits=logits) - - def predict( - self, - obs: torch.Tensor, - deterministic: bool = False, - need_log_prob: bool = False, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: - r"""Predict deterministic or stochastic action based on observation. - - - ``deterministic`` = ``True`` or ``False`` - - When training the actor, - one important trick to avoid local minimum is to use stochastic actions, - which can simply be achieved by sampling actions from the distribution - (set ``deterministic`` = ``False``). - - When testing the actor, - we want to know the actual action that the agent will take, - so we should use deterministic actions (set ``deterministic`` = ``True``). - - - ``need_log_prob`` = ``True`` or ``False`` - - In some cases, we need to calculate the log probability of the action, - which is used to calculate the loss of the actor. - For example, in the case of continuous action space, - the loss can be calculated as: - - .. math:: - L = -\mathbb{E}_{s \sim p(s)} [\log p(a | s) A^R (s, a)] - - where :math:`p(s)` is the distribution of observation, - :math:`p(a | s)` is the distribution of action, - and :math:`\log p(a | s)` is the log probability of action under the distribution. - - Args: - obs (torch.Tensor): observation. - deterministic (bool, optional): whether to predict deterministic action. Defaults to False. - need_log_prob (bool, optional): whether to return log probability of action. Defaults to False. - """ - dist = self._distribution(obs) - if deterministic: - action = dist.probs.argmax(dim=-1) - else: - action = dist.sample() - action = action.unsqueeze(0) - if need_log_prob: - logp_a = dist.log_prob(action) - return action, action, logp_a - return action, action diff --git a/omnisafe/models/actor/cholesky_actor.py b/omnisafe/models/actor/cholesky_actor.py deleted file mode 100644 index 21df20a1f..000000000 --- a/omnisafe/models/actor/cholesky_actor.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of CholeskyActor.""" - -from typing import Tuple, Union - -import torch -import torch.nn.functional as F -from torch import nn -from torch.distributions import MultivariateNormal - -from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network, initialize_layer - - -# pylint: disable-next=too-many-instance-attributes -class MLPCholeskyActor(nn.Module): - r"""Implementation of CholeskyActor. - - A Gaussian policy that uses a MLP to map observations to actions distributions. - :class:`MLPCholeskyActor` uses a double headed MLP , - to predict the mean and Cholesky decomposition of the Gaussian distribution. - - .. note:: - The Cholesky decomposition is a lower triangular matrix L with positive diagonal entries, - such that :math:`L^T L = \Sigma`, where :math:`\Sigma` is the covariance matrix of the Gaussian distribution. - The Cholesky decomposition is a convenient way to represent a covariance matrix, - and it is more numerically stable than the standard representation of the covariance matrix. - - This class is an inherit class of :class:`Actor`. - You can design your own Gaussian policy by inheriting this class or :class:`Actor`. - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - obs_dim: int, - act_dim: int, - act_max: torch.Tensor, - act_min: torch.Tensor, - hidden_sizes: list, - cov_min: float, - mu_clamp_min: float, - mu_clamp_max: float, - cov_clamp_min: float, - cov_clamp_max: float, - activation: Activation = 'relu', - weight_initialization_mode: InitFunction = 'xavier_uniform', - ) -> None: - """Initialize MLPCholeskyActor. - - Args: - obs_dim (int): observation dimension. - act_dim (int): action dimension. - act_max (torch.Tensor): maximum value of the action. - act_min (torch.Tensor): minimum value of the action. - hidden_sizes (list): list of hidden layer sizes. - activation (str): activation function. - cov_min (float): minimum value of the covariance matrix. - mu_clamp_min (float): minimum value of the mean. - mu_clamp_max (float): maximum value of the mean. - cov_clamp_min (float): minimum value of the covariance matrix. - cov_clamp_max (float): maximum value of the covariance matrix. - weight_initialization_mode (str): weight initialization mode. - """ - super().__init__() - pi_sizes = [obs_dim] + hidden_sizes - self.act_limit = act_max - self.act_low = torch.nn.Parameter( - torch.as_tensor(act_min), requires_grad=False - ) # (1, act_dim) - self.act_high = torch.nn.Parameter( - torch.as_tensor(act_max), requires_grad=False - ) # (1, act_dim) - self.act_dim = act_dim - self.obs_dim = obs_dim - self.cov_min = cov_min - self.mu_clamp_min = mu_clamp_min - self.mu_clamp_max = mu_clamp_max - self.cov_clamp_min = cov_clamp_min - self.cov_clamp_max = cov_clamp_max - - self.net = build_mlp_network(pi_sizes, activation, activation) - self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim) - self.cholesky_layer = nn.Linear(hidden_sizes[-1], (self.act_dim * (self.act_dim + 1)) // 2) - initialize_layer(weight_initialization_mode, self.mu_layer) - # initialize_layer(weight_initialization_mode,self.cholesky_layer) - nn.init.constant_(self.mu_layer.bias, 0.0) - nn.init.constant_(self.cholesky_layer.bias, 0.0) - - def predict( - self, obs: torch.Tensor, deterministic: bool = False, need_log_prob: bool = False - ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: - r"""Predict action given observation. - - .. note:: - - Compute the mean and Cholesky decomposition of the Gaussian distribution. - - Compute logprob from Gaussian, and then apply correction for Tanh squashing. - For details of the correction formula, - please refer to the original `SAC paper `_. - - Get action from Multi-variate Gaussian distribution. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool): Whether to use deterministic policy. - """ - if len(obs.shape) == 1: - obs = torch.unsqueeze(obs, dim=0) - obs_length = obs.size(0) - - net_out = self.net(obs) - - clamped_mu = torch.clamp(self.mu_layer(net_out), self.mu_clamp_min, self.mu_clamp_max) - mean = torch.sigmoid(clamped_mu) # (B, act_dim) - - mean = self.act_low + (self.act_high - self.act_low) * mean - cholesky_vector = torch.clamp( - self.cholesky_layer(net_out), self.cov_clamp_min, self.cov_clamp_max - ) - cholesky_diag_index = torch.arange(self.act_dim, dtype=torch.long) + 1 - cholesky_diag_index = ( - torch.div(cholesky_diag_index * (cholesky_diag_index + 1), 2, rounding_mode='floor') - 1 - ) - cholesky_vector[:, cholesky_diag_index] = ( - F.softplus(cholesky_vector[:, cholesky_diag_index]) + self.cov_min - ) - tril_indices = torch.tril_indices(row=self.act_dim, col=self.act_dim, offset=0) - cholesky = torch.zeros(size=(obs_length, self.act_dim, self.act_dim), dtype=torch.float32) - cholesky[:, tril_indices[0], tril_indices[1]] = cholesky_vector - pi_distribution = MultivariateNormal(mean.to(torch.float32), scale_tril=cholesky) - - if deterministic: - pi_action = mean - else: - pi_action = pi_distribution.rsample() - - pi_action = torch.tanh(pi_action) - pi_action = self.act_limit * pi_action - - if need_log_prob: - return ( - pi_action.to(torch.float32), - pi_action.to(torch.float32), - cholesky.to(torch.float32), - ) - return pi_action.to(torch.float32), pi_action.to(torch.float32) - - def forward(self, obs, deterministic=False): - """Forward.""" diff --git a/omnisafe/models/actor/gaussian_actor.py b/omnisafe/models/actor/gaussian_actor.py index a969320da..1baff6fc7 100644 --- a/omnisafe/models/actor/gaussian_actor.py +++ b/omnisafe/models/actor/gaussian_actor.py @@ -12,222 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of GaussianStdNetActor.""" +"""This module contains some base normal distribution agent for the models.""" -from typing import Optional, Tuple, Union - -import torch -import torch.nn as nn -from torch.distributions.normal import Normal +from abc import ABC, abstractmethod from omnisafe.models.base import Actor -from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network - - -# pylint: disable-next=too-many-instance-attributes -class GaussianActor(Actor): - """Implementation of GaussianStdNetActor.""" - - # pylint: disable-next=too-many-arguments, too-many-locals - def __init__( - self, - obs_dim: int, - act_dim: int, - act_max: torch.Tensor, - act_min: torch.Tensor, - hidden_sizes: list, - activation: Activation = 'tanh', - output_activation: Activation = 'identity', - weight_initialization_mode: InitFunction = 'kaiming_uniform', - shared: nn.Module = None, - scale_action: bool = False, - clip_action: bool = False, - std_learning: bool = True, - std_init: float = 1.0, - std_end: float = 1.0, - std_annealing: bool = False, - ) -> None: - """Initialize GaussianStdNetActor. - - Args: - obs_dim (int): Observation dimension. - act_dim (int): Action dimension. - act_max (torch.Tensor): Maximum value of the action. - act_min (torch.Tensor): Minimum value of the action. - hidden_sizes (list): List of hidden layer sizes. - activation (Activation): Activation function. - output_activation (Activation): Activation function for the output layer. - weight_initialization_mode (InitFunction): Weight initialization mode. - shared (nn.Module): Shared module. - scale_action (bool): Whether to scale the action. - clip_action (bool): Whether to clip the action. - std_learning (bool): Whether to learn the standard deviation. - std_init (float): Initial value of the standard deviation. - std_end (float): Final value of the standard deviation. - std_annealing (bool): Whether to anneal the standard deviation. - """ - super().__init__( - obs_dim, act_dim, hidden_sizes, activation, weight_initialization_mode, shared - ) - self.act_min = act_min - self.act_max = act_max - self.scale_action = scale_action - self.clip_action = clip_action - self.std_init = std_init - self._std = std_init - self.std_end = std_end - self.std_annealing = std_annealing - assert ( - self.act_min.size() == self.act_max.size() - ), f'The size of act_min {self.act_min} and act_max {self.act_max} should be the same.' - if std_annealing: - assert ( - std_init > std_end - ), 'If std_annealing is True, std_init should be greater than std_end.' - assert not std_learning, 'If std_annealing is True, std_learning should be False.' - if std_learning: - assert not std_annealing, 'If std_learning is True, std_annealing should be False.' - - if shared is not None: - mean_head = build_mlp_network( - sizes=[hidden_sizes[-1], act_dim], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.net = nn.Sequential(shared, mean_head) - else: - self.net = build_mlp_network( - [obs_dim] + list(hidden_sizes) + [act_dim], - activation=activation, - output_activation=output_activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.logstd_layer = nn.Parameter(torch.zeros(1, act_dim), requires_grad=std_learning) - - def _distribution(self, obs: torch.Tensor) -> Normal: - """Get distribution of the action. - - .. note:: - The term ``log_std`` is used to control the noise level of the policy, - which is a trainable parameter. - To avoid the policy to be too explorative, - we use ``torch.clamp`` to limit the range of ``log_std``. - - Args: - obs (torch.Tensor): Observation. - """ - mean, std = self.get_mean_std(obs) - return Normal(mean, std) - - def get_mean_std(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Get mean and std of the action. - - Args: - obs (torch.Tensor): Observation. - - """ - mean = self.net(obs) - if len(mean.size()) == 1: - mean = mean.view(1, -1) - log_std = self.logstd_layer.expand_as(mean) - std = torch.exp(log_std) * self._std - - return mean, std - - def get_log_prob(self, obs: torch.Tensor, action: torch.Tensor) -> torch.Tensor: - """Get log probability of the action. - - Args: - obs (torch.Tensor): Observation. - action (torch.Tensor): Action. - """ - dist = self._distribution(obs) - return dist.log_prob(action).sum(axis=-1) - - def predict( - self, - obs: torch.Tensor, - deterministic: bool = False, - need_log_prob: bool = False, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: - r"""Predict action given observation. - - .. note:: - The action is scaled to the action space by: - - .. math:: - a = a_{min} + \frac{a + 1}{2} \times (a_{max} - a_{min}) - - where :math:`a` is the action predicted by the policy, - :math:`a_{min}` and :math:`a_{max}` are the minimum and maximum values of the action space. - After scaling, the action is clipped to the range of :math:`[a_{min}, a_{max}]`. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool): Whether to use deterministic policy. - """ - mean, std = self.get_mean_std(obs) - dist = Normal(mean, std) - if deterministic: - out = mean.to(torch.float64) - else: - out = dist.rsample().to(torch.float64) - - if self.scale_action: - # If the action scale is inf, stop scaling the action - assert ( - not torch.isinf(self.act_min).any() and not torch.isinf(self.act_max).any() - ), 'The action scale is inf, stop scaling the action.' - self.act_min = self.act_min.to(mean.device) - self.act_max = self.act_max.to(mean.device) - action = self.act_min + (out + 1) / 2 * (self.act_max - self.act_min) - else: - action = out - - if self.clip_action: - action = torch.clamp(action, self.act_min, self.act_max) - - if need_log_prob: - log_prob = dist.log_prob(out).sum(axis=-1) - return out.to(torch.float32), action.to(torch.float32), log_prob.to(torch.float32) - return out.to(torch.float32), action.to(torch.float32) - - def forward( - self, - obs: torch.Tensor, - act: Optional[torch.Tensor] = None, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: - """Forward function for actor. - - .. note:: - This forward function has two modes: - - If ``act`` is not None, it will return the distribution and the log probability of action. - - If ``act`` is None, it will return the distribution. - Args: - obs (torch.Tensor): observation. - act (torch.Tensor, optional): action. Defaults to None. - """ - dist = self._distribution(obs) - if act is not None: - log_prob = dist.log_prob(act).sum(axis=-1) - return dist, log_prob - return dist +class GaussianActor(Actor, ABC): + """A abstract class for normal distribution actor. - def get_distribution(self, obs: torch.Tensor) -> Normal: - """Get distribution of the action. - Args: - obs (torch.Tensor): Observation. - """ - return self._distribution(obs) + AN NormalActor inherits from Actor and use Normal distribution to approximate + the policy function. - def set_std(self, proportion: float) -> float: - """To support annealing exploration noise. + .. note:: + You can use this class to implement your own actor by inheriting it. + """ - Proportion is annealing from 1. to 0 over course of training. + @property + @abstractmethod + def std(self) -> float: + """Get the standard deviation of the normal distribution.""" - Args: - proportion (float): proportion of annealing. - """ - self._std = self.std_init * proportion + self.std_end * (1 - proportion) + @std.setter + @abstractmethod + def std(self, std: float) -> None: + """Set the standard deviation of the normal distribution.""" diff --git a/omnisafe/models/actor/gaussian_learning_actor.py b/omnisafe/models/actor/gaussian_learning_actor.py new file mode 100644 index 000000000..cec6b8354 --- /dev/null +++ b/omnisafe/models/actor/gaussian_learning_actor.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of GaussianStdNetActor.""" + +from typing import List + +import torch +import torch.nn as nn +from torch.distributions import Distribution, Normal + +from omnisafe.models.actor.gaussian_actor import GaussianActor +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace +from omnisafe.utils.model import build_mlp_network + + +# pylint: disable-next=too-many-instance-attributes +class GaussianLearningActor(GaussianActor): + """Implementation of GaussianLearningActor.""" + + def __init__( + self, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], + activation: Activation = 'relu', + weight_initialization_mode: InitFunction = 'kaiming_uniform', + ) -> None: + """Initialize GaussianLearningActor. + + Args: + obs_space (OmnisafeSpace): Observation space. + act_space (OmnisafeSpace): Action space. + hidden_sizes (list): List of hidden layer sizes. + activation (Activation): Activation function. + weight_initialization_mode (InitFunction): Weight initialization mode. + shared (nn.Module): Shared module. + """ + super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode) + self.mean = build_mlp_network( + sizes=[self._obs_dim, *self._hidden_sizes, self._act_dim], + activation=activation, + weight_initialization_mode=weight_initialization_mode, + ) + self.log_std = nn.Parameter(torch.zeros(self._act_dim), requires_grad=True) + + def _distribution(self, obs: torch.Tensor) -> Distribution: + mean = self.mean(obs) + std = torch.exp(self.log_std) + return Normal(mean, std) + + def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor: + self._current_dist = self._distribution(obs) + self._after_inference = True + if deterministic: + return self._current_dist.mean + return self._current_dist.rsample() + + def forward(self, obs: torch.Tensor) -> Distribution: + self._current_dist = self._distribution(obs) + self._after_inference = True + return self._current_dist + + def log_prob(self, act: torch.Tensor) -> torch.Tensor: + assert self._after_inference, 'log_prob() should be called after predict() or forward()' + self._after_inference = False + return self._current_dist.log_prob(act).sum(axis=-1) + + @property + def std(self) -> float: + return torch.exp(self.log_std).mean().item() + + @std.setter + def std(self, std: float) -> None: + device = self.log_std.device + self.log_std.data.fill_(torch.log(torch.tensor(std, device=device))) diff --git a/omnisafe/models/actor/gaussian_sac_actor.py b/omnisafe/models/actor/gaussian_sac_actor.py new file mode 100644 index 000000000..00bcb346f --- /dev/null +++ b/omnisafe/models/actor/gaussian_sac_actor.py @@ -0,0 +1,76 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of GaussianStdNetActor.""" + +from typing import List + +import torch +from torch.distributions import Distribution + +from omnisafe.models.base import Actor +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace +from omnisafe.utils.math import TanhNormal +from omnisafe.utils.model import build_mlp_network + + +class GaussianSACActor(Actor): + """Implementation of GaussianSACActor.""" + + def __init__( + self, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], + activation: Activation = 'relu', + weight_initialization_mode: InitFunction = 'kaiming_uniform', + ) -> None: + super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode) + self.net = build_mlp_network( + sizes=[self._obs_dim, *self._hidden_sizes, self._act_dim * 2], + activation=activation, + weight_initialization_mode=weight_initialization_mode, + ) + + def _distribution(self, obs: torch.Tensor) -> Distribution: + mean, log_std = self.net(obs).chunk(2, dim=-1) + log_std = torch.clamp(log_std, min=-20, max=2) + std = log_std.exp() + return TanhNormal(mean, std) + + def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor: + self._current_dist = self._distribution(obs) + self._after_inference = True + if deterministic: + return self._current_dist.mean + return self._current_dist.rsample() + + def forward(self, obs: torch.Tensor) -> Distribution: + self._current_dist = self._distribution(obs) + self._after_inference = True + return self._current_dist + + def log_prob(self, act: torch.Tensor) -> torch.Tensor: + assert self._after_inference, 'log_prob() should be called after predict() or forward()' + self._after_inference = False + return self._current_dist.log_prob(act).sum(axis=-1) + + @property + def std(self) -> float: + """Get the standard deviation of the normal distribution.""" + return self._current_dist.stddev.mean().item() + + @std.setter + def std(self, std: float) -> None: + raise NotImplementedError('GaussianStdNetActor does not support setting std.') diff --git a/omnisafe/models/actor/gaussian_stdnet_actor.py b/omnisafe/models/actor/gaussian_stdnet_actor.py deleted file mode 100644 index 7b7189f6a..000000000 --- a/omnisafe/models/actor/gaussian_stdnet_actor.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of GaussianStdNetActor.""" - -import torch -import torch.nn as nn -from torch.distributions.normal import Normal - -from omnisafe.models.base import Actor -from omnisafe.utils.model_utils import Activation, build_mlp_network - - -class GaussianStdNetActor(Actor): - """Implementation of GaussianStdNetActor.""" - - # pylint: disable-next=too-many-arguments - def __init__( - self, - obs_dim, - act_dim, - act_max: torch.Tensor, - act_min: torch.Tensor, - hidden_sizes: list, - activation: Activation = 'relu', - output_activation: Activation = 'tanh', - weight_initialization_mode: Activation = 'kaiming_uniform', - shared=None, - scale_action=False, - clip_action: bool = False, - ): - """Initialize GaussianStdNetActor.""" - super().__init__( - obs_dim, act_dim, hidden_sizes, activation, weight_initialization_mode, shared - ) - self.act_min = act_min - self.act_max = act_max - self.scale_action = scale_action - self.clip_action = clip_action - - if shared is not None: - mean_head = build_mlp_network( - sizes=[hidden_sizes[-1], act_dim], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - std_head = build_mlp_network( - sizes=[hidden_sizes[-1], act_dim], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.mean = nn.Sequential(shared, mean_head) - self.log_std = nn.Sequential(shared, std_head) - else: - net = build_mlp_network( - [obs_dim] + list(hidden_sizes), - activation=activation, - output_activation=output_activation, - weight_initialization_mode=weight_initialization_mode, - ) - mean_head = build_mlp_network( - sizes=[hidden_sizes[-1], act_dim], - activation=activation, - output_activation=output_activation, - weight_initialization_mode=weight_initialization_mode, - ) - std_head = build_mlp_network( - sizes=[hidden_sizes[-1], act_dim], - activation=activation, - output_activation=output_activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.mean = nn.Sequential(net, mean_head) - self.log_std = nn.Sequential(net, std_head) - self.net = nn.ModuleList([self.mean, self.log_std]) - - def _distribution(self, obs): - """Get distribution of the action. - - .. note:: - The term ``log_std`` is used to control the noise level of the policy, - which is a trainable parameter. - To avoid the policy to be too explorative, - we use ``torch.clamp`` to limit the range of ``log_std``. - - Args: - obs (torch.Tensor): Observation. - """ - mean = self.mean(obs) - log_std = self.log_std(obs) - log_std = torch.clamp(log_std, -20, 2) - std = torch.exp(log_std) - return Normal(mean, std) - - def predict(self, obs, deterministic=False, need_log_prob=False): - r"""Predict action given observation. - - .. note:: - The action is scaled to the action space by: - - .. math:: - a = a_{min} + \frac{a + 1}{2} \times (a_{max} - a_{min}) - - where :math:`a` is the action predicted by the policy, - :math:`a_{min}` and :math:`a_{max}` are the minimum and maximum values of the action space. - After scaling, the action is clipped to the range of :math:`[a_{min}, a_{max}]`. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool): Whether to use deterministic policy. - """ - dist = self._distribution(obs) - if deterministic: - out = dist.mean - else: - out = dist.rsample() - - if self.scale_action: - # If the action scale is inf, stop scaling the action - assert ( - not torch.isinf(self.act_min).any() and not torch.isinf(self.act_max).any() - ), 'The action scale is inf, stop scaling the action.' - self.act_min = self.act_min.to(out.device) - self.act_max = self.act_max.to(out.device) - action = self.act_min + (out + 1) / 2 * (self.act_max - self.act_min) - else: - action = out - - if self.clip_action: - action = torch.clamp(action, self.act_min, self.act_max) - - if need_log_prob: - log_prob = dist.log_prob(out).sum(axis=-1) - log_prob -= torch.log(1.00001 - torch.tanh(out) ** 2).sum(axis=-1) - return out.to(torch.float32), action.to(torch.float32), log_prob.to(torch.float32) - return out.to(torch.float32), action.to(torch.float32) - - def forward(self, obs, act=None): - """Forward function for actor. - - .. note:: - This forward function has two modes: - - - If ``act`` is not None, it will return the distribution and the log probability of action. - - If ``act`` is None, it will return the distribution. - - Args: - obs (torch.Tensor): observation. - act (torch.Tensor, optional): action. Defaults to None. - """ - dist = self._distribution(obs) - if act is not None: - log_prob = dist.log_prob(act).sum(axis=-1) - return dist, log_prob - return dist diff --git a/omnisafe/models/actor_critic.py b/omnisafe/models/actor_critic.py deleted file mode 100644 index 60108aead..000000000 --- a/omnisafe/models/actor_critic.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of ActorCritic.""" - -from typing import NamedTuple, Tuple - -import numpy as np -import torch -import torch.nn as nn -from gymnasium.spaces import Box, Discrete - -from omnisafe.models.actor import ActorBuilder -from omnisafe.models.critic import CriticBuilder -from omnisafe.utils.model_utils import build_mlp_network - - -# pylint: disable-next=too-many-instance-attributes -class ActorCritic(nn.Module): - """Class for ActorCritic. - - In ``omnisafe``, we combine the actor and critic into one this class. - - .. list-table:: - - * - Model - - Description - - Function - * - Actor - - The policy network, input is observation, output is action. - Choose the actor from the following options: - :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`, - :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`. - - Choose the action based on the observation. - * - Value Critic - - The value network, input is observation, output is reward value. - Choose the critic from the following options: - :class:`QCritic`, :class:`VCritic`. - - Estimate the reward value of the observation. - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - observation_space: Box, - action_space: Box, - model_cfgs: NamedTuple, - ) -> None: - """Initialize ActorCritic - - .. note:: - Instead of creating the actor or critic directly, we use the builder to create them. - The advantage of this is that, - each type of critic has a uniform way of passing parameters. - This makes it easy for users to use existing critics, - and also facilitates the extension of new critic types. - - Args: - observation_space (Box): Observation space. - action_space (Box): Action space. - standardized_obs (bool): Whether to standardize the observation. - scale_rewards (bool): Whether to scale the rewards. - model_cfgs (NamedTuple): Model configurations. - """ - super().__init__() - - self.obs_shape = observation_space.shape - self.obs_dim = observation_space.shape[-1] - - self.act_space_type = 'discrete' if isinstance(action_space, Discrete) else 'continuous' - self.act_dim = action_space.shape[-1] if isinstance(action_space, Box) else action_space.n - - self.model_cfgs = model_cfgs - self.ac_kwargs = model_cfgs.ac_kwargs - - # use for shared weights - layer_units = [self.obs_dim] + self.ac_kwargs.pi.hidden_sizes - activation = self.ac_kwargs.pi.activation - if model_cfgs.shared_weights: - self.shared = build_mlp_network( - layer_units, - activation=activation, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - output_activation=activation, - ) - else: - self.shared = None - - # build actor - actor_builder = ActorBuilder( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - shared=self.shared, - **self.ac_kwargs.pi, - ) - if self.act_space_type == 'discrete': - self.actor = actor_builder.build_actor('categorical') - else: - act_max = torch.as_tensor(action_space.high) - act_min = torch.as_tensor(action_space.low) - self.actor = actor_builder.build_actor( - model_cfgs.actor_type, act_max=act_max, act_min=act_min - ) - - # build critic - critic_builder = CriticBuilder( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - hidden_sizes=self.ac_kwargs.val.hidden_sizes, - activation=self.ac_kwargs.val.activation, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - shared=self.shared, - ) - self.reward_critic = critic_builder.build_critic('v') - - def forward(self, obs: torch.Tensor) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Forward pass of the actor-critic model""" - return self.step(obs) - - def step( - self, obs: torch.Tensor, deterministic: bool = False - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Step function of the actor-critic model - - Input observation, output value (from :class:`Critic`) action, - and its log probability (from :class`Actor`). - - .. note:: - The observation is standardized by the running mean and standard deviation. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool, optional): Whether to use deterministic action. - """ - with torch.no_grad(): - value = self.reward_critic(obs) - raw_action, action, logp_a = self.actor.predict( - obs, deterministic=deterministic, need_log_prob=True - ) - - return raw_action, action, value, logp_a - - def anneal_exploration(self, frac: float) -> None: - """Update internals of actors - - Updates exploration parameters for Gaussian actors update log_std - - Args: - frac: progress of epochs. 1.0 is the end of training. - """ - if hasattr(self.actor, 'set_std'): - self.actor.set_std(1 - frac) diff --git a/omnisafe/models/actor_critic/actor_critic.py b/omnisafe/models/actor_critic/actor_critic.py new file mode 100644 index 000000000..f920e27a9 --- /dev/null +++ b/omnisafe/models/actor_critic/actor_critic.py @@ -0,0 +1,154 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of ActorCritic.""" + +from typing import List, Tuple + +import torch +from torch import nn, optim +from torch.optim.lr_scheduler import ConstantLR, LinearLR, _LRScheduler + +from omnisafe.models.actor.actor_builder import ActorBuilder +from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor +from omnisafe.models.critic.critic_builder import CriticBuilder +from omnisafe.typing import OmnisafeSpace +from omnisafe.utils.config import ModelConfig +from omnisafe.utils.schedule import PiecewiseSchedule, Schedule + + +class ActorCritic(nn.Module): + """Class for ActorCritic. + + In ``omnisafe``, we combine the actor and critic into one this class. + + .. list-table:: + + * - Model + - Description + - Function + * - Actor + - The policy network, input is observation, output is action. + Choose the actor from the following options: + :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`, + :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`. + - Choose the action based on the observation. + * - Value Critic + - The value network, input is observation, output is reward value. + Choose the critic from the following options: + :class:`QCritic`, :class:`VCritic`. + - Estimate the reward value of the observation. + """ + + # pylint: disable-next=too-many-arguments + def __init__( + self, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + model_cfgs: ModelConfig, + epochs: int, + ) -> None: + """Initialize ActorCritic.""" + super().__init__() + self.actor = ActorBuilder( + obs_space=obs_space, + act_space=act_space, + hidden_sizes=model_cfgs.actor.hidden_sizes, + activation=model_cfgs.actor.activation, + weight_initialization_mode=model_cfgs.weight_initialization_mode, + ).build_actor(actor_type=model_cfgs.actor_type) + self.reward_critic = CriticBuilder( + obs_space=obs_space, + act_space=act_space, + hidden_sizes=model_cfgs.critic.hidden_sizes, + activation=model_cfgs.critic.activation, + weight_initialization_mode=model_cfgs.weight_initialization_mode, + num_critics=1, + use_obs_encoder=False, + ).build_critic(critic_type='v') + self.add_module('actor', self.actor) + self.add_module('reward_critic', self.reward_critic) + + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=model_cfgs.actor.lr) + self.reward_critic_optimizer = optim.Adam( + self.reward_critic.parameters(), lr=model_cfgs.critic.lr + ) + + self.actor_scheduler: _LRScheduler + if model_cfgs.linear_lr_decay: + self.actor_scheduler = LinearLR( + self.actor_optimizer, + start_factor=1.0, + end_factor=0.0, + total_iters=epochs, + verbose=True, + ) + else: + self.actor_scheduler = ConstantLR( + self.actor_optimizer, factor=1.0, total_iters=epochs, verbose=True + ) + + self.std_schedule: Schedule + + def step(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]: + """Choose the action based on the observation. used in rollout without gradient. + + Args: + obs: The observation. + deterministic: Whether to use deterministic action. default: False. + + Returns: + The action, value_r, and log_prob. + """ + with torch.no_grad(): + value_r = self.reward_critic(obs) + act = self.actor.predict(obs, deterministic=deterministic) + log_prob = self.actor.log_prob(act) + return act, value_r[0], log_prob + + def forward(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]: + """Choose the action based on the observation. used in training with gradient. + + Args: + obs: The observation. + deterministic: Whether to use deterministic action. default: False. + + Returns: + The action, value_r, and log_prob. + """ + return self.step(obs, deterministic=deterministic) + + def set_annealing(self, epochs: List[float], std: List[float]) -> None: + """Set the annealing mode for the actor. + + Args: + annealing: Whether to use annealing mode. + """ + assert isinstance( + self.actor, GaussianLearningActor + ), 'Only GaussianLearningActor support annealing.' + self.std_schedule = PiecewiseSchedule( + endpoints=list(zip(epochs, std)), outside_value=std[-1] + ) + + def annealing(self, epoch: int) -> None: + """Set the annealing mode for the actor. + + Args: + epoch: The current epoch. + """ + assert isinstance( + self.actor, GaussianLearningActor + ), 'Only GaussianLearningActor support annealing.' + self.actor.std = self.std_schedule.value(epoch) diff --git a/omnisafe/models/actor_critic/constraint_actor_critic.py b/omnisafe/models/actor_critic/constraint_actor_critic.py new file mode 100644 index 000000000..f69cd6d5e --- /dev/null +++ b/omnisafe/models/actor_critic/constraint_actor_critic.py @@ -0,0 +1,117 @@ +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of ConstraintActorCritic.""" + +from typing import Tuple + +import torch +from torch import optim + +from omnisafe.models.actor_critic.actor_critic import ActorCritic +from omnisafe.models.critic.critic_builder import CriticBuilder +from omnisafe.typing import OmnisafeSpace +from omnisafe.utils.config import ModelConfig + + +class ConstraintActorCritic(ActorCritic): + """ConstraintActorCritic is a wrapper around ActorCritic that adds a cost critic to the model. + + In ``omnisafe``, we combine the actor and critic into one this class. + + .. list-table:: + + * - Model + - Description + - Function + * - Actor + - The policy network, input is observation, output is action. + Choose the actor from the following options: + :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`, + :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`. + - Choose the action based on the observation. + * - Reward Critic + - The value network, input is observation, + output is reward value. + Choose the critic from the following options: + :class:`QCritic`, :class:`VCritic`. + - Estimate the reward value of the observation. + * - Cost Critic + - The value network, input is observation, + output is cost value. + Choose the critic from the following options: + :class:`QCritic`, :class:`VCritic`. + - Estimate the cost value of the observation. + """ + + def __init__( + self, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + model_cfgs: ModelConfig, + epochs: int, + ) -> None: + """Initialize ConstraintActorCritic.""" + super().__init__(obs_space, act_space, model_cfgs, epochs) + self.cost_critic = CriticBuilder( + obs_space=obs_space, + act_space=act_space, + hidden_sizes=model_cfgs.critic.hidden_sizes, + activation=model_cfgs.critic.activation, + weight_initialization_mode=model_cfgs.weight_initialization_mode, + num_critics=1, + use_obs_encoder=False, + ).build_critic('v') + self.add_module('cost_critic', self.cost_critic) + + self.cost_critic_optimizer = optim.Adam( + self.cost_critic.parameters(), lr=model_cfgs.critic.lr + ) + + def step(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]: + """Choose action based on observation. + + Args: + obs (torch.Tensor): Observation. + deterministic (bool): Whether to use deterministic policy. + + Returns: + action (torch.Tensor): Action. + value_r (torch.Tensor): Reward value. + value_c (torch.Tensor): Cost value. + log_prob (torch.Tensor): Log probability of action. + """ + with torch.no_grad(): + value_r = self.reward_critic(obs) + value_c = self.cost_critic(obs) + + action = self.actor.predict(obs, deterministic=deterministic) + log_prob = self.actor.log_prob(action) + + return action, value_r[0], value_c[0], log_prob + + def forward(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]: + """Choose action based on observation. + + Args: + obs (torch.Tensor): Observation. + deterministic (bool): Whether to use deterministic policy. + + Returns: + action (torch.Tensor): Action. + value_r (torch.Tensor): Reward value. + value_c (torch.Tensor): Cost value. + log_prob (torch.Tensor): Log probability of action. + """ + return self.step(obs, deterministic=deterministic) diff --git a/omnisafe/models/actor_q_critic.py b/omnisafe/models/actor_q_critic.py deleted file mode 100644 index 92995a7e0..000000000 --- a/omnisafe/models/actor_q_critic.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of ActorQCritic.""" - -from typing import NamedTuple, Tuple - -import numpy as np -import torch -import torch.nn as nn -from gymnasium.spaces import Box, Discrete - -from omnisafe.models.actor import ActorBuilder -from omnisafe.models.critic.q_critic import QCritic -from omnisafe.utils.model_utils import build_mlp_network - - -# pylint: disable-next=too-many-instance-attributes -class ActorQCritic(nn.Module): - """Class for ActorCritic. - - In ``omnisafe``, we combine the actor and critic into one this class. - - .. list-table:: - - * - Model - - Description - - Function - * - Actor - - The policy network, input is observation, output is action. - Choose the actor from the following options: - :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`, - :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`. - - Choose the action based on the observation. - * - Value Q Critic - - The value network, input is observation-action pair, - output is reward value. - Choose the critic from the following options: - :class:`QCritic`, :class:`VCritic`. - - Estimate the reward value of the observation. - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - observation_space: Box, - action_space: Box, - model_cfgs: NamedTuple, - ) -> None: - """Initialize ActorQCritic - - .. note:: - Instead of creating the actor or critic directly, we use the builder to create them. - The advantage of this is that, - each type of critic has a uniform way of passing parameters. - This makes it easy for users to use existing critics, - and also facilitates the extension of new critic types. - - Args: - observation_space: observation space - action_space: action space - standardized_obs: whether to standardize observation - shared_weights: whether to share weights between actor and critic - model_cfgs: model configurations - weight_initialization_mode: weight initialization mode - device: device, cpu or cuda - """ - super().__init__() - - self.obs_shape = observation_space.shape - self.act_dim = action_space.shape[-1] if isinstance(action_space, Box) else action_space.n - self.ac_kwargs = model_cfgs.ac_kwargs - # build policy and value functions - self.act_space_type = 'discrete' if isinstance(action_space, Discrete) else 'continuous' - self.obs_dim = observation_space.shape[0] - - # Use for shared weights - layer_units = [self.obs_dim] + model_cfgs.ac_kwargs.pi.hidden_sizes - - activation = model_cfgs.ac_kwargs.pi.activation - if model_cfgs.shared_weights: - shared = build_mlp_network( - layer_units, - activation=activation, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - output_activation=activation, - ) - else: - shared = None - actor_builder = ActorBuilder( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - shared=shared, - **model_cfgs.ac_kwargs.pi, - ) - - if model_cfgs.actor_type == 'cholesky': - self.actor = actor_builder.build_actor( - model_cfgs.actor_type, - act_max=torch.as_tensor(action_space.high), - act_min=torch.as_tensor(action_space.low), - cov_min=model_cfgs.cov_min, - mu_clamp_min=model_cfgs.mu_clamp_min, - mu_clamp_max=model_cfgs.mu_clamp_max, - cov_clamp_min=model_cfgs.cov_clamp_min, - cov_clamp_max=model_cfgs.cov_clamp_max, - ) - elif self.act_space_type == 'discrete': - self.actor = actor_builder.build_actor('categorical') - else: - act_max = torch.as_tensor(action_space.high) - act_min = torch.as_tensor(action_space.low) - self.actor = actor_builder.build_actor( - model_cfgs.actor_type, - act_max=act_max, - act_min=act_min, - ) - - self.critic = QCritic( - self.obs_dim, - self.act_dim, - hidden_sizes=model_cfgs.ac_kwargs.val.hidden_sizes, - activation=model_cfgs.ac_kwargs.val.activation, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - shared=shared, - num_critics=model_cfgs.ac_kwargs.val.num_critics, - action_type='continuous' if isinstance(action_space, Box) else 'discrete', - ) - - def forward(self, obs: torch.Tensor) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Forward pass of the actor-critic model""" - return self.step(obs) - - def step( - self, obs: torch.Tensor, deterministic: bool = False - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Step function of the actor-critic model - - Input observation, output value (from :class:`Critic`) action, - and its log probability (from :class`Actor`). - - .. note:: - The observation is standardized by the running mean and standard deviation. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool, optional): Whether to use deterministic action. - """ - with torch.no_grad(): - raw_action, action, logp_a = self.actor.predict( - obs, deterministic=deterministic, need_log_prob=True - ) - value = self.critic(obs, action)[0] - - return raw_action, action, value, logp_a - - def anneal_exploration(self, frac: float) -> None: - """Update internals of actors - - Updates exploration parameters for Gaussian actors update log_std - - Args: - frac: progress of epochs. 1.0 is the end of training. - """ - if hasattr(self.actor, 'set_std'): - self.actor.set_std(1 - frac) diff --git a/omnisafe/models/base.py b/omnisafe/models/base.py index da5fb4511..784c7723a 100644 --- a/omnisafe/models/base.py +++ b/omnisafe/models/base.py @@ -14,17 +14,18 @@ # ============================================================================== """This module contains some base abstract classes for the models.""" -import abc -from typing import List, Tuple, Union +from abc import ABC, abstractmethod +from typing import List import torch import torch.nn as nn -from torch.distributions.normal import Normal +from gymnasium import spaces +from torch.distributions import Distribution -from omnisafe.utils.model_utils import Activation, InitFunction +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace -class Actor(abc.ABC, nn.Module): +class Actor(ABC, nn.Module): """A abstract class for actor. An actor approximates the policy function that maps observations to actions. @@ -38,34 +39,45 @@ class Actor(abc.ABC, nn.Module): # pylint: disable-next=too-many-arguments def __init__( self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], activation: Activation = 'relu', - weight_initialization_mode: InitFunction = 'xavier_uniform', - shared: nn.Module = None, + weight_initialization_mode: InitFunction = 'kaiming_uniform', ) -> None: """Initialize the base actor. Args: - obs_dim (int): observation dimension. - act_dim (int): action dimension. + obs_space (OmnisafeSpace): observation space. + act_space (OmnisafeSpace): action space. hidden_sizes (list): hidden layer sizes. activation (Activation): activation function. weight_initialization_mode (InitFunction, optional): weight initialization mode. - Defaults to ``xavier_uniform``. + Defaults to ``kaiming_uniform``. shared (nn.Module, optional): shared module. Defaults to None. """ nn.Module.__init__(self) - self.obs_dim = obs_dim - self.act_dim = act_dim - self.shared = shared - self.weight_initialization_mode = weight_initialization_mode - self.activation = activation - self.hidden_sizes = hidden_sizes - - @abc.abstractmethod - def _distribution(self, obs) -> Normal: + self._obs_space = obs_space + self._act_space = act_space + self._weight_initialization_mode = weight_initialization_mode + self._activation = activation + self._hidden_sizes = hidden_sizes + + self._current_dist: Distribution + self._after_inference: bool = False + + if isinstance(self._obs_space, spaces.Box) and len(self._obs_space.shape) == 1: + self._obs_dim = self._obs_space.shape[0] + else: + raise NotImplementedError + + if isinstance(self._act_space, spaces.Box) and len(self._act_space.shape) == 1: + self._act_dim = self._act_space.shape[0] + else: + raise NotImplementedError + + @abstractmethod + def _distribution(self, obs: torch.Tensor) -> Distribution: r"""Return the distribution of action. An actor generates a distribution, which is used to sample actions during training. @@ -86,15 +98,28 @@ def _distribution(self, obs) -> Normal: Args: obs (torch.Tensor): observation. + + Returns: + Distribution: the distribution of action. + """ + + @abstractmethod + def forward(self, obs: torch.Tensor) -> Distribution: + r"""Return the distribution of action. + + Args: + obs (torch.Tensor): observation. + + Returns: + Distribution: the distribution of action. """ - @abc.abstractmethod + @abstractmethod def predict( self, obs: torch.Tensor, deterministic: bool = False, - need_log_prob: bool = False, - ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: + ) -> torch.Tensor: r"""Predict deterministic or stochastic action based on observation. - ``deterministic`` = ``True`` or ``False`` @@ -108,13 +133,6 @@ def predict( we want to know the actual action that the agent will take, so we should use deterministic actions (set ``deterministic`` = ``True``). - - ``need_log_prob`` = ``True`` or ``False`` - - In some cases, we need to calculate the log probability of the action, - which is used to calculate the loss of the actor. - For example, in the case of Policy Gradient, - the loss is defined as - .. math:: L = -\mathbb{E}_{s \sim p(s)} [\log p(a | s) A^R (s, a)] @@ -126,11 +144,24 @@ def predict( Args: obs (torch.Tensor): observation. deterministic (bool, optional): whether to predict deterministic action. Defaults to False. - need_log_prob (bool, optional): whether to return log probability of action. Defaults to False. + """ + + @abstractmethod + def log_prob(self, act: torch.Tensor) -> torch.Tensor: + r"""Return the log probability of action under the distribution. + + log_prob only can be called after calling ``predict`` or ``forward``. + + Args: + obs (torch.Tensor): observation. + act (torch.Tensor): action. + + Returns: + torch.Tensor: the log probability of action under the distribution. """ -class Critic(abc.ABC, nn.Module): +class Critic(ABC, nn.Module): """A abstract class for critic. A critic approximates the value function that maps observations to values. @@ -147,46 +178,40 @@ class Critic(abc.ABC, nn.Module): # pylint: disable-next=too-many-arguments def __init__( self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], activation: Activation = 'relu', - weight_initialization_mode: InitFunction = 'xavier_uniform', - shared: nn.Module = None, + weight_initialization_mode: InitFunction = 'kaiming_uniform', + num_critics: int = 1, + use_obs_encoder: bool = False, ) -> None: """Initialize the base critic. Args: - obs_dim (int): observation dimension. - act_dim (int): action dimension. + obs_space (OmnisafeSpace): observation space. + act_space (OmnisafeSpace): action space. hidden_sizes (list): hidden layer sizes. activation (Activation, optional): activation function. Defaults to 'relu'. weight_initialization_mode (InitFunction, optional): weight initialization mode. - Defaults to 'xavier_uniform'. + Defaults to 'kaiming_uniform'. shared (nn.Module, optional): shared module. Defaults to None. """ nn.Module.__init__(self) - self.obs_dim = obs_dim - self.act_dim = act_dim - self.shared = shared - self.weight_initialization_mode = weight_initialization_mode - self.activation = activation - self.hidden_sizes = hidden_sizes - - @abc.abstractmethod - def forward( - self, - obs: torch.Tensor, - act: torch.Tensor = None, - ) -> Union[torch.Tensor, List]: - """Forward function for critic. - - .. note:: - This forward function has two modes: - - If ``act`` is not None, it will return the value of the observation-action pair. - - If ``act`` is None, it will return the value of the observation. - - Args: - obs (torch.Tensor): observation. - act (torch.Tensor, optional): action. Defaults to None. - """ + self._obs_space = obs_space + self._act_space = act_space + self._weight_initialization_mode = weight_initialization_mode + self._activation = activation + self._hidden_sizes = hidden_sizes + self._num_critics = num_critics + self._use_obs_encoder = use_obs_encoder + + if isinstance(self._obs_space, spaces.Box) and len(self._obs_space.shape) == 1: + self._obs_dim = self._obs_space.shape[0] + else: + raise NotImplementedError + + if isinstance(self._act_space, spaces.Box) and len(self._act_space.shape) == 1: + self._act_dim = self._act_space.shape[0] + else: + raise NotImplementedError diff --git a/omnisafe/models/constraint_actor_critic.py b/omnisafe/models/constraint_actor_critic.py deleted file mode 100644 index 8c12dc0b6..000000000 --- a/omnisafe/models/constraint_actor_critic.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of ConstraintActorCritic.""" - -from typing import NamedTuple, Tuple - -import numpy as np -import torch -from gymnasium.spaces import Box - -from omnisafe.models.actor_critic import ActorCritic -from omnisafe.models.critic import CriticBuilder - - -class ConstraintActorCritic(ActorCritic): - """ConstraintActorCritic is a wrapper around ActorCritic that adds a cost critic to the model. - - In ``omnisafe``, we combine the actor and critic into one this class. - - .. list-table:: - - * - Model - - Description - - Function - * - Actor - - The policy network, input is observation, output is action. - Choose the actor from the following options: - :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`, - :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`. - - Choose the action based on the observation. - * - Reward Critic - - The value network, input is observation, - output is reward value. - Choose the critic from the following options: - :class:`QCritic`, :class:`VCritic`. - - Estimate the reward value of the observation. - * - Cost Critic - - The value network, input is observation, - output is cost value. - Choose the critic from the following options: - :class:`QCritic`, :class:`VCritic`. - - Estimate the cost value of the observation. - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - observation_space: Box, - action_space: Box, - model_cfgs: NamedTuple, - ) -> None: - """Initialize ConstraintActorCritic - - Args: - observation_space (Box): Observation space. - action_space (Box): Action space. - standardized_obs (bool): Whether to standardize the observation. - scale_rewards (bool): Whether to scale the rewards. - model_cfgs (NamedTuple): Model configurations. - """ - ActorCritic.__init__( - self, - observation_space, - action_space, - model_cfgs, - ) - - critic_builder = CriticBuilder( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - hidden_sizes=self.ac_kwargs.val.hidden_sizes, - activation=self.ac_kwargs.val.activation, - weight_initialization_mode=self.model_cfgs.weight_initialization_mode, - shared=self.shared, - ) - self.cost_critic = critic_builder.build_critic('v') - - def step( - self, obs: torch.Tensor, deterministic: bool = False - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray,]: - """Step function of the actor-critic model - - Input observation, output reward and cost value (from :class:`Critic`) action, - and its log probability (from :class`Actor`). - - .. note:: - The observation is standardized by the running mean and standard deviation. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool, optional): Whether to use deterministic action. - """ - with torch.no_grad(): - value = self.reward_critic(obs) - cost_value = self.cost_critic(obs) - - raw_action, action, logp_a = self.actor.predict( - obs, deterministic=deterministic, need_log_prob=True - ) - - return raw_action, action, value, cost_value, logp_a diff --git a/omnisafe/models/constraint_actor_q_critic.py b/omnisafe/models/constraint_actor_q_critic.py deleted file mode 100644 index 7050f0840..000000000 --- a/omnisafe/models/constraint_actor_q_critic.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of ConstraintActorQCritic.""" - -from typing import NamedTuple, Tuple - -import numpy as np -import torch -from gymnasium.spaces import Box - -from omnisafe.models.actor_q_critic import ActorQCritic -from omnisafe.models.critic.q_critic import QCritic - - -class ConstraintActorQCritic(ActorQCritic): - """ConstraintActorCritic is a wrapper around ActorCritic that adds a cost critic to the model. - - In ``omnisafe``, we combine the actor and critic into one this class. - - .. list-table:: - - * - Model - - Description - - Function - * - Actor - - The policy network, input is observation, output is action. - Choose the actor from the following options: - :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`, - :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`. - - Choose the action based on the observation. - * - Reward Q-Critic - - The value network, input is observation-action pair, - output is reward value. - Choose the critic from the following options: - :class:`QCritic`, :class:`VCritic`. - - Estimate the reward value of the observation. - * - Cost Q-Critic - - The value network, input is observation-action pair, - output is cost value. - Choose the critic from the following options: - :class:`QCritic`, :class:`VCritic`. - - Estimate the cost value of the observation. - """ - - # pylint: disable-next=too-many-arguments - def __init__( - self, - observation_space: Box, - action_space: Box, - model_cfgs: NamedTuple, - ) -> None: - """Initialize ConstraintActorQCritic. - - Args: - observation_space: The observation space. - action_space: The action space. - standardized_obs: Whether to standardize the observation. - model_cfgs: The model configurations. - """ - - super().__init__( - observation_space=observation_space, - action_space=action_space, - model_cfgs=model_cfgs, - ) - self.cost_critic = QCritic( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - hidden_sizes=self.ac_kwargs.val.hidden_sizes, - activation=self.ac_kwargs.val.activation, - weight_initialization_mode=model_cfgs.weight_initialization_mode, - shared=model_cfgs.shared_weights, - ) - - def step( - self, obs: torch.Tensor, deterministic: bool = False - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray,]: - """Step function of the actor-critic model - - Input observation-action pair, output reward and cost value (from :class:`QCritic`) action, - and its log probability (from :class`Actor`). - - .. note:: - The observation is standardized by the running mean and standard deviation. - - Args: - obs (torch.Tensor): Observation. - deterministic (bool, optional): Whether to use deterministic action. - """ - with torch.no_grad(): - raw_action, action, logp_a = self.actor.predict( - obs, deterministic=deterministic, need_log_prob=True - ) - value = self.critic(obs, action)[0] - cost_value = self.cost_critic(obs, action)[0] - - return raw_action, action, value, cost_value, logp_a diff --git a/omnisafe/models/critic/critic_builder.py b/omnisafe/models/critic/critic_builder.py index 733e99478..c3aa69f7f 100644 --- a/omnisafe/models/critic/critic_builder.py +++ b/omnisafe/models/critic/critic_builder.py @@ -14,13 +14,13 @@ # ============================================================================== """Implementation of CriticBuilder.""" -from typing import Union - -import torch.nn as nn +import difflib +from typing import List +from omnisafe.models.base import Critic from omnisafe.models.critic.q_critic import QCritic from omnisafe.models.critic.v_critic import VCritic -from omnisafe.utils.model_utils import Activation, InitFunction +from omnisafe.typing import Activation, CriticType, InitFunction, OmnisafeSpace # pylint: disable-next=too-few-public-methods @@ -40,35 +40,37 @@ class CriticBuilder: # pylint: disable-next=too-many-arguments def __init__( self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'kaiming_uniform', - shared: nn.Module = None, + num_critics: int = 1, + use_obs_encoder: bool = False, ) -> None: """Initialize CriticBuilder. Args: - obs_dim (int): Observation dimension. - act_dim (int): Action dimension. - hidden_sizes (list): Hidden layer sizes. + obs_space (OmnisafeSpace): Observation space. + act_space (OmnisafeSpace): Action space. + hidden_sizes (List[int]): Hidden sizes of the critic network. activation (Activation): Activation function. weight_initialization_mode (InitFunction): Weight initialization mode. - shared (nn.Module): Shared network. + num_critics (int): Number of critics. + use_obs_encoder (bool): Whether to use observation encoder, only used in q critic. """ - self.obs_dim = obs_dim - self.act_dim = act_dim - self.hidden_sizes = hidden_sizes - self.activation = activation - self.weight_initialization_mode = weight_initialization_mode - self.shared = shared + self._obs_space = obs_space + self._act_space = act_space + self._weight_initialization_mode = weight_initialization_mode + self._activation = activation + self._hidden_sizes = hidden_sizes + self._num_critics = num_critics + self._use_obs_encoder = use_obs_encoder def build_critic( self, - critic_type: str, - use_obs_encoder: bool = True, - ) -> Union[QCritic, VCritic, NotImplementedError]: + critic_type: CriticType, + ) -> Critic: """Build critic. Currently, we support two types of critics: ``q`` and ``v``. @@ -79,22 +81,25 @@ def build_critic( """ if critic_type == 'q': return QCritic( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - hidden_sizes=self.hidden_sizes, - activation=self.activation, - weight_initialization_mode=self.weight_initialization_mode, - shared=self.shared, - use_obs_encoder=use_obs_encoder, + obs_space=self._obs_space, + act_space=self._act_space, + hidden_sizes=self._hidden_sizes, + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, + num_critics=self._num_critics, + use_obs_encoder=self._use_obs_encoder, ) if critic_type == 'v': return VCritic( - obs_dim=self.obs_dim, - act_dim=self.act_dim, - hidden_sizes=self.hidden_sizes, - activation=self.activation, - weight_initialization_mode=self.weight_initialization_mode, - shared=self.shared, + obs_space=self._obs_space, + act_space=self._act_space, + hidden_sizes=self._hidden_sizes, + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, + num_critics=self._num_critics, ) - raise NotImplementedError(f'critic_type "{critic_type}" is not implemented.') + raise NotImplementedError( + f'critic_type "{critic_type}" is not implemented.' + f'Did you mean one of {difflib.get_close_matches(critic_type, ["q", "v"])[0]}?' + ) diff --git a/omnisafe/models/critic/q_critic.py b/omnisafe/models/critic/q_critic.py index 06f95ef48..5c14f5056 100644 --- a/omnisafe/models/critic/q_critic.py +++ b/omnisafe/models/critic/q_critic.py @@ -13,13 +13,15 @@ # limitations under the License. # ============================================================================== """Implementation of QCritic.""" -from typing import List, Optional + +from typing import List import torch import torch.nn as nn from omnisafe.models.base import Critic -from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace +from omnisafe.utils.model import build_mlp_network class QCritic(Critic): @@ -33,24 +35,22 @@ class QCritic(Critic): # pylint: disable-next=too-many-arguments def __init__( self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], activation: Activation = 'relu', - weight_initialization_mode: InitFunction = 'xavier_uniform', - shared: nn.Module = None, + weight_initialization_mode: InitFunction = 'kaiming_uniform', num_critics: int = 1, - use_obs_encoder: bool = True, - action_type: str = 'continuous', + use_obs_encoder: bool = False, ) -> None: """Initialize the critic network. The Q critic network has two modes: - ``use_obs_encoder`` = ``False`` : - The input of the network is the concatenation of the observation and action. + The input of the network is the concatenation of the observation and action. - ``use_obs_encoder`` = ``True`` : - The input of the network is the concatenation of the output of the observation encoder and action. + The input of the network is the concatenation of the output of the observation encoder and action. For example, in :class:`DDPG`, the action is not directly concatenated with the observation, @@ -63,56 +63,55 @@ def __init__( you need to use the index to get it. Args: - obs_dim (int): Observation dimension. - act_dim (int): Action dimension. - hidden_sizes (list): Hidden layer sizes. - activation (Activation): Activation function. - weight_initialization_mode (InitFunction): Weight initialization mode. - shared (nn.Module): Shared network. - num_critics (int): Number of critics. - use_obs_encoder (bool): Whether to use observation encoder. + obs_space (OmnisafeSpace): observation space. + act_space (OmnisafeSpace): action space. + hidden_sizes (list): list of hidden layer sizes. + activation (Activation): activation function. + weight_initialization_mode (InitFunction): weight initialization mode. + shared (nn.Module): shared network. + num_critics (int): number of critics. + use_obs_encoder (bool): whether to use observation encoder. + """ - self.use_obs_encoder = use_obs_encoder - Critic.__init__( - self, - obs_dim=obs_dim, - act_dim=act_dim, - hidden_sizes=hidden_sizes, - activation=activation, - weight_initialization_mode=weight_initialization_mode, - shared=shared, + super().__init__( + obs_space, + act_space, + hidden_sizes, + activation, + weight_initialization_mode, + num_critics, + use_obs_encoder, ) - self.critic_list = [] - expand_dim = act_dim if action_type == 'continuous' else 1 - for idx in range(num_critics): - if self.use_obs_encoder: + self.net_lst: List[nn.Module] = [] + for idx in range(self._num_critics): + if self._use_obs_encoder: obs_encoder = build_mlp_network( - [obs_dim, hidden_sizes[0]], + [self._obs_dim, hidden_sizes[0]], activation=activation, output_activation=activation, weight_initialization_mode=weight_initialization_mode, ) net = build_mlp_network( - [hidden_sizes[0] + expand_dim] + hidden_sizes[1:] + [1], + [hidden_sizes[0] + self._act_dim] + hidden_sizes[1:] + [1], activation=activation, weight_initialization_mode=weight_initialization_mode, ) critic = nn.Sequential(obs_encoder, net) else: net = build_mlp_network( - [obs_dim + act_dim] + hidden_sizes[:] + [1], + [self._obs_dim + self._act_dim] + hidden_sizes + [1], activation=activation, weight_initialization_mode=weight_initialization_mode, ) critic = nn.Sequential(net) - self.critic_list.append(critic) + self.net_lst.append(critic) self.add_module(f'critic_{idx}', critic) def forward( self, obs: torch.Tensor, - act: Optional[torch.Tensor] = None, - ) -> List: + act: torch.Tensor, + ) -> List[torch.Tensor]: """Forward function. As a multi-critic network, the output of the network is a list of Q-values. @@ -125,10 +124,6 @@ def forward( act (torch.Tensor): Action. """ res = [] - for critic in self.critic_list: - if self.use_obs_encoder: - encodered_obs = critic[0](obs) - res.append(torch.squeeze(critic[1](torch.cat([encodered_obs, act], dim=-1)), -1)) - else: - res.append(torch.squeeze(critic[0](torch.cat([obs, act], dim=-1)), -1)) + for critic in self.net_lst: + res.append(torch.squeeze(critic(torch.cat([obs, act], dim=-1)), -1)) return res diff --git a/omnisafe/models/critic/v_critic.py b/omnisafe/models/critic/v_critic.py index 19c6aa022..5e8d859ba 100644 --- a/omnisafe/models/critic/v_critic.py +++ b/omnisafe/models/critic/v_critic.py @@ -14,11 +14,14 @@ # ============================================================================== """Implementation of VCritic.""" +from typing import List + import torch import torch.nn as nn from omnisafe.models.base import Critic -from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace +from omnisafe.utils.model import build_mlp_network class VCritic(Critic): @@ -29,15 +32,14 @@ class VCritic(Critic): You can design your own V-function approximator by inheriting this class or :class:`Critic`. """ - # pylint: disable-next=too-many-arguments def __init__( self, - obs_dim: int, - act_dim: int, - hidden_sizes: list, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: List[int], activation: Activation = 'relu', - weight_initialization_mode: InitFunction = 'xavier_uniform', - shared: nn.Module = None, + weight_initialization_mode: InitFunction = 'kaiming_uniform', + num_critics: int = 1, ) -> None: """Initialize the critic network. @@ -49,41 +51,37 @@ def __init__( weight_initialization_mode (InitFunction): Weight initialization mode. shared (nn.Module): Shared network. """ - Critic.__init__( - self, - obs_dim=obs_dim, - act_dim=act_dim, - hidden_sizes=hidden_sizes, - activation=activation, - weight_initialization_mode=weight_initialization_mode, - shared=shared, + super().__init__( + obs_space, + act_space, + hidden_sizes, + activation, + weight_initialization_mode, + num_critics, + use_obs_encoder=False, ) - if shared is not None: - value_head = build_mlp_network( - sizes=[hidden_sizes[-1], 1], - activation=activation, - weight_initialization_mode=weight_initialization_mode, - ) - self.net = nn.Sequential(shared, value_head) - else: - self.net = build_mlp_network( - [obs_dim] + list(hidden_sizes) + [1], - activation=activation, - weight_initialization_mode=weight_initialization_mode, + self.net_lst: List[nn.Module] = [] + for idx in range(self._num_critics): + net = build_mlp_network( + sizes=[self._obs_dim, *self._hidden_sizes, 1], + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, ) - self.add_module('critic', self.net) + self.net_lst.append(net) + self.add_module(f'critic_{idx}', net) def forward( self, obs: torch.Tensor, - act: torch.Tensor = None, - ) -> torch.Tensor: + ) -> List[torch.Tensor]: """Forward function. Specifically, V function approximator maps observations to V-values. Args: obs (torch.Tensor): Observations. - act (torch.Tensor): Actions. """ - return torch.squeeze(self.net(obs), -1) + res = [] + for critic in self.net_lst: + res.append(torch.squeeze(critic(obs), -1)) + return res diff --git a/omnisafe/typing.py b/omnisafe/typing.py index a5369fcbb..48e81c4eb 100644 --- a/omnisafe/typing.py +++ b/omnisafe/typing.py @@ -36,6 +36,8 @@ Activation = Literal['identity', 'relu', 'sigmoid', 'softplus', 'tanh'] AdvatageEstimator = Literal['gae', 'gae-rtg', 'vtrace', 'plain'] InitFunction = Literal['kaiming_uniform', 'xavier_normal', 'glorot', 'xavier_uniform', 'orthogonal'] +CriticType = Literal['v', 'q'] +ActorType = Literal['gaussian_learning', 'gaussian_sac'] __all__ = [ 'Activation', diff --git a/omnisafe/utils/config.py b/omnisafe/utils/config.py index 34ff4acaa..5eed962a6 100644 --- a/omnisafe/utils/config.py +++ b/omnisafe/utils/config.py @@ -20,7 +20,7 @@ import yaml -from omnisafe.typing import Activation, AdvatageEstimator, InitFunction +from omnisafe.typing import Activation, ActorType, AdvatageEstimator, InitFunction class Config(dict): @@ -54,21 +54,7 @@ class Config(dict): max_grad_norm: float use_critic_norm: bool critic_norm_coeff: bool - model_cfgs: 'Config' - shared_weights: bool - weight_initialization_mode: InitFunction - actor_type: str - ac_kwargs: 'Config' - pi: 'Config' - hidden_sizes: List[int] - activation: Activation - output_activation: Activation - scale_action: bool - clip_action: bool - std_learning: bool - std_init: float - val: 'Config' - num_critics: int + model_cfgs: 'ModelConfig' buffer_cfgs: 'Config' gamma: float lam: float @@ -100,6 +86,10 @@ def __getattr__(self, name: str) -> Any: except KeyError: return super().__getattribute__(name) + def __setattr__(self, name: str, value: Any) -> None: + """Set attribute.""" + self[name] = value + def todict(self) -> dict: """Convert Config to dictionary.""" config_dict = {} @@ -145,6 +135,20 @@ def recurisve_update(self, update_args: Dict[str, Any]) -> None: self[key] = value +class ModelConfig(Config): + """Model config.""" + + weight_initialization_mode: InitFunction + actor_type: ActorType + actor: 'ModelConfig' + critic: 'ModelConfig' + hidden_sizes: List[int] + activation: Activation + std: List[float] + use_obs_encoder: bool + lr: float + + def get_default_kwargs_yaml(algo: str, env_id: str, algo_type: str) -> Config: """Get the default kwargs from ``yaml`` file. diff --git a/omnisafe/utils/core.py b/omnisafe/utils/core.py deleted file mode 100644 index 3329abcf4..000000000 --- a/omnisafe/utils/core.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Some Core Functions""" - -from typing import Union - -import torch - -from omnisafe.models import ConstraintActorCritic, ConstraintActorQCritic - - -def set_optimizer( - opt: str, module: Union[ConstraintActorCritic, ConstraintActorQCritic], learning_rate: float -) -> torch.optim.Optimizer: - """Returns an initialized optimizer from PyTorch. - - .. note:: - - The optimizer can be chosen from the following list: - - - Adam - - AdamW - - Adadelta - - Adagrad - - Adamax - - ASGD - - LBFGS - - RMSprop - - Rprop - - SGD - - Args: - opt (str): optimizer name. - module (torch.nn.Module): module to be optimized. - learning_rate (float): learning rate. - """ - assert hasattr(torch.optim, opt), f'Optimizer={opt} not found in torch.' - optimizer = getattr(torch.optim, opt) - - return optimizer(module.parameters(), lr=learning_rate, eps=1e-5) - - -def discount_cumsum_torch(x_vector: torch.Tensor, discount: float) -> torch.Tensor: - """Compute the discounted cumulative sum of vectors.""" - length = x_vector.shape[0] - x_vector = x_vector.type(torch.float64) - for idx in reversed(range(length)): - if idx == length - 1: - cumsum = x_vector[idx] - else: - cumsum = x_vector[idx] + discount * cumsum - x_vector[idx] = cumsum - return x_vector diff --git a/omnisafe/utils/distributed_utils.py b/omnisafe/utils/distributed.py similarity index 64% rename from omnisafe/utils/distributed_utils.py rename to omnisafe/utils/distributed.py index cf2e88000..615784566 100644 --- a/omnisafe/utils/distributed_utils.py +++ b/omnisafe/utils/distributed.py @@ -17,7 +17,7 @@ import os import subprocess import sys -from typing import Tuple +from typing import Any, Tuple, Union import numpy as np import torch @@ -25,57 +25,54 @@ from torch.distributed import ReduceOp -def setup_torch_for_mpi(): +def setup_distributed() -> None: """Avoid slowdowns caused by each separate process's PyTorch, using more than its fair share of CPU resources. """ old_num_threads = torch.get_num_threads() # decrease number of torch threads for MPI - if old_num_threads > 1 and num_procs() > 1: - fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) + if old_num_threads > 1 and world_size() > 1: + fair_num_threads = max(int(torch.get_num_threads() / world_size()), 1) torch.set_num_threads(fair_num_threads) print( - f'Proc {proc_id()}: Decreased number of Torch threads from ' + f'Proc {get_rank()}: Decreased number of Torch threads from ' f'{old_num_threads} to {torch.get_num_threads()}', flush=True, ) -def mpi_avg_grads(module: torch.nn.Module) -> None: - """Average contents of gradient buffers across MPI processes. +def get_rank() -> int: + """Get rank of calling process.""" + if os.getenv('MASTER_ADDR') is None: + return 0 + return dist.get_rank() - Args: - module (torch.nn.Module): module to be averaged. - """ - if num_procs() > 1: - for parameter in module.parameters(): - p_grad_numpy = parameter.grad - avg_p_grad = mpi_avg(parameter.grad) - p_grad_numpy[:] = avg_p_grad[:] +def is_master() -> bool: + """Test whether the process is the root process.""" + return bool(get_rank() == 0) -def sync_params(module: torch.nn.Module) -> None: - """Sync all parameters of module across all MPI processes. - .. note:: +def world_size() -> int: + """Count active MPI processes.""" + if os.getenv('MASTER_ADDR') is None: + return 1 + return dist.get_world_size() - This function only works when the training is multi-processing. - Args: - module (torch.nn.Module): module to be synchronized. - """ - if num_procs() > 1: - for parameter in module.parameters(): - p_numpy = parameter.data - broadcast(p_numpy) +reduce = dist.reduce +allreduce = dist.all_reduce +gather = dist.gather +allgather = dist.all_gather +broadcast = dist.broadcast +scatter = dist.scatter -def mpi_fork( +def fork( parallel: int, bind_to_core: bool = False, use_number_of_threads: bool = False, device: str = 'cpu', - test_message: list = None, ) -> bool: """The entrance of multi-processing. @@ -93,10 +90,10 @@ def mpi_fork( bind_to_core (bool, optional): Defaults to False. use_number_of_threads (bool, optional): Defaults to False. """ - is_parent = False - back_end = 'gloo' if device == 'cpu' else 'nccl' + is_parent: bool = False + backend = 'gloo' if device == 'cpu' else 'nccl' if os.getenv('MASTER_ADDR') is not None and os.getenv('IN_DIST') is None: - dist.init_process_group(backend=back_end) + dist.init_process_group(backend=backend) os.environ['IN_DIST'] = '1' # check if MPI is already setup.. if parallel > 1 and os.getenv('MASTER_ADDR') is None: @@ -116,36 +113,15 @@ def mpi_fork( args += ['-bind-to', 'core'] if use_number_of_threads: args += ['--use-hwthread-cpus'] - args += test_message or sys.argv + args += sys.argv + print(sys.argv) # this is the parent process, spawn sub-processes.. subprocess.check_call(args, env=env) is_parent = True return is_parent -def is_root_process() -> bool: - """Test whether the process is the root process.""" - return bool(dist.get_rank() == 0) - - -def proc_id() -> int: - """Get rank of calling process.""" - if os.getenv('MASTER_ADDR') is None: - return 0 - return dist.get_rank() - - -def allreduce(*args, **kwargs) -> torch.Tensor: - """Allreduce operation.""" - return dist.all_reduce(*args, **kwargs) - - -def gather(*args, **kwargs) -> torch.Tensor: - """Gather operation.""" - return dist.gather(*args, **kwargs) - - -def mpi_avg_torch_tensor(value: torch.Tensor) -> None: +def avg_tensor(value: torch.Tensor) -> None: """Average a torch tensor over MPI processes. Since torch and numpy share same memory space, tensors of dim > 0 can be be manipulated through call by reference, @@ -154,40 +130,76 @@ def mpi_avg_torch_tensor(value: torch.Tensor) -> None: value (torch.Tensor): value to be averaged. """ assert isinstance(value, torch.Tensor) - if num_procs() > 1: + if world_size() > 1: assert len(value.shape) > 0 - avg_x = mpi_avg(value) + avg_x = dist_avg(value) value[:] = avg_x[:] -def num_procs() -> int: - """Count active MPI processes.""" - if os.getenv('MASTER_ADDR') is None: - return 1 - return dist.get_world_size() +def avg_grads(module: torch.nn.Module) -> None: + """Average contents of gradient buffers across MPI processes. + + Args: + module (torch.nn.Module): module to be averaged. + """ + if world_size() > 1: + for parameter in module.parameters(): + if parameter.grad is not None: + p_grad = parameter.grad + avg_p_grad = dist_avg(parameter.grad) + p_grad[:] = avg_p_grad[:] + + +def sync_params(module: torch.nn.Module) -> None: + """Sync all parameters of module across all MPI processes. + + .. note:: + This function only works when the training is multi-processing. -def broadcast(value: torch.Tensor, src: int = 0) -> torch.Tensor: - """Broadcast.""" - dist.broadcast(value, src=src) + Args: + module (torch.nn.Module): module to be synchronized. + """ + if world_size() > 1: + for parameter in module.parameters(): + p_numpy = parameter.data + broadcast(p_numpy, src=0) -def mpi_avg(value: torch.Tensor) -> torch.Tensor: - """Average a scalar or numpy vector over MPI processes.""" - return mpi_sum(value) / num_procs() +def avg_params(module: torch.nn.Module) -> None: + """Average contents of all parameters across MPI processes. + + Args: + module (torch.nn.Module): module to be averaged. + """ + if world_size() > 1: + for parameter in module.parameters(): + param_tensor = parameter.data + avg_param_tensor = dist_avg(param_tensor) + param_tensor[:] = avg_param_tensor[:] + + +def dist_avg(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor: + """Average a tensor over distributed processes.""" + return dist_sum(value) / world_size() -def mpi_max(value: torch.Tensor) -> torch.Tensor: - """Determine global maximum of scalar or numpy array over MPI processes.""" - return mpi_op(value, ReduceOp.MAX) +def dist_max(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor: + """Determine global maximum of tensor over distributed processes.""" + return dist_op(value, ReduceOp.MAX) -def mpi_min(value: torch.Tensor) -> torch.Tensor: - """Determine global minimum of scalar or numpy array over MPI processes.""" - return mpi_op(value, ReduceOp.MIN) +def dist_min(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor: + """Determine global minimum of tensor over distributed processes.""" + return dist_op(value, ReduceOp.MIN) -def mpi_op(value: torch.Tensor, operation: ReduceOp) -> torch.Tensor: +def dist_sum(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor: + """Sum a tensor over distributed processes.""" + return dist_op(value, ReduceOp.SUM) + + +def dist_op(value: Union[np.ndarray, torch.Tensor, int, float], operation: Any) -> torch.Tensor: """Multi-processing operation. .. note:: @@ -199,20 +211,15 @@ def mpi_op(value: torch.Tensor, operation: ReduceOp) -> torch.Tensor: value (torch.Tensor): value to be operated. operation (ReduceOp): operation type. """ - if num_procs() == 1: - return value - value, scalar = ([value], True) if np.isscalar(value) else (value, False) - value = torch.as_tensor(value, dtype=torch.float32) + if world_size() == 1: + return torch.as_tensor(value, dtype=torch.float32) + value_, scalar = ([value], True) if np.isscalar(value) else (value, False) + value = torch.as_tensor(value_, dtype=torch.float32) allreduce(value, op=operation) return value[0] if scalar else value -def mpi_sum(value: torch.Tensor) -> torch.Tensor: - """Sum a scalar or numpy vector over MPI processes.""" - return mpi_op(value, ReduceOp.SUM) - - -def mpi_statistics_scalar( +def dist_statistics_scalar( value: torch.Tensor, with_min_and_max: bool = False ) -> Tuple[torch.Tensor, ...]: """Get mean/std and optional min/max of scalar x across MPI processes. @@ -221,14 +228,15 @@ def mpi_statistics_scalar( value (torch.Tensor): value to be operated. with_min_and_max (bool): whether to return min and max. """ - global_sum, global_n = mpi_sum([torch.sum(value), len(value)]) + global_sum = dist_sum(torch.sum(value)) + global_n = dist_sum(len(value)) mean = global_sum / global_n - global_sum_sq = mpi_sum(torch.sum((value - mean) ** 2)) + global_sum_sq = dist_sum(torch.sum((value - mean) ** 2)) # compute global std std = torch.sqrt(global_sum_sq / global_n) if with_min_and_max: - global_min = mpi_min(value) - global_max = mpi_max(value) + global_min = dist_min(value) + global_max = dist_max(value) return mean, std, global_min, global_max return mean, std diff --git a/omnisafe/utils/exp_grid_tools.py b/omnisafe/utils/exp_grid_tools.py index 027e1e949..9b7cc922d 100644 --- a/omnisafe/utils/exp_grid_tools.py +++ b/omnisafe/utils/exp_grid_tools.py @@ -15,6 +15,7 @@ """Tools for Experiment Grid.""" import string +from typing import List, Union def all_bools(vals: list) -> bool: @@ -22,7 +23,7 @@ def all_bools(vals: list) -> bool: return all(isinstance(v, bool) for v in vals) -def valid_str(vals: list or str or type) -> str: +def valid_str(vals: Union[List, str]) -> str: r"""Convert a value or values to a string which could go in a path of file. Partly based on `this gist`_. diff --git a/omnisafe/utils/algo_utils.py b/omnisafe/utils/math.py similarity index 50% rename from omnisafe/utils/algo_utils.py rename to omnisafe/utils/math.py index 0af602658..b8e936f5f 100644 --- a/omnisafe/utils/algo_utils.py +++ b/omnisafe/utils/math.py @@ -13,9 +13,10 @@ # limitations under the License. # ============================================================================== """Implementation of the algo utils.""" -from typing import Tuple +from typing import Callable, Tuple import torch +from torch.distributions import Normal, TanhTransform, TransformedDistribution def get_transpose(tensor: torch.Tensor) -> torch.Tensor: @@ -105,3 +106,124 @@ def gaussian_kl( c_mean_q = 0.5 * torch.mean(inner_mean_q) c_sigma_q = 0.5 * torch.mean(inner_sigma_q) return c_mean_q, c_sigma_q, torch.mean(sigma_p_det), torch.mean(sigma_q_det) + + +def discount_cumsum(x_vector: torch.Tensor, discount: float) -> torch.Tensor: + """Compute the discounted cumulative sum of vectors.""" + length = x_vector.shape[0] + x_vector = x_vector.type(torch.float64) + for idx in reversed(range(length)): + if idx == length - 1: + cumsum = x_vector[idx] + else: + cumsum = x_vector[idx] + discount * cumsum + x_vector[idx] = cumsum + return x_vector + + +def conjugate_gradients( + Avp: Callable[[torch.Tensor], torch.Tensor], + b_vector: torch.Tensor, + num_steps: int = 10, + residual_tol: float = 1e-10, + eps: float = 1e-6, +): # pylint: disable=invalid-name,too-many-locals + """Implementation of Conjugate gradient algorithm. + + Conjugate gradient algorithm is used to solve the linear system of equations :math:`Ax = b`. + The algorithm is described in detail in the paper `Conjugate Gradient Method`_. + + .. _Conjugate Gradient Method: https://en.wikipedia.org/wiki/Conjugate_gradient_method + + .. note:: + Increasing ``num_steps`` will lead to a more accurate approximation + to :math:`A^{-1} b`, and possibly slightly-improved performance, + but at the cost of slowing things down. + Also probably don't play with this hyperparameter. + + Args: + num_steps (int): Number of iterations of conjugate gradient to perform. + """ + + x = torch.zeros_like(b_vector) + r = b_vector - Avp(x) + p = r.clone() + rdotr = torch.dot(r, r) + + for _ in range(num_steps): + z = Avp(p) + alpha = rdotr / (torch.dot(p, z) + eps) + x += alpha * p + r -= alpha * z + new_rdotr = torch.dot(r, r) + if torch.sqrt(new_rdotr) < residual_tol: + break + mu = new_rdotr / (rdotr + eps) + p = r + mu * p + rdotr = new_rdotr + return x + + +class SafeTanhTransformer(TanhTransform): + """Safe Tanh Transformer.""" + + def _call(self, x: torch.Tensor) -> torch.Tensor: + return torch.clamp(torch.tanh(x), min=-0.999999, max=0.999999) + + def _inverse(self, y: torch.Tensor) -> torch.Tensor: + if y.dtype.is_floating_point: + eps = torch.finfo(y.dtype).eps + else: + raise ValueError('Expected floating point type') + y = y.clamp(min=-1 + eps, max=1 - eps) + x = super()._inverse(y) + return x + + +class TanhNormal(TransformedDistribution): # pylint: disable=abstract-method + r""" + Creates a tanh-normal distribution. + + X ~ Normal(loc, scale) + Y = tanh(X) ~ TanhNormal(loc, scale) + + Example:: + + >>> m = TanhNormal(torch.tensor([0.0]), torch.tensor([1.0])) + >>> m.sample() # tanh-normal distributed with mean=0 and stddev=1 + tensor([-0.7616]) + + Args: + loc (float or Tensor): mean of the underlying normal distribution + scale (float or Tensor): standard deviation of the underlying normal distribution + """ + + arg_constraints = { + 'loc': Normal.arg_constraints['loc'], + 'scale': Normal.arg_constraints['scale'], + } + support = TransformedDistribution.support + has_rsample = True + + def __init__(self, loc, scale, validate_args=None): + base_dist = Normal(loc, scale, validate_args=validate_args) + super().__init__(base_dist, SafeTanhTransformer(), validate_args=validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(TanhNormal, _instance) + return super().expand(batch_shape, _instance=new) + + @property + def mean(self): + return SafeTanhTransformer()(self.base_dist.mean) + + @property + def stddev(self): + return self.base_dist.stddev + + def entropy(self): + return self.base_dist.entropy() + + @property + def variance(self): + return self.base_dist.variance diff --git a/omnisafe/utils/model_utils.py b/omnisafe/utils/model.py similarity index 67% rename from omnisafe/utils/model_utils.py rename to omnisafe/utils/model.py index 5361c744b..9ec8c75ec 100644 --- a/omnisafe/utils/model_utils.py +++ b/omnisafe/utils/model.py @@ -14,14 +14,13 @@ # ============================================================================== """This module contains the helper functions for the model.""" -from typing import List, Literal, Union +from typing import List, Type, Union import numpy as np +import torch from torch import nn - -Activation = Literal['identity', 'relu', 'sigmoid', 'softplus', 'tanh'] -InitFunction = Literal['kaiming_uniform', 'xavier_normal', 'glorot', 'xavier_uniform', 'orthogonal'] +from omnisafe.typing import Activation, InitFunction def initialize_layer(init_function: InitFunction, layer: nn.Linear) -> None: @@ -49,7 +48,7 @@ def initialize_layer(init_function: InitFunction, layer: nn.Linear) -> None: def get_activation( activation: Activation, -) -> Union[nn.Identity, nn.ReLU, nn.Sigmoid, nn.Softplus, nn.Tanh]: +) -> Union[Type[nn.Identity], Type[nn.ReLU], Type[nn.Sigmoid], Type[nn.Softplus], Type[nn.Tanh]]: """Get the activation function. The ``activation`` can be chosen from: @@ -83,12 +82,47 @@ def build_mlp_network( output_activation (Activation): The output activation function. weight_initialization_mode (InitFunction): The initialization function. """ - activation = get_activation(activation) - output_activation = get_activation(output_activation) + activation_fn = get_activation(activation) + output_activation_fn = get_activation(output_activation) layers = [] for j in range(len(sizes) - 1): - act = activation if j < len(sizes) - 2 else output_activation + act_fn = activation_fn if j < len(sizes) - 2 else output_activation_fn affine_layer = nn.Linear(sizes[j], sizes[j + 1]) initialize_layer(weight_initialization_mode, affine_layer) - layers += [affine_layer, act()] + layers += [affine_layer, act_fn()] return nn.Sequential(*layers) + + +def set_optimizer( + opt: str, module: Union[nn.Module, List[nn.Parameter]], learning_rate: float +) -> torch.optim.Optimizer: + """Returns an initialized optimizer from PyTorch. + + .. note:: + + The optimizer can be chosen from the following list: + + - Adam + - AdamW + - Adadelta + - Adagrad + - Adamax + - ASGD + - LBFGS + - RMSprop + - Rprop + - SGD + + Args: + opt (str): optimizer name. + module (Union[nn.Module, List[nn.Parameter]]): module or parameters. + learning_rate (float): learning rate. + """ + assert hasattr(torch.optim, opt), f'Optimizer={opt} not found in torch.' + optimizer = getattr(torch.optim, opt) + + if isinstance(module, list): + return optimizer(module, lr=learning_rate) + if isinstance(module, nn.Module): + return optimizer(module.parameters(), lr=learning_rate) + raise TypeError(f'Invalid module type: {type(module)}') diff --git a/omnisafe/utils/online_mean_std.py b/omnisafe/utils/online_mean_std.py deleted file mode 100644 index 7b3bf7e1b..000000000 --- a/omnisafe/utils/online_mean_std.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implementation of the online mean and standard deviation.""" - -import numpy as np -import torch - -from omnisafe.utils import distributed_utils - - -class OnlineMeanStd(torch.nn.Module): - """ - Track mean and standard deviation of inputs with incremental formula. - """ - - def __init__(self, epsilon=1e-5, shape=()): - super().__init__() - self.mean = torch.nn.Parameter(torch.zeros(*shape), requires_grad=False) - self.std = torch.nn.Parameter(torch.ones(*shape), requires_grad=False) - self.count = torch.nn.Parameter(torch.zeros(1), requires_grad=False) - self.eps = epsilon - self.bound = 10 - self.shape = shape - - @property - def var(self): - """Return variance.""" - return torch.square(self.std) - - @staticmethod - def _convert_to_torch(params, dtype=torch.float32) -> torch.Tensor: - if isinstance(params, np.ndarray): - params = torch.from_numpy(params).float() - if isinstance(params, float): - params = torch.tensor([params], dtype=dtype) # use [] to make tensor torch.Size([1]) - if isinstance(params, np.floating): - params = torch.tensor([params], dtype=dtype) # use [] to make tensor torch.Size([1]) - return params - - def forward(self, data, subtract_mean=True, clip=False): - """Make input average free and scale to standard deviation.""" - # sanity checks - if len(data.shape) >= 2: - assert ( - data.shape[-1] == self.mean.shape[-1] - ), f'got shape={data.shape} but expected: {self.mean.shape}' - - is_numpy = isinstance(data, np.ndarray) - data = self._convert_to_torch(data) - if subtract_mean: - data_new = (data - self.mean) / (self.std + self.eps) - else: - data_new = data / (self.std + self.eps) - if clip: - data_new = torch.clamp(data_new, -self.bound, self.bound) - data_new = data_new.numpy() if is_numpy else data_new - return data_new - - # pylint: disable-next=too-many-locals - def update(self, data) -> None: - """Update internals incrementally. - Note: works for both vector and matrix inputs. - MPI implementation according to Chan et al.[10]; see: - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - """ - data = self._convert_to_torch(data) - - # ==== Input checks - msg = f'Expected dim in [1, 2], but got dim={len(data.shape)}.' - assert len(data.shape) == 2 or len(data.shape) == 1, msg - if self.shape[0] > 1: # expect matrix inputs - msg = f'Expected obs_dim={self.shape[0]} but got: {data.shape[1]}' - assert len(data.shape) == 2 and data.shape[1] == self.shape[0], msg - if self.shape[0] == 1: - assert len(data.shape) == 1, f'Expected dim=1 but got: {data.shape}' - # reshape is necessary since mean operator reduces vector dim by one - data = data.view((-1, 1)) - - n_b = data.shape[0] * distributed_utils.num_procs() # get batch size - n_a = self.count.clone() - n_a_b = self.count + n_b - batch_mean = torch.mean(data, dim=0) - - # 1) Calculate mean and average batch mean across processes - distributed_utils.mpi_avg_torch_tensor(batch_mean) - delta = batch_mean - self.mean - mean_new = self.mean + delta * n_b / n_a_b - - # 2) Determine variance and sync across processes - diff = data - mean_new - batch_var = torch.mean(diff**2, dim=0) - distributed_utils.mpi_avg_torch_tensor(batch_var) - - # Update running terms - m2_a = n_a * self.var - m2_b = n_b * batch_var - ratio = n_a * n_b / n_a_b - m2_a_b = m2_a + m2_b + delta**2 * ratio - - # 3) Update parameters - access internal values with data attribute - self.mean.data = mean_new - self.count.data = n_a_b - new_var = m2_a_b / n_a_b - self.std.data = torch.sqrt(new_var) diff --git a/omnisafe/utils/schedule.py b/omnisafe/utils/schedule.py new file mode 100644 index 000000000..679527ee0 --- /dev/null +++ b/omnisafe/utils/schedule.py @@ -0,0 +1,93 @@ +# Copyright 2022 OmniSafe Team. All Rights Reserved. +# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""helper class to generate scheduling params""" + +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple, Union + + +def _linear_interpolation(l, r, alpha): # pylint: disable=invalid-name + return l + alpha * (r - l) + + +class Schedule(ABC): + """Schedule for a value based on the step""" + + @abstractmethod + def value(self, time: Union[int, float]) -> Union[int, float]: + """Value at time t. + + Args: + t (float): Time. + + Returns: + float: Value at time t. + """ + + +# pylint: disable=too-few-public-methods +class PiecewiseSchedule(Schedule): + """Piece-wise schedule for a value based on the step""" + + def __init__( + self, + endpoints: List[Tuple[Union[int, float], Union[int, float]]], + outside_value=Optional[Union[int, float]], + ) -> None: + """From OpenAI baselines""" + idxes = [e[0] for e in endpoints] + assert idxes == sorted(idxes) + self._interpolation = _linear_interpolation + self._outside_value = outside_value + self._endpoints = endpoints + + def value(self, time: Union[int, float]) -> Union[int, float]: + """Value at time t. + + Args: + t (float): Time. + + Returns: + float: Value at time t. + """ + # pylint: disable=invalid-name + for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): + if l_t <= time < r_t: + alpha = float(time - l_t) / (r_t - l_t) + return self._interpolation(l, r, alpha) + + # t does not belong to any of the pieces, so doom. + assert self._outside_value is not None + return self._outside_value + + +class ConstantSchedule(Schedule): + """Constant schedule for a value""" + + def __init__(self, value): + """Value remains constant over time. + Parameters + ---------- + value: float + Constant value of the schedule + """ + self._v = value + + def value( + self, time: Union[int, float] + ) -> Union[int, float]: # pylint: disable=unused-argument + """See Schedule.value""" + return self._v diff --git a/omnisafe/utils/tools.py b/omnisafe/utils/tools.py index b7f831370..b49cd7ce5 100644 --- a/omnisafe/utils/tools.py +++ b/omnisafe/utils/tools.py @@ -14,11 +14,12 @@ # ============================================================================== """tool_function_packages""" +import os +import random + import numpy as np import torch -from omnisafe.typing import Any, Callable, Union - def get_flat_params_from(model: torch.nn.Module) -> torch.Tensor: """This function is used to get the flattened parameters from the model. @@ -61,49 +62,6 @@ def get_flat_gradients_from(model: torch.nn.Module) -> torch.Tensor: return torch.cat(grads) -def conjugate_gradients( - Avp: Callable[[torch.Tensor], torch.Tensor], - b_vector: torch.Tensor, - num_steps: int = 10, - residual_tol: float = 1e-10, - eps: float = 1e-6, -): # pylint: disable=invalid-name,too-many-locals - """Implementation of Conjugate gradient algorithm. - - Conjugate gradient algorithm is used to solve the linear system of equations :math:`Ax = b`. - The algorithm is described in detail in the paper `Conjugate Gradient Method`_. - - .. _Conjugate Gradient Method: https://en.wikipedia.org/wiki/Conjugate_gradient_method - - .. note:: - Increasing ``num_steps`` will lead to a more accurate approximation - to :math:`A^{-1} b`, and possibly slightly-improved performance, - but at the cost of slowing things down. - Also probably don't play with this hyperparameter. - - Args: - num_steps (int): Number of iterations of conjugate gradient to perform. - """ - - x = torch.zeros_like(b_vector) - r = b_vector - Avp(x) - p = r.clone() - rdotr = torch.dot(r, r) - - for _ in range(num_steps): - z = Avp(p) - alpha = rdotr / (torch.dot(p, z) + eps) - x += alpha * p - r -= alpha * z - new_rdotr = torch.dot(r, r) - if torch.sqrt(new_rdotr) < residual_tol: - break - mu = new_rdotr / (rdotr + eps) - p = r + mu * p - rdotr = new_rdotr - return x - - def set_param_values_to_model(model: torch.nn.Module, vals: torch.Tensor) -> None: """This function is used to set the parameters to the model. @@ -116,70 +74,19 @@ def set_param_values_to_model(model: torch.nn.Module, vals: torch.Tensor) -> Non vals (torch.Tensor): parameters to be set. """ assert isinstance(vals, torch.Tensor) - i = 0 + i: int = 0 for _, param in model.named_parameters(): if param.requires_grad: # param has grad and, hence, must be set orig_size = param.size() size = np.prod(list(param.size())) - new_values = vals[i : i + size] + new_values = vals[i : int(i + size)] # set new param values new_values = new_values.view(orig_size) param.data = new_values - i += size # increment array position + i += int(size) # increment array position assert i == len(vals), f'Lengths do not match: {i} vs. {len(vals)}' -# pylint: disable-next=too-many-branches,too-many-return-statements -def to_ndarray(item: Any, dtype: np.dtype = None) -> Union[np.ndarray, TypeError, None]: - """This function is used to convert the data type to ndarray. - - Change `torch.Tensor`, sequence of scalars to ndarray, and keep other data types unchanged. - - .. note: - Now supports item type: :obj:`torch.Tensor`, :obj:`dict`, :obj:`list`, :obj:`tuple` and :obj:`None` - - Args: - item (Any): item to be converted. - dtype (np.dtype): data type of the output ndarray. Default to None. - """ - - if isinstance(item, dict): - new_data = {} - for key, value in item.items(): - new_data[key] = to_ndarray(value, dtype) - return new_data - - if isinstance(item, (list, tuple)): - if len(item) == 0: - return None - if hasattr(item, '_fields'): # namedtuple - return type(item)(*[to_ndarray(t, dtype) for t in item]) - new_data = [] - for data in item: - new_data.append(to_ndarray(data, dtype)) - return new_data - - if isinstance(item, torch.Tensor): - if item.device != 'cpu': - item = item.detach().cpu() - if dtype is None: - return item.numpy() - return item.numpy().astype(dtype) - - if isinstance(item, np.ndarray): - if dtype is None: - return item - return item.astype(dtype) - - if np.isscalar(item): - return np.array(item) - - if item is None: - return None - - raise TypeError(f'not support item type: {item}') - - def expand_dims(*args): """This function is used to expand the dimensions of the input data. @@ -195,7 +102,7 @@ def expand_dims(*args): return [np.expand_dims(item, axis=0) for item in args] -def as_tensor(*args, device: torch.device = 'cpu'): +def as_tensor(*args, device: torch.device = torch.device('cpu')): """This function is used to convert the input data to tensor. .. note:: @@ -208,3 +115,20 @@ def as_tensor(*args, device: torch.device = 'cpu'): if len(args) == 1: return torch.as_tensor(args[0], dtype=torch.float32) return [torch.as_tensor(item, dtype=torch.float32, device=device) for item in args] + + +def seed_all(seed: int): + """This function is used to set the random seed for all the packages.""" + + os.environ['PYTHONHASHSEED'] = str(seed) + + random.seed(seed) + np.random.seed(seed) + + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + try: + torch.use_deterministic_algorithms(True) + except AttributeError: + pass diff --git a/omnisafe/utils/vtrace.py b/omnisafe/utils/vtrace.py deleted file mode 100644 index 3d479711e..000000000 --- a/omnisafe/utils/vtrace.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""vtrace""" - -from typing import Tuple - -import torch - - -# pylint: disable-next=too-many-arguments,too-many-locals -def calculate_v_trace( - policy_action_probs: torch.Tensor, - values: torch.Tensor, # including bootstrap - rewards: torch.Tensor, # including bootstrap - behavior_action_probs: torch.Tensor, - gamma: float = 0.99, - rho_bar: float = 1.0, - c_bar: float = 1.0, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]: - r"""This function is used to calculate V-trace targets. - - .. math:: - A_t = \sum_{k=0}^{n-1} (\lambda \gamma)^k \delta_{t+k} + - (\lambda \gamma)^n * \rho_{t+n} * (1 - d_{t+n}) * (V(x_{t+n}) - b_{t+n}) - - Calculate V-trace targets for off-policy actor-critic learning recursively. - For more details, - please refer to the paper: `Espeholt et al. 2018, IMPALA `_. - - Args: - policy_action_probs (torch.Tensor): action probabilities of policy network, shape=(sequence_length,) - values (torch.Tensor): state values, shape=(sequence_length+1,) - rewards (torch.Tensor): rewards, shape=(sequence_length+1,) - behavior_action_probs (torch.Tensor): action probabilities of behavior network, shape=(sequence_length,) - gamma (float): discount factor - rho_bar (float): clip rho - c_bar (float): clip c - - Returns: - tuple: V-trace targets, shape=(batch_size, sequence_length) - """ - assert values.ndim == 1, 'Please provide 1d-arrays' - assert rewards.ndim == 1 - assert policy_action_probs.ndim == 1 - assert behavior_action_probs.ndim == 1 - assert c_bar <= rho_bar - - sequence_length = policy_action_probs.shape[0] - # pylint: disable-next=assignment-from-no-return - rhos = torch.div(policy_action_probs, behavior_action_probs) - clip_rhos = torch.min( - rhos, torch.as_tensor(rho_bar) - ) # pylint: disable=assignment-from-no-return - clip_cs = torch.min(rhos, torch.as_tensor(c_bar)) # pylint: disable=assignment-from-no-return - v_s = values[:-1].clone() # copy all values except bootstrap value - last_v_s = values[-1] # bootstrap from last state - - # calculate v_s - for index in reversed(range(sequence_length)): - delta = clip_rhos[index] * (rewards[index] + gamma * values[index + 1] - values[index]) - v_s[index] += delta + gamma * clip_cs[index] * (last_v_s - values[index + 1]) - last_v_s = v_s[index] # accumulate current v_s for next iteration - - # calculate q_targets - v_s_plus_1 = torch.cat((v_s[1:], values[-1:])) - policy_advantage = clip_rhos * (rewards[:-1] + gamma * v_s_plus_1 - values[:-1]) - - return v_s, policy_advantage, clip_rhos diff --git a/omnisafe/wrappers/__init__.py b/omnisafe/wrappers/__init__.py deleted file mode 100644 index 9615479e9..000000000 --- a/omnisafe/wrappers/__init__.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Environment wrappers.""" - -import itertools -from types import MappingProxyType - -from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper -from omnisafe.wrappers.early_terminated_wrapper import EarlyTerminatedWrapper -from omnisafe.wrappers.saute_wrapper import SauteWrapper -from omnisafe.wrappers.simmer_wrapper import PidController, QController, SimmerWrapper - - -ENVWRAPPERS = { - 'cmdp-wrapper': CMDPWrapper, - 'saute-wrapper': SauteWrapper, - 'simmer-wrapper': SimmerWrapper, - 'early-terminated-wrapper': EarlyTerminatedWrapper, -} - -ENVWRAPPERS2TYPE = { - env_wrapper: env_wrapper_type for env_wrapper_type, env_wrapper in ENVWRAPPERS.items() -} - -__all__ = ENVWRAPPERS['all'] = tuple(itertools.chain(ENVWRAPPERS.values())) - -assert len(ENVWRAPPERS2TYPE) == len(__all__), 'Duplicate environment wrappers found.' - -ENVWRAPPERS = MappingProxyType(ENVWRAPPERS) -ENVWRAPPERS2TYPE = MappingProxyType(ENVWRAPPERS2TYPE) - -del itertools, MappingProxyType diff --git a/omnisafe/wrappers/early_terminated_wrapper.py b/omnisafe/wrappers/early_terminated_wrapper.py deleted file mode 100644 index 5b8a18347..000000000 --- a/omnisafe/wrappers/early_terminated_wrapper.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Early terminated wrapper.""" - -from typing import Dict, Tuple, TypeVar - -import numpy as np - -from omnisafe.utils.tools import as_tensor, expand_dims -from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper -from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY - - -RenderFrame = TypeVar('RenderFrame') - - -@WRAPPER_REGISTRY.register -# pylint: disable-next=too-many-instance-attributes -class EarlyTerminatedWrapper(CMDPWrapper): - """Implementation of the environment wrapper for early-terminated algorithms. - - ``omnisafe`` use different environment wrappers for different kinds of algorithms. - This is the environment wrapper for early-terminated algorithms. - - .. note:: - The only difference between this wrapper and :class:`OnPolicyEnvWrapper` is that, - this wrapper terminates the episode when the cost is unequal to 0. - Any on-policy algorithm can use this wrapper, - to convert itself into an early-terminated algorithm. - ``omnisafe`` provides a implementation of :class:`PPOEarlyTerminated`, - and :class:`PPOLagarlyTerminated`. - """ - - def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, bool, bool, Dict]: - """Step the environment. - - The environment will be stepped by the action from the agent. - Corresponding to the Markov Decision Process, - the environment will return the ``next observation``, - ``reward``, ``cost``, ``terminated``, ``truncated`` and ``info``. - - Args: - action (np.ndarray): action. - """ - next_obs, reward, cost, terminated, truncated, info = self.env.step( - action.cpu().numpy().squeeze() - ) - if self.cfgs.num_envs == 1: - next_obs, reward, cost, terminated, truncated, info = expand_dims( - next_obs, reward, cost, terminated, truncated, info - ) - if terminated | truncated: - next_obs, info = self.reset() - for idx, single_cost in enumerate(cost): - if single_cost: - terminated[idx] = True - self.rollout_data.rollout_log.ep_ret += reward - self.rollout_data.rollout_log.ep_costs += cost - self.rollout_data.rollout_log.ep_len += np.ones(self.cfgs.num_envs) - return ( - as_tensor(next_obs, reward, cost, device=self.cfgs.device), - terminated, - truncated, - info, - ) diff --git a/omnisafe/wrappers/model_based_wrapper.py b/omnisafe/wrappers/model_based_wrapper.py deleted file mode 100644 index c5f8810db..000000000 --- a/omnisafe/wrappers/model_based_wrapper.py +++ /dev/null @@ -1,455 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Environment wrapper for model-based algorithms.""" - -import gymnasium -import numpy as np -import safety_gymnasium -import torch - -from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY - - -# ---------------------------------------------------------------------------------------------------------- -ROBOTS = ['Point', 'Car', 'Doggo'] -TASKS = ['Goal', 'Button'] - -XYZ_SENSORS = { - 'Point': ['velocimeter'], - 'Car': ['velocimeter'], - 'Doggo': ['velocimeter', 'accelerometer'], -} - -ANGLE_SENSORS = { - 'Point': ['gyro', 'magnetometer'], - 'Car': ['magnetometer', 'gyro'], - 'Doggo': ['magnetometer', 'gyro'], -} - -CONSTRAINTS_SAFELOOP = { - 'Goal': ['vases', 'hazards'], - 'Button': ['hazards', 'gremlins', 'buttons'], -} -CONSTRAINTS_MBPPO = { - 'Goal': ['hazards'], - 'Button': ['hazards', 'gremlins', 'buttons'], -} - - -@WRAPPER_REGISTRY.register -class ModelBasedEnvWrapper: # pylint: disable=too-many-instance-attributes - """Model-based Environment""" - - def __init__(self, algo, env_id, render_mode=None): - self.algo = algo - self.env_id = env_id # safety gym not use this attribute - self.render_mode = render_mode - self.timestep = 0 - self.num_steps = 1000 - self.goal_distance = 0 - self.modelbased_safetygym = [ - 'SafetyPointGoal3-v0', - 'SafetyCarGoal3-v0', - 'SafetyPointGoal1-v0', - 'SafetyCarGoal1-v0', - ] - self.modelbased_mujoco_velocity = [ - 'Ant-v4', - 'Swimmer-v4', - 'HalfCheetah-v4', - 'Hopper-v4', - 'Humanoid-v4', - 'Walker2d-v4', - 'Ant-v3', - 'Swimmer-v3', - 'HalfCheetah-v3', - 'Hopper-v3', - 'Humanoid-v3', - 'Walker2d-v3', - ] - assert ( - env_id in self.modelbased_safetygym + self.modelbased_mujoco_velocity - ), f'not support {env_id}' - if env_id in self.modelbased_safetygym: - self.robot = 'Point' if 'Point' in env_id else 'Car' - self.task = 'Goal' - self.env_type = 'gym' - self.hazards_size = 0.2 - self.robot = self.robot.capitalize() # mujoco not use this attribute - self.task = self.task.capitalize() # mujoco not use this attribute - assert self.robot in ROBOTS, f'can not recognize the robot type {self.robot}' - assert self.task in TASKS, f'can not recognize the task type {self.task}' - self.env = safety_gymnasium.make(env_id, render_mode=render_mode) - self.init_sensor() - self.observation_space = gymnasium.spaces.Box( - -np.inf, np.inf, (self.ac_state_size,), dtype=np.float32 - ) - self.action_space = gymnasium.spaces.Box( - -1, 1, (self.env.action_space.shape[0],), dtype=np.float32 - ) - self.goal_position = self.env.task.goal_pos[0][:2] - self.robot_position = self.env.task.robot_pos - self.hazards_position = self.env.task.hazards_pos - elif env_id in self.modelbased_mujoco_velocity: - self.env_type = 'mujoco-velocity' - self.env = gymnasium.make(env_id) - self.observation_space = self.env.observation_space - self.action_space = self.env.action_space - self.dynamics_state_size = self.observation_space.shape[0] - self.ac_state_size = self.observation_space.shape[0] - - def set_eplen(self, eplen): - """Set episode length""" - self.num_steps = eplen - - def get_observation_cost(self, obs): - """Get batch cost from batch observation""" - if torch.is_tensor(obs): - obs = obs.cpu().detach().numpy() - batch_size = obs.shape[0] - hazards_key = self.key_to_slice['hazards'] - hazard_obs = obs[:, hazards_key].reshape(batch_size, -1, 2) - hazards_dist = np.sqrt(np.sum(np.square(hazard_obs), axis=2)).reshape(batch_size, -1) - cost = ((hazards_dist < self.hazards_size) * (self.hazards_size - hazards_dist)).sum(1) * 10 - - return cost - - def init_sensor(self): - """Initialize sensor observation""" - self.xyz_sensors = XYZ_SENSORS[self.robot] - self.angle_sensors = ANGLE_SENSORS[self.robot] - self.constraints_safeloop = CONSTRAINTS_SAFELOOP[self.task] - self.constraints_mbppo = CONSTRAINTS_MBPPO[self.task] - self.base_state_name = self.xyz_sensors + self.angle_sensors + ['goal'] - self.env.reset() - obs = self.get_obs() - self.obs_flat_size = sum(np.prod(i.shape) for i in list(obs.values())) - if self.algo == 'MBPPOLag': - self.flatten_order = ( - self.base_state_name + self.constraints_mbppo + ['robot_m'] + ['robot'] - ) - elif self.algo in ['SafeLOOP', 'CAP']: - self.flatten_order = self.base_state_name + self.constraints_safeloop - - self.key_to_slice = {} - offset = 0 - for k in self.flatten_order: - k_size = np.prod(obs[k].shape) - self.key_to_slice[k] = slice(offset, offset + k_size) - - offset += k_size - self.base_state_dim = sum(np.prod(obs[k].shape) for k in list(self.base_state_name)) - self.action_dim = self.env.action_space.shape[0] - self.key_to_slice['base_state'] = slice(0, self.base_state_dim) - - self.reset() - obs_flat = self.get_obs_flatten() - if self.algo == 'MBPPOLag': - self.dynamics_state_size = obs_flat.shape[0] # 42 - self.ac_state_size = np.array(self.generate_lidar(obs_flat)).shape[0] # 26 - - elif self.algo in ['SafeLOOP', 'CAP']: - self.dynamics_state_size = obs_flat.shape[0] # 42 - self.ac_state_size = obs_flat.shape[0] # 42 - - def reset(self): - """Reset Environment""" - self.timestep = 0 # Reset internal timer - - if self.env_type == 'mujoco-velocity': - obs, _ = self.env.reset() - return obs - - self.env.reset() - obs = self.get_obs_flatten() - if self.algo == 'MBPPOLag': - self.goal_position = self.env.task.goal_pos[0][:2] - self.robot_position = self.env.task.robot_pos - self.hazards_position = self.env.task.hazards_pos - self.goal_distance = self.dist_xy(self.robot_position, self.goal_position) - - return obs - - def step(self, action, num_repeat): # pylint: disable=too-many-locals - """Simulate Environment""" - reward = 0 - cost = 0 - step_num = 0 - if self.env_type == 'gym': - for _ in range(num_repeat): - control = action - _, reward_k, cost_k, terminated, truncated, info = self.env.step(control) - terminated = False # not used now - step_num += 1 - reward += reward_k - cost += cost_k - self.timestep += 1 # Increment internal timer - if self.timestep >= self.num_steps: - truncated = True - observation = self.get_obs_flatten() - goal_met = 'goal_met' in info.keys() # reach the goal - if terminated or truncated or goal_met: - # the action is not related to next state, so break - break - if self.algo in ['MBPPOLag', 'SafeLOOP', 'CAP']: - info = {'cost': cost, 'goal_met': goal_met, 'step_num': step_num} - elif self.env_type == 'mujoco-velocity': - for _ in range(num_repeat): - control = action - state_k, reward_k, terminated, truncated, info = self.env.step(control) - step_num += 1 - reward += reward_k - if 'y_velocity' not in info: - cost_k = np.abs(info['x_velocity']) - else: - cost_k = np.sqrt(info['x_velocity'] ** 2 + info['y_velocity'] ** 2) - cost += cost_k - self.timestep += 1 # Increment internal timer - if self.timestep >= self.num_steps: - truncated = True - if terminated or truncated: - # the action is not related to next state, so break - break - info = {'cost': cost, 'goal_met': False, 'step_num': step_num} - observation = state_k - return observation, reward, cost, terminated, truncated, info - - def render(self): - """render environment""" - return self.env.render() - - def close(self): - """close environment""" - self.env.close() - - def recenter(self, pos): - '''Return the egocentric XY vector to a position from the robot''' - return self.env.task.ego_xy(pos) - - def get_obs(self): - ''' - We will ignore the z-axis coordinates in every poses. - The returned obs coordinates are all in the robot coordinates. - ''' - obs = {} - robot_pos = self.env.task.robot_pos - goal_pos = self.env.task.goal_pos[0] - vases_pos_list = self.env.task.vases_pos # list of shape (3,) ndarray - hazards_pos_list = self.env.task.hazards_pos # list of shape (3,) ndarray - ego_goal_pos = self.recenter(np.array(goal_pos[:2])) - ego_vases_pos_list = [ - self.env.task.ego_xy(pos[:2]) for pos in vases_pos_list - ] # list of shape (2,) ndarray - ego_hazards_pos_list = [ - self.env.task.ego_xy(pos[:2]) for pos in hazards_pos_list - ] # list of shape (2,) ndarray - - # append obs to the dict - for sensor in self.xyz_sensors: # Explicitly listed sensors - if sensor == 'accelerometer': - obs[sensor] = self.env.task.world.get_sensor(sensor)[:1] # only x axis matters - elif sensor == 'ballquat_rear': - obs[sensor] = self.env.task.world.get_sensor(sensor) - else: - obs[sensor] = self.env.task.world.get_sensor(sensor)[:2] # only x,y axis matters - - for sensor in self.angle_sensors: - if sensor == 'gyro': - obs[sensor] = self.env.task.world.get_sensor(sensor)[ - 2: - ] # [2:] # only z axis matters - # pass # gyro does not help - else: - obs[sensor] = self.env.task.world.get_sensor(sensor) - if self.algo == 'MBPPOLag': - # --------modification----------------- - obs['robot'] = np.array(robot_pos[:2]) - obs['hazards'] = np.array(ego_hazards_pos_list) # (hazard_num, 2) - robot_matrix = self.env.task.world.robot_mat() - obs['robot_m'] = np.array(robot_matrix[0][:2]) - obs['goal'] = ego_goal_pos # (2,) - elif self.algo in ['CAP', 'SafeLOOP']: - obs['vases'] = np.array(ego_vases_pos_list) # (vase_num, 2) - obs['hazards'] = np.array(ego_hazards_pos_list) # (hazard_num, 2) - obs['goal'] = ego_goal_pos # (2,) - return obs - - def get_obs_flatten(self): - '''get the flattened obs.''' - obs = self.get_obs() - flat_obs = np.zeros(self.obs_flat_size) - for k in self.flatten_order: - idx = self.key_to_slice[k] - flat_obs[idx] = obs[k].flat - return flat_obs - - def get_dist_reward(self): - ''' - @return reward: negative distance from robot to the goal - ''' - return -self.env.task.dist_goal() - - @property - def action_range(self): - """Get action range""" - return float(self.env.action_space.low[0]), float(self.env.action_space.high[0]) - - def sample_random_action(self): - '''Sample an action randomly from a uniform distribution over all valid actions.''' - return self.env.action_space.sample() - - def dist_xy(self, pos1, pos2): - '''Return the distance from the robot to an XY position.''' - pos1 = np.asarray(pos1) - pos2 = np.asarray(pos2) - if pos1.shape == (3,): - pos1 = pos1[:2] - if pos2.shape == (3,): - pos2 = pos2[:2] - return np.sqrt(np.sum(np.square(pos1 - pos2))) - - def get_reward_cost(self, state): - '''Assuming we have reward & cost function. available with us in closed form.''' - last_dist_goal = self.goal_distance - robot_pos = state[self.key_to_slice['robot']] - # ----cost---- - cost = 0 - hazards_cost = 1.0 - for h_pos in self.hazards_position: - h_dist = self.dist_xy(h_pos, robot_pos) - if h_dist <= self.hazards_size: - cost += hazards_cost * (self.hazards_size - h_dist) - if cost > 0: - cost = 1 - else: - cost = 0 - # ----reward---- - - reward = 0 - reward_distance = 1.0 - reward_goal = 1.0 - goal_size = 0.3 - - dist_goal = self.dist_xy(robot_pos, self.goal_position) - reward += (last_dist_goal - dist_goal) * reward_distance - last_dist_goal = dist_goal - goal_flag = False - if dist_goal < goal_size: - reward += reward_goal - goal_flag = True - # clip reward - if reward < -10: - reward = -10 - elif reward > 10: - reward = 10 - self.goal_distance = last_dist_goal - return reward, cost, goal_flag - - def get_goal_flag(self, robot_pos, goal_pos): - """Get goal flat""" - dist_goal = self.dist_xy(robot_pos, goal_pos) - goal_size = 0.3 - return dist_goal < goal_size - - def ego_xy(self, robot_matrix, robot_pos, pos): - '''Return the egocentric XY vector to a position from the robot''' - assert pos.shape == (2,), f'Bad pos {pos}' - robot_3vec = robot_pos - robot_mat = robot_matrix - - pos_3vec = np.concatenate([pos, [0]]) # Add a zero z-coordinate - robot_3vec = np.concatenate([robot_3vec, [0]]) - world_3vec = pos_3vec - robot_3vec - return np.matmul(world_3vec, robot_mat)[:2] - - def obs_lidar_pseudo( - self, robot_matrix, robot_pos, positions - ): # pylint: disable=too-many-locals - ''' - Return a robot-centric lidar observation of a list of positions. - - Lidar is a set of bins around the robot (divided evenly in a circle). - The detection directions are exclusive and exhaustive for a full 360 view. - Each bin reads 0 if there are no objects in that direction. - If there are multiple objects, the distance to the closest one is used. - Otherwise the bin reads the fraction of the distance towards the robot. - - E.g. if the object is 90% of lidar_max_dist away, the bin will read 0.1, - and if the object is 10% of lidar_max_dist away, the bin will read 0.9. - (The reading can be thought of as "closeness" or inverse distance) - - This encoding has some desirable properties: - - bins read 0 when empty - - bins smoothly increase as objects get close - - maximum reading is 1.0 (where the object overlaps the robot) - - close objects occlude far objects - - constant size observation with variable numbers of objects - ''' - lidar_num_bins = 16 - lidar_max_dist = 3 - obs = np.zeros(lidar_num_bins) - lidar_exp_gain = 1.0 - lidar_alias = True - for pos in positions: - pos = np.asarray(pos) - if pos.shape == (3,): - pos = pos[:2] # Truncate Z coordinate - position_z = np.complex( - *self.ego_xy(robot_matrix, robot_pos, pos) - ) # X, Y as real, imaginary components - dist = np.abs(position_z) - angle = np.angle(position_z) % (np.pi * 2) - bin_size = (np.pi * 2) / lidar_num_bins - sensor_bin = int(angle / bin_size) - bin_angle = bin_size * sensor_bin - if lidar_max_dist is None: - sensor = np.exp(-lidar_exp_gain * dist) - else: - sensor = max(0, lidar_max_dist - dist) / lidar_max_dist - obs[sensor_bin] = max(obs[sensor_bin], sensor) - # Aliasing - if lidar_alias: - alias = (angle - bin_angle) / bin_size - assert ( - 0 <= alias <= 1 - ), f'bad alias {alias}, dist {dist}, angle {angle}, bin {sensor_bin}' - bin_plus = (sensor_bin + 1) % lidar_num_bins - bin_minus = (sensor_bin - 1) % lidar_num_bins - obs[bin_plus] = max(obs[bin_plus], alias * sensor) - obs[bin_minus] = max(obs[bin_minus], (1 - alias) * sensor) - return obs - - def make_observation(self, state, lidar): - """Get observation""" - state = list(state) - lidar = list(lidar) - base_state = state[self.key_to_slice['base_state']] - obs = base_state + lidar + state[self.key_to_slice['robot']] - - return obs - - def generate_lidar(self, obs): - """Get lidar observation""" - robot_matrix_x_y = obs[self.key_to_slice['robot_m']] - robot_matrix_x = robot_matrix_x_y[0] - robot_matrix_y = robot_matrix_x_y[1] - first_row = [robot_matrix_x, robot_matrix_y, 0] - second_row = [-robot_matrix_y, robot_matrix_x, 0] - third_row = [0, 0, 1] - robot_matrix = [first_row, second_row, third_row] - robot_pos = obs[self.key_to_slice['robot']] - lidar_vec = self.obs_lidar_pseudo(robot_matrix, robot_pos, self.hazards_position) - obs_vec = self.make_observation(obs, lidar_vec) - return obs_vec diff --git a/omnisafe/wrappers/saute_wrapper.py b/omnisafe/wrappers/saute_wrapper.py deleted file mode 100644 index faffdcf38..000000000 --- a/omnisafe/wrappers/saute_wrapper.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Environment wrapper for saute algorithms.""" - -from dataclasses import dataclass - -import numpy as np -import torch -from gymnasium import spaces - -from omnisafe.common.normalizer import Normalizer -from omnisafe.common.record_queue import RecordQueue -from omnisafe.typing import NamedTuple, Optional -from omnisafe.utils.tools import as_tensor, expand_dims -from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper -from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY - - -@dataclass -class RolloutLog: - """Log for roll out.""" - - ep_ret: np.ndarray - ep_costs: np.ndarray - ep_len: np.ndarray - ep_budget: np.ndarray - - -@dataclass -class SauteData: - """Data for Saute RL.""" - - safety_budget: float - unsafe_reward: float - safety_obs: np.ndarray - - -@dataclass -class RolloutData: - """Data for roll out.""" - - local_steps_per_epoch: int - max_ep_len: int - use_cost: bool - current_obs: torch.Tensor - rollout_log: RolloutLog - saute_data: SauteData - - -@WRAPPER_REGISTRY.register -class SauteWrapper(CMDPWrapper): - r"""SauteEnvWrapper. - - Saute is a safe RL algorithm that uses state augmentation to ensure safety. - The state augmentation is the concatenation of the original state and the safety state. - The safety state is the safety budget minus the cost divided by the safety budget. - - .. note:: - - If the safety state is greater than 0, the reward is the original reward. - - If the safety state is less than 0, the reward is the unsafe reward (always 0 or less than 0). - - ``omnisafe`` provides two implementations of Saute RL: :class:`PPOSaute` and :class:`PPOLagSaute`. - - References: - - - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation - - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, - David Mguni, Jun Wang, Haitham Bou-Ammar. - - URL: https://arxiv.org/abs/2202.06558 - """ - - def __init__(self, env_id, cfgs: Optional[NamedTuple] = None, **env_kwargs) -> None: - """Initialize environment wrapper. - - Args: - env_id (str): environment id. - cfgs (collections.namedtuple): configs. - env_kwargs (dict): The additional parameters of environments. - """ - super().__init__(env_id, cfgs, **env_kwargs) - if hasattr(self.env, '_max_episode_steps'): - max_ep_len = self.env._max_episode_steps - else: - max_ep_len = 1000 - if cfgs.scale_safety_budget: - safety_budget = ( - cfgs.safety_budget - * (1 - self.cfgs.saute_gamma**max_ep_len) - / (1 - self.cfgs.saute_gamma) - / np.float32(max_ep_len) - * np.ones((self.cfgs.num_envs, 1)) - ) - else: - safety_budget = cfgs.safety_budget * np.ones((self.cfgs.num_envs, 1)) - safety_obs = np.ones((self.cfgs.num_envs, 1), dtype=np.float32) - self.rollout_data = RolloutData( - 0.0, - max_ep_len, - False, - None, - RolloutLog( - np.zeros(self.cfgs.num_envs), - np.zeros(self.cfgs.num_envs), - np.zeros(self.cfgs.num_envs), - np.zeros((self.cfgs.num_envs, 1)), - ), - SauteData( - safety_budget=safety_budget, - unsafe_reward=cfgs.unsafe_reward, - safety_obs=safety_obs, - ), - ) - high = np.array(np.hstack([self.observation_space.high, np.inf]), dtype=np.float32) - low = np.array(np.hstack([self.observation_space.low, np.inf]), dtype=np.float32) - self.observation_space = spaces.Box(high=high, low=low) - self.obs_normalizer = ( - Normalizer(shape=(self.cfgs.num_envs, self.observation_space.shape[0]), clip=5).to( - self.cfgs.device - ) - if self.cfgs.normalized_obs - else None - ) - self.record_queue = RecordQueue( - 'ep_ret', 'ep_cost', 'ep_len', 'ep_budget', maxlen=self.cfgs.max_len - ) - self.rollout_data.current_obs = self.reset()[0] - - def augment_obs(self, obs: np.ndarray) -> np.ndarray: - """Augmenting the obs with the safety obs. - - Detailedly, the augmented obs is the concatenation of the original obs and the safety obs. - The safety obs is the safety budget minus the cost divided by the safety budget. - - Args: - obs (np.ndarray): observation. - safety_obs (np.ndarray): safety observation. - """ - augmented_obs = np.hstack([obs, self.rollout_data.saute_data.safety_obs]) - return augmented_obs - - def safety_step(self, cost: np.ndarray, done: bool) -> np.ndarray: - """Update the normalized safety obs. - - Args: - cost (np.ndarray): cost. - """ - if done: - self.rollout_data.saute_data.safety_obs = np.ones( - (self.cfgs.num_envs, 1), dtype=np.float32 - ) - else: - self.rollout_data.saute_data.safety_obs -= ( - cost / self.rollout_data.saute_data.safety_budget - ) - self.rollout_data.saute_data.safety_obs /= self.cfgs.saute_gamma - - def safety_reward(self, reward: np.ndarray) -> np.ndarray: - """Update the reward. - - Args: - reward (np.ndarray): reward. - next_safety_obs (np.ndarray): next safety observation. - """ - for idx, safety_obs in enumerate(self.rollout_data.saute_data.safety_obs): - if safety_obs <= 0: - reward[idx] = self.rollout_data.saute_data.unsafe_reward - return reward - - def reset(self) -> tuple((torch.Tensor, dict)): - """Reset environment. - - .. note:: - The safety obs is initialized to 1.0. - - Args: - seed (int): seed for environment reset. - """ - obs, info = self.env.reset() - if self.cfgs.num_envs == 1: - obs = expand_dims(obs) - info = [info] - self.rollout_data.saute_data.safety_obs = np.ones((self.cfgs.num_envs, 1), dtype=np.float32) - obs = self.augment_obs(obs) - return torch.as_tensor(obs, dtype=torch.float32, device=self.cfgs.device), info - - def step( - self, action: torch.Tensor - ) -> tuple((torch.Tensor, torch.Tensor, torch.Tensor, bool, dict)): - """Step environment. - - .. note:: - The safety obs is updated by the cost. - The reward is updated by the safety obs. - Detailedly, the reward is the original reward if the safety obs is greater than 0, - otherwise the reward is the unsafe reward. - - Args: - action (torch.Tensor): action. - """ - next_obs, reward, cost, terminated, truncated, info = self.env.step( - action.cpu().numpy().squeeze() - ) - if self.cfgs.num_envs == 1: - next_obs, reward, cost, terminated, truncated, info = expand_dims( - next_obs, reward, cost, terminated, truncated, info - ) - self.safety_step(cost, done=terminated | truncated) - if terminated | truncated: - augmented_obs, info = self.reset() - else: - augmented_obs = self.augment_obs(next_obs) - else: - augmented_obs = self.augment_obs(next_obs) - self.rollout_data.rollout_log.ep_ret += reward - self.rollout_data.rollout_log.ep_costs += cost - self.rollout_data.rollout_log.ep_len += np.ones(self.cfgs.num_envs) - self.rollout_data.rollout_log.ep_budget += self.rollout_data.saute_data.safety_obs - reward = self.safety_reward(reward) - return ( - as_tensor(augmented_obs, reward, cost, device=self.cfgs.device), - terminated, - truncated, - info, - ) - - def reset_log( - self, - idx, - ) -> None: - ( - self.rollout_data.rollout_log.ep_ret[idx], - self.rollout_data.rollout_log.ep_costs[idx], - self.rollout_data.rollout_log.ep_len[idx], - self.rollout_data.rollout_log.ep_budget[idx], - ) = (0.0, 0.0, 0.0, 0.0) - - def rollout_log( - self, - logger, - idx, - is_train: bool = True, - ) -> None: - """Log the information of the rollout.""" - self.record_queue.append( - ep_ret=self.rollout_data.rollout_log.ep_ret[idx], - ep_cost=self.rollout_data.rollout_log.ep_costs[idx], - ep_len=self.rollout_data.rollout_log.ep_len[idx], - ep_budget=self.rollout_data.rollout_log.ep_budget[idx], - ) - avg_ep_ret, avg_ep_cost, avg_ep_len, avg_ep_budget = self.record_queue.get_mean( - 'ep_ret', 'ep_cost', 'ep_len', 'ep_budget' - ) - if is_train: - logger.store( - **{ - 'Metrics/EpRet': avg_ep_ret, - 'Metrics/EpCost': avg_ep_cost, - 'Metrics/EpLen': avg_ep_len, - 'Metrics/EpBudget': avg_ep_budget, - } - ) - else: - logger.store( - **{ - 'Test/EpRet': avg_ep_ret, - 'Test/EpCost': avg_ep_cost, - 'Test/EpLen': avg_ep_len, - 'Test/EpBudget': avg_ep_budget, - } - ) diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py deleted file mode 100644 index 068f45137..000000000 --- a/omnisafe/wrappers/simmer_wrapper.py +++ /dev/null @@ -1,688 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY pid_kiND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Environment wrapper for Simmer algorithm.""" - -import copy -from dataclasses import dataclass -from typing import Dict, Tuple - -import numpy as np -import torch -from gymnasium import spaces - -from omnisafe.common.normalizer import Normalizer -from omnisafe.common.record_queue import RecordQueue -from omnisafe.typing import NamedTuple, Optional -from omnisafe.utils.tools import as_tensor, expand_dims -from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper -from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY - - -@dataclass -class RolloutLog: - """Log for roll out.""" - - ep_ret: np.ndarray = 0.0 - ep_costs: np.ndarray = 0.0 - ep_len: np.ndarray = 0.0 - ep_budget: np.ndarray = 0.0 - - -@dataclass -class SimmerData: - """Data for Simmer RL.""" - - safety_budget: float = 0.0 - upper_budget: float = 0.0 - lower_budget: float = 0.0 - relative_budget: float = 0.0 - unsafe_reward: float = 0.0 - safety_obs: np.ndarray = None - - -@dataclass -class RolloutData: - """Data for roll out.""" - - local_steps_per_epoch: int = 0 - max_ep_len: int = 0 - use_cost: bool = False - current_obs: np.ndarray = 0.0 - rollout_log: RolloutLog = None - simmer_data: SimmerData = None - - -@dataclass -class PidData: - """Data for PID controller.""" - - pid_kp: float - pid_ki: float - pid_kd: float - tau: float - step_size: float - - -@dataclass -class QData: - """Data for Q controller.""" - - state_dim: int - action_dim: int - tau: float - threshold: float - learning_rate: float - epsilon: float - - -@dataclass -class QTable: - """Q table for Q controller.""" - - action_space: np.ndarray - q_function: np.ndarray - state_space: np.ndarray - - -# pylint: disable-next=too-many-instance-attributes -class PidController: - """Using PID controller to control the safety budget in Simmer environment.""" - - def __init__( - self, - cfgs: NamedTuple, - safety_budget: float = 25.0, - lower_budget: float = 1.0, - upper_budget: float = 25.0, - ) -> None: - """Initialize the PID controller. - - Args: - cfgs (CfgNode): Configurations. - safety_budget (float): The initial safety budget. - lower_budget (float): The lower bound of safety budget. - upper_budget (float): The upper bound of safety budget. - """ - self.pid_data = PidData( - pid_kp=cfgs.pid_kp, - pid_ki=cfgs.pid_ki, - pid_kd=cfgs.pid_kd, - tau=cfgs.tau, - step_size=cfgs.step_size, - ) - self.simmer_data = SimmerData( - safety_budget=safety_budget, - upper_budget=upper_budget, - lower_budget=lower_budget, - ) - - # initialize the PID controller. - self.error = 0.0 - self.error_i = 0.0 - self.prev_action = 0 - self.prev_raw_action = 0 - self._init_check() - - def _init_check(self) -> None: - """Check the initial value of PID controller.""" - assert self.pid_data.pid_kp >= 0, 'pid_kp should be non-negative.' - assert self.pid_data.pid_ki >= 0, 'pid_ki should be non-negative.' - assert self.pid_data.pid_kd >= 0, 'pid_kd should be non-negative.' - assert self.pid_data.tau >= 0 and self.pid_data.tau <= 1, 'tau should be in [0, 1].' - assert self.pid_data.step_size > 0, 'step_size should be positive.' - assert ( - self.simmer_data.safety_budget >= self.simmer_data.lower_budget - ), 'safety_budget should be larger than lower_budget.' - assert ( - self.simmer_data.safety_budget <= self.simmer_data.upper_budget - ), 'safety_budget should be smaller than upper_budget.' - - def compute_raw_action(self, obs: float) -> float: - r"""Compute the raw action based on current obs. - - Detailedly, the raw action is computed by the PID controller. - - .. math:: - a = K_p e_p + K_i \int e_p dt + K_d \frac{de_p}{dt} - - where :math:`e_p` is the error of the PID controller. - - Args: - obs (float): The current observation. - """ - # low pass filter. - error_p = self.pid_data.tau * self.error + (1 - self.pid_data.tau) * ( - self.simmer_data.safety_budget - obs - ) - self.error_i += self.error - error_d = self.pid_data.pid_kd * (self.prev_action - self.prev_raw_action) - - # compute PID error. - curr_raw_action = ( - self.pid_data.pid_kp * error_p - + self.pid_data.pid_ki * self.error_i - + self.pid_data.pid_kd * error_d - ) - return curr_raw_action - - def act(self, obs: float) -> float: - """Compute the safety budget based on the observation ``Jc``, following the several steps: - - - Compute the raw action based on the observation ``Jc``. - - Clip the raw action. - - Compute the safety budget. - - Args: - obs (float): The current observation. - """ - curr_raw_action = self.compute_raw_action(obs) - - # clip the raw action. - curr_action = np.clip(curr_raw_action, -self.pid_data.step_size, self.pid_data.step_size) - self.prev_action = curr_action - self.prev_raw_action = curr_raw_action - raw_budget = self.simmer_data.safety_budget + curr_action - - # clip the safety budget. - self.simmer_data.safety_budget = np.clip( - raw_budget, self.simmer_data.lower_budget, self.simmer_data.upper_budget - ) - - return self.simmer_data.safety_budget - - -# pylint: disable-next=too-many-instance-attributes -class QController: - """Using Q-learning to control the safety budget in Simmer environment.""" - - def __init__( - self, - cfgs, - safety_budget: float = 25.0, - lower_budget: float = 1.0, - upper_budget: float = 25.0, - ) -> None: - """ " - Initialize the Q-learning controller. - - Args: - cfgs (CfgNode): The config file. - safety_budget (float): The initial safety budget. - lower_budget (float): The lower bound of the safety budget. - upper_budget (float): The upper bound of the safety budget. - """ - self.safety_budget = safety_budget - self.q_data = QData( - state_dim=cfgs.state_dim, - action_dim=cfgs.act_dim, - tau=cfgs.tau, - threshold=cfgs.threshold, - learning_rate=cfgs.q_lr, - epsilon=cfgs.epsilon, - ) - self.q_table = QTable( - action_space=np.linspace(-1, 1, cfgs.act_dim, dtype=int), - q_function=np.zeros((cfgs.state_dim, cfgs.act_dim)), - state_space=np.linspace(lower_budget, upper_budget, cfgs.state_dim), - ) - self.action = 0 - self.step(self.action) - - # initialize the observation (Cost value per epoch) buffer. - self.prev_obs = copy.copy(self.safety_budget) - self.filtered_obs_buffer = [] - self.filtered_obs = 0 - self._init_check() - - def _init_check(self) -> None: - """Check the initial value of Q-learning controller.""" - assert self.q_data.state_dim > 0, 'state_dim should be positive.' - assert self.q_data.action_dim > 0, 'action_dim should be positive.' - assert self.q_data.tau >= 0 and self.q_data.tau <= 1, 'tau should be in [0, 1].' - assert self.q_data.threshold >= 0, 'threshold should be non-negative.' - assert self.q_data.learning_rate > 0, 'learning_rate should be positive.' - assert self.q_data.epsilon >= 0 and self.q_data.epsilon <= 1, 'epsilon should be in [0, 1].' - - def get_state_idx(self, state: float) -> int: - """Get the state index. - - Args: - state (float): The current state. - """ - state_idx = np.argwhere(self.q_table.state_space == state)[0][0] - return state_idx - - def get_action_idx(self, action: float) -> int: - """Get the action index. - - Args: - action (float): The current action. - """ - action_idx = np.argwhere(self.q_table.action_space == action) - return action_idx - - def get_random_action(self) -> float: - """Get the random action. - - Returns: - float: The random action. - """ - action_idx = np.random.randint(0, self.q_data.action_dim) - return self.q_table.action_space[action_idx] - - def get_greedy_action(self, state: float) -> float: - """Get the greedy action. - - Args: - state (float): The current state(``cost_limit``). - """ - state_idx = self.get_state_idx(state) - action_idx = np.argmax(self.q_table.q_function[state_idx, :]) - action = self.q_table.action_space[action_idx] - return action - - def update_q_function( - self, state: float, action: float, reward: float, next_state: float - ) -> None: - """Update the Q function using the Bellman equation. - - Detailedly, the Q function is updated as follows: - - .. math:: - Q(s, a) = (1 - \\alpha) Q(s, a) + \\alpha (r + \\tau \\max_{a'} Q(s', a')) - - where :math:`s` is the current state, :math:`a` is the current action, - :math:`r` is the reward, :math:`s'` is the next state, - :math:`\\alpha` is the learning rate, - and :math:`\\tau` is the discount factor. - - Args: - state (float): The current state. - action (float): The current action. - reward (float): The reward. - next_state (float): The next state. - """ - state_idx = self.get_state_idx(state) - action_idx = self.get_action_idx(action) - next_state_idx = self.get_state_idx(next_state) - self.q_table.q_function[state_idx, action_idx] = ( - 1 - self.q_data.learning_rate - ) * self.q_table.q_function[state_idx, action_idx] + self.q_data.learning_rate * ( - reward + self.q_data.tau * np.max(self.q_table.q_function[next_state_idx, :]) - ) - - def step(self, action: float) -> float: - """Step the environment. - - Args: - action (float): The current action. - """ - state_idx = self.get_state_idx(self.safety_budget) - state_idx = np.clip(state_idx + action, 0, self.q_data.state_dim - 1, dtype=int) - self.safety_budget = self.q_table.state_space[state_idx] - return self.safety_budget - - def reward(self, state: float, action: float, obs: float) -> float: - r"""Get the reward function based on whether the observation is within the threshold. - - Detailedly, the reward function is defined as follows: - - .. list-table:: - - * - States - - Increase - - No change - - Decrease - * - Unsafe - - -1 - - -1 - - 2 - * - Safe - - 0.5 - - 1 - - -1 - * - Very Safe - - 0.5 - - 1 - - -1 - - Args: - state (float): The current state. - action (float): The current action. - obs (float): The observation. - """ - action_idx = self.get_action_idx(action) - if int(self.q_data.threshold > obs - state and obs - state > -self.q_data.threshold): - reward = np.array([-1, 1, 0.5])[action_idx] - elif int(obs - state <= -self.q_data.threshold): - reward = np.array([-1, 0, 2])[action_idx] - elif int(obs - state >= self.q_data.threshold): - reward = np.array([2, -1, -1])[action_idx] - return reward[0] - - def act(self, obs: float) -> float: - """Compute the safety budget based on the observation ``Jc``, following the several steps: - - - Filter the observation using a low-pass filter. - - Use epsilon greedy to explore the environment. - - Update the Q function by calling :meth:`update_q_function`. - - Return the safety budget. - - Args: - obs (float): The current observation. - - """ - prev_obs = self.filtered_obs - self.filtered_obs = self.q_data.tau * prev_obs + (1 - self.q_data.tau) * obs - self.filtered_obs_buffer.append(self.filtered_obs) - state = self.safety_budget - - # use epsilon greedy to explore the environment - epsilon = np.random.random() - if epsilon > self.q_data.epsilon: - action = self.get_random_action() - else: - action = self.get_greedy_action(state) - reward = self.reward(state, action, self.filtered_obs) - next_state = self.step(action) - safety_budget = next_state - - # update the Q function - self.update_q_function(state, action, reward, next_state) - return safety_budget - - -@WRAPPER_REGISTRY.register -# pylint: disable-next=too-many-instance-attributes -class SimmerWrapper(CMDPWrapper): - r"""SimmerEnvWrapper. - - Simmer is a safe RL algorithm that uses a safety budget to control the exploration of the RL agent. - Similar to :class:`SauteEnvWrapper`, Simmer uses state augmentation to ensure safety. - Additionally, Simmer uses PID controller and Q learning controller to control the safety budget. - - .. note:: - - - If the safety state is greater than 0, the reward is the original reward. - - If the safety state is less than 0, the reward is the unsafe reward (always 0 or less than 0). - - ``omnisafe`` provides two implementations of Simmer RL: :class:`PPOSimmer` and :class:`PPOLagSimmer`. - - References: - - - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation - - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang, - David Mguni, Jun Wang, Haitham Bou-Ammar. - - URL: https://arxiv.org/abs/2202.06558 - - """ - - def __init__(self, env_id, cfgs: Optional[NamedTuple] = None, **env_kwargs) -> None: - """Initialize environment wrapper. - - Args: - env_id (str): environment id. - cfgs (collections.namedtuple): configs. - env_kwargs (dict): The additional parameters of environments. - """ - super().__init__(env_id, cfgs, **env_kwargs) - if hasattr(self.env, '_max_episode_steps'): - max_ep_len = self.env._max_episode_steps - else: - max_ep_len = 1000 - if cfgs.scale_safety_budget: - safety_budget = ( - cfgs.lower_budget - * (1 - cfgs.simmer_gamma**max_ep_len) - / (1 - cfgs.simmer_gamma) - / np.float32(max_ep_len) - ) - lower_budget = ( - cfgs.lower_budget - * (1 - cfgs.simmer_gamma**max_ep_len) - / (1 - cfgs.simmer_gamma) - / np.float32(max_ep_len) - ) - upper_budget = ( - cfgs.upper_budget - * (1 - cfgs.simmer_gamma**max_ep_len) - / (1 - cfgs.simmer_gamma) - / np.float32(max_ep_len) - ) - else: - safety_budget = cfgs.lower_budget - lower_budget = cfgs.lower_budget - upper_budget = cfgs.upper_budget - self.rollout_data = RolloutData( - 0.0, - max_ep_len, - False, - None, - RolloutLog( - np.zeros(self.cfgs.num_envs), - np.zeros(self.cfgs.num_envs), - np.zeros(self.cfgs.num_envs), - np.zeros((self.cfgs.num_envs, 1)), - ), - SimmerData( - safety_budget=safety_budget, - upper_budget=upper_budget, - lower_budget=lower_budget, - relative_budget=safety_budget / upper_budget, - unsafe_reward=cfgs.unsafe_reward, - safety_obs=safety_budget / upper_budget, - ), - ) - high = np.array(np.hstack([self.observation_space.high, np.inf]), dtype=np.float32) - low = np.array(np.hstack([self.observation_space.low, np.inf]), dtype=np.float32) - self.observation_space = spaces.Box(high=high, low=low) - self.obs_normalizer = ( - Normalizer(shape=(self.cfgs.num_envs, self.observation_space.shape[0]), clip=5).to( - device=self.cfgs.device - ) - if self.cfgs.normalized_obs - else None - ) - self.record_queue = RecordQueue( - 'ep_ret', 'ep_cost', 'ep_len', 'ep_budget', maxlen=self.cfgs.max_len - ) - if cfgs.simmer_controller == 'PID': - self.controller = PidController( - cfgs.controller_cfgs, - safety_budget=self.rollout_data.simmer_data.safety_budget, - lower_budget=self.rollout_data.simmer_data.lower_budget, - upper_budget=self.rollout_data.simmer_data.upper_budget, - ) - elif cfgs.simmer_controller == 'Q': - self.controller = QController( - cfgs.controller_cfgs, - safety_budget=self.rollout_data.simmer_data.safety_budget, - lower_budget=self.rollout_data.simmer_data.lower_budget, - upper_budget=self.rollout_data.simmer_data.upper_budget, - ) - else: - raise NotImplementedError( - f'Controller type {cfgs.simmer_controller} is not implemented.' - ) - self.rollout_data.current_obs = self.reset()[0] - - def _init_check(self) -> None: - super()._init_check() - assert ( - self.cfgs.simmer_gamma >= 0 and self.cfgs.simmer_gamma <= 1 - ), 'The simmer gamma should be in [0, 1].' - - def augment_obs(self, obs: np.ndarray) -> np.ndarray: - """Augmenting the obs with the safety obs. - - Detailedly, the augmented obs is the concatenation of the original obs and the safety obs. - The safety obs is the safety budget minus the cost divided by the safety budget. - - Args: - obs (np.ndarray): observation. - safety_obs (np.ndarray): safety observation. - """ - augmented_obs = np.hstack([obs, self.rollout_data.simmer_data.safety_obs]) - return augmented_obs - - def safety_step(self, cost: np.ndarray, done: bool) -> np.ndarray: - """Update the normalized safety obs. - - Args: - cost (np.ndarray): cost. - """ - if done: - self.rollout_data.simmer_data.safety_obs = np.ones( - (self.cfgs.num_envs, 1), dtype=np.float32 - ) - else: - self.rollout_data.simmer_data.safety_obs -= ( - cost / self.rollout_data.simmer_data.upper_budget - ) - self.rollout_data.simmer_data.safety_obs /= self.cfgs.simmer_gamma - - def safety_reward(self, reward: np.ndarray) -> np.ndarray: - """Update the reward. - - Args: - reward (np.ndarray): reward. - next_safety_obs (np.ndarray): next safety observation. - """ - for idx, safety_obs in enumerate(self.rollout_data.simmer_data.safety_obs): - if safety_obs <= 0: - reward[idx] = self.rollout_data.simmer_data.unsafe_reward - return reward - - def reset(self) -> Tuple[torch.Tensor, Dict]: - r"""Reset environment. - - .. note:: - The safety obs is initialized to ``rel_safety_budget``, - which is the safety budget divided by the upper budget. - The safety budget is controlled by the controller. - - Args: - seed (int): seed for environment reset. - """ - obs, info = self.env.reset() - if self.cfgs.num_envs == 1: - obs = expand_dims(obs) - info = [info] - self.rollout_data.simmer_data.relative_budget = ( - self.rollout_data.simmer_data.safety_budget / self.rollout_data.simmer_data.upper_budget - ) - self.rollout_data.simmer_data.safety_obs = ( - self.rollout_data.simmer_data.relative_budget - * np.ones((self.cfgs.num_envs, 1), dtype=np.float32) - ) - obs = self.augment_obs(obs) - return torch.as_tensor(obs, dtype=torch.float32, device=self.cfgs.device), info - - def step( - self, action: torch.Tensor - ) -> tuple((torch.Tensor, torch.Tensor, torch.Tensor, bool, dict)): - """Step environment. - - .. note:: - The safety obs is updated by the cost. - The reward is updated by the safety obs. - Detailedly, the reward is the original reward if the safety obs is greater than 0, - otherwise the reward is the unsafe reward. - - Args: - action (torch.Tensor): action. - """ - next_obs, reward, cost, terminated, truncated, info = self.env.step( - action.cpu().numpy().squeeze() - ) - if self.cfgs.num_envs == 1: - next_obs, reward, cost, terminated, truncated, info = expand_dims( - next_obs, reward, cost, terminated, truncated, info - ) - self.safety_step(cost, done=terminated | truncated) - if terminated | truncated: - augmented_obs, info = self.reset() - else: - augmented_obs = self.augment_obs(next_obs) - else: - augmented_obs = self.augment_obs(next_obs) - self.rollout_data.rollout_log.ep_ret += reward - self.rollout_data.rollout_log.ep_costs += cost - self.rollout_data.rollout_log.ep_len += np.ones(self.cfgs.num_envs) - self.rollout_data.rollout_log.ep_budget += self.rollout_data.simmer_data.safety_obs - reward = self.safety_reward(reward) - return ( - as_tensor(augmented_obs, reward, cost, device=self.cfgs.device), - terminated, - truncated, - info, - ) - - def set_budget(self, Jc): - """Set the safety budget by the controller. - - Args: - Jc (np.ndarray): The safety budget. - """ - self.rollout_data.simmer_data.safety_budget = self.controller.act(Jc) - - def rollout_log( - self, - logger, - idx, - is_train: bool = True, - ) -> None: - """Log the information of the rollout.""" - self.record_queue.append( - ep_ret=self.rollout_data.rollout_log.ep_ret[idx], - ep_cost=self.rollout_data.rollout_log.ep_costs[idx], - ep_len=self.rollout_data.rollout_log.ep_len[idx], - ep_budget=self.rollout_data.rollout_log.ep_budget[idx], - ) - avg_ep_ret, avg_ep_cost, avg_ep_len, avg_ep_budget = self.record_queue.get_mean( - 'ep_ret', 'ep_cost', 'ep_len', 'ep_budget' - ) - if is_train: - logger.store( - **{ - 'Metrics/EpRet': avg_ep_ret, - 'Metrics/EpCost': avg_ep_cost, - 'Metrics/EpLen': avg_ep_len, - 'Metrics/EpBudget': avg_ep_budget, - 'Metrics/SafetyBudget': self.rollout_data.simmer_data.safety_budget, - } - ) - self.set_budget(avg_ep_cost) - else: - logger.store( - **{ - 'Test/EpRet': avg_ep_ret, - 'Test/EpCost': avg_ep_cost, - 'Test/EpLen': avg_ep_len, - 'Test/EpBudget': avg_ep_budget, - 'Test/SafetyBudget': self.rollout_data.simmer_data.safety_budget, - } - ) - - def reset_log( - self, - idx, - ) -> None: - ( - self.rollout_data.rollout_log.ep_ret[idx], - self.rollout_data.rollout_log.ep_costs[idx], - self.rollout_data.rollout_log.ep_len[idx], - self.rollout_data.rollout_log.ep_budget[idx], - ) = (0.0, 0.0, 0.0, 0.0) diff --git a/omnisafe/wrappers/wrapper_registry.py b/omnisafe/wrappers/wrapper_registry.py deleted file mode 100644 index 7ff4e47c1..000000000 --- a/omnisafe/wrappers/wrapper_registry.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Registry for algorithms.""" - -import inspect - - -class WrapperRegistry: - """A registry to map strings to classes. - Args: - name (str): Registry name. - """ - - def __init__(self, name): - self._name = name - self._module_dict = {} - - def __repr__(self): - format_str = ( - self.__class__.__name__ + f'(name={self._name}, items={list(self._module_dict.keys())})' - ) - return format_str - - @property - def name(self): - """Return the name of the registry.""" - return self._name - - @property - def module_dict(self): - """Return a dict mapping names to classes.""" - return self._module_dict - - def get(self, key): - """Get the class that has been registered under the given key.""" - return self._module_dict.get(key, None) - - def _register_module(self, module_class): - """Register a module. - Args: - module (:obj:`nn.Module`): Module to be registered. - """ - if not inspect.isclass(module_class): - raise TypeError(f'module must be a class, but got {type(module_class)}') - module_name = module_class.__name__ - if module_name in self._module_dict: - raise KeyError(f'{module_name} is already registered in {self.name}') - self._module_dict[module_name] = module_class - - def register(self, cls): - """Register a module class.""" - self._register_module(cls) - return cls - - -WRAPPER_REGISTRY = WrapperRegistry('OmniSafe-Wrappers') - - -register = WRAPPER_REGISTRY.register -get = WRAPPER_REGISTRY.get diff --git a/pyproject.toml b/pyproject.toml index b2dfc8437..23502074b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "scipy >= 1.7.0", "joblib >= 1.2.0", "pyyaml >= 6.0", + "types-pyyaml >= 6.0", "xmltodict >= 0.13.0", "moviepy >= 1.0.0", "typing-extensions >= 4.0.0", diff --git a/tests/test_model.py b/tests/test_model.py index 98cc2d482..e29946407 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -25,8 +25,8 @@ from omnisafe.models import ActorBuilder, CriticBuilder from omnisafe.models.actor_critic import ActorCritic from omnisafe.models.actor_q_critic import ActorQCritic +from omnisafe.typing import Activation, InitFunction from omnisafe.utils.config import Config -from omnisafe.utils.model_utils import Activation, InitFunction @helpers.parametrize( diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index 533cd3ddf..000000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2022-2023 OmniSafe Team. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Test Utils""" - -import os -import sys - -import numpy as np -import torch - -import helpers -import omnisafe -from omnisafe.common.experiment_grid import ExperimentGrid -from omnisafe.typing import NamedTuple, Tuple -from omnisafe.utils.core import discount_cumsum_torch -from omnisafe.utils.distributed_utils import mpi_fork, mpi_statistics_scalar -from omnisafe.utils.tools import to_ndarray - - -@helpers.parametrize(item=[1, 1.0, [1, 2, 3], (1, 2, 3), {'a': 1, 'b': 2}, torch.tensor([1, 2, 3])]) -def test_to_ndarray(item): - """Test to_ndarray.""" - if isinstance(item, torch.Tensor): - assert isinstance(to_ndarray(item), np.ndarray) - elif isinstance(item, list): - out_list = to_ndarray(item) - for val in out_list: - assert isinstance(val, np.ndarray) - elif isinstance(item, tuple): - out_tuple = to_ndarray(item) - for val in out_tuple: - assert isinstance(val, np.ndarray) - elif isinstance(item, dict): - out_dict = to_ndarray(item) - for val in out_dict.values(): - assert isinstance(val, np.ndarray) - else: - assert isinstance(to_ndarray(item), np.ndarray) - - -def get_answer(gamma: float) -> torch.Tensor: - """Input gamma and return the answer.""" - if gamma == 0.9: - return torch.tensor([11.4265, 11.5850, 10.6500, 8.5000, 5.0000], dtype=torch.float64) - elif gamma == 0.99: - return torch.tensor([14.6045, 13.7419, 11.8605, 8.9500, 5.0000], dtype=torch.float64) - elif gamma == 0.999: - return torch.tensor([14.9600, 13.9740, 11.9860, 8.9950, 5.0000], dtype=torch.float64) - - -@helpers.parametrize( - discount=[0.9, 0.99, 0.999], -) -def test_discount_cumsum_torch( - discount: float, -): - """Test discount_cumsum_torch.""" - x1 = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float64) - y1 = get_answer(discount) - assert torch.allclose( - discount_cumsum_torch(x1, discount), y1 - ), 'discount_cumsum_torch is not correct' - - -def test_distributed_tools(): - """Test mpi_fork.""" - mpi_fork(2, test_message=['examples/train_from_custom_dict.py', '--parallel', '2']) - - -def train( - exp_id: str, algo: str, env_id: str, custom_cfgs: NamedTuple, num_threads: int = 6 -) -> Tuple[float, float, float]: - """Train a policy from exp-x config with OmniSafe. - - Args: - exp_id (str): Experiment ID. - algo (str): Algorithm to train. - env_id (str): The name of test environment. - custom_cfgs (NamedTuple): Custom configurations. - num_threads (int, optional): Number of threads. Defaults to 6. - """ - torch.set_num_threads(num_threads) - sys.stdout = sys.__stdout__ - sys.stderr = sys.__stderr__ - print(f'exp-x: {exp_id} is training...') - USE_REDIRECTION = True - if USE_REDIRECTION: - if not os.path.exists(custom_cfgs['data_dir']): - os.makedirs(custom_cfgs['data_dir']) - sys.stdout = open(f'{custom_cfgs["data_dir"]}terminal.log', 'w', encoding='utf-8') - sys.stderr = open(f'{custom_cfgs["data_dir"]}error.log', 'w', encoding='utf-8') - agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs) - reward, cost, ep_len = agent.learn() - return reward, cost, ep_len - - -def test_train( - exp_name='Safety_Gymnasium_Goal', - algo='CPO', - env_id='SafetyHalfCheetahVelocity-v4', - epochs=1, - steps_per_epoch=1000, - num_envs=1, -): - """Test train.""" - eg = ExperimentGrid(exp_name=exp_name) - eg.add('algo', [algo]) - eg.add('env_id', [env_id]) - eg.add('epochs', [epochs]) - eg.add('steps_per_epoch', [steps_per_epoch]) - eg.add('env_cfgs', [{'num_envs': num_envs}]) - eg.run(train, num_pool=1, is_test=True)