From c2fcffa684041e164ffd1c6cfc6d624bdc178e1e Mon Sep 17 00:00:00 2001
From: Ruiyang Sun <rockmagma02@gmail.com>
Date: Fri, 24 Feb 2023 01:58:00 +0800
Subject: [PATCH] refactor: change architecture of omnisafe (#121)

---
 .github/workflows/ci.yml                      |   7 +-
 .pylintrc                                     |  14 +-
 docs/source/spelling_wordlist.txt             |   9 +
 omnisafe/__init__.py                          |   4 +-
 .../model_based => adapter}/__init__.py       |  16 +-
 omnisafe/adapter/early_terminated_adapter.py  |  49 +
 omnisafe/adapter/online_adapter.py            | 125 +++
 omnisafe/adapter/onpolicy_adapter.py          | 136 +++
 omnisafe/adapter/saute_adapter.py             | 127 +++
 omnisafe/adapter/simmer_adapter.py            |  62 ++
 omnisafe/algorithms/__init__.py               |  26 +-
 omnisafe/algorithms/algo_wrapper.py           |  61 +-
 omnisafe/algorithms/base_algo.py              |  67 ++
 omnisafe/algorithms/model_based/cap.py        | 140 ---
 omnisafe/algorithms/model_based/mbppo_lag.py  | 445 ---------
 .../model_based/models/dynamic_model.py       | 405 --------
 .../model_based/models/virtual_env.py         | 249 -----
 omnisafe/algorithms/model_based/planner.py    | 919 ------------------
 .../algorithms/model_based/policy_gradient.py | 304 ------
 omnisafe/algorithms/model_based/safeloop.py   | 311 ------
 omnisafe/algorithms/off_policy/__init__.py    |  44 -
 omnisafe/algorithms/off_policy/cvpo.py        | 189 ----
 omnisafe/algorithms/off_policy/ddpg_lag.py    | 103 --
 omnisafe/algorithms/on_policy/__init__.py     |  19 +-
 .../algorithms/on_policy/base/natural_pg.py   | 174 +---
 .../on_policy/base/policy_gradient.py         | 861 +++++-----------
 omnisafe/algorithms/on_policy/base/ppo.py     |  51 +-
 omnisafe/algorithms/on_policy/base/trpo.py    | 179 ++--
 .../early_terminated/ppo_early_terminated.py  |  16 +-
 .../ppo_lag_early_terminated.py               |  15 +-
 .../algorithms/on_policy/first_order/cup.py   | 235 ++---
 .../on_policy/first_order/focops.py           | 257 ++---
 .../on_policy/naive_lagrange/crpo.py          |  72 +-
 .../on_policy/naive_lagrange/pdo.py           |  81 +-
 .../on_policy/naive_lagrange/ppo_lag.py       |  84 +-
 .../on_policy/naive_lagrange/rcpo.py          |  83 +-
 .../on_policy/naive_lagrange/trpo_lag.py      |  83 +-
 .../on_policy/penalty_function/ipo.py         |  42 +-
 .../on_policy/penalty_function/p3o.py         |  97 +-
 .../on_policy/pid_lagrange/cppo_pid.py        | 161 +--
 .../on_policy/pid_lagrange/trpo_pid.py        | 150 +--
 .../on_policy/saute/ppo_lag_saute.py          |  31 +-
 .../algorithms/on_policy/saute/ppo_saute.py   |  30 +-
 .../algorithms/on_policy/second_order/cpo.py  | 400 ++++----
 .../algorithms/on_policy/second_order/pcpo.py | 330 ++-----
 .../algorithms/on_policy/simmer/__init__.py   |  28 -
 .../on_policy/simmer/ppo_lag_simmer_pid.py    |  48 -
 .../on_policy/simmer/ppo_lag_simmer_q.py      |  48 -
 .../on_policy/simmer/ppo_simmer_pid.py        |  47 -
 .../on_policy/simmer/ppo_simmer_q.py          |  47 -
 omnisafe/common/buffer/onpolicy_buffer.py     |  93 +-
 .../common/buffer/vector_onpolicy_buffer.py   |   6 +-
 omnisafe/common/experiment_grid.py            |  13 +-
 omnisafe/common/lagrange.py                   |   4 +-
 omnisafe/common/logger.py                     |  73 +-
 omnisafe/common/normalizer.py                 | 118 ++-
 omnisafe/common/pid_lagrange.py               |  11 +-
 omnisafe/common/record_queue.py               |  62 --
 omnisafe/configs/on-policy/CPO.yaml           |  58 +-
 omnisafe/configs/on-policy/CPPOPid.yaml       |  62 +-
 omnisafe/configs/on-policy/CUP.yaml           |  58 +-
 omnisafe/configs/on-policy/FOCOPS.yaml        |  58 +-
 omnisafe/configs/on-policy/IPO.yaml           |  58 +-
 omnisafe/configs/on-policy/NaturalPG.yaml     |  58 +-
 omnisafe/configs/on-policy/OnCRPO.yaml        |  58 +-
 omnisafe/configs/on-policy/P3O.yaml           |  58 +-
 omnisafe/configs/on-policy/PCPO.yaml          |  58 +-
 omnisafe/configs/on-policy/PDO.yaml           |  63 +-
 omnisafe/configs/on-policy/PPO.yaml           |  60 +-
 .../configs/on-policy/PPOEarlyTerminated.yaml |  60 +-
 omnisafe/configs/on-policy/PPOLag.yaml        |  58 +-
 .../on-policy/PPOLagEarlyTerminated.yaml      |  60 +-
 omnisafe/configs/on-policy/PPOLagSaute.yaml   |  60 +-
 omnisafe/configs/on-policy/PPOSaute.yaml      |  60 +-
 .../configs/on-policy/PolicyGradient.yaml     |  64 +-
 omnisafe/configs/on-policy/RCPO.yaml          |  58 +-
 omnisafe/configs/on-policy/TRPO.yaml          |  58 +-
 omnisafe/configs/on-policy/TRPOLag.yaml       |  58 +-
 omnisafe/configs/on-policy/TRPOPid.yaml       |  60 +-
 .../model_based/models => envs}/__init__.py   |   6 +-
 omnisafe/envs/core.py                         | 336 +++++++
 omnisafe/envs/safety_gymnasium_env.py         | 117 +++
 omnisafe/envs/wrapper.py                      | 288 ++++++
 omnisafe/evaluator.py                         | 318 ------
 omnisafe/models/__init__.py                   |  11 +-
 omnisafe/models/actor/__init__.py             |   5 +-
 omnisafe/models/actor/actor_builder.py        | 141 +--
 omnisafe/models/actor/categorical_actor.py    | 134 ---
 omnisafe/models/actor/cholesky_actor.py       | 160 ---
 omnisafe/models/actor/gaussian_actor.py       | 227 +----
 .../models/actor/gaussian_learning_actor.py   |  87 ++
 omnisafe/models/actor/gaussian_sac_actor.py   |  76 ++
 .../models/actor/gaussian_stdnet_actor.py     | 166 ----
 omnisafe/models/actor_critic.py               | 164 ----
 omnisafe/models/actor_critic/actor_critic.py  | 154 +++
 .../actor_critic/constraint_actor_critic.py   | 117 +++
 omnisafe/models/actor_q_critic.py             | 178 ----
 omnisafe/models/base.py                       | 157 +--
 omnisafe/models/constraint_actor_critic.py    | 113 ---
 omnisafe/models/constraint_actor_q_critic.py  | 109 ---
 omnisafe/models/critic/critic_builder.py      |  75 +-
 omnisafe/models/critic/q_critic.py            |  83 +-
 omnisafe/models/critic/v_critic.py            |  62 +-
 omnisafe/typing.py                            |   2 +
 omnisafe/utils/config.py                      |  36 +-
 omnisafe/utils/core.py                        |  65 --
 .../{distributed_utils.py => distributed.py}  | 186 ++--
 omnisafe/utils/exp_grid_tools.py              |   3 +-
 omnisafe/utils/{algo_utils.py => math.py}     | 124 ++-
 omnisafe/utils/{model_utils.py => model.py}   |  52 +-
 omnisafe/utils/online_mean_std.py             | 116 ---
 omnisafe/utils/schedule.py                    |  93 ++
 omnisafe/utils/tools.py                       | 124 +--
 omnisafe/utils/vtrace.py                      |  80 --
 omnisafe/wrappers/__init__.py                 |  44 -
 omnisafe/wrappers/early_terminated_wrapper.py |  77 --
 omnisafe/wrappers/model_based_wrapper.py      | 455 ---------
 omnisafe/wrappers/saute_wrapper.py            | 282 ------
 omnisafe/wrappers/simmer_wrapper.py           | 688 -------------
 omnisafe/wrappers/wrapper_registry.py         |  72 --
 pyproject.toml                                |   1 +
 tests/test_model.py                           |   2 +-
 tests/test_utils.py                           | 124 ---
 123 files changed, 4361 insertions(+), 10655 deletions(-)
 rename omnisafe/{algorithms/model_based => adapter}/__init__.py (69%)
 create mode 100644 omnisafe/adapter/early_terminated_adapter.py
 create mode 100644 omnisafe/adapter/online_adapter.py
 create mode 100644 omnisafe/adapter/onpolicy_adapter.py
 create mode 100644 omnisafe/adapter/saute_adapter.py
 create mode 100644 omnisafe/adapter/simmer_adapter.py
 create mode 100644 omnisafe/algorithms/base_algo.py
 delete mode 100644 omnisafe/algorithms/model_based/cap.py
 delete mode 100644 omnisafe/algorithms/model_based/mbppo_lag.py
 delete mode 100644 omnisafe/algorithms/model_based/models/dynamic_model.py
 delete mode 100644 omnisafe/algorithms/model_based/models/virtual_env.py
 delete mode 100644 omnisafe/algorithms/model_based/planner.py
 delete mode 100644 omnisafe/algorithms/model_based/policy_gradient.py
 delete mode 100644 omnisafe/algorithms/model_based/safeloop.py
 delete mode 100644 omnisafe/algorithms/off_policy/__init__.py
 delete mode 100644 omnisafe/algorithms/off_policy/cvpo.py
 delete mode 100644 omnisafe/algorithms/off_policy/ddpg_lag.py
 delete mode 100644 omnisafe/algorithms/on_policy/simmer/__init__.py
 delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py
 delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py
 delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py
 delete mode 100644 omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py
 delete mode 100644 omnisafe/common/record_queue.py
 rename omnisafe/{algorithms/model_based/models => envs}/__init__.py (77%)
 create mode 100644 omnisafe/envs/core.py
 create mode 100644 omnisafe/envs/safety_gymnasium_env.py
 create mode 100644 omnisafe/envs/wrapper.py
 delete mode 100644 omnisafe/evaluator.py
 delete mode 100644 omnisafe/models/actor/categorical_actor.py
 delete mode 100644 omnisafe/models/actor/cholesky_actor.py
 create mode 100644 omnisafe/models/actor/gaussian_learning_actor.py
 create mode 100644 omnisafe/models/actor/gaussian_sac_actor.py
 delete mode 100644 omnisafe/models/actor/gaussian_stdnet_actor.py
 delete mode 100644 omnisafe/models/actor_critic.py
 create mode 100644 omnisafe/models/actor_critic/actor_critic.py
 create mode 100644 omnisafe/models/actor_critic/constraint_actor_critic.py
 delete mode 100644 omnisafe/models/actor_q_critic.py
 delete mode 100644 omnisafe/models/constraint_actor_critic.py
 delete mode 100644 omnisafe/models/constraint_actor_q_critic.py
 delete mode 100644 omnisafe/utils/core.py
 rename omnisafe/utils/{distributed_utils.py => distributed.py} (64%)
 rename omnisafe/utils/{algo_utils.py => math.py} (50%)
 rename omnisafe/utils/{model_utils.py => model.py} (67%)
 delete mode 100644 omnisafe/utils/online_mean_std.py
 create mode 100644 omnisafe/utils/schedule.py
 delete mode 100644 omnisafe/utils/vtrace.py
 delete mode 100644 omnisafe/wrappers/__init__.py
 delete mode 100644 omnisafe/wrappers/early_terminated_wrapper.py
 delete mode 100644 omnisafe/wrappers/model_based_wrapper.py
 delete mode 100644 omnisafe/wrappers/saute_wrapper.py
 delete mode 100644 omnisafe/wrappers/simmer_wrapper.py
 delete mode 100644 omnisafe/wrappers/wrapper_registry.py
 delete mode 100644 tests/test_utils.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 37eeefea3..2933289bc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -61,10 +61,9 @@ jobs:
         run: |
           make addlicense
 
-      # TODO: enable this when ready
-      # - name: mypy
-      #   run: |
-      #     make mypy
+      - name: mypy
+        run: |
+          make mypy
 
       - name: Install dependencies
         run: |
diff --git a/.pylintrc b/.pylintrc
index 68ca39503..13c1bd408 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -289,10 +289,10 @@ exclude-too-few-public-methods=
 ignored-parents=
 
 # Maximum number of arguments for function / method.
-max-args=5
+max-args=8
 
 # Maximum number of attributes for a class (see R0902).
-max-attributes=7
+max-attributes=12
 
 # Maximum number of boolean expressions in an if statement (see R0916).
 max-bool-expr=5
@@ -301,22 +301,22 @@ max-bool-expr=5
 max-branches=12
 
 # Maximum number of locals for function / method body.
-max-locals=15
+max-locals=20
 
 # Maximum number of parents for a class (see R0901).
-max-parents=7
+max-parents=12
 
 # Maximum number of public methods for a class (see R0904).
 max-public-methods=20
 
 # Maximum number of return / yield for function / method body.
-max-returns=6
+max-returns=8
 
 # Maximum number of statements in function / method body.
-max-statements=50
+max-statements=80
 
 # Minimum number of public methods for a class (see R0903).
-min-public-methods=2
+min-public-methods=1
 
 
 [EXCEPTIONS]
diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
index 7f8a2f617..653cf0d8b 100644
--- a/docs/source/spelling_wordlist.txt
+++ b/docs/source/spelling_wordlist.txt
@@ -369,3 +369,12 @@ noqa
 hyperparameters
 json
 msg
+env's
+CMDP
+api
+moviepy
+normalizer
+Unsqueeze
+Golub
+logp
+loc
diff --git a/omnisafe/__init__.py b/omnisafe/__init__.py
index 3669e2ac5..71973dc66 100644
--- a/omnisafe/__init__.py
+++ b/omnisafe/__init__.py
@@ -17,7 +17,9 @@
 from omnisafe import algorithms
 from omnisafe.algorithms import ALGORITHMS
 from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent
-from omnisafe.evaluator import Evaluator
 
 # from omnisafe.algorithms.env_wrapper import EnvWrapper as Env
 from omnisafe.version import __version__
+
+
+# from omnisafe.evaluator import Evaluator
diff --git a/omnisafe/algorithms/model_based/__init__.py b/omnisafe/adapter/__init__.py
similarity index 69%
rename from omnisafe/algorithms/model_based/__init__.py
rename to omnisafe/adapter/__init__.py
index 54456b5b3..40b122bb1 100644
--- a/omnisafe/algorithms/model_based/__init__.py
+++ b/omnisafe/adapter/__init__.py
@@ -12,15 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model-Based algorithms."""
+"""Adapter for the environment and the algorithm."""
 
-from omnisafe.algorithms.model_based.cap import CAP
-from omnisafe.algorithms.model_based.mbppo_lag import MBPPOLag
-from omnisafe.algorithms.model_based.safeloop import SafeLOOP
-
-
-__all__ = [
-    'CAP',
-    'MBPPOLag',
-    'SafeLOOP',
-]
+from omnisafe.adapter.early_terminated_adapter import EarlyTerminatedAdapter
+from omnisafe.adapter.online_adapter import OnlineAdapter
+from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter
+from omnisafe.adapter.saute_adapter import SauteAdapter
diff --git a/omnisafe/adapter/early_terminated_adapter.py b/omnisafe/adapter/early_terminated_adapter.py
new file mode 100644
index 000000000..4674d41a6
--- /dev/null
+++ b/omnisafe/adapter/early_terminated_adapter.py
@@ -0,0 +1,49 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""OnPolicy Adapter for OmniSafe."""
+
+from typing import Dict, Tuple
+
+import torch
+
+from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter
+from omnisafe.utils.config import Config
+
+
+class EarlyTerminatedAdapter(OnPolicyAdapter):
+    """OnPolicy Adapter for OmniSafe."""
+
+    def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None:
+        assert num_envs == 1, 'EarlyTerminatedAdapter only supports num_envs=1.'
+
+        super().__init__(env_id, num_envs, seed, cfgs)
+
+        self._cost_limit = cfgs.cost_limit
+        self._cost_logger = torch.zeros(self._env.num_envs)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        next_obs, reward, cost, terminated, truncated, info = super().step(action)
+
+        self._cost_logger += info.get('original_cost', cost)
+
+        if self._cost_logger > self._cost_limit:
+            reward = torch.zeros(self._env.num_envs)  # r_e = 0
+            terminated = torch.ones(self._env.num_envs)
+            next_obs, _ = self._env.reset()
+            self._cost_logger = torch.zeros(self._env.num_envs)
+
+        return next_obs, reward, cost, terminated, truncated, info
diff --git a/omnisafe/adapter/online_adapter.py b/omnisafe/adapter/online_adapter.py
new file mode 100644
index 000000000..f2439b508
--- /dev/null
+++ b/omnisafe/adapter/online_adapter.py
@@ -0,0 +1,125 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Online Adapter for OmniSafe."""
+
+from typing import Dict, Tuple
+
+import torch
+
+from omnisafe.envs.core import make, support_envs
+from omnisafe.envs.wrapper import (
+    ActionScale,
+    AutoReset,
+    CostNormalize,
+    ObsNormalize,
+    RewardNormalize,
+    TimeLimit,
+    Unsqueeze,
+)
+from omnisafe.typing import OmnisafeSpace
+from omnisafe.utils.config import Config
+
+
+class OnlineAdapter:
+    """Online Adapter for OmniSafe."""
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        env_id: str,
+        num_envs: int,
+        seed: int,
+        cfgs: Config,
+    ) -> None:
+        assert env_id in support_envs(), f'Env {env_id} is not supported.'
+
+        self._env_id = env_id
+        self._env = make(env_id, num_envs=num_envs)
+        self._wrapper(
+            obs_normalize=cfgs.obs_normalize,
+            reward_normalize=cfgs.reward_normalize,
+            cost_normalize=cfgs.cost_normalize,
+        )
+        self._env.set_seed(seed)
+
+        self._cfgs = cfgs
+
+    def _wrapper(
+        self,
+        obs_normalize: bool = True,
+        reward_normalize: bool = True,
+        cost_normalize: bool = True,
+    ):
+        if self._env.need_time_limit_wrapper:
+            self._env = TimeLimit(self._env, time_limit=1000)
+        if self._env.need_auto_reset_wrapper:
+            self._env = AutoReset(self._env)
+        if obs_normalize:
+            self._env = ObsNormalize(self._env)
+        if reward_normalize:
+            self._env = RewardNormalize(self._env)
+        if cost_normalize:
+            self._env = CostNormalize(self._env)
+        self._env = ActionScale(self._env, low=-1.0, high=1.0)
+        if self._env.num_envs == 1:
+            self._env = Unsqueeze(self._env)
+
+    @property
+    def action_space(self) -> OmnisafeSpace:
+        """The action space of the environment.
+
+        Returns:
+            OmnisafeSpace: the action space.
+        """
+        return self._env.action_space
+
+    @property
+    def observation_space(self) -> OmnisafeSpace:
+        """The observation space of the environment.
+
+        Returns:
+            OmnisafeSpace: the observation space.
+        """
+        return self._env.observation_space
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        """Run one timestep of the environment's dynamics using the agent actions.
+
+        Args:
+            action (torch.Tensor): action.
+
+        Returns:
+            observation (torch.Tensor): agent's observation of the current environment.
+            reward (torch.Tensor): amount of reward returned after previous action.
+            cost (torch.Tensor): amount of cost returned after previous action.
+            terminated (torch.Tensor): whether the episode has ended, in which case further step()
+            calls will return undefined results.
+            truncated (torch.Tensor): whether the episode has been truncated due to a time limit.
+            info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
+        """
+        return self._env.step(action)
+
+    def reset(self) -> Tuple[torch.Tensor, Dict]:
+        """Resets the environment and returns an initial observation.
+
+        Args:
+            seed (Optional[int]): seed for the environment.
+
+        Returns:
+            observation (torch.Tensor): the initial observation of the space.
+            info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
+        """
+        return self._env.reset()
diff --git a/omnisafe/adapter/onpolicy_adapter.py b/omnisafe/adapter/onpolicy_adapter.py
new file mode 100644
index 000000000..f816e20d4
--- /dev/null
+++ b/omnisafe/adapter/onpolicy_adapter.py
@@ -0,0 +1,136 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""OnPolicy Adapter for OmniSafe."""
+
+from typing import Dict, Optional
+
+import torch
+
+from omnisafe.adapter.online_adapter import OnlineAdapter
+from omnisafe.common.buffer import VectorOnPolicyBuffer
+from omnisafe.common.logger import Logger
+from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic
+from omnisafe.utils.config import Config
+
+
+class OnPolicyAdapter(OnlineAdapter):
+    """OnPolicy Adapter for OmniSafe."""
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self, env_id: str, num_envs: int, seed: int, cfgs: Config
+    ) -> None:
+        super().__init__(env_id, num_envs, seed, cfgs)
+
+        self._ep_ret: torch.Tensor
+        self._ep_cost: torch.Tensor
+        self._ep_len: torch.Tensor
+        self._reset_log()
+
+    def roll_out(  # pylint: disable=too-many-locals
+        self,
+        steps_per_epoch: int,
+        agent: ConstraintActorCritic,
+        buffer: VectorOnPolicyBuffer,
+        logger: Logger,
+    ) -> None:
+        """Roll out the environment and store the data in the buffer.
+
+        Args:
+            steps_per_epoch (int): Number of steps per epoch.
+            agent (ConstraintActorCritic): Agent.
+            buf (VectorOnPolicyBuffer): Buffer.
+            logger (Logger): Logger.
+        """
+        self._reset_log()
+
+        obs, _ = self.reset()
+        for step in range(steps_per_epoch):
+            act, value_r, value_c, logp = agent.step(obs)
+            next_obs, reward, cost, terminated, truncated, info = self.step(act)
+
+            self._log_value(reward=reward, cost=cost, info=info)
+
+            if self._cfgs.use_cost:
+                logger.store(**{'Value/cost': value_c})
+            logger.store(**{'Value/reward': value_r})
+
+            buffer.store(
+                obs=obs,
+                act=act,
+                reward=reward,
+                cost=cost,
+                value_r=value_r,
+                value_c=value_c,
+                logp=logp,
+            )
+
+            obs = next_obs
+            dones = torch.logical_or(terminated, truncated)
+            epoch_end = step >= steps_per_epoch - 1
+            for idx, done in enumerate(dones):
+                if epoch_end or done:
+                    if epoch_end and not done:
+                        logger.log(
+                            f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.'
+                        )
+                        _, last_value_r, last_value_c, _ = agent.step(obs[idx])
+                        last_value_r = last_value_r.unsqueeze(0)
+                        last_value_c = last_value_c.unsqueeze(0)
+                    elif done:
+                        last_value_r = torch.zeros(1)
+                        last_value_c = torch.zeros(1)
+
+                        self._log_metrics(logger, idx)
+                        self._reset_log(idx)
+
+                        self._ep_ret[idx] = 0.0
+                        self._ep_cost[idx] = 0.0
+                        self._ep_len[idx] = 0.0
+
+                    buffer.finish_path(last_value_r, last_value_c, idx)
+
+    def _log_value(
+        self,
+        reward: torch.Tensor,
+        cost: torch.Tensor,
+        info: Dict,
+        **kwargs,  # pylint: disable=unused-argument
+    ) -> None:  # pylint: disable=unused-argument
+        """Log value."""
+        self._ep_ret += info.get('original_reward', reward)
+        self._ep_cost += info.get('original_cost', cost)
+        self._ep_len += 1
+
+    def _log_metrics(self, logger: Logger, idx: int) -> None:
+        """Log metrics."""
+
+        logger.store(
+            **{
+                'Metrics/EpRet': self._ep_ret[idx],
+                'Metrics/EpCost': self._ep_cost[idx],
+                'Metrics/EpLen': self._ep_len[idx],
+            }
+        )
+
+    def _reset_log(self, idx: Optional[int] = None) -> None:
+        """Reset log."""
+        if idx is None:
+            self._ep_ret = torch.zeros(self._env.num_envs)
+            self._ep_cost = torch.zeros(self._env.num_envs)
+            self._ep_len = torch.zeros(self._env.num_envs)
+        else:
+            self._ep_ret[idx] = 0.0
+            self._ep_cost[idx] = 0.0
+            self._ep_len[idx] = 0.0
diff --git a/omnisafe/adapter/saute_adapter.py b/omnisafe/adapter/saute_adapter.py
new file mode 100644
index 000000000..1b65d60a8
--- /dev/null
+++ b/omnisafe/adapter/saute_adapter.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""OnPolicy Adapter for OmniSafe."""
+
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+import torch
+from gymnasium.spaces import Box
+
+from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter
+from omnisafe.common.logger import Logger
+from omnisafe.envs.wrapper import ActionScale, AutoReset, ObsNormalize, TimeLimit, Unsqueeze
+from omnisafe.utils.config import Config
+
+
+class SauteAdapter(OnPolicyAdapter):
+    """OnPolicy Adapter for OmniSafe."""
+
+    def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None:
+        super().__init__(env_id, num_envs, seed, cfgs)
+
+        self._safety_budget: torch.Tensor
+        self._safety_obs: torch.Tensor
+
+        if self._cfgs.env_cfgs.scale_safety_budget:
+            self._safety_budget = (
+                self._cfgs.env_cfgs.safety_budget
+                * (1 - self._cfgs.env_cfgs.saute_gamma**self._cfgs.env_cfgs.max_ep_len)
+                / (1 - self._cfgs.env_cfgs.saute_gamma)
+                / self._cfgs.env_cfgs.max_ep_len
+                * torch.ones(num_envs, 1)
+            )
+        else:
+            self._safety_budget = self._cfgs.env_cfgs.safety_budget * torch.ones(num_envs, 1)
+
+        self._ep_budget: torch.Tensor
+
+        assert isinstance(self._env.observation_space, Box), 'Observation space must be Box'
+        self._observation_space = Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=(self._env.observation_space.shape[0] + 1,),
+        )
+
+    @property
+    def observation_space(self) -> Box:
+        return self._observation_space
+
+    def _wrapper(
+        self,
+        obs_normalize: bool = True,
+        reward_normalize: bool = False,
+        cost_normalize: bool = False,
+    ):
+        if self._env.need_time_limit_wrapper:
+            self._env = TimeLimit(self._env, time_limit=1000)
+        if self._env.need_auto_reset_wrapper:
+            self._env = AutoReset(self._env)
+        if obs_normalize:
+            self._env = ObsNormalize(self._env)
+        assert reward_normalize is False, 'Reward normalization is not supported'
+        assert cost_normalize is False, 'Cost normalization is not supported'
+        self._env = ActionScale(self._env, low=-1.0, high=1.0)
+        if self._env.num_envs == 1:
+            self._env = Unsqueeze(self._env)
+
+    def reset(self) -> Tuple[torch.Tensor, Dict]:
+        obs, info = self._env.reset()
+        self._safety_obs = torch.ones(self._env.num_envs, 1)
+        obs = self._augment_obs(obs)
+        return obs, info
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        next_obs, reward, cost, terminated, truncated, info = self._env.step(action)
+
+        self._safety_step(cost)
+        reward = self._safety_reward(reward)
+
+        # autoreset the environment
+        done = torch.logical_or(terminated, truncated).float().unsqueeze(-1)
+        self._safety_obs = self._safety_obs * (1 - done) + done
+
+        augmented_obs = self._augment_obs(next_obs)
+
+        return augmented_obs, reward, cost, terminated, truncated, info
+
+    def _safety_step(self, cost: torch.Tensor) -> None:
+        self._safety_obs -= cost.unsqueeze(-1) / self._safety_budget
+        self._safety_obs /= self._safety_budget
+
+    def _safety_reward(self, reward: torch.Tensor) -> torch.Tensor:
+        safe = torch.as_tensor(self._safety_obs > 0, dtype=reward.dtype).squeeze(-1)
+        reward = safe * reward + (1 - safe) * self._cfgs.env_cfgs.unsafe_reward
+        return reward
+
+    def _augment_obs(self, obs: torch.Tensor) -> torch.Tensor:
+        return torch.cat([obs, self._safety_obs], dim=-1)
+
+    def _log_value(self, reward: torch.Tensor, cost: torch.Tensor, info: Dict, **kwargs) -> None:
+        super()._log_value(reward, cost, info, **kwargs)
+        self._ep_budget += self._safety_obs.squeeze(-1)
+
+    def _reset_log(self, idx: Optional[int] = None) -> None:
+        super()._reset_log(idx)
+        if idx is None:
+            self._ep_budget = torch.zeros(self._env.num_envs)
+        else:
+            self._ep_budget[idx] = 0
+
+    def _log_metrics(self, logger: Logger, idx: int) -> None:
+        super()._log_metrics(logger, idx)
+        logger.store(**{'Metrics/EpBudget': self._ep_budget[idx]})
diff --git a/omnisafe/adapter/simmer_adapter.py b/omnisafe/adapter/simmer_adapter.py
new file mode 100644
index 000000000..65f062bd6
--- /dev/null
+++ b/omnisafe/adapter/simmer_adapter.py
@@ -0,0 +1,62 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""OnPolicy Adapter for OmniSafe."""
+
+import numpy as np
+import torch
+from gymnasium.spaces import Box
+
+from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter
+from omnisafe.adapter.saute_adapter import SauteAdapter
+from omnisafe.utils.config import Config
+
+
+class SimmerAdapter(SauteAdapter, OnPolicyAdapter):
+    """OnPolicy Adapter for OmniSafe."""
+
+    def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None:
+        """Initialize the adapter."""
+        super(OnPolicyAdapter, self).__init__(env_id, num_envs, seed, cfgs)
+
+        self._safety_budget: torch.Tensor
+        self._safety_obs: torch.Tensor
+
+        if self._cfgs.env_cfgs.scale_safety_budget:
+            self._safety_budget = (
+                self._cfgs.env_cfgs.lower_budget
+                * (1 - self._cfgs.env_cfgs.saute_gamma**self._cfgs.env_cfgs.max_ep_len)
+                / (1 - self._cfgs.env_cfgs.saute_gamma)
+                / self._cfgs.env_cfgs.max_ep_len
+            )
+            self._lower_budget = self._safety_budget
+            self._upper_budget = (
+                self._cfgs.env_cfgs.upper_budget
+                * (1 - self._cfgs.env_cfgs.saute_gamma**self._cfgs.env_cfgs.max_ep_len)
+                / (1 - self._cfgs.env_cfgs.saute_gamma)
+                / self._cfgs.env_cfgs.max_ep_len
+            )
+        else:
+            self._safety_budget = self._cfgs.env_cfgs.lower_budget
+            self._lower_budget = self._safety_budget
+            self._upper_budget = self._cfgs.env_cfgs.upper_budget
+
+        self._ep_budget: torch.Tensor
+
+        assert isinstance(self._env.observation_space, Box), 'Observation space must be Box'
+        self._observation_space = Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=(self._env.observation_space.shape[0] + 1,),
+        )
diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py
index 5231479b6..9c74117ac 100644
--- a/omnisafe/algorithms/__init__.py
+++ b/omnisafe/algorithms/__init__.py
@@ -17,16 +17,11 @@
 import itertools
 from types import MappingProxyType
 
-from omnisafe.algorithms import model_based, off_policy, on_policy
-
-# Model-based Safe
-from omnisafe.algorithms.model_based import CAP, MBPPOLag, SafeLOOP
-
-# Off-Policy Safe
-from omnisafe.algorithms.off_policy import DDPG, SAC, SDDPG, TD3, DDPGLag, SACLag, TD3Lag
+from omnisafe.algorithms import on_policy
+from omnisafe.algorithms.base_algo import BaseAlgo
 
 # On-Policy Safe
-from omnisafe.algorithms.on_policy import (
+from omnisafe.algorithms.on_policy import (  # PPOLagSimmerPid,; PPOLagSimmerQ,; PPOSimmerPid,; PPOSimmerQ,
     CPO,
     CUP,
     FOCOPS,
@@ -43,20 +38,23 @@
     PPOLag,
     PPOLagEarlyTerminated,
     PPOLagSaute,
-    PPOLagSimmerPid,
-    PPOLagSimmerQ,
     PPOSaute,
-    PPOSimmerPid,
-    PPOSimmerQ,
     TRPOLag,
     TRPOPid,
 )
 
 
+# Model-based Safe
+# from omnisafe.algorithms.model_based import CAP, MBPPOLag, SafeLOOP
+
+# Off-Policy Safe
+# from omnisafe.algorithms.off_policy import DDPG, SAC, SDDPG, TD3, DDPGLag, SACLag, TD3Lag
+
+
 ALGORITHMS = {
-    'off-policy': tuple(off_policy.__all__),
+    # 'off-policy': tuple(off_policy.__all__),
     'on-policy': tuple(on_policy.__all__),
-    'model-based': tuple(model_based.__all__),
+    # 'model-based': tuple(model_based.__all__),
 }
 
 ALGORITHM2TYPE = {
diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py
index d0ed9b2b1..1a75fc308 100644
--- a/omnisafe/algorithms/algo_wrapper.py
+++ b/omnisafe/algorithms/algo_wrapper.py
@@ -17,24 +17,32 @@
 import difflib
 import os
 import sys
+from typing import Any, Dict, Optional
 
 import psutil
+import torch
 from safety_gymnasium.utils.registration import safe_registry
 
 from omnisafe.algorithms import ALGORITHM2TYPE, ALGORITHMS, registry
-from omnisafe.utils import distributed_utils
-from omnisafe.utils.config import check_all_configs, get_default_kwargs_yaml
+from omnisafe.utils import distributed
+from omnisafe.utils.config import get_default_kwargs_yaml
 
 
 class AlgoWrapper:
     """Algo Wrapper for algo."""
 
-    def __init__(self, algo, env_id, parallel=1, custom_cfgs=None):
+    def __init__(
+        self,
+        algo: str,
+        env_id: str,
+        parallel: int = 1,
+        custom_cfgs: Optional[Dict[str, Any]] = None,
+    ):
         self.algo = algo
         self.parallel = parallel
         self.env_id = env_id
         # algo_type will set in _init_checks()
-        self.algo_type = None
+        self.algo_type: str
         self.custom_cfgs = custom_cfgs
         self.evaluator = None
         self._init_checks()
@@ -55,7 +63,7 @@ def _init_checks(self):
             f"{self.env_id} doesn't exist. "
             f'Did you mean {difflib.get_close_matches(self.env_id, safe_registry, n=1)[0]}?'
         )
-        self.algo_type = ALGORITHM2TYPE.get(self.algo, None)
+        self.algo_type = ALGORITHM2TYPE.get(self.algo, '')
         if self.algo_type is None or self.algo_type == '':
             raise ValueError(f'{self.algo} is not supported!')
         if self.algo_type in ['off-policy', 'model-based']:
@@ -69,15 +77,17 @@ def learn(self):
         physical_cores = psutil.cpu_count(logical=False)
         use_number_of_threads = bool(self.parallel > physical_cores)
 
+        torch.set_num_threads(5)
+
         cfgs = get_default_kwargs_yaml(self.algo, self.env_id, self.algo_type)
         exp_name = os.path.join(self.env_id, self.algo)
         cfgs.recurisve_update({'exp_name': exp_name, 'env_id': self.env_id})
         if self.custom_cfgs is not None:
             cfgs.recurisve_update(self.custom_cfgs)
 
-        check_all_configs(cfgs, self.algo_type)
+        # check_all_configs(cfgs, self.algo_type)
 
-        if distributed_utils.mpi_fork(
+        if distributed.fork(
             self.parallel, use_number_of_threads=use_number_of_threads, device=cfgs.device
         ):
             # Re-launches the current script with workers linked by MPI
@@ -87,22 +97,25 @@ def learn(self):
             cfgs=cfgs,
         )
         agent.learn()
-        return agent.env.record_queue.get_mean('ep_ret', 'ep_cost', 'ep_len')
+        ep_ret = agent.logger.get_stats('Metrics/EpRet')
+        ep_len = agent.logger.get_stats('Metrics/EpLen')
+        ep_cost = agent.logger.get_stats('Metrics/EpCost')
+        return ep_ret, ep_len, ep_cost
 
-    def evaluate(self, num_episodes: int = 10, horizon: int = 1000, cost_criteria: float = 1.0):
-        """Agent Evaluation."""
-        assert self.evaluator is not None, 'Please run learn() first!'
-        self.evaluator.evaluate(num_episodes, horizon, cost_criteria)
+    # def evaluate(self, num_episodes: int = 10, horizon: int = 1000, cost_criteria: float = 1.0):
+    #     """Agent Evaluation."""
+    #     assert self.evaluator is not None, 'Please run learn() first!'
+    #     self.evaluator.evaluate(num_episodes, horizon, cost_criteria)
 
-    # pylint: disable-next=too-many-arguments
-    def render(
-        self,
-        num_episode: int = 0,
-        horizon: int = 1000,
-        seed: int = None,
-        play=True,
-        save_replay_path: str = None,
-    ):
-        """Render the environment."""
-        assert self.evaluator is not None, 'Please run learn() first!'
-        self.evaluator.render(num_episode, horizon, seed, play, save_replay_path)
+    # # pylint: disable-next=too-many-arguments
+    # def render(
+    #     self,
+    #     num_episode: int = 0,
+    #     horizon: int = 1000,
+    #     seed: int = None,
+    #     play=True,
+    #     save_replay_path: Optional[str] = None,
+    # ):
+    #     """Render the environment."""
+    #     assert self.evaluator is not None, 'Please run learn() first!'
+    #     self.evaluator.render(num_episode, horizon, seed, play, save_replay_path)
diff --git a/omnisafe/algorithms/base_algo.py b/omnisafe/algorithms/base_algo.py
new file mode 100644
index 000000000..d28282115
--- /dev/null
+++ b/omnisafe/algorithms/base_algo.py
@@ -0,0 +1,67 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the Policy Gradient algorithm."""
+
+from abc import ABC, abstractmethod
+
+import torch
+
+from omnisafe.utils import distributed
+from omnisafe.utils.config import Config
+from omnisafe.utils.tools import seed_all
+
+
+class BaseAlgo(ABC):  # pylint: disable=too-few-public-methods
+    """Base class for all algorithms."""
+
+    def __init__(self, env_id: str, cfgs: Config) -> None:
+        self._env_id = env_id
+        self._cfgs = cfgs
+
+        assert hasattr(cfgs, 'seed'), 'Please specify the seed in the config file.'
+        self._seed = cfgs.seed + distributed.get_rank() * 1000
+        seed_all(self._seed)
+
+        assert hasattr(cfgs, 'device'), 'Please specify the device in the config file.'
+        self._device = torch.device(self._cfgs.device)
+
+        distributed.setup_distributed()
+
+        self._init_env()
+        self._init_model()
+
+        self._init()
+
+        self._init_log()
+
+    @abstractmethod
+    def _init(self) -> None:
+        """Initialize the algorithm."""
+
+    @abstractmethod
+    def _init_env(self) -> None:
+        """Initialize the environment."""
+
+    @abstractmethod
+    def _init_model(self) -> None:
+        """Initialize the model."""
+
+    @abstractmethod
+    def _init_log(self) -> None:
+        """Initialize the logger."""
+
+    @abstractmethod
+    def learn(self) -> None:
+        """Learn the policy."""
diff --git a/omnisafe/algorithms/model_based/cap.py b/omnisafe/algorithms/model_based/cap.py
deleted file mode 100644
index 195466ecb..000000000
--- a/omnisafe/algorithms/model_based/cap.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the CAP algorithm. The CAP in safety-gym may unable to converge."""
-
-import numpy as np
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.model_based.planner import CCEPlanner
-from omnisafe.algorithms.model_based.policy_gradient import PolicyGradientModelBased
-from omnisafe.common.lagrange import Lagrange
-
-
-@registry.register
-class CAP(
-    PolicyGradientModelBased, CCEPlanner, Lagrange
-):  # pylint: disable=too-many-instance-attributes
-    """The Conservative and Adaptive Penalty (CAP) algorithm.
-
-    References:
-        Title: Conservative and Adaptive Penalty for Model-Based Safe Reinforcement Learning
-        Authors: Yecheng Jason Ma, Andrew Shen, Osbert Bastani, Dinesh Jayaraman.
-        URL: https://arxiv.org/abs/2112.07701
-    """
-
-    def __init__(self, env_id, cfgs) -> None:
-        PolicyGradientModelBased.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(self, **self.cfgs.lagrange_cfgs, device=self.cfgs.device)
-        CCEPlanner.__init__(
-            self,
-            algo=self.algo,
-            cfgs=self.cfgs,
-            device=self.device,
-            env=self.env,
-            models=self.virtual_env,
-            **self.cfgs.mpc_config,
-            lagrangian_multiplier=self.lagrangian_multiplier,
-        )
-        # Set up model saving
-        what_to_save = {
-            'dynamics': self.dynamics,
-        }
-        self.logger.setup_torch_saver(what_to_save=what_to_save)
-        self.logger.torch_save()
-
-    def _specific_init_logs(self):
-        self.logger.register_key('Loss/DynamicsTrainMseLoss')
-        self.logger.register_key('Loss/DynamicsValMseLoss')
-        self.logger.register_key('Penalty')
-
-    def algorithm_specific_logs(self, time_step):
-        """Log algo parameter"""
-        super().algorithm_specific_logs(time_step)
-        self.logger.store(
-            **{'Penalty': self.lambda_range_projection(self.lagrangian_multiplier).item()}
-        )
-
-    def update_dynamics_model(self):
-        """Update dynamics."""
-        state = self.off_replay_buffer.data['obs'][: self.off_replay_buffer.size, :]
-        action = self.off_replay_buffer.data['act'][: self.off_replay_buffer.size, :]
-        reward = self.off_replay_buffer.data['reward'][: self.off_replay_buffer.size]
-        cost = self.off_replay_buffer.data['cost'][: self.off_replay_buffer.size]
-        next_state = self.off_replay_buffer.data['next_obs'][: self.off_replay_buffer.size, :]
-        delta_state = next_state - state
-        inputs = np.concatenate((state, action), axis=-1)
-        if self.env.env_type == 'mujoco-velocity':
-            labels = np.concatenate(
-                (
-                    np.reshape(reward, (reward.shape[0], -1)),
-                    np.reshape(cost, (cost.shape[0], -1)),
-                    delta_state,
-                ),
-                axis=-1,
-            )
-        elif self.env.env_type == 'gym':
-            labels = np.concatenate(
-                (np.reshape(reward, (reward.shape[0], -1)), delta_state), axis=-1
-            )
-        train_mse_losses, val_mse_losses = self.dynamics.train(
-            inputs, labels, batch_size=256, holdout_ratio=0.2
-        )
-
-        ep_costs = self.logger.get_stats('Metrics/EpCost')[0]
-        # update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(ep_costs)
-
-        self.logger.store(
-            **{
-                'Loss/DynamicsTrainMseLoss': train_mse_losses,
-                'Loss/DynamicsValMseLoss': val_mse_losses,
-            }
-        )
-
-    def select_action(self, time_step, state, env):
-        """action selection"""
-        action = self.get_action(np.array(state))
-        return action, None
-
-    def store_real_data(
-        self,
-        time_step,
-        ep_len,
-        state,
-        action_info,
-        action,
-        reward,
-        cost,
-        terminated,
-        truncated,
-        next_state,
-        info,
-    ):  # pylint: disable=too-many-arguments
-        """store real data"""
-        if not terminated and not truncated and not info['goal_met']:
-            # Current goal position is not related to the last goal position, so do not store.
-            self.off_replay_buffer.store(
-                obs=state, act=action, reward=reward, cost=cost, next_obs=next_state, done=truncated
-            )
-
-    def algo_reset(self):
-        """reset planner"""
-
-    def set_algorithm_specific_actor_critic(self):
-        """Initialize Soft Actor-Critic"""
diff --git a/omnisafe/algorithms/model_based/mbppo_lag.py b/omnisafe/algorithms/model_based/mbppo_lag.py
deleted file mode 100644
index 1f3e04d37..000000000
--- a/omnisafe/algorithms/model_based/mbppo_lag.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Model-based PPO-Lag algorithm."""
-
-import numpy as np
-import torch
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.model_based.policy_gradient import PolicyGradientModelBased
-from omnisafe.common.buffer import OnPolicyBuffer
-from omnisafe.common.lagrange import Lagrange
-from omnisafe.models.constraint_actor_critic import ConstraintActorCritic
-from omnisafe.utils import core
-from omnisafe.wrappers import wrapper_registry
-
-
-@registry.register
-# pylint: disable-next=too-many-instance-attributes
-class MBPPOLag(PolicyGradientModelBased, Lagrange):
-    """The Model-based PPO-Lag algorithm.
-
-    References:
-        Title: Model-based Safe Deep Reinforcement Learning via a Constrained Proximal Policy Optimization Algorithm
-        Authors: Ashish Kumar Jayant, Shalabh Bhatnagar.
-        URL: https://arxiv.org/abs/2210.07573
-    """
-
-    def __init__(self, env_id, cfgs) -> None:
-        PolicyGradientModelBased.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(self, **self.cfgs.lagrange_cfgs, device=self.cfgs.device)
-        self.clip = self.cfgs.clip
-        self.loss_pi_before = 0.0
-        self.loss_v_before = 0.0
-        self.loss_c_before = 0.0
-        self.env_auxiliary = wrapper_registry.get(self.wrapper_type)(self.algo, self.env_id)
-        # Initialize Actor-Critic
-        self.actor_critic = self.set_algorithm_specific_actor_critic()
-        self.buf = OnPolicyBuffer(
-            obs_space=self.env.observation_space,
-            act_space=self.env.action_space,
-            size=self.cfgs.imaging_steps_per_policy_update,
-            gamma=self.cfgs.buffer_cfgs.gamma,
-            lam=self.cfgs.buffer_cfgs.lam,
-            lam_c=self.cfgs.buffer_cfgs.lam_c,
-            advantage_estimator=self.cfgs.buffer_cfgs.advantage_estimator,
-            penalty_coefficient=0,
-            standardized_adv_r=self.cfgs.buffer_cfgs.standardized_reward,
-            standardized_adv_c=self.cfgs.buffer_cfgs.standardized_cost,
-            device=self.device,
-        )
-        # Set up model saving
-        what_to_save = {
-            'pi': self.actor_critic.actor,
-            'dynamics': self.dynamics,
-        }
-        self.logger.setup_torch_saver(what_to_save=what_to_save)
-        self.logger.torch_save()
-
-    def _specific_init_logs(self):
-        self.logger.register_key('DynaMetrics/EpRet')
-        self.logger.register_key('DynaMetrics/EpLen')
-        self.logger.register_key('DynaMetrics/EpCost')
-        self.logger.register_key('Loss/DynamicsTrainMseLoss')
-        self.logger.register_key('Loss/DynamicsValMseLoss')
-        self.logger.register_key('Loss/Pi')
-        self.logger.register_key('Loss/Value')
-        self.logger.register_key('Loss/DeltaPi')
-        self.logger.register_key('Loss/DeltaValue')
-        self.logger.register_key('Loss/CValue')
-        self.logger.register_key('Loss/DeltaCValue')
-        self.logger.register_key('Penalty')
-        self.logger.register_key('Values/Adv')
-        self.logger.register_key('Values/Adv_C')
-        self.logger.register_key('Megaiter')
-        self.logger.register_key('Entropy')
-        self.logger.register_key('KL')
-        self.logger.register_key('Misc/StopIter')
-        self.logger.register_key('PolicyRatio')
-
-    def algorithm_specific_logs(self, time_step):
-        """log algo parameter"""
-        super().algorithm_specific_logs(time_step)
-        self.logger.store(
-            **{'Penalty': self.lambda_range_projection(self.lagrangian_multiplier).item()}
-        )
-
-    def update_actor_critic(self, time_step):  # pylint: disable=unused-argument
-        """update actor critic"""
-        megaiter = 0
-        last_valid_rets = np.zeros(self.cfgs.dynamics_cfgs.elite_size)
-        while True:
-            self.roll_out_in_imaginary(megaiter)
-            # validation
-            if megaiter > 0:
-                old_actor = self.get_param_values(self.actor_critic.actor)
-                old_reward_critic = self.get_param_values(self.actor_critic.reward_critic)
-                old_cost_critic = self.get_param_values(self.actor_critic.cost_critic)
-                self.update()
-                result, valid_rets = self.validation(last_valid_rets)
-                if result is True:
-                    # backtrack
-                    self.set_param_values(old_actor, self.actor_critic.actor)
-                    self.set_param_values(old_reward_critic, self.actor_critic.reward_critic)
-                    self.set_param_values(old_cost_critic, self.actor_critic.cost_critic)
-                    megaiter += 1
-                    break
-                megaiter += 1
-                last_valid_rets = valid_rets
-            else:
-                megaiter += 1
-                self.update()
-
-        self.logger.store(Megaiter=megaiter)
-
-    def update(self):
-        """Get data from buffer and update Lagrange multiplier, actor, critic"""
-        data = self.buf.get()
-        # Note that logger already uses MPI statistics across all processes..
-        ep_costs = self.logger.get_stats('DynaMetrics/EpCost')[0]
-        # First update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(ep_costs)
-        # now update policy and value network
-        self.update_policy_net(data=data)
-        self.update_value_net(data=data)
-
-    def compute_loss_v(self, data):
-        """compute the loss of value function"""
-        obs, ret, cret = data['obs'], data['target_v'], data['target_c']
-        return ((self.actor_critic.reward_critic(obs) - ret) ** 2).mean(), (
-            (self.actor_critic.cost_critic(obs) - cret) ** 2
-        ).mean()
-
-    def compute_loss_pi(self, data):
-        """compute the loss of policy"""
-        dist, _log_p = self.actor_critic.actor(data['obs'], data['act'])
-        ratio = torch.exp(_log_p - data['log_p'])
-        ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip)
-        loss_pi = -(torch.min(ratio * data['adv'], ratio_clip * data['adv'])).mean()
-
-        # ensure that Lagrange multiplier is positive
-        penalty = self.lambda_range_projection(self.lagrangian_multiplier).item()
-        loss_pi += penalty * ((ratio * data['cost_adv']).mean())
-        loss_pi /= 1 + penalty
-
-        # Useful extra info
-        approx_kl = (data['log_p'] - _log_p).mean().item()
-        ent = dist.entropy().mean().item()
-        clipped = ratio.gt(1 + self.clip) | ratio.lt(1 - self.clip)
-        clipfrac = torch.as_tensor(clipped, device=self.device, dtype=torch.float32).mean().item()
-        pi_info = {'kl': approx_kl, 'ent': ent, 'cf': clipfrac}
-        return loss_pi, pi_info
-
-    def update_dynamics_model(self):
-        """compute the loss of dynamics"""
-        state = self.off_replay_buffer.data['obs'][: self.off_replay_buffer.size, :]
-        action = self.off_replay_buffer.data['act'][: self.off_replay_buffer.size, :]
-        reward = self.off_replay_buffer.data['reward'][: self.off_replay_buffer.size]
-        cost = self.off_replay_buffer.data['cost'][: self.off_replay_buffer.size]
-        next_state = self.off_replay_buffer.data['next_obs'][: self.off_replay_buffer.size, :]
-        delta_state = next_state - state
-        inputs = np.concatenate((state, action), axis=-1)
-        if self.env.env_type == 'mujoco-velocity':
-            labels = np.concatenate(
-                (
-                    np.reshape(reward, (reward.shape[0], -1)),
-                    np.reshape(cost, (cost.shape[0], -1)),
-                    delta_state,
-                ),
-                axis=-1,
-            )
-        elif self.env.env_type == 'gym':
-            labels = delta_state
-        train_mse_losses, val_mse_losses = self.dynamics.train(
-            inputs, labels, batch_size=256, holdout_ratio=0.2
-        )
-        self.logger.store(
-            **{
-                'Loss/DynamicsTrainMseLoss': train_mse_losses,
-                'Loss/DynamicsValMseLoss': val_mse_losses,
-            }
-        )
-
-    def update_policy_net(self, data):
-        """update policy"""
-        # Get prob. distribution before updates: used to measure KL distance
-        pi_l_old, pi_info_old = self.compute_loss_pi(data)
-        self.loss_pi_before = pi_l_old.item()
-        # Train policy with multiple steps of gradient descent
-        for i in range(self.cfgs.pi_iters):
-            loss_pi, pi_info = self.compute_loss_pi(data)
-            kl_div = pi_info['kl']
-            if self.cfgs.kl_early_stopping:
-                if kl_div > self.cfgs.target_kl:
-                    self.logger.log(f'Reached ES criterion after {i+1} steps.')
-                    break
-            self.actor_optimizer.zero_grad()
-            loss_pi.backward()
-            self.actor_optimizer.step()
-        self.logger.store(
-            **{
-                'Loss/Pi': self.loss_pi_before,
-                'Loss/DeltaPi': loss_pi.item() - self.loss_pi_before,
-                'Misc/StopIter': i + 1,
-                'Values/Adv': data['adv'].cpu().numpy(),
-                'Values/Adv_C': data['cost_adv'].cpu().numpy(),
-                'Entropy': pi_info_old['ent'],
-                'KL': pi_info['kl'],
-                'PolicyRatio': pi_info['cf'],
-            }
-        )
-
-    def update_value_net(self, data):
-        """Value function learning"""
-        v_l_old, cv_l_old = self.compute_loss_v(data)
-        self.loss_v_before, self.loss_c_before = v_l_old.item(), cv_l_old.item()
-
-        for _ in range(self.cfgs.critic_iters):
-            loss_v, loss_vc = self.compute_loss_v(data)
-            self.reward_critic_optimizer.zero_grad()
-            loss_v.backward()
-            self.reward_critic_optimizer.step()
-
-            self.cost_critic_optimizer.zero_grad()
-            loss_vc.backward()
-            self.cost_critic_optimizer.step()
-
-        self.logger.store(
-            **{
-                'Loss/DeltaValue': loss_v.item() - self.loss_v_before,
-                'Loss/Value': self.loss_v_before,
-                'Loss/DeltaCValue': loss_vc.item() - self.loss_c_before,
-                'Loss/CValue': self.loss_c_before,
-            }
-        )
-
-    def get_param_values(self, model):
-        """get the dynamics parameters"""
-        trainable_params = list(model.parameters())
-        params = np.concatenate(
-            [p.contiguous().view(-1).data.cpu().numpy() for p in trainable_params]
-        )
-        return params.copy()
-
-    def set_param_values(self, new_params, model, set_new=True):
-        """set the dynamics parameters"""
-        trainable_params = list(model.parameters())
-        param_shapes = [p.data.cpu().numpy().shape for p in trainable_params]
-        param_sizes = [p.data.cpu().numpy().size for p in trainable_params]
-        if set_new:
-            current_idx = 0
-            for idx, param in enumerate(trainable_params):
-                vals = new_params[current_idx : current_idx + param_sizes[idx]]
-                vals = vals.reshape(param_shapes[idx])
-                param.data = torch.from_numpy(vals).float().to(self.device)
-                current_idx += param_sizes[idx]
-
-    def roll_out_in_imaginary(self, megaiter):  # pylint: disable=too-many-locals
-        """collect data and store to experience buffer."""
-        state = self.env_auxiliary.reset()
-        dep_ret, dep_cost, dep_len = 0, 0, 0
-        mix_real = self.cfgs.mixed_real_time_steps if megaiter == 0 else 0
-
-        for time_step in range(self.cfgs.imaging_steps_per_policy_update - mix_real):
-            action, action_info = self.select_action(time_step, state, self.env_auxiliary)
-            next_state, reward, cost, info = self.virtual_step(state, action)
-
-            dep_ret += reward
-            dep_cost += (self.cost_gamma**dep_len) * cost
-            dep_len += 1
-
-            self.buf.store(
-                obs=action_info['state_vec'],
-                act=action,
-                rew=reward,
-                val=action_info['val'],
-                logp=action_info['logp'],
-                cost=cost,
-                cost_val=action_info['cval'],
-            )
-            state = next_state
-
-            timeout = dep_len == self.cfgs.horizon
-            truncated = timeout
-            epoch_ended = time_step == self.cfgs.imaging_steps_per_policy_update - 1
-            if truncated or epoch_ended or info['goal_flag']:
-                if timeout or epoch_ended or info['goal_flag']:
-                    state_tensor = torch.as_tensor(
-                        action_info['state_vec'], device=self.device, dtype=torch.float32
-                    )
-                    _, val, cval, _ = self.actor_critic.step(state_tensor)
-                    del state_tensor
-                else:
-                    # this means episode is terminated,
-                    # and this will be triggered only in robots fall down case
-                    val = 0
-                    cval = 0
-                self.buf.finish_path(val, cval)
-                if timeout:
-                    # only save EpRet / EpLen if trajectory finished
-                    self.logger.store(
-                        **{
-                            'DynaMetrics/EpRet': dep_ret,
-                            'DynaMetrics/EpLen': dep_len,
-                            'DynaMetrics/EpCost': dep_cost,
-                        }
-                    )
-                state = self.env_auxiliary.reset()
-                dep_ret, dep_len, dep_cost = 0, 0, 0
-
-    def validation(self, last_valid_rets):
-        """policy validation"""
-        valid_rets = np.zeros(self.cfgs.validation_num)
-        winner = 0
-        for valid_id in range(len(valid_rets)):  # pylint:disable=consider-using-enumerate
-            state = self.env_auxiliary.reset()
-            for step in range(self.cfgs.validation_horizon):
-                action, _ = self.select_action(step, state, self.env_auxiliary)
-                next_state, reward, _, info = self.virtual_step(state, action, idx=valid_id)
-                valid_rets[valid_id] += reward
-                state = next_state
-                if info['goal_flag']:
-                    state = self.env_auxiliary.reset()
-            if valid_rets[valid_id] > last_valid_rets[valid_id]:
-                winner += 1
-        performance_ratio = winner / self.cfgs.validation_num
-        threshold = self.cfgs.validation_threshold_num / self.cfgs.validation_num
-        result = performance_ratio < threshold
-        return result, valid_rets
-
-    # pylint: disable-next=too-many-arguments
-    def store_real_data(
-        self,
-        time_step,
-        ep_len,
-        state,
-        action_info,
-        action,
-        reward,
-        cost,
-        terminated,
-        truncated,
-        next_state,
-        info,
-    ):
-        """store real data"""
-        if not terminated and not truncated and not info['goal_met']:
-            self.off_replay_buffer.store(
-                obs=state, act=action, reward=reward, cost=cost, next_obs=next_state, done=truncated
-            )
-        if (
-            time_step % self.cfgs.update_policy_freq <= self.cfgs.mixed_real_time_steps
-            and self.buf.ptr < self.cfgs.mixed_real_time_steps
-        ):
-            self.buf.store(
-                obs=action_info['state_vec'],
-                act=action,
-                rew=reward,
-                val=action_info['val'],
-                logp=action_info['logp'],
-                cost=cost,
-                cost_val=action_info['cval'],
-            )
-            if terminated:
-                # this means episode is terminated,
-                # which will be triggered only in robots fall down case
-                val = 0
-                cval = 0
-                self.buf.finish_path(val, cval)
-
-            # reached max imaging horizon, mixed real timestep, real max timestep , or episode truncated.
-            elif (
-                time_step % self.cfgs.horizon < self.cfgs.action_repeat
-                or self.buf.ptr == self.cfgs.mixed_real_time_steps
-                or time_step >= self.cfgs.max_real_time_steps
-                or truncated
-            ):
-                state_tensor = torch.as_tensor(
-                    action_info['state_vec'], device=self.device, dtype=torch.float32
-                )
-                _, val, cval, _ = self.actor_critic.step(state_tensor)
-                del state_tensor
-                self.buf.finish_path(val, cval)
-
-    def algo_reset(self):
-        """reset algo parameters"""
-
-    def virtual_step(self, state, action, idx=None):
-        """use virtual environment to predict next state, reward, cost"""
-        if self.env.env_type == 'gym':
-            next_state, _, _, _ = self.virtual_env.mbppo_step(state, action, idx)
-            next_state = np.nan_to_num(next_state)
-            next_state = np.clip(next_state, -self.cfgs.obs_clip, self.cfgs.obs_clip)
-            reward, cost, goal_flag = self.env_auxiliary.get_reward_cost(next_state)
-            info = {'goal_flag': goal_flag}
-        elif self.env.env_type == 'mujoco-velocity':
-            next_state, reward, cost, _ = self.virtual_env.mbppo_step(state, action, idx)
-            next_state = np.nan_to_num(next_state)
-            reward = np.nan_to_num(reward)
-            cost = np.nan_to_num(cost)
-            next_state = np.clip(next_state, -self.cfgs.obs_clip, self.cfgs.obs_clip)
-            info = {'goal_flag': False}
-        return next_state, reward, cost, info
-
-    def set_algorithm_specific_actor_critic(self):
-        """
-        Use this method to initialize network.
-        e.g. Initialize Soft Actor Critic
-
-        Returns:
-            Actor_critic
-        """
-        self.actor_critic = ConstraintActorCritic(
-            observation_space=self.env.observation_space,
-            action_space=self.env.action_space,
-            model_cfgs=self.cfgs.model_cfgs,
-        ).to(self.device)
-        # Set up optimizer for policy and value function
-
-        self.actor_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.actor, learning_rate=self.cfgs.actor_lr
-        )
-        self.reward_critic_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.reward_critic, learning_rate=self.cfgs.critic_lr
-        )
-        self.cost_critic_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.cost_critic, learning_rate=self.cfgs.critic_lr
-        )
-
-        return self.actor_critic
diff --git a/omnisafe/algorithms/model_based/models/dynamic_model.py b/omnisafe/algorithms/model_based/models/dynamic_model.py
deleted file mode 100644
index 37ae6d9ef..000000000
--- a/omnisafe/algorithms/model_based/models/dynamic_model.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Modified version of model.py from  https://github.com/Xingyu-Lin/mbpo_pytorch/blob/main/model.py
-# original version doesn't validate model error batch-wise and is highly memory intensive.
-# ==============================================================================
-"""Dynamics Model"""
-
-import itertools
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def swish(data):
-    """Transform data using sigmoid function."""
-    return data * torch.sigmoid(data)
-
-
-class StandardScaler:
-    """Normalize data"""
-
-    def __init__(self, device=torch.device('cpu')):
-        self.mean = 0.0
-        self.std = 1.0
-        self.mean_t = torch.tensor(self.mean).to(device)
-        self.std_t = torch.tensor(self.std).to(device)
-        self.device = device
-
-    def fit(self, data):
-        """Runs two ops, one for assigning the mean of the data to the internal mean, and
-        another for assigning the standard deviation of the data to the internal standard deviation.
-        This function must be called within a 'with <session>.as_default()' block.
-
-        Arguments:
-        data (np.ndarray): A numpy array containing the input
-
-        Returns: None.
-        """
-        self.mean = np.mean(data, axis=0, keepdims=True)
-        self.std = np.std(data, axis=0, keepdims=True)
-        self.std[self.std < 1e-12] = 1.0
-        self.mean_t = torch.FloatTensor(self.mean).to(self.device)
-        self.std_t = torch.FloatTensor(self.std).to(self.device)
-
-    def transform(self, data):
-        """Transforms the input matrix data using the parameters of this scaler.
-
-        Arguments:
-        data (np.array): A numpy array containing the points to be transformed.
-
-        Returns: (np.array) The transformed dataset.
-        """
-        if torch.is_tensor(data):
-            return (data - self.mean_t) / self.std_t
-        return (data - self.mean) / self.std
-
-
-def init_weights(layer):
-    """Initialize network weight"""
-
-    def truncated_normal_init(weight, mean=0.0, std=0.01):
-        """Initialize network weight"""
-        torch.nn.init.normal_(weight, mean=mean, std=std)
-        while True:
-            cond = torch.logical_or(weight < mean - 2 * std, weight > mean + 2 * std)
-            if not torch.sum(cond):
-                break
-            weight = torch.where(
-                cond, torch.nn.init.normal_(torch.ones(weight.shape), mean=mean, std=std), weight
-            )
-        return weight
-
-    if isinstance(layer, (nn.Linear, EnsembleFC)):
-        input_dim = layer.in_features
-        truncated_normal_init(layer.weight, std=1 / (2 * np.sqrt(input_dim)))
-        layer.bias.data.fill_(0.0)
-
-
-class EnsembleFC(nn.Module):
-    """Ensemble fully connected network"""
-
-    __constants__ = ['in_features', 'out_features']
-    in_features: int
-    out_features: int
-    ensemble_size: int
-    weight: torch.Tensor
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        ensemble_size: int,
-        weight_decay: float = 0.0,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.ensemble_size = ensemble_size
-        self.weight = nn.Parameter(torch.Tensor(ensemble_size, in_features, out_features))
-        self.weight_decay = weight_decay
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(ensemble_size, out_features))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        """reset parameters"""
-
-    def forward(self, input_data: torch.Tensor) -> torch.Tensor:
-        """forward"""
-        w_times_x = torch.bmm(input_data, self.weight)
-        return torch.add(w_times_x, self.bias[:, None, :])  # w times x + b
-
-
-# pylint: disable-next=too-many-instance-attributes
-class EnsembleModel(nn.Module):
-    """Ensemble dynamics model"""
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        algo,
-        env_type,
-        state_size,
-        action_size,
-        reward_size,
-        cost_size,
-        ensemble_size,
-        hidden_size=200,
-        learning_rate=1e-3,
-        use_decay=False,
-    ):
-        super().__init__()
-        self.algo = algo
-        self.env_type = env_type
-
-        self.state_size = state_size
-        self.reward_size = reward_size
-        self.cost_size = cost_size
-        if self.algo == 'MBPPOLag' and self.env_type == 'gym':
-            self.output_dim = state_size
-        elif self.algo == 'SafeLOOP' and self.env_type == 'gym':
-            self.output_dim = state_size + reward_size
-        elif self.algo == 'CAP' and self.env_type == 'gym':
-            self.output_dim = state_size + reward_size
-        elif self.env_type == 'mujoco-velocity':
-            self.output_dim = state_size + reward_size + cost_size
-        self.hidden_size = hidden_size
-        self.use_decay = use_decay
-
-        self.nn1 = EnsembleFC(
-            state_size + action_size, hidden_size, ensemble_size, weight_decay=0.000025
-        )
-        self.nn2 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.00005)
-        self.nn3 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.000075)
-        self.nn4 = EnsembleFC(hidden_size, hidden_size, ensemble_size, weight_decay=0.000075)
-        self.nn5 = EnsembleFC(hidden_size, self.output_dim * 2, ensemble_size, weight_decay=0.0001)
-
-        self.register_buffer('max_logvar', (torch.ones((1, self.output_dim)).float() / 2))
-        self.register_buffer('min_logvar', (-torch.ones((1, self.output_dim)).float() * 10))
-        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
-        self.apply(init_weights)
-
-    # pylint: disable-next=too-many-locals
-    def forward(self, data, ret_log_var=False):
-        """Compute next state, reward, cost"""
-        nn1_output = swish(self.nn1(data))
-        nn2_output = swish(self.nn2(nn1_output))
-        nn3_output = swish(self.nn3(nn2_output))
-        nn4_output = swish(self.nn4(nn3_output))
-        nn5_output = self.nn5(nn4_output)
-        mean = nn5_output[:, :, : self.output_dim]
-        logvar = self.max_logvar - F.softplus(self.max_logvar - nn5_output[:, :, self.output_dim :])
-        logvar = self.min_logvar + F.softplus(logvar - self.min_logvar)
-        var = torch.exp(logvar)
-        if ret_log_var:
-            return mean, logvar
-        return mean, var
-
-    def get_decay_loss(self):
-        """Get decay loss"""
-        decay_loss = 0.0
-        for layer in self.children():
-            if isinstance(layer, EnsembleFC):
-                decay_loss += layer.weight_decay * torch.sum(torch.square(layer.weight)) / 2.0
-        return decay_loss
-
-    def loss(self, mean, logvar, labels, inc_var_loss=True):
-        """
-        mean, logvar: Ensemble_size x N x dim
-        labels: N x dim
-        """
-        assert len(mean.shape) == len(logvar.shape) == len(labels.shape) == 3
-        inv_var = torch.exp(-logvar)
-        if inc_var_loss:
-            # Average over batch and dim, sum over ensembles.
-            mse_loss = torch.mean(torch.mean(torch.pow(mean - labels, 2) * inv_var, dim=-1), dim=-1)
-            var_loss = torch.mean(torch.mean(logvar, dim=-1), dim=-1)
-            total_loss = torch.sum(mse_loss) + torch.sum(var_loss)
-        else:
-            mse_loss = torch.mean(torch.pow(mean - labels, 2), dim=(1, 2))
-            total_loss = torch.sum(mse_loss)
-        return total_loss, mse_loss
-
-    def train_ensemble(self, loss):
-        """Train the dynamics model"""
-        self.optimizer.zero_grad()
-        loss += 0.01 * torch.sum(self.max_logvar) - 0.01 * torch.sum(self.min_logvar)
-        if self.use_decay:
-            loss += self.get_decay_loss()
-        loss.backward()
-        self.optimizer.step()
-
-
-# pylint: disable-next=too-many-instance-attributes
-class EnsembleDynamicsModel:
-    """Dynamics model for predict next state, reward and cost"""
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        algo,
-        env_type,
-        device,
-        network_size,
-        elite_size,
-        hidden_size,
-        use_decay,
-        state_size,
-        action_size,
-        reward_size,
-        cost_size,
-    ):
-        self.algo = algo
-        self.network_size = network_size
-        self.elite_size = elite_size
-        self.model_list = []
-        self.state_size = state_size
-        self.action_size = action_size
-        self.reward_size = reward_size
-        self.cost_size = cost_size
-        self.network_size = network_size
-        self.device = device
-        if self.algo == 'MBPPOLag':
-            self.elite_model_idxes = []
-        elif self.algo in ['SafeLOOP', 'CAP']:
-            self.elite_model_idxes = list(range(self.elite_size))
-        self.env_type = env_type
-        self.ensemble_model = EnsembleModel(
-            algo,
-            env_type,
-            state_size,
-            action_size,
-            reward_size,
-            cost_size,
-            network_size,
-            hidden_size,
-            use_decay=use_decay,
-        )
-        self.ensemble_model.to(self.device)
-        self.scaler = StandardScaler(self.device)
-        self._max_epochs_since_update = 5
-        self._epochs_since_update = 0
-        self._state = {}
-        self._snapshots = {i: (None, 1e10) for i in range(self.network_size)}
-
-    # pylint: disable-next=too-many-locals, too-many-arguments
-    def train(self, inputs, labels, batch_size=256, holdout_ratio=0.0, max_epochs_since_update=5):
-        """train dynamics, holdout_ratio is the data ratio hold out for validation"""
-        self._max_epochs_since_update = max_epochs_since_update
-        self._epochs_since_update = 0
-        self._state = {}
-        self._snapshots = {i: (None, 1e10) for i in range(self.network_size)}
-
-        num_holdout = int(inputs.shape[0] * holdout_ratio)
-        permutation = np.random.permutation(inputs.shape[0])
-        inputs, labels = inputs[permutation], labels[permutation]
-
-        # split training and testing dataset
-        train_inputs, train_labels = inputs[num_holdout:], labels[num_holdout:]
-        holdout_inputs, holdout_labels = inputs[:num_holdout], labels[:num_holdout]
-        self.scaler.fit(train_inputs)
-        train_inputs = self.scaler.transform(train_inputs)
-        holdout_inputs = self.scaler.transform(holdout_inputs)
-
-        for epoch in itertools.count():
-            train_mse_losses = []
-            # training
-            train_idx = np.vstack(
-                [np.random.permutation(train_inputs.shape[0]) for _ in range(self.network_size)]
-            )
-            # shape: [train_inputs.shape[0],network_size]
-
-            for start_pos in range(0, train_inputs.shape[0], batch_size):
-                idx = train_idx[:, start_pos : start_pos + batch_size]
-                train_input = torch.from_numpy(train_inputs[idx]).float().to(self.device)
-                train_label = torch.from_numpy(train_labels[idx]).float().to(self.device)
-                mean, logvar = self.ensemble_model(train_input, ret_log_var=True)
-                total_loss, mse_loss = self.ensemble_model.loss(mean, logvar, train_label)
-                self.ensemble_model.train_ensemble(total_loss)
-                train_mse_losses.append(mse_loss.detach().cpu().numpy().mean())
-
-            # validation
-            val_idx = np.vstack(
-                [np.random.permutation(holdout_inputs.shape[0]) for _ in range(self.network_size)]
-            )
-            val_batch_size = 512
-            val_losses_list = []
-            len_valid = 0
-            for start_pos in range(0, holdout_inputs.shape[0], val_batch_size):
-                with torch.no_grad():
-                    idx = val_idx[:, start_pos : start_pos + val_batch_size]
-                    val_input = torch.from_numpy(holdout_inputs[idx]).float().to(self.device)
-                    val_label = torch.from_numpy(holdout_labels[idx]).float().to(self.device)
-                    holdout_mean, holdout_logvar = self.ensemble_model(val_input, ret_log_var=True)
-                    _, holdout_mse_losses = self.ensemble_model.loss(
-                        holdout_mean, holdout_logvar, val_label, inc_var_loss=False
-                    )
-                    holdout_mse_losses = holdout_mse_losses.detach().cpu().numpy()
-                    val_losses_list.append(holdout_mse_losses)
-                len_valid += 1
-            val_losses = np.array(val_losses_list)
-            val_losses = np.sum(val_losses, axis=0) / len_valid
-            sorted_loss_idx = np.argsort(val_losses)
-            self.elite_model_idxes = sorted_loss_idx[: self.elite_size].tolist()
-            break_train = self._save_best(epoch, val_losses)
-            if break_train:
-                break
-
-        train_mse_losses = np.array(train_mse_losses).mean()
-        val_mse_losses = val_losses
-        return train_mse_losses, val_mse_losses
-
-    def _save_best(self, epoch, holdout_losses):
-        updated = False
-        for i, current_loss in enumerate(holdout_losses):
-            _, best = self._snapshots[i]
-            improvement = (best - current_loss) / best
-            if improvement > 0.01:
-                self._snapshots[i] = (epoch, current_loss)
-                updated = True
-
-        if updated:
-            self._epochs_since_update = 0
-        else:
-            self._epochs_since_update += 1
-        return self._epochs_since_update > self._max_epochs_since_update
-
-    def predict_t(self, inputs, batch_size=1024, repeat_network=False):
-        """Input type and output type both are tensor, used for planning loop"""
-        inputs = self.scaler.transform(inputs)
-        # input shape: [networ_size, (num_gaus+num_actor)*paritcle ,state_dim + action_dim]
-        ensemble_mean, ensemble_var = [], []
-        for i in range(0, inputs.shape[0], batch_size):
-            model_input = inputs[i : min(i + batch_size, inputs.shape[0])].float().to(self.device)
-            # input shape: [networ_size, (num_gaus+num_actor)*paritcle ,state_dim + action_dim]
-            if repeat_network:
-                b_mean, b_var = self.ensemble_model(
-                    model_input[None, :, :].repeat([self.network_size, 1, 1]), ret_log_var=False
-                )
-            else:
-                b_mean, b_var = self.ensemble_model(model_input, ret_log_var=False)
-
-            ensemble_mean.append(b_mean)
-            ensemble_var.append(b_var)
-        ensemble_mean = torch.cat(ensemble_mean, dim=1)
-        ensemble_var = torch.cat(ensemble_var, dim=1)
-
-        return ensemble_mean, ensemble_var
-
-    def predict(self, inputs, batch_size=1024):
-        """Input type and output type both are numpy"""
-        inputs = self.scaler.transform(inputs)
-        ensemble_mean, ensemble_var = [], []
-        for i in range(0, inputs.shape[0], batch_size):
-            model_input = (
-                torch.from_numpy(inputs[i : min(i + batch_size, inputs.shape[0])])
-                .float()
-                .to(self.device)
-            )
-            b_mean, b_var = self.ensemble_model(
-                model_input[None, :, :].repeat([self.network_size, 1, 1]), ret_log_var=False
-            )
-            ensemble_mean.append(b_mean.detach().cpu().numpy())
-            ensemble_var.append(b_var.detach().cpu().numpy())
-        ensemble_mean = np.hstack(ensemble_mean)
-        ensemble_var = np.hstack(ensemble_var)
-        return ensemble_mean, ensemble_var
diff --git a/omnisafe/algorithms/model_based/models/virtual_env.py b/omnisafe/algorithms/model_based/models/virtual_env.py
deleted file mode 100644
index 8c1b71b7d..000000000
--- a/omnisafe/algorithms/model_based/models/virtual_env.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Virtual Environment"""
-
-import numpy as np
-import torch
-
-
-class VirtualEnv:
-    """Virtual environment for generating data or planning"""
-
-    def __init__(self, algo, model, env_name, device=torch.device('cpu')):
-        self.algo = algo
-        self.model = model
-        self.env_name = env_name
-        self.device = device
-        if self.model.env_type == 'gym' and self.algo in ['MBPPOLag']:
-            self.state_start_dim = 0
-        elif self.model.env_type == 'gym' and self.algo in ['CAP', 'SafeLOOP']:
-            self.state_start_dim = 1
-        elif self.model.env_type == 'mujoco-velocity' and self.algo in [
-            'MBPPOLag',
-            'CAP',
-            'SafeLOOP',
-        ]:
-            self.state_start_dim = 2
-
-    def _termination_fn(self, env_name, obs, act, next_obs):
-        """Terminal function"""
-        if env_name == 'Hopper-v2':
-            assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
-
-            height = next_obs[:, 0]
-            angle = next_obs[:, 1]
-            not_done = (
-                np.isfinite(next_obs).all(axis=-1)
-                * np.abs(next_obs[:, 1:] < 100).all(axis=-1)
-                * (height > 0.7)
-                * (np.abs(angle) < 0.2)
-            )
-
-            done = ~not_done
-            done = done[:, None]
-            return done
-        if env_name == 'Walker2d-v2':
-            assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
-
-            height = next_obs[:, 0]
-            angle = next_obs[:, 1]
-            not_done = (height > 0.8) * (height < 2.0) * (angle > -1.0) * (angle < 1.0)
-            done = ~not_done
-            done = done[:, None]
-            return done
-        if 'walker_' in env_name:
-            torso_height = next_obs[:, -2]
-            torso_ang = next_obs[:, -1]
-            if 'walker_7' in env_name or 'walker_5' in env_name:
-                offset = 0.0
-            else:
-                offset = 0.26
-            not_done = (
-                (torso_height > 0.8 - offset)
-                * (torso_height < 2.0 - offset)
-                * (torso_ang > -1.0)
-                * (torso_ang < 1.0)
-            )
-            done = ~not_done
-            done = done[:, None]
-            return done
-
-        return False
-
-    def _get_logprob(self, input_data, means, variances):
-        k = input_data.shape[-1]
-        log_prob = (
-            -1
-            / 2
-            * (
-                k * np.log(2 * np.pi)
-                + np.log(variances).sum(-1)
-                + (np.power(input_data - means, 2) / variances).sum(-1)
-            )
-        )
-
-        # [ batch_size ]
-        prob = np.exp(log_prob).sum(0)
-
-        # [ batch_size ]
-        log_prob = np.log(prob)
-
-        stds = np.std(means, 0).mean(-1)
-
-        return log_prob, stds
-
-    # pylint: disable-next=too-many-locals
-    def mbppo_step(self, obs, act, idx=None, deterministic=False):
-        # pylint: disable-next=line-too-long
-        """use numpy input to predict single next state by randomly select one model result or select index model result."""
-        if len(obs.shape) == 1:
-            obs = obs[None]
-            act = act[None]
-            return_single = True
-        else:
-            return_single = False
-
-        if idx is None:
-            idx = self.model.elite_model_idxes
-        else:
-            idx = [idx]
-        inputs = np.concatenate((obs, act), axis=-1)
-        ensemble_model_means, ensemble_model_vars = self.model.predict(inputs)
-        ensemble_model_means[:, :, self.state_start_dim :] += obs
-
-        ensemble_model_stds = np.sqrt(ensemble_model_vars)
-
-        if deterministic:
-            ensemble_samples = ensemble_model_means
-        else:
-            ensemble_samples = (
-                ensemble_model_means
-                + np.random.normal(size=ensemble_model_means.shape) * ensemble_model_stds
-            )
-
-        _, batch_size, _ = ensemble_model_means.shape
-        model_idxes = np.random.choice(idx, size=batch_size)
-        batch_idxes = np.arange(0, batch_size)
-
-        samples = ensemble_samples[model_idxes, batch_idxes]
-        if self.algo == 'MBPPOLag' and self.model.env_type == 'mujoco-velocity':
-            rewards, cost, next_obs = (
-                samples[:, 0],
-                samples[:, 1],
-                samples[:, self.state_start_dim :],
-            )
-            terminals = self._termination_fn(self.env_name, obs, act, next_obs)
-        elif self.algo == 'MBPPOLag' and self.model.env_type == 'gym':
-            next_obs = samples
-            rewards = None
-            cost = None
-            terminals = None
-
-        if return_single:
-            next_obs = next_obs[0]
-            if self.model.env_type == 'mujoco-velocity':
-                rewards = rewards[0]
-                cost = cost[0]
-
-        return next_obs, rewards, cost, terminals
-
-    # pylint: disable-next=too-many-arguments,too-many-locals
-    def safeloop_step(self, obs, act, deterministic=False, all_model=False, repeat_network=False):
-        """Use tensor input to predict single next state by randomly select elite model result for online planning"""
-        if len(obs.shape) == 1:
-            obs = obs[None]
-            act = act[None]
-
-        inputs = torch.cat((obs, act), dim=-1)
-        ensemble_model_means, ensemble_model_vars = self.model.predict_t(
-            inputs, repeat_network=repeat_network
-        )
-
-        ensemble_model_means[:, :, self.state_start_dim :] += obs
-
-        ensemble_model_stds = torch.sqrt(ensemble_model_vars)
-
-        if deterministic:
-            ensemble_samples = ensemble_model_means
-        else:
-            ensemble_samples = (
-                ensemble_model_means
-                + torch.randn(size=ensemble_model_means.shape).to(self.device) * ensemble_model_stds
-            )
-
-        # use all dynamics model result
-        if all_model:
-            samples = ensemble_samples
-            samples_var = ensemble_model_vars
-        # only use elite model result
-        else:
-            _, batch_size, _ = ensemble_model_means.shape
-            model_idxes = np.random.choice(self.model.elite_model_idxes, size=batch_size)
-            batch_idxes = np.arange(0, batch_size)
-            samples = ensemble_samples[model_idxes, batch_idxes]
-            samples_var = ensemble_model_vars[model_idxes, batch_idxes]
-
-        return samples, samples_var
-
-    # pylint: disable-next=too-many-arguments, too-many-locals
-    def cap_step(self, obs, act, deterministic=False, all_model=True, repeat_network=False):
-        """Use tensor input to predict single next state by randomly select elite model result for online planning"""
-        if len(obs.shape) == 1:
-            obs = obs[None]
-            act = act[None]
-
-        inputs = torch.cat((obs, act), dim=-1)
-        ensemble_model_means, ensemble_model_vars = self.model.predict_t(
-            inputs, repeat_network=repeat_network
-        )
-
-        ensemble_model_means[:, :, self.state_start_dim :] += obs
-
-        ensemble_model_stds = torch.sqrt(ensemble_model_vars)
-
-        if deterministic:
-            ensemble_samples = ensemble_model_means
-        else:
-            ensemble_samples = (
-                ensemble_model_means
-                + torch.randn(size=ensemble_model_means.shape).to(self.device) * ensemble_model_stds
-            )
-
-        # use all dynamics model result
-        if all_model:
-            samples = ensemble_samples
-            samples_var = ensemble_model_vars
-        # only use elite model result
-        else:
-            _, batch_size, _ = ensemble_model_means.shape
-            model_idxes = np.random.choice(self.model.elite_model_idxes, size=batch_size)
-            batch_idxes = np.arange(0, batch_size)
-            samples = ensemble_samples[model_idxes, batch_idxes]
-            samples_var = ensemble_model_vars[model_idxes, batch_idxes]
-
-        rewards, rewards_var = samples[:, :, 0].unsqueeze(2), samples_var[:, :, 0].unsqueeze(2)
-        next_obs, next_obs_var = (
-            samples[:, :, self.state_start_dim :],
-            samples_var[:, :, self.state_start_dim :],
-        )
-        output = {
-            'state': (next_obs, next_obs_var),
-            'reward': (rewards, rewards_var),
-        }
-        if self.model.env_type == 'mujoco-velocity':
-            cost, cost_var = samples[:, :, 1].unsqueeze(2), samples_var[:, :, 1].unsqueeze(2)
-            output['cost'] = (cost, cost_var)
-
-        return output
diff --git a/omnisafe/algorithms/model_based/planner.py b/omnisafe/algorithms/model_based/planner.py
deleted file mode 100644
index 58a34a168..000000000
--- a/omnisafe/algorithms/model_based/planner.py
+++ /dev/null
@@ -1,919 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Safe controllers which do a black box optimization incorporating the constraint costs."""
-
-import numpy as np
-import scipy.stats as stats
-import torch
-
-
-class ARCPlanner:  # pylint: disable=too-many-instance-attributes
-    """The Actor Regularized Control (ARC) Planner.
-
-    References:
-        Title: Learning Off-Policy with Online Planning
-        Authors: Harshit Sikchi, Wenxuan Zhou, David Held.
-        URL: https://arxiv.org/abs/2008.10066
-    """
-
-    # pylint: disable-next=too-many-locals,too-many-arguments
-    def __init__(
-        self,
-        algo,
-        cfgs,
-        device,
-        env,
-        models,
-        actor_critic,
-        horizon,
-        popsize,
-        particles,
-        max_iters,
-        alpha,
-        mixture_coefficient,
-        kappa,
-        safety_threshold,
-        minimal_elites,
-        obs_clip,
-        lagrangian_multiplier=None,
-    ):
-        self.algo = algo
-        self.cfgs = cfgs
-        self.device = device
-        self.obs_dim = env.observation_space.shape[0]
-        self.action_dim = env.action_space.shape[0]
-        self.env = env
-        self.models = models
-        self.actor_critic = actor_critic
-        self.termination_function = default_termination_function
-        self.horizon = horizon
-        self.sol_dim = self.env.action_space.shape[0] * horizon
-        self.action_max = np.repeat(self.env.action_space.high, self.horizon, axis=0)
-        self.action_min = np.repeat(self.env.action_space.low, self.horizon, axis=0)
-        self.mean = np.zeros((self.sol_dim,))
-        # Shape: [ H * action_dim, 1 ]
-        self.num_gaussian_traj = popsize
-        self.mixture_coefficient = mixture_coefficient
-        self.num_actor_traj = int(self.mixture_coefficient * self.num_gaussian_traj)
-
-        self.particles = particles
-        self.max_iters = max_iters
-        self.alpha_plan = alpha
-        self.kappa = kappa
-        self.safety_threshold = safety_threshold
-        self.minimal_elites = minimal_elites
-        self.state_start_dim = 2 if self.env.env_type == 'mujoco-velocity' else 1
-        self.obs_clip = obs_clip
-        self.lagrangian_multiplier = lagrangian_multiplier
-
-    def planner_reset(self):
-        """Reset planner when the episode end."""
-        self.mean = np.zeros((self.sol_dim,))
-
-    def generate_actor_action(self, curr_state):
-        """Generate H steps deterministic and stochastic actor action trajectory using dynamics model."""
-        # Set the reward of initial state to zero.
-        actor_state = np.array(
-            [np.concatenate(([0] * self.state_start_dim, curr_state.copy()), axis=0)]
-            * (self.num_actor_traj)
-        )
-        # Shape: [actor_traj, reward_dim (+ cost_dim) + state_dim]
-
-        # Add trajectories using actions suggested by actors
-        actor_action_traj = np.zeros((self.num_actor_traj, self.sol_dim))
-        # Shape: [actor_traj, H * action_dim]
-
-        actor_state = torch.FloatTensor(actor_state).to(self.device)
-        # Shape: [actor_traj, reward_dim (+ cost_dim) + state_dim]
-
-        actor_state_m = actor_state[0, :].reshape(1, -1)
-        # Shape: [1, reward_dim (+ cost_dim) + state_dim]
-
-        actor_state_m2 = actor_state[1, :].reshape(1, -1)
-        # Shape: [1, reward_dim (+ cost_dim) + state_dim]
-
-        for current_horizon in range(self.horizon):
-            # Use deterministic policy to plan a action trajectory
-            actor_actions_m, _, _ = self.actor_critic.step(
-                actor_state_m.reshape(1, -1)[:, self.state_start_dim :], deterministic=True
-            )
-            # Shape: [1, action_dim]
-            actor_actions_m = torch.tensor(actor_actions_m).to(self.device)
-            # Use dynamics model to plan
-            actor_state_m, _ = self.models.safeloop_step(
-                actor_state_m[:, self.state_start_dim :],
-                actor_actions_m,
-                repeat_network=True,
-            )
-            # Shape: [1, reward_dim + state_dim]
-
-            # protection for producing nan
-            actor_state_m = torch.clamp(actor_state_m, -self.obs_clip, self.obs_clip)
-            actor_state_m = torch.nan_to_num(actor_state_m)
-
-            # Store a planning action to action buffer
-            actor_action_traj[
-                0, current_horizon * self.action_dim : (current_horizon + 1) * self.action_dim
-            ] = (actor_actions_m.detach().cpu().numpy())
-
-            # Using Stochastic policy to plan a action trajectory
-            actor_actions, _, _ = self.actor_critic.step(actor_state_m2[:, self.state_start_dim :])
-            # Shape: [1, action_dim]
-            actor_actions = torch.tensor(actor_actions).to(self.device)
-
-            # Use dynamics model to plan
-            actor_state_m2, _ = self.models.safeloop_step(
-                actor_state_m2[:, self.state_start_dim :],
-                actor_actions,
-                repeat_network=True,
-            )
-            # Shape: [1, reward_dim + state_dim]
-
-            # protection for producing nan
-            actor_state_m2 = torch.clamp(actor_state_m2, -self.obs_clip, self.obs_clip)
-            actor_state_m2 = torch.nan_to_num(actor_state_m2)
-
-            # Copy the planning action of stochastic actor (actor_traj-1) times, and store to action buffer
-            actor_action_traj[
-                1:, current_horizon * self.action_dim : (current_horizon + 1) * self.action_dim
-            ] = (actor_actions.detach().cpu().numpy())
-        return actor_action_traj
-
-    def compute_terminal_reward(self, action_traj, state_traj):
-        """Compute the terminal reward behind H horizon"""
-        # This is the final action for evaluating terminated reward and cost
-        final_action = (
-            torch.from_numpy(
-                action_traj[
-                    :,
-                    (self.horizon - 1) * self.action_dim : (self.horizon) * self.action_dim,
-                ].reshape((self.num_gaussian_traj + self.num_actor_traj) * self.particles, -1)
-            )
-            .float()
-            .to(self.device)
-        )
-        # Shape: [ (num_gau_traj + num_actor_traj) * particles,  action_dim ) ,
-        # action_traj Shape: [ (num_gau_traj + num_actor_traj) * particles, H * action_dim]
-
-        final_action = final_action.repeat(self.models.model.network_size, 1)
-        # Shape: [ (num_gau_traj + num_actor_traj) * particles, network_size , action_dim )
-
-        # This is the final state for evaluating terminated reward and cost
-        final_state = (
-            torch.from_numpy(
-                state_traj[self.horizon, :, :, self.state_start_dim :].reshape(
-                    (self.num_gaussian_traj + self.num_actor_traj)
-                    * self.particles
-                    * state_traj.shape[1],
-                    -1,
-                )
-            )
-            .float()
-            .to(self.device)
-        )
-        # [ (num_gau_traj + num_actor_traj) * particles, state_dim ]
-
-        terminal_reward = (
-            self.actor_critic.critic(final_state, final_action)[0].cpu().detach().numpy()
-        )
-        terminal_reward = terminal_reward.reshape(state_traj.shape[1], -1)
-        # [ (num_gau_traj + num_actor_traj) * particles, 1]
-
-        return terminal_reward
-
-    def compute_cost_from_state(self, state_traj):
-        """compute cost from state that dynamics model predict"""
-        states_flatten = state_traj[:, :, :, self.state_start_dim :].reshape(-1, self.obs_dim)
-        # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, state_dim]
-
-        all_safety_costs = np.zeros((states_flatten.shape[0],))
-        # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1]
-
-        all_safety_costs = self.env.get_observation_cost(states_flatten)
-        # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1]
-
-        all_safety_costs = all_safety_costs.reshape(
-            state_traj.shape[0], state_traj.shape[1], state_traj.shape[2], 1
-        )
-        # [ horizon+1, network_size, (num_gau_traj + num_actor_traj) * particles, 1]
-        return all_safety_costs
-
-    # pylint: disable-next=too-many-statements,too-many-locals,too-many-branches
-    def get_action(self, curr_state):
-        """Select action when interact with environment."""
-        # sample action from actor
-        if self.num_actor_traj != 0.0:
-            actor_action_traj = self.generate_actor_action(curr_state)
-            # Shape: [actor_traj, H * action_dim]
-
-        curr_state = np.array(
-            [np.concatenate(([0] * self.state_start_dim, curr_state.copy()), axis=0)]
-            * ((self.num_gaussian_traj + self.num_actor_traj) * self.particles)
-        )
-        # Shape: [(num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim]
-
-        curr_state = np.expand_dims(curr_state, axis=0)
-        # Shape: [1, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) +state_dim]
-
-        curr_state = np.repeat(curr_state, self.models.model.network_size, 0)
-        # Shape: [network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim]
-
-        # initial mean and var of the sampling normal dist
-        # shift the current array to the left, clear the used action
-        self.mean[: -self.action_dim] = self.mean[self.action_dim :]
-        # Shape: [ H * action_dim, 1 ]
-
-        # fill the last position with the last second  action
-        self.mean[-self.action_dim :] = self.mean[-2 * self.action_dim : -self.action_dim]
-        mean = self.mean
-        # Shape: [ H * action_dim, 1 ]
-
-        var = np.tile(
-            np.square(self.env.action_space.high[0] - self.env.action_space.low[0]) / 16,
-            [self.sol_dim],
-        )
-        # Shape: [ H * action_dim, 1 ]
-
-        # Create gaussian distribution.
-        # mean is the zero vector, var is Unit Matrix
-        gaussian = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean))
-
-        current_iter = 0
-        while current_iter < self.max_iters:
-            lb_dist, ub_dist = mean - self.action_min, self.action_max - mean
-
-            constrained_var = np.minimum(
-                np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var
-            )
-
-            # Generate random normal gaussian variable and multiply by the var, then add to the mean
-            action_traj = (
-                gaussian.rvs(size=(self.num_gaussian_traj, self.sol_dim)) * np.sqrt(constrained_var)
-                + mean
-            ).astype(np.float32)
-            # Shape: [ N , H * action_dim]
-
-            if self.num_actor_traj != 0:
-                # Combine the actor action with gaussian action
-                action_traj = np.concatenate((action_traj, actor_action_traj), axis=0)
-                # Shape: [ num_gau_traj + num_actor_traj, H * action_dim]
-
-            # Multiple particles go through the same action sequence
-            action_traj = np.repeat(action_traj, self.particles, axis=0)
-            # Shape: [ particles, num_gau_traj + num_actor_traj, H * action_dim]
-
-            # actions clipped between -1 and 1
-            action_traj = np.clip(action_traj, -1, 1)
-            # Shape: [ particles, num_gau_traj + num_actor_traj, H * action_dim]
-
-            state_traj = (
-                torch.from_numpy(np.expand_dims(curr_state.copy(), axis=0)).float().to(self.device)
-            )
-            # Shape: [1, network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim]
-
-            var_traj = (
-                torch.zeros([1, curr_state.shape[0], curr_state.shape[1], 1])
-                .float()
-                .to(self.device)
-            )
-            # Shape: [1, network_size, (num_gau_traj + num_actor_traj) * particles, 1]
-            actions = np.repeat(
-                np.expand_dims(action_traj, axis=0), self.models.model.network_size, axis=0
-            )
-            # Shape: [ network_size, particles, num_gau_traj + num_actor_traj, H * action_dim]
-
-            actions = torch.FloatTensor(actions).to(self.device)
-            # Shape: [ network_size, particles, num_gau_traj + num_actor_traj, H * action_dim]
-            for current_horizon in range(self.horizon):
-                states_h = state_traj[current_horizon, :, :, self.state_start_dim :]
-                # [ network_size, (num_gau_traj + num_actor_traj) * particles, state_dim]
-                # use all dynamics model to predict next state (all_model=True)
-                next_states, next_var = self.models.safeloop_step(
-                    states_h,
-                    actions[
-                        :,
-                        :,
-                        current_horizon * self.action_dim : (current_horizon + 1) * self.action_dim,
-                    ],
-                    all_model=True,
-                    repeat_network=False,
-                )
-                # next_states and var shape:
-                # [ network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim]
-
-                # protection for producing nan in rare cases
-                next_states = torch.clamp(next_states, -self.obs_clip, self.obs_clip)
-                next_states = torch.nan_to_num(next_states)
-
-                state_traj = torch.cat((state_traj, next_states.unsqueeze(0)), axis=0)
-                # pylint: disable-next=line-too-long
-                # [ horizon + 1, network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim]
-
-                next_var = next_var[:, :, self.state_start_dim :].sqrt().norm(dim=2).unsqueeze(2)
-                # [network_size, (num_gau_traj + num_actor_traj) * particles,1]
-
-                var_traj = torch.cat((var_traj, next_var.unsqueeze(0)), axis=0)
-                # [ horizon + 1, network_size, (num_gau_traj + num_actor_traj) * particles, 1]
-
-            state_traj = state_traj.cpu().detach().numpy()
-            # pylint: disable-next=line-too-long
-            # [ horizon + 1, network_size, (num_gau_traj + num_actor_traj) * particles, reward_dim (+ cost_dim) + state_dim]
-
-            var_traj_numpy = var_traj.detach().cpu().numpy()
-            del var_traj
-
-            if self.env.env_type == 'mujoco-terminated':
-                done = np.zeros((state_traj.shape[1], state_traj.shape[2], 1))
-                # [network_size, (num_gau_traj + num_actor_traj) * particles, 1]
-
-                # Set the reward of terminated states to zero
-                for current_horizon in range(1, self.horizon + 1):
-                    for ens in range(state_traj.shape[1]):
-                        # check the state whether terminate
-                        done[ens, :, :] = np.logical_or(
-                            done[ens, :, :],
-                            self.termination_function(
-                                None,
-                                None,
-                                state_traj[current_horizon, ens, :, self.state_start_dim :],
-                            ),
-                        )
-                        not_done = 1 - done[ens, :, :]
-                        # Set the reward of terminated states to zero
-                        state_traj[current_horizon, ens, :, 0] *= not_done.astype(
-                            np.float32
-                        ).reshape(-1)
-
-            # Find average cost of each trajectory
-            returns = np.zeros((self.num_gaussian_traj + self.num_actor_traj,))
-            safety_costs = np.zeros((self.num_gaussian_traj + self.num_actor_traj,))
-            trajectory_max_vars = np.zeros((self.num_gaussian_traj + self.num_actor_traj,))
-
-            # Shape: [ num_gau_traj + num_actor_traj,  1 ]
-            if self.algo == 'SafeLOOP':
-                terminal_reward = self.compute_terminal_reward(action_traj, state_traj)
-                # [ (num_gau_traj + num_actor_traj) * particles, 1]
-
-            if self.env.env_type == 'gym':
-                all_safety_costs = self.compute_cost_from_state(state_traj)
-                # [ horizon+1, network_size, (num_gau_traj + num_actor_traj) * particles, 1]
-
-            # Calculate the average reward and max cost of N action trajectory,
-            # each action trajectory have generated (network_size * particles) state-action trajectory
-            # using (network_size * particles) ensemble models.
-            for ensemble in self.models.model.elite_model_idxes:
-                if self.algo == 'SafeLOOP':
-                    if self.env.env_type == 'mujoco-terminated':
-                        done[ensemble, :, :] = np.logical_or(
-                            done[ensemble, :, :],
-                            self.termination_function(
-                                None,
-                                None,
-                                state_traj[self.horizon - 1, ensemble, :, self.state_start_dim :],
-                            ),
-                        )
-                        not_done = 1 - done[ensemble, :, :]
-                        q_rews = terminal_reward[ensemble, :] * not_done.reshape(-1)
-                    else:
-                        q_rews = terminal_reward[ensemble, :]
-
-                traj_indices = np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(
-                    int
-                )
-                for particle in range(self.particles):
-                    returns[traj_indices] += np.sum(
-                        state_traj[
-                            1 : self.horizon + 1,
-                            ensemble,
-                            traj_indices * self.particles + particle,
-                            0,
-                        ],
-                        axis=0,
-                    )
-                    if self.algo == 'SafeLOOP':
-                        returns[traj_indices] += q_rews.reshape(-1)[
-                            traj_indices * self.particles + particle
-                        ]
-                    if self.env.env_type == 'gym':
-                        # use state that dynamics predict to compute cost
-                        safety_costs[traj_indices] = np.maximum(
-                            safety_costs,
-                            np.sum(
-                                all_safety_costs[
-                                    0 : self.horizon,
-                                    ensemble,
-                                    traj_indices * self.particles + particle,
-                                    0,
-                                ],
-                                axis=0,
-                            ),
-                        )
-                    elif self.env.env_type == 'mujoco-velocity':
-                        # use cost that dynamics predict at dimension one
-                        safety_costs[traj_indices] += np.sum(
-                            state_traj[
-                                1 : self.horizon + 1,
-                                ensemble,
-                                traj_indices * self.particles + particle,
-                                1,
-                            ],
-                            axis=0,
-                        )
-                    if self.algo == 'CAP':
-                        trajectory_max_vars[traj_indices] += np.maximum(
-                            trajectory_max_vars,
-                            np.sum(
-                                var_traj_numpy[
-                                    1 : self.horizon + 1,
-                                    ensemble,
-                                    traj_indices * self.particles + particle,
-                                    0,
-                                ],
-                                axis=0,
-                            ),
-                        )
-            returns /= state_traj.shape[1] * self.particles
-            # [ num_gau_traj + num_actor_traj, 1]
-
-            if self.algo == 'SafeLOOP':
-                new_mean, new_var, safety_costs_mean, fail_flag = self.safe_loop_elite_select(
-                    returns, safety_costs, action_traj
-                )
-                if fail_flag is False:
-                    mean = new_mean
-                else:  # rare case for protecting bug
-                    break
-            elif self.algo == 'CAP':
-                safety_costs /= state_traj.shape[1] * self.particles
-                if self.cfgs.cost_gamma == 1.0:
-                    c_gamma_discount = self.cfgs.max_ep_len / self.horizon
-                    # Extend the cost to the entire trajectory
-                else:
-                    c_gamma_discount = (1 / self.horizon) * (
-                        (1 - self.cfgs.cost_gamma**self.cfgs.max_ep_len)
-                        / (1 - self.cfgs.cost_gamma)
-                    )
-                    # average the cost, then view it as the starting element of the arithmetic progression
-                safety_costs = c_gamma_discount * safety_costs
-
-                penalty = torch.nn.ReLU()(self.lagrangian_multiplier).item()
-                safety_costs = safety_costs + penalty * trajectory_max_vars
-                mean, new_var, safety_costs_mean = self.cap_elite_select(
-                    returns, safety_costs, action_traj
-                )
-            var = (self.alpha_plan) * var + (1 - self.alpha_plan) * new_var
-            current_iter += 1
-
-            del state_traj, action_traj
-
-            # Initialize the var every 6 times
-            if (current_iter + 1) % 6 == 0:
-                var = np.tile(
-                    np.square(self.env.action_space.high[0] - self.env.action_space.low[0]) / 16.0,
-                    [self.sol_dim],
-                ) * (1.5 ** ((current_iter + 1) // 6))
-
-            # If safe trajectory not enough and t>5  or t>25 ,then break
-            if (
-                ((safety_costs < self.safety_threshold).sum() >= self.minimal_elites)
-                and current_iter > 5
-            ) or current_iter > 25:
-                break
-
-        # Store the mean and use it in next plan
-        self.mean = mean
-
-        # Return [1, action_dim], that is the first action of H horizon action mean, which shape is [1, H * action_dim]
-        return mean[: self.action_dim], safety_costs_mean
-
-    def cap_elite_select(self, returns, safety_costs, action_traj):
-        """TODO"""
-        # returns: [ num_gau_traj + num_actor_traj, 1]
-        # safety_costs: [ num_gau_traj + num_actor_traj, 1]
-        # action_traj: [ (num_gau_traj + num_actor_traj) * particle,  H * action_dim]
-        safety_costs_mean = np.mean(safety_costs)
-        if (safety_costs < self.safety_threshold).sum() < self.minimal_elites:
-            indices = np.argsort(safety_costs)
-            indices *= self.particles
-            elites = action_traj[indices][: self.minimal_elites]
-        else:
-            costs = (
-                -returns * (safety_costs < self.safety_threshold)
-                + (safety_costs >= self.safety_threshold) * 1e4
-            )
-            indices = np.argsort(costs)
-            indices = np.array([idx for idx in indices if costs[idx] < 1e3])
-            indices *= self.particles
-            elites = action_traj[indices][: min(self.minimal_elites, indices.shape[0])]
-        mean = np.mean(elites, axis=0)
-        new_var = np.var(elites, axis=0)
-        return mean, new_var, safety_costs_mean
-
-    def cap_elite_selection(self, returns, safety_costs, action_traj):
-        """TODO"""
-        # returns: [ num_gau_traj + num_actor_traj, 1]
-        # safety_costs: [ num_gau_traj + num_actor_traj, 1]
-        # action_traj: [ (num_gau_traj + num_actor_traj) * particle,  H * action_dim]
-        all_action = action_traj[
-            np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int)
-            * self.particles,
-            :,
-        ]
-        # all_action is [ num_gau_traj + num_actor_traj, H * action_dim]
-
-        # find the index for safe trajectories
-        feasible_ids = (safety_costs <= self.safety_threshold).nonzero()[0]
-        if feasible_ids.shape[0] < self.minimal_elites:
-            # if safe trajectories not enough
-            elite_ids = np.argsort(safety_costs)[: self.minimal_elites]
-        else:
-            # if have enough safe trajectories
-            # select the top k reward in safe action trajectories
-            elite_ids = feasible_ids[np.argsort(-returns[feasible_ids])][: self.minimal_elites]
-
-        elite_action = all_action[elite_ids]
-        # [ elite_ids, H * action_dim]
-
-        mean = np.mean(elite_action, axis=0)
-        # [ 1, H * action_dim]
-
-        var = np.var(elite_action, axis=0)
-        # [ 1,  H * action_dim]
-
-        return mean, var
-
-    def safe_loop_elite_select(self, returns, safety_costs, action_traj):
-        """Update mean and var using reward and cost"""
-        # returns: [ num_gau_traj + num_actor_traj, 1]
-        # safety_costs: [ num_gau_traj + num_actor_traj, 1]
-        # action_traj: [ (num_gau_traj + num_actor_traj) * particle,  H * action_dim]
-        safety_costs_mean = np.mean(safety_costs)
-
-        if (safety_costs < self.safety_threshold).sum() < self.minimal_elites:
-            safety_rewards = -safety_costs
-            # [ num_gau_traj + num_actor_traj, 1]
-
-            max_safety_reward = np.max(safety_rewards)
-            # [1,1]
-
-            score = np.exp(self.kappa * (safety_rewards - max_safety_reward))
-            # [ num_gau_traj + num_actor_traj, 1]
-
-            indices = np.argsort(safety_costs)
-            # [ num_gau_traj + num_actor_traj, 1]
-
-            mean = np.sum(
-                action_traj[
-                    np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int)
-                    * self.particles,
-                    :,
-                ]
-                * score.reshape(-1, 1),
-                axis=0,
-            ) / (np.sum(score) + 1e-10)
-            # mean: [1, H * action_dim],
-            # action_traj: [ num_gau_traj + num_actor_traj, H * action_dim],
-            # score: [ num_gau_traj + num_actor_traj, 1]
-
-            new_var = np.average(
-                (
-                    action_traj[
-                        np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int)
-                        * self.particles,
-                        :,
-                    ]
-                    - mean
-                )
-                ** 2,
-                weights=score.reshape(-1),
-                axis=0,
-            )
-        # [ 1,  H * action_dim]
-
-        else:  # if have enough safe trajectory
-            # safe trajectory's costs is -reward, unsafe trajectory's costs is 1e4
-            costs = (
-                -returns * (safety_costs < self.safety_threshold)
-                + (safety_costs >= self.safety_threshold) * 1e4
-            )
-            # [ num_gau_traj + num_actor_traj, 1]
-
-            # select indices of safe trajectory
-            indices = np.arange(costs.shape[0])
-            indices = np.array([idx for idx in indices if costs[idx] < 1e3])
-            # [ num_safe_traj, 1]
-
-            # rare case
-            if indices.shape[0] == 0 or action_traj.shape[0] == 0:
-                return False, False, False, True
-
-            safe_action_traj = action_traj[
-                np.arange(0, self.num_gaussian_traj + self.num_actor_traj, 1).astype(int)
-                * self.particles,
-                :,
-            ][indices, :]
-            # [ num_safe_traj, H * action_dim]
-
-            # use safe trajectory and its reward as weight to update
-            rewards = -costs[indices]
-            # [ num_safe_traj, 1 ]
-
-            max_reward = np.max(rewards)
-            # [ 1, 1 ]
-
-            score = np.exp(self.kappa * (rewards - max_reward))
-            # [ num_safe_traj, 1 ]
-
-            mean = np.sum(safe_action_traj * score.reshape(-1, 1), axis=0) / (np.sum(score) + 1e-10)
-            # [1, H * action_dim] = [1, H * action_dim] / [1,1]
-
-            new_var = np.average((safe_action_traj - mean) ** 2, weights=score.reshape(-1), axis=0)
-            # [ 1,  H * action_dim]
-        return mean, new_var, safety_costs_mean, False
-
-
-def default_termination_function(state, action, next_state):  # pylint: disable=unused-argument
-    '# Default termination function that outputs done=False'
-    if torch.is_tensor(next_state):
-        done = torch.zeros((next_state.shape[0], 1))
-    else:
-        done = np.zeros((next_state.shape[0], 1))
-    return done
-
-
-# pylint: disable-next=too-many-instance-attributes
-class CCEPlanner:
-    """Constrained Cross-Entropy (CCE) Planner.
-
-    References:
-        Title: Constrained Cross-Entropy Method for Safe Reinforcement Learning
-        Authors: Min Wen, Ufuk Topcu.
-        URL: https://proceedings.neurips.cc/paper/2018/hash/34ffeb359a192eb8174b6854643cc046-Abstract.html
-    """
-
-    # pylint: disable-next=too-many-locals, too-many-arguments
-    def __init__(
-        self,
-        algo,
-        cfgs,
-        device,
-        env,
-        models,
-        horizon,
-        popsize,
-        particles,
-        max_iters,
-        alpha,
-        mixture_coefficient,
-        minimal_elites,
-        epsilon,
-        obs_clip,
-        lagrangian_multiplier,
-        cost_constrained=True,
-        penalize_uncertainty=True,
-    ):
-        self.algo = algo
-        self.cfgs = cfgs
-        self.obs_dim, self.action_dim = env.observation_space.shape[0], env.action_space.shape[0]
-        self.action_max, self.action_min = env.action_space.high, env.action_space.low
-        self.gamma = self.cfgs.gamma
-        self.c_gamma = self.cfgs.cost_gamma
-        self.cost_limit = self.cfgs.lagrange_cfgs.cost_limit
-        self.cost_constrained = cost_constrained
-        self.penalize_uncertainty = penalize_uncertainty
-        self.device = device
-        self.obs_clip = obs_clip
-        self.particles = particles
-        self.horizon = horizon
-        self.num_gaussian_traj = popsize
-        self.minimal_elites = minimal_elites
-        self.max_iters = max_iters
-        self.alpha = alpha
-        self.epsilon = epsilon
-        self.horizin_action_min = np.tile(self.action_min, [self.horizon])
-        self.horizin_action_max = np.tile(self.action_max, [self.horizon])
-        self.env = env
-        self.ac_buf = np.array([]).reshape(0, self.action_dim)
-        self.prev_sol = np.tile((self.action_min + self.action_max) / 2, [self.horizon])
-        self.init_var = np.tile(np.square(self.action_max - self.action_min) / 16, [self.horizon])
-        self.state_start_dim = 2 if self.env.env_type == 'mujoco-velocity' else 1
-        self.mixture_coefficient = mixture_coefficient
-        self.lagrangian_multiplier = lagrangian_multiplier
-        self.models = models
-        self.elites = None
-
-    def get_action(self, obs):
-        """Get action from previous solution or planner"""
-        if self.models is None:
-            return np.random.uniform(self.action_min, self.action_max, self.action_min.shape)
-        if self.ac_buf.shape[0] > 0:
-            action, self.ac_buf = self.ac_buf[0], self.ac_buf[1:]
-            return action
-
-        soln = self.obtain_solution(obs, self.prev_sol, self.init_var)
-        self.prev_sol = np.concatenate(
-            [np.copy(soln)[self.action_dim :], np.zeros(self.action_dim)]
-        )
-        self.ac_buf = soln[: self.action_dim].reshape(-1, self.action_dim)
-
-        return self.get_action(obs)
-
-    # pylint: disable-next=too-many-locals
-    def obtain_solution(self, obs, init_mean, init_var):
-        """Get action from planner"""
-        mean, var, iteration = init_mean, init_var, 0
-        gaussian = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(var))
-
-        while (iteration < self.max_iters) and np.max(var) > self.epsilon:
-            lb_dist, ub_dist = mean - self.horizin_action_min, self.horizin_action_max - mean
-            constrained_var = np.minimum(
-                np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var
-            )
-
-            noise = gaussian.rvs(size=[self.num_gaussian_traj, self.horizon * self.action_dim])
-
-            samples = noise * np.sqrt(constrained_var) + mean
-            samples = samples.astype(np.float32)
-
-            rewards, costs, eps_lens = self.rollout(obs, samples)
-            epoch_ratio = np.ones_like(eps_lens) * self.cfgs.max_ep_len / self.horizon
-            terminated = eps_lens != self.horizon
-            if self.c_gamma == 1:
-                c_gamma_discount = epoch_ratio
-            else:
-                c_gamma_discount = (
-                    (1 - self.c_gamma ** (epoch_ratio * self.horizon))
-                    / (1 - self.c_gamma)
-                    / self.horizon
-                )
-            rewards = rewards * epoch_ratio
-            costs = costs * c_gamma_discount
-
-            feasible_ids = ((costs <= self.cost_limit) & (~terminated)).nonzero()[0]
-            if self.cost_constrained:
-                if feasible_ids.shape[0] >= self.minimal_elites:
-                    elite_ids = feasible_ids[np.argsort(-rewards[feasible_ids])][
-                        : self.minimal_elites
-                    ]
-                else:
-                    elite_ids = np.argsort(costs)[: self.minimal_elites]
-            else:
-                elite_ids = np.argsort(-rewards)[: self.minimal_elites]
-            self.elites = samples[elite_ids]
-            new_mean = np.mean(self.elites, axis=0)
-            new_var = np.var(self.elites, axis=0)
-            mean = self.alpha * mean + (1 - self.alpha) * new_mean
-            var = self.alpha * var + (1 - self.alpha) * new_var
-            iteration += 1
-
-        return mean
-
-    @torch.no_grad()
-    def rollout(self, obs, ac_seqs):
-        """Roll out H step to compute reward, cost"""
-        # obs: [obs_dim,]
-        # ac_seqs: [num_gaussian_traj, horizon * action_dim]
-
-        ac_seqs = torch.from_numpy(ac_seqs).float().to(self.device)
-        ac_seqs = ac_seqs.view(-1, self.horizon, self.action_dim)
-        transposed = ac_seqs.transpose(0, 1)
-        expanded = transposed[:, :, None]
-        tiled = expanded.expand(-1, -1, self.particles, -1)
-        ac_seqs = tiled.contiguous().view(self.horizon, -1, self.action_dim)
-
-        # Expand current observation
-        cur_obs = torch.from_numpy(obs).float().to(self.device)
-        cur_obs = cur_obs[None]
-        cur_obs = cur_obs.expand(self.num_gaussian_traj * self.particles, -1)
-        # cur_obs: [num_gaussian_traj * particles, obs_dim]
-        rewards = torch.zeros(self.num_gaussian_traj, self.particles, device=self.device)
-        costs = torch.zeros(self.num_gaussian_traj, self.particles, device=self.device)
-        length = torch.zeros(self.num_gaussian_traj, self.particles, device=self.device)
-
-        for horizon in range(self.horizon):
-            cur_acs = ac_seqs[horizon]
-            cur_obs, reward, cost = self._predict_next(cur_obs, cur_acs)
-            # Clip state value
-            cur_obs = torch.clamp(cur_obs, -self.obs_clip, self.obs_clip)
-            reward = reward.view(-1, self.particles)
-            cost = cost.view(-1, self.particles)
-
-            rewards += reward
-            costs += cost
-            length += 1
-
-        # Replace nan with high cost
-        rewards = rewards.nan_to_num_(-1e6)
-        costs = costs.nan_to_num_(1e6)
-
-        return (
-            rewards.mean(dim=1).detach().cpu().numpy(),
-            costs.mean(dim=1).detach().cpu().numpy(),
-            length.mean(dim=1).detach().cpu().numpy(),
-        )
-
-    def _predict_next(self, obs, acs):
-        """Predict next state, reward and cost"""
-        # obs: [num_gaussian_traj * particles, obs_dim]
-        proc_obs = self._expand_to_ts_format(obs)
-        # [network_size, num_gaussian_traj*particles/network_size, state_dim]
-        proc_acs = self._expand_to_ts_format(acs)
-        output = self.models.cap_step(proc_obs, proc_acs)
-        next_obs, var = output['state']
-        # [network_size, num_gaussian_traj*particles/network_size, state_dim]
-        reward, _ = output['reward']
-        # [network_size, num_gaussian_traj*particles, 1]
-        reward = self._flatten_to_matrix(reward)
-        # [network_size * num_gaussian_traj * particles, 1]
-
-        if self.env.env_type == 'mujoco-velocity':
-            cost, _ = output['cost']
-            cost = self._flatten_to_matrix(cost)
-        elif self.env.env_type == 'gym':
-            next_obs_cost = next_obs.unsqueeze(0)
-            cost = self.compute_cost_from_state(next_obs_cost)
-            cost = torch.tensor(cost, device=self.device)
-            # [1, network_size, num_gaussian_traj*particles/network_size, 1]
-            cost = cost.squeeze(0)
-            # [network_size, num_gaussian_traj*particles/network_size, 1]
-            cost = self._flatten_to_matrix(cost)
-            # [num_gaussian_traj*particles, 1]
-
-        next_obs = self._flatten_to_matrix(next_obs)
-
-        obs = obs.detach().cpu().numpy()
-        acs = acs.detach().cpu().numpy()
-
-        if self.cost_constrained and self.penalize_uncertainty:
-            # var: [network_size, num_gaussian_traj*particles/network_size, state_dim]
-            var_penalty = var.sqrt().norm(dim=2).max(0)[0]
-            # cost_penalty: [num_gaussian_traj*particles/network_size]
-            var_penalty = var_penalty.repeat_interleave(self.models.model.network_size).view(
-                cost.shape
-            )
-            # cost_penalty: [num_gaussian_traj*particles, 1]
-            penalty = torch.nn.ReLU()(self.lagrangian_multiplier).item()
-            cost += penalty * var_penalty
-
-        return next_obs, reward, cost
-
-    def _expand_to_ts_format(self, mat):
-        """Expand input to ensemble network input format"""
-        dim = mat.shape[-1]
-        # eg:state_dim
-        reshaped = mat.view(
-            -1,
-            self.models.model.network_size,
-            self.particles // self.models.model.network_size,
-            dim,
-        )
-        # [num_gaussian_traj, network_size, particles // network_size, state_dim]
-        transposed = reshaped.transpose(0, 1)
-        # [network_size, num_gaussian_traj, particles // network_size, state_dim]
-        reshaped = transposed.contiguous().view(self.models.model.network_size, -1, dim)
-        # [network_size, num_gaussian_traj * particles / network_size, state_dim]
-
-        return reshaped
-
-    def _flatten_to_matrix(self, ts_fmt_arr):
-        """Flatten ensemble network output format to matrix"""
-
-        dim = ts_fmt_arr.shape[-1]
-        reshaped = ts_fmt_arr.view(
-            self.models.model.network_size,
-            -1,
-            self.particles // self.models.model.network_size,
-            dim,
-        )
-        transposed = reshaped.transpose(0, 1)
-        reshaped = transposed.contiguous().view(-1, dim)
-        return reshaped
-
-    def compute_cost_from_state(self, state_traj):
-        """compute cost from state that dynamics model predict"""
-        states_flatten = state_traj[:, :, :, :].reshape(-1, self.obs_dim)
-        # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, state_dim]
-
-        all_safety_costs = np.zeros((states_flatten.shape[0],))
-        # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1]
-
-        all_safety_costs = self.env.get_observation_cost(states_flatten)
-        # [ horizon+1 * network_size * (num_gau_traj + num_actor_traj) * particles, 1]
-
-        all_safety_costs = all_safety_costs.reshape(
-            state_traj.shape[0], state_traj.shape[1], state_traj.shape[2], 1
-        )
-        # [ horizon+1, network_size, (num_gau_traj + num_actor_traj) * particles, 1]
-        return all_safety_costs
diff --git a/omnisafe/algorithms/model_based/policy_gradient.py b/omnisafe/algorithms/model_based/policy_gradient.py
deleted file mode 100644
index 310a7fb72..000000000
--- a/omnisafe/algorithms/model_based/policy_gradient.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The Policy Gradient algorithm in model-based setting."""
-
-import time
-from copy import deepcopy
-
-import numpy as np
-import torch
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.model_based.models import EnsembleDynamicsModel, VirtualEnv
-from omnisafe.common.buffer import OffPolicyBuffer
-from omnisafe.common.logger import Logger
-from omnisafe.models.constraint_actor_critic import ConstraintActorCritic
-from omnisafe.utils import core
-from omnisafe.utils.distributed_utils import proc_id
-from omnisafe.wrappers import wrapper_registry
-
-
-@registry.register
-class PolicyGradientModelBased:  # pylint: disable=too-many-instance-attributes
-    """The Policy Gradient algorithm in Model-Based setting.
-
-    References:
-        Title: Policy Gradient Methods for Reinforcement Learning with Function Approximation
-        Authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour.
-        URL: https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf
-    """
-
-    def __init__(self, env_id, cfgs=None) -> None:
-        self.env_id = env_id
-        self.cfgs = deepcopy(cfgs)
-        self.algo = self.__class__.__name__
-        self.wrapper_type = self.cfgs.wrapper_type
-        self.env = wrapper_registry.get(self.wrapper_type)(self.algo, self.env_id)
-
-        self.device = torch.device(self.cfgs.device)
-        self.cost_gamma = self.cfgs.cost_gamma
-        # Set up logger and save configuration to disk
-        # Get local parameters before logger instance to avoid unnecessary print
-        self.logger = Logger(
-            output_dir=cfgs.data_dir,
-            exp_name=cfgs.exp_name,
-            seed=cfgs.seed,
-            use_tensorboard=cfgs.use_tensorboard,
-            use_wandb=cfgs.use_wandb,
-            config=cfgs,
-        )
-
-        # Set seed
-        seed = int(cfgs.seed)
-        seed += 10000 * proc_id()
-        torch.manual_seed(seed)
-        np.random.seed(seed)
-
-        # Set env
-        self.env.env.reset(seed=seed)
-        self.env.set_eplen(int(self.cfgs.max_ep_len))
-
-        # Initialize dynamics model
-        self.dynamics = EnsembleDynamicsModel(
-            self.algo,
-            self.env.env_type,
-            self.device,
-            state_size=self.env.dynamics_state_size,
-            action_size=self.env.action_space.shape[0],
-            reward_size=1,
-            cost_size=1,
-            **self.cfgs.dynamics_cfgs,
-        )
-        self.virtual_env = VirtualEnv(self.algo, self.dynamics, self.env_id, self.device)
-
-        # Initialize off-policy buffer
-        self.off_replay_buffer = OffPolicyBuffer(
-            obs_space=self.env.observation_space,
-            act_space=self.env.action_space,
-            size=self.cfgs.replay_size,
-            batch_size=self.cfgs.batch_size,
-            device=self.device,
-        )
-
-        if self.algo in ['MBPPOLag', 'SafeLOOP']:
-            self.use_actor = True
-            self.actor_critic = self.set_algorithm_specific_actor_critic()
-        else:
-            self.use_actor = False
-
-        # Setup statistics
-        self.start_time = time.time()
-        self.epoch_time = time.time()
-
-        self.logger.log('Start with training.')
-
-        self._init_log()
-
-    def _init_log(self):
-        self.logger.register_key('TotalEnvSteps3')
-        self.logger.register_key('Metrics/EpRet')
-        self.logger.register_key('Metrics/EpCost')
-        self.logger.register_key('Metrics/EpLen')
-        self._specific_init_logs()
-        self.logger.register_key('Time')
-
-    def _specific_init_logs(self):
-        pass
-
-    def learn(self):  # pylint: disable=too-many-locals
-        """training the policy."""
-        self.start_time = time.time()
-        ep_len, ep_ret, ep_cost = 0, 0, 0
-        state = self.env.reset()
-        time_step = 0
-        last_policy_update, last_dynamics_update, last_log = 0, 0, 0
-        while time_step < self.cfgs.max_real_time_steps:
-            # select action
-            action, action_info = self.select_action(time_step, state, self.env)
-
-            next_state, reward, cost, terminated, truncated, info = self.env.step(
-                action, self.cfgs.action_repeat
-            )
-
-            time_step += info['step_num']
-            ep_cost += (self.cost_gamma**ep_len) * cost
-            ep_len += 1
-            ep_ret += reward
-            self.store_real_data(
-                time_step,
-                ep_len,
-                state,
-                action_info,
-                action,
-                reward,
-                cost,
-                terminated,
-                truncated,
-                next_state,
-                info,
-            )
-
-            state = next_state
-            if terminated or truncated:
-                self.logger.store(
-                    **{
-                        'Metrics/EpRet': ep_ret,
-                        'Metrics/EpLen': ep_len * self.cfgs.action_repeat,
-                        'Metrics/EpCost': ep_cost,
-                    }
-                )
-                ep_ret, ep_cost, ep_len = 0, 0, 0
-                state = self.env.reset()
-                self.algo_reset()
-
-            if (
-                time_step % self.cfgs.update_dynamics_freq < self.cfgs.action_repeat
-                and time_step - last_dynamics_update >= self.cfgs.update_dynamics_freq
-            ):
-                self.update_dynamics_model()
-                last_dynamics_update = time_step
-
-            if (
-                self.use_actor
-                and time_step % self.cfgs.update_policy_freq < self.cfgs.action_repeat
-                and time_step - last_policy_update >= self.cfgs.update_policy_freq
-            ):
-                self.update_actor_critic(time_step)
-                last_policy_update = time_step
-
-            # Evaluate episode
-            if (
-                time_step % self.cfgs.log_freq < self.cfgs.action_repeat
-                and time_step - last_log >= self.cfgs.log_freq
-            ) or time_step == self.cfgs.max_real_time_steps - 1:
-                self.log(time_step)
-                self.logger.torch_save()
-                last_log = time_step
-        # Close opened files to avoid number of open files overflow
-        self.logger.close()
-
-    def log(self, time_step: int):
-        """
-        logging data
-        """
-        # Some child classes may add information to logs
-        self.algorithm_specific_logs(time_step)
-
-        self.logger.store(
-            **{
-                'TotalEnvSteps3': time_step,
-                'Time': int(time.time() - self.start_time),
-            }
-        )
-
-        self.logger.dump_tabular()
-
-    def select_action(self, time_step, state, env):  # pylint: disable=unused-argument
-        """
-        Select action when interact with real environment.
-
-        Returns:
-            action, action_info
-        """
-        if self.env.env_type == 'gym':
-            state = env.generate_lidar(state)
-        state_vec = np.array(state)
-        state_tensor = torch.as_tensor(state_vec, device=self.device, dtype=torch.float32)
-        action, val, cval, logp = self.actor_critic.step(state_tensor)
-        action = np.nan_to_num(action)
-        action_info = {'state_vec': state_vec, 'val': val, 'cval': cval, 'logp': logp}
-        return action, action_info
-
-    def algorithm_specific_logs(self, time_step):
-        """
-        Use this method to collect log information.
-        e.g. log lagrangian for lagrangian-base , log q, r, s, c for CPO, etc
-
-        Returns:
-            No return
-        """
-
-    def update_actor_critic(self, time_step):
-        """
-        Use this method to update actor and critic.
-
-        Returns:
-            No return
-        """
-
-    def set_algorithm_specific_actor_critic(self):
-        """
-        Use this method to initialize network.
-        e.g. Initialize Soft Actor Critic
-
-        Returns:
-            Actor_critic
-        """
-        self.actor_critic = ConstraintActorCritic(
-            observation_space=self.env.observation_space,
-            action_space=self.env.action_space,
-            model_cfgs=self.cfgs.model_cfgs,
-        ).to(self.device)
-        # Set up optimizer for policy and value function
-
-        self.actor_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.actor, learning_rate=self.cfgs.actor_lr
-        )
-        self.reward_critic_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.reward_critic, learning_rate=self.cfgs.critic_lr
-        )
-        self.cost_critic_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.cost_critic, learning_rate=self.cfgs.critic_lr
-        )
-
-        return self.actor_critic
-
-    def update_dynamics_model(self):
-        """
-        training the dynamics model
-
-        Returns:
-            No return
-        """
-
-    def algo_reset(self):
-        """
-        reset algo parameters
-
-        Returns:
-            No return
-        """
-
-    # pylint: disable-next=too-many-arguments
-    def store_real_data(
-        self,
-        time_step,
-        ep_len,
-        state,
-        action_info,
-        action,
-        reward,
-        cost,
-        terminated,
-        truncated,
-        next_state,
-        info,
-    ):
-        """
-        store real env data to buffer
-
-        Returns:
-            No return
-        """
diff --git a/omnisafe/algorithms/model_based/safeloop.py b/omnisafe/algorithms/model_based/safeloop.py
deleted file mode 100644
index f1d52e4bb..000000000
--- a/omnisafe/algorithms/model_based/safeloop.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the SafeLOOP algorithm."""
-
-from copy import deepcopy
-
-import numpy as np
-import torch
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.model_based.planner import ARCPlanner
-from omnisafe.algorithms.model_based.policy_gradient import PolicyGradientModelBased
-from omnisafe.models.actor_q_critic import ActorQCritic
-from omnisafe.utils import core
-
-
-@registry.register
-class SafeLOOP(
-    PolicyGradientModelBased, ARCPlanner
-):  # pylint: disable=too-many-instance-attributes
-    """The Safe Learning Off-Policy with Online Planning (SafeLOOP) algorithm.
-
-    References:
-        Title: Learning Off-Policy with Online Planning
-        Authors: Harshit Sikchi, Wenxuan Zhou, David Held.
-        URL: https://arxiv.org/abs/2008.10066
-    """
-
-    def __init__(self, env_id, cfgs) -> None:
-        PolicyGradientModelBased.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        # # Initialize Actor-Critic
-        self.actor_critic = self.set_algorithm_specific_actor_critic()
-        self.ac_targ = deepcopy(self.actor_critic)
-        self._ac_training_setup()
-
-        self.alpha = self.cfgs.alpha
-        self.alpha_gamma = self.cfgs.alpha_gamma
-        ARCPlanner.__init__(
-            self,
-            self.algo,
-            self.cfgs,
-            self.device,
-            self.env,
-            self.virtual_env,
-            self.actor_critic,
-            **self.cfgs.mpc_config,
-        )
-
-        # Set up model saving
-        what_to_save = {
-            'pi': self.actor_critic.actor,
-            'dynamics': self.dynamics,
-        }
-        self.logger.setup_torch_saver(what_to_save=what_to_save)
-        self.logger.torch_save()
-
-    def _specific_init_logs(self):
-        self.logger.register_key('Loss/DynamicsTrainMseLoss')
-        self.logger.register_key('Loss/DynamicsValMseLoss')
-        self.logger.register_key('Plan/safety_costs_mean')
-        self.logger.register_key('QVals')
-        self.logger.register_key('Loss/Pi')
-        self.logger.register_key('Loss/Value')
-
-    # pylint: disable-next=too-many-locals
-    def compute_loss_v(self, data):
-        """Computing value loss.
-
-        Args:
-            data (dict): data from replay buffer.
-
-        Returns:
-            torch.Tensor.
-        """
-        obs, act, rew, next_obs, done = (
-            data['obs'],
-            data['act'],
-            data['reward'],
-            data['next_obs'],
-            data['done'],
-        )
-        q_value_list = self.actor_critic.critic(obs, act)
-        # Bellman backup for Q function
-        with torch.no_grad():
-            act_targ, logp_a_next = self.ac_targ.actor.predict(
-                obs, deterministic=False, need_log_prob=True
-            )
-            q_targ = torch.min(torch.vstack(self.ac_targ.critic(next_obs, act_targ)), dim=0).values
-            backup = rew + self.cfgs.gamma * (1 - done) * (q_targ - self.alpha * logp_a_next)
-        # MSE loss against Bellman backup
-        loss_q = []
-        q_values = []
-        for q_value in q_value_list:
-            loss_q.append(torch.mean((q_value - backup) ** 2))
-            q_values.append(torch.mean(q_value))
-
-        # Useful info for logging
-        q_info = {'QVals': sum(q_values).cpu().detach().numpy()}
-        return sum(loss_q), q_info
-
-    def compute_loss_pi(self, data: dict):
-        """Computing pi/actor loss.
-
-        Args:
-            data (dict): data from replay buffer.
-
-        Returns:
-            torch.Tensor.
-        """
-        action, logp_a = self.actor_critic.actor.predict(
-            data['obs'], deterministic=True, need_log_prob=True
-        )
-        loss_pi = self.actor_critic.critic(data['obs'], action)[0] - self.alpha * logp_a
-        pi_info = {'LogPi': logp_a.cpu().detach().numpy()}
-        return -loss_pi.mean(), pi_info
-
-    def update_policy_net(self, data) -> None:
-        """Update policy network.
-
-        Args:
-            data (dict): data dictionary.
-        """
-        # Train policy with one steps of gradient descent
-        self.actor_optimizer.zero_grad()
-        loss_pi, _ = self.compute_loss_pi(data)
-        loss_pi.backward()
-        self.actor_optimizer.step()
-        self.logger.store(**{'Loss/Pi': loss_pi.item()})
-
-    def alpha_discount(self):
-        """Alpha discount."""
-        self.alpha *= self.alpha_gamma
-
-    def polyak_update_target(self):
-        """Polyak update target network."""
-        with torch.no_grad():
-            for param, param_targ in zip(self.actor_critic.parameters(), self.ac_targ.parameters()):
-                # Notes: We use an in-place operations "mul_", "add_" to update target
-                # params, as opposed to "mul" and "add", which would make new tensors.
-                param_targ.data.mul_(self.cfgs.polyak)
-                param_targ.data.add_((1 - self.cfgs.polyak) * param.data)
-
-    def update_value_net(self, data: dict) -> None:
-        """Update value network.
-
-        Args:
-            data (dict): data dictionary
-        """
-        # Train value critic with one steps of gradient descent
-        self.critic_optimizer.zero_grad()
-        loss_q, q_info = self.compute_loss_v(data)
-        loss_q.backward()
-        self.critic_optimizer.step()
-        self.logger.store(**{'Loss/Value': loss_q.item(), 'QVals': q_info['QVals']})
-
-    def set_algorithm_specific_actor_critic(self):
-        """
-        Use this method to initialize network.
-        e.g. Initialize Soft Actor Critic
-
-        Returns:
-            Actor_critic
-        """
-        self.actor_critic = ActorQCritic(
-            observation_space=self.env.observation_space,
-            action_space=self.env.action_space,
-            model_cfgs=self.cfgs.model_cfgs,
-        ).to(self.device)
-        # Set up optimizer for policy and q-function
-        self.actor_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.actor, learning_rate=self.cfgs.actor_lr
-        )
-        self.critic_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.critic, learning_rate=self.cfgs.critic_lr
-        )
-        return self.actor_critic
-
-    def _ac_training_setup(self):
-        """Set up target network for off_policy training."""
-        # Freeze target networks with respect to optimizer (only update via polyak averaging)
-        for param in self.ac_targ.actor.parameters():
-            param.requires_grad = False
-        for param in self.ac_targ.critic.parameters():
-            param.requires_grad = False
-
-    def algorithm_specific_logs(self, time_step):
-        """Log algo parameter"""
-        super().algorithm_specific_logs(time_step)
-        if time_step < self.cfgs.update_policy_start_timesteps:
-            self.logger.store(
-                **{
-                    'Loss/Pi': 0,
-                    'Plan/safety_costs_mean': 0,
-                    'QVals': 0,
-                    'Loss/Value': 0,
-                }
-            )
-
-    def update_actor_critic(self, time_step):
-        """update actor and critic"""
-        if time_step >= self.cfgs.update_policy_start_timesteps:
-            for _ in range(self.cfgs.update_policy_iters):
-                data = self.off_replay_buffer.sample_batch()
-                # First run one gradient descent step for Q.
-                self.update_value_net(data)
-
-                # Freeze Q-network so you don't waste computational effort
-                # computing gradients for it during the policy learning step.
-                for param in self.actor_critic.critic.parameters():
-                    param.requires_grad = False
-
-                # Next run one gradient descent step for actor.
-                self.update_policy_net(data)
-
-                # Unfreeze Q-network so you can optimize it at next DDPG step.
-                for param in self.actor_critic.critic.parameters():
-                    param.requires_grad = True
-
-                # Finally, update target networks by polyak averaging.
-                self.polyak_update_target()
-                self.alpha_discount()
-
-    def update_dynamics_model(self):
-        """Update dynamics."""
-        state = self.off_replay_buffer.data['obs'][: self.off_replay_buffer.size, :]
-        action = self.off_replay_buffer.data['act'][: self.off_replay_buffer.size, :]
-        reward = self.off_replay_buffer.data['reward'][: self.off_replay_buffer.size]
-        cost = self.off_replay_buffer.data['cost'][: self.off_replay_buffer.size]
-        next_state = self.off_replay_buffer.data['next_obs'][: self.off_replay_buffer.size, :]
-        delta_state = next_state - state
-        inputs = np.concatenate((state, action), axis=-1)
-        if self.env.env_type == 'mujoco-velocity':
-            labels = np.concatenate(
-                (
-                    np.reshape(reward, (reward.shape[0], -1)),
-                    np.reshape(cost, (cost.shape[0], -1)),
-                    delta_state,
-                ),
-                axis=-1,
-            )
-        elif self.env.env_type == 'gym':
-            labels = np.concatenate(
-                (np.reshape(reward, (reward.shape[0], -1)), delta_state), axis=-1
-            )
-        train_mse_losses, val_mse_losses = self.dynamics.train(
-            inputs, labels, batch_size=256, holdout_ratio=0.2
-        )
-        self.logger.store(
-            **{
-                'Loss/DynamicsTrainMseLoss': train_mse_losses,
-                'Loss/DynamicsValMseLoss': val_mse_losses,
-            }
-        )
-
-    def select_action(self, time_step, state, env):
-        """action selection"""
-        if time_step < self.cfgs.update_policy_start_timesteps:
-            action = self.env.action_space.sample()
-
-        else:
-            action, safety_costs_mean = self.get_action(np.array(state))
-            self.logger.store(
-                **{
-                    'Plan/safety_costs_mean': safety_costs_mean,
-                }
-            )
-            action = action + np.random.normal(action.shape) * self.cfgs.exploration_noise
-        action = np.clip(action, env.action_space.low, env.action_space.high)
-        return action, None
-
-    def store_real_data(
-        self,
-        time_step,
-        ep_len,
-        state,
-        action_info,
-        action,
-        reward,
-        cost,
-        terminated,
-        truncated,
-        next_state,
-        info,
-    ):  # pylint: disable=too-many-arguments
-        """store real data"""
-        if not terminated and not truncated and not info['goal_met']:
-            # Current goal position is not related to the last goal position, so do not store.
-            self.off_replay_buffer.store(
-                obs=state, act=action, reward=reward, cost=cost, next_obs=next_state, done=truncated
-            )
-
-    def algo_reset(self):
-        """reset planner"""
-        if self.env.env_type == 'gym':
-            self.planner_reset()
diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py
deleted file mode 100644
index 120a3bec3..000000000
--- a/omnisafe/algorithms/off_policy/__init__.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Off-policy algorithms."""
-
-from omnisafe.algorithms.off_policy.crpo import OffCRPO
-from omnisafe.algorithms.off_policy.cvpo import CVPO
-from omnisafe.algorithms.off_policy.ddpg import DDPG
-from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag
-from omnisafe.algorithms.off_policy.ddpg_pid import DDPGPid
-from omnisafe.algorithms.off_policy.sac import SAC
-from omnisafe.algorithms.off_policy.sac_lag import SACLag
-from omnisafe.algorithms.off_policy.sac_pid import SACPid
-from omnisafe.algorithms.off_policy.sddpg import SDDPG
-from omnisafe.algorithms.off_policy.td3 import TD3
-from omnisafe.algorithms.off_policy.td3_lag import TD3Lag
-from omnisafe.algorithms.off_policy.td3_pid import TD3Pid
-
-
-__all__ = [
-    'DDPG',
-    'DDPGLag',
-    'SAC',
-    'SACLag',
-    'SDDPG',
-    'TD3',
-    'TD3Lag',
-    'CVPO',
-    'DDPGPid',
-    'TD3Pid',
-    'SACPid',
-    'OffCRPO',
-]
diff --git a/omnisafe/algorithms/off_policy/cvpo.py b/omnisafe/algorithms/off_policy/cvpo.py
deleted file mode 100644
index 446ab588f..000000000
--- a/omnisafe/algorithms/off_policy/cvpo.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the CVPO algorithm."""
-
-import numpy as np
-import torch
-from scipy.optimize import minimize
-from torch.distributions import MultivariateNormal
-from torch.nn.utils import clip_grad_norm_
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.off_policy.ddpg import DDPG
-from omnisafe.utils.algo_utils import gaussian_kl
-from omnisafe.utils.tools import to_ndarray
-
-
-@registry.register
-# pylint: disable-next=too-many-instance-attributes,too-many-locals
-class CVPO(DDPG):
-    """Constrained Variational Policy Optimization for Safe Reinforcement Learning.
-
-    References:
-
-        - Title: Constrained Variational Policy Optimization for Safe Reinforcement Learning.
-        - Authors: Zuxin Liu, Zhepeng Cen, Vladislav Isenbaev,
-                Wei Liu, Zhiwei Steven Wu, Bo Li, Ding Zhao
-        - URL: https://arxiv.org/abs/2201.11927v2
-    """
-
-    def __init__(
-        self,
-        env_id: str,
-        cfgs,
-    ) -> None:
-        """Constrained Variational Policy Optimization.
-
-        Args:
-            env_id (str): Environment ID.
-            cfgs (dict): Configuration dictionary.
-        """
-
-        super().__init__(
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        self.eta = 0.1
-        self.lam = 0.1
-        self.alpha_mean = 0.0
-        self.alpha_var = 0.0
-        self.cost_limit = self.cfgs.cost_limit
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Loss/Loss_l')
-        self.logger.register_key('Misc/mean_sigma_det')
-        self.logger.register_key('Misc/max_kl_sigma')
-        self.logger.register_key('Misc/max_kl_mu')
-        self.logger.register_key('Misc/eta')
-
-    # pylint: disable-next=too-many-locals
-    def update_policy_net(self, obs) -> None:
-        """Update policy network.
-
-        Args:
-            obs (torch.Tensor): observation.
-        """
-        num_action = self.cfgs.sample_action_num
-        num_obs = obs.shape[0]
-        act_dim = self.actor_critic.act_dim
-        obs_dim = self.actor_critic.obs_shape[0]
-
-        with torch.no_grad():
-            # sample N actions per state
-            b_mean, _, b_var = self.ac_targ.actor.predict(
-                obs, deterministic=True, need_log_prob=True
-            )
-            b_dist = MultivariateNormal(b_mean, scale_tril=b_var)
-            sampled_actions = b_dist.sample((num_action,))
-
-            expanded_states = obs[None, ...].expand(num_action, -1, -1)
-            target_q = self.ac_targ.critic(
-                expanded_states.reshape(-1, obs_dim), sampled_actions.reshape(-1, act_dim)
-            )[0]
-            target_q = target_q.reshape(num_action, num_obs)
-            target_q_np = to_ndarray(target_q).T
-            target_qc = self.ac_targ.cost_critic(
-                expanded_states.reshape(-1, obs_dim), sampled_actions.reshape(-1, act_dim)
-            )[0]
-            target_qc = target_qc.reshape(num_action, num_obs)
-            target_qc_np = to_ndarray(target_qc).T
-
-        def dual(val):
-            """Dual function of the non-parametric variational."""
-            beta, lam = val
-            target_q_np_comb = target_q_np - lam * target_qc_np
-            max_q = np.max(target_q_np_comb, 1)
-            return (
-                beta * self.cfgs.dual_constraint
-                + lam * self.cost_limit
-                + np.mean(max_q)
-                + beta
-                * np.mean(
-                    np.log(np.mean(np.exp((target_q_np_comb - max_q[:, None]) / beta), axis=1))
-                )
-            )
-
-        bounds = [(1e-6, 1e5), (1e-6, 1e5)]
-        options = {'ftol': 1e-3, 'maxiter': 10}
-        res = minimize(
-            dual,
-            np.array([self.eta, self.lam]),
-            method='SLSQP',
-            bounds=bounds,
-            tol=1e-3,
-            options=options,
-        )
-        self.eta, self.lam = res.x
-
-        raw_loss = torch.softmax((target_q - self.lam * target_qc) / self.eta, dim=0)
-
-        # M-Step of Policy Improvement
-        for _ in range(self.cfgs.mstep_iteration_num):
-            mean, _, var = self.actor_critic.actor.predict(
-                obs, deterministic=True, need_log_prob=True
-            )
-
-            actor = MultivariateNormal(loc=mean, scale_tril=b_var)
-            actor_ = MultivariateNormal(loc=b_mean, scale_tril=var)
-            loss_p = torch.mean(
-                raw_loss
-                * (
-                    actor.expand((num_action, num_obs)).log_prob(sampled_actions)
-                    + actor_.expand((num_action, num_obs)).log_prob(sampled_actions)
-                )
-            )
-
-            kl_mu, kl_sigma, _, sigma_det = gaussian_kl(
-                mean_p=b_mean, mean_q=mean, var_p=b_var, var_q=var
-            )
-
-            if np.isnan(kl_mu.item()):
-                raise RuntimeError('kl_mu is nan')
-            if np.isnan(kl_sigma.item()):
-                raise RuntimeError('kl_sigma is nan')
-
-            # update lagrange multipliers by gradient descent
-            self.alpha_mean -= (
-                self.cfgs.alpha_mean_scale * (self.cfgs.kl_mean_constraint - kl_mu).detach().item()
-            )
-            self.alpha_var -= (
-                self.cfgs.alpha_var_scale * (self.cfgs.kl_var_constraint - kl_sigma).detach().item()
-            )
-
-            self.alpha_mean = np.clip(self.alpha_mean, 0.0, self.cfgs.alpha_mean_max)
-            self.alpha_var = np.clip(self.alpha_var, 0.0, self.cfgs.alpha_var_max)
-            self.actor_optimizer.zero_grad()
-            loss_l = -(
-                loss_p
-                + self.alpha_mean * (self.cfgs.kl_mean_constraint - kl_mu)
-                + self.alpha_var * (self.cfgs.kl_var_constraint - kl_sigma)
-            )
-            loss_l.backward()
-            clip_grad_norm_(self.actor_critic.actor.parameters(), 0.01)
-            self.actor_optimizer.step()
-            self.logger.store(
-                **{
-                    'Loss/Loss_pi': loss_p.mean().item(),
-                    'Loss/Loss_l': loss_l.mean().item(),
-                    'Misc/mean_sigma_det': sigma_det.item(),
-                    'Misc/max_kl_sigma': kl_sigma.item(),
-                    'Misc/max_kl_mu': kl_mu.item(),
-                    'Misc/eta': self.eta,
-                }
-            )
-
-    def algorithm_specific_logs(self):
-        """Log the CVPO specific information."""
diff --git a/omnisafe/algorithms/off_policy/ddpg_lag.py b/omnisafe/algorithms/off_policy/ddpg_lag.py
deleted file mode 100644
index aee66a7cc..000000000
--- a/omnisafe/algorithms/off_policy/ddpg_lag.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Lagrange version of the DDPG algorithm."""
-
-from typing import Dict, NamedTuple, Tuple
-
-import torch
-import torch.nn.functional as F
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.off_policy.ddpg import DDPG
-from omnisafe.common.lagrange import Lagrange
-
-
-@registry.register
-# pylint: disable-next=too-many-instance-attributes
-class DDPGLag(DDPG, Lagrange):
-    """The Lagrange version of the DDPG Algorithm.
-
-    References:
-       - Title: Continuous control with deep reinforcement learning
-       - Authors: Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez,
-                   Yuval Tassa, David Silver, Daan Wierstra.
-       - URL: `DDPG <https://arxiv.org/abs/1509.02971>`_
-    """
-
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize DDPG."""
-        DDPG.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(self, **self.cfgs.lagrange_cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-        self.logger.register_key('Loss/Loss_pi_c')
-        self.logger.register_key('Misc/CostLimit')
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the DDPG Lag specific information.
-
-        .. list-table::
-
-            *  -   Things to log
-               -   Description
-            *  -   Metrics/LagrangeMultiplier
-               -   The Lagrange multiplier value in current epoch.
-            *  -   Loss/Loss_pi_c
-               -   The loss of the critic network.
-            *  -   Misc/CostLimit
-               -   The cost limit.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-                'Misc/CostLimit': self.cost_limit,
-            }
-        )
-
-    def compute_loss_pi(self, obs: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        r"""Computing ``pi/actor`` loss.
-
-        In the lagrange version of DDPG, the loss is defined as:
-
-        .. math::
-            L=\mathbb{E}_{s \sim \mathcal{D}} [ Q(s, \pi(s))- \lambda C(s, \pi(s))]
-
-        where :math:`\lambda` is the lagrange multiplier.
-
-        Args:
-            obs (:class:`torch.Tensor`): ``observation`` saved in data.
-        """
-        _, action = self.actor_critic.actor.predict(obs, deterministic=False, need_log_prob=False)
-        loss_pi = self.actor_critic.critic(obs, action)[0]
-        loss_pi_c = self.actor_critic.cost_critic(obs, action)[0]
-        loss_pi_c = F.relu(loss_pi_c - self.cost_limit)
-        self.update_lagrange_multiplier(loss_pi_c.mean().item())
-        penalty = self.lambda_range_projection(self.lagrangian_multiplier).item()
-        loss_pi -= penalty * loss_pi_c
-        loss_pi /= 1 + penalty
-        pi_info = {}
-        self.logger.store(
-            **{
-                'Loss/Loss_pi_c': loss_pi_c.mean().item(),
-            }
-        )
-        return -loss_pi.mean(), pi_info
diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py
index 7b8582d73..b155319bd 100644
--- a/omnisafe/algorithms/on_policy/__init__.py
+++ b/omnisafe/algorithms/on_policy/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """On-policy algorithms."""
 
-from omnisafe.algorithms.on_policy import (
+from omnisafe.algorithms.on_policy import (  # simmer,
     base,
     early_terminated,
     first_order,
@@ -23,7 +23,6 @@
     pid_lagrange,
     saute,
     second_order,
-    simmer,
 )
 from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient
 from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, PPOLagEarlyTerminated
@@ -33,12 +32,14 @@
 from omnisafe.algorithms.on_policy.pid_lagrange import CPPOPid, TRPOPid
 from omnisafe.algorithms.on_policy.saute import PPOLagSaute, PPOSaute
 from omnisafe.algorithms.on_policy.second_order import CPO, PCPO
-from omnisafe.algorithms.on_policy.simmer import (
-    PPOLagSimmerPid,
-    PPOLagSimmerQ,
-    PPOSimmerPid,
-    PPOSimmerQ,
-)
+
+
+# from omnisafe.algorithms.on_policy.simmer import (
+#     PPOLagSimmerPid,
+#     PPOLagSimmerQ,
+#     PPOSimmerPid,
+#     PPOSimmerQ,
+# )
 
 
 __all__ = [
@@ -50,5 +51,5 @@
     *pid_lagrange.__all__,
     *saute.__all__,
     *second_order.__all__,
-    *simmer.__all__,
+    # *simmer.__all__,
 ]
diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py
index 6865d7579..be36d8723 100644
--- a/omnisafe/algorithms/on_policy/base/natural_pg.py
+++ b/omnisafe/algorithms/on_policy/base/natural_pg.py
@@ -14,15 +14,14 @@
 # ==============================================================================
 """Implementation of the Natural Policy Gradient algorithm."""
 
-from typing import NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient
-from omnisafe.utils import distributed_utils
+from omnisafe.utils import distributed
+from omnisafe.utils.config import Config
+from omnisafe.utils.math import conjugate_gradients
 from omnisafe.utils.tools import (
-    conjugate_gradients,
     get_flat_gradients_from,
     get_flat_params_from,
     set_param_values_to_model,
@@ -43,62 +42,21 @@ class NaturalPG(PolicyGradient):
         - URL: `Natural PG <https://proceedings.neurips.cc/paper/2001/file/4b86abe48d358ecf194c56c69108433e-Paper.pdf>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize Natural Policy Gradient.
+    def __init__(self, env_id: str, cfgs: Config) -> None:
+        super().__init__(env_id, cfgs)
 
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-        self.cg_damping = cfgs.cg_damping
-        self.cg_iters = cfgs.cg_iters
-        self.target_kl = cfgs.target_kl
-        self.fvp_obs = cfgs.fvp_obs
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Misc/AcceptanceStep')
-        self.logger.register_key('Misc/Alpha')
-        self.logger.register_key('Misc/FinalStepNorm')
-        self.logger.register_key('Misc/gradient_norm')
-        self.logger.register_key('Misc/xHx')
-        self.logger.register_key('Misc/H_inv_g')
-
-    def search_step_size(self, step_dir: torch.Tensor) -> Tuple[torch.Tensor, int]:
-        """NPG use full step_size, so we just return 1.
+        self._fvp_obs: torch.Tensor
 
-        Args:
-            step_dir (torch.Tensor): The step direction.
-        """
-        accept_step = 1
-        return step_dir, accept_step
-
-    def algorithm_specific_logs(self) -> None:
-        r"""Log the Natural Policy Gradient specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   ``Misc/AcceptanceStep``
-                -   The acceptance step size.
-            *   -   ``Misc/Alpha``
-                -   :math:`\frac{\delta_{KL}}{xHx}` in original paper.
-                    where :math:`x` is the step direction, :math:`H` is the Hessian matrix,
-                    and :math:`\delta_{KL}` is the target KL divergence.
-            *   -   ``Misc/FinalStepNorm``
-                -   The final step norm.
-            *   -   ``Misc/gradient_norm``
-                -   The gradient norm.
-            *   -   ``Misc/xHx``
-                -   :math:`xHx` in original paper.
-            *   -   ``Misc/H_inv_g``
-                -   :math:`H^{-1}g` in original paper.
+    def _init_log(self) -> None:
+        super()._init_log()
 
-        """
+        self._logger.register_key('Misc/Alpha')
+        self._logger.register_key('Misc/FinalStepNorm')
+        self._logger.register_key('Misc/gradient_norm')
+        self._logger.register_key('Misc/xHx')
+        self._logger.register_key('Misc/H_inv_g')
 
-    def Fvp(self, params: torch.Tensor) -> torch.Tensor:
+    def _fvp(self, params: torch.Tensor) -> torch.Tensor:
         """Build the `Hessian-vector product <https://en.wikipedia.org/wiki/Hessian_matrix>`_
         based on an approximation of the KL-divergence.
         The Hessian-vector product is approximated by the Fisher information matrix,
@@ -108,98 +66,64 @@ def Fvp(self, params: torch.Tensor) -> torch.Tensor:
         Args:
             params (torch.Tensor): The parameters of the actor network.
         """
-        self.actor_critic.actor.zero_grad()
-        q_dist = self.actor_critic.actor(self.fvp_obs)
+        self._actor_critic.actor.zero_grad()
+        q_dist = self._actor_critic.actor(self._fvp_obs)
         with torch.no_grad():
-            p_dist = self.actor_critic.actor(self.fvp_obs)
+            p_dist = self._actor_critic.actor(self._fvp_obs)
         kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean()
 
-        grads = torch.autograd.grad(kl, self.actor_critic.actor.parameters(), create_graph=True)
+        grads = torch.autograd.grad(kl, self._actor_critic.actor.parameters(), create_graph=True)  # type: ignore
         flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])
 
         kl_p = (flat_grad_kl * params).sum()
-        grads = torch.autograd.grad(kl_p, self.actor_critic.actor.parameters(), retain_graph=False)
-        # contiguous indicating, if the memory is contiguously stored or not
+        grads = torch.autograd.grad(kl_p, self._actor_critic.actor.parameters(), retain_graph=False)  # type: ignore
+
         flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads])
-        distributed_utils.mpi_avg_torch_tensor(flat_grad_grad_kl)
-        return flat_grad_grad_kl + params * self.cg_damping
+        distributed.avg_tensor(flat_grad_grad_kl)
+        return flat_grad_grad_kl + params * self._cfgs.cg_damping
 
-    # pylint: disable-next=too-many-locals,too-many-arguments
-    def update_policy_net(
+    def _update_actor(  # pylint: disable=too-many-arguments, too-many-locals
         self,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
     ) -> None:
-        """Update policy network.
+        self._fvp_obs = obs[::4]
+        theta_old = get_flat_params_from(self._actor_critic.actor)
+        self._actor_critic.actor.zero_grad()
+        adv = self._compute_adv_surrogate(adv_r, adv_c)
+        loss, info = self._loss_pi(obs, act, logp, adv)
 
-        Natural Policy Gradient (NPG) update policy network using the conjugate gradient algorithm,
-        following the steps:
+        loss.backward()
+        distributed.avg_grads(self._actor_critic.actor)
 
-        - Calculate the gradient of the policy network,
-        - Use the conjugate gradient algorithm to calculate the step direction.
-        - Use the line search algorithm to find the step size.
-
-        Args:
-            obs (torch.Tensor): The observation tensor.
-            act (torch.Tensor): The action tensor.
-            log_p (torch.Tensor): The log probability of the action.
-            adv (torch.Tensor): The advantage tensor.
-            cost_adv (torch.Tensor): The cost advantage tensor.
-        """
-        # get loss and info values before update
-        self.fvp_obs = obs[::4]
-        theta_old = get_flat_params_from(self.actor_critic.actor)
-        self.actor_critic.actor.zero_grad()
-        processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv)
-        loss_pi, pi_info = self.compute_loss_pi(
-            obs=obs,
-            act=act,
-            log_p=log_p,
-            adv=processed_adv,
-        )
-        # train policy with multiple steps of gradient descent
-        loss_pi.backward()
-        # average grads across MPI processes
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        g_flat = get_flat_gradients_from(self.actor_critic.actor)
-        g_flat *= -1
-
-        # pylint: disable-next=invalid-name
-        x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters)
+        grad = -get_flat_gradients_from(self._actor_critic.actor)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
-        # note that xHx = g^T x, but calculating xHx is faster than g^T x
-        xHx = torch.dot(x, self.Fvp(x))  # equivalent to : g^T x
-        assert xHx.item() >= 0, 'No negative values'
-
-        # perform descent direction
-        alpha = torch.sqrt(2 * self.target_kl / (xHx + 1e-8))
-        step_direction = alpha * x
+        xHx = torch.dot(x, self._fvp(x))
+        assert xHx.item() >= 0, 'xHx is negative'
+        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+        step_direction = x * alpha
         assert torch.isfinite(step_direction).all(), 'step_direction is not finite'
 
-        # determine step direction and apply SGD step after grads where set
-        # TRPO uses custom backtracking line search
-        final_step_dir, accept_step = self.search_step_size(step_dir=step_direction)
-
-        # update actor network parameters
-        new_theta = theta_old + final_step_dir
-        set_param_values_to_model(self.actor_critic.actor, new_theta)
+        theta_new = theta_old + step_direction
+        set_param_values_to_model(self._actor_critic.actor, theta_new)
 
         with torch.no_grad():
-            loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv)
-            self.loss_record.append(loss_pi=loss_pi.mean().item())
+            loss, info = self._loss_pi(obs, act, logp, adv)
 
-        self.logger.store(
+        self._logger.store(
             **{
-                'Train/Entropy': pi_info['ent'],
-                'Train/PolicyRatio': pi_info['ratio'],
-                'Misc/AcceptanceStep': accept_step,
+                'Train/Entropy': info['entrophy'],
+                'Train/PolicyRatio': info['ratio'],
+                'Train/PolicyStd': info['std'],
+                'Loss/Loss_pi': loss.mean().item(),
                 'Misc/Alpha': alpha.item(),
-                'Misc/FinalStepNorm': torch.norm(final_step_dir).mean().item(),
+                'Misc/FinalStepNorm': torch.norm(step_direction).mean().item(),
                 'Misc/xHx': xHx.item(),
-                'Misc/gradient_norm': torch.norm(g_flat).mean().item(),
+                'Misc/gradient_norm': torch.norm(grad).mean().item(),
                 'Misc/H_inv_g': x.norm().item(),
             }
         )
diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py
index 0ab7925cb..345508ac8 100644
--- a/omnisafe/algorithms/on_policy/base/policy_gradient.py
+++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py
@@ -15,26 +15,24 @@
 """Implementation of the Policy Gradient algorithm."""
 
 import time
-from copy import deepcopy
 from typing import Dict, Tuple
 
 import torch
 import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
 
+from omnisafe.adapter import OnPolicyAdapter
 from omnisafe.algorithms import registry
+from omnisafe.algorithms.base_algo import BaseAlgo
 from omnisafe.common.buffer import VectorOnPolicyBuffer
 from omnisafe.common.logger import Logger
-from omnisafe.common.record_queue import RecordQueue
-from omnisafe.models.constraint_actor_critic import ConstraintActorCritic
-from omnisafe.utils import core, distributed_utils
-from omnisafe.utils.config import Config
-from omnisafe.utils.tools import get_flat_params_from
-from omnisafe.wrappers import wrapper_registry
+from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic
+from omnisafe.utils import distributed
 
 
 @registry.register
-# pylint: disable-next=too-many-instance-attributes
-class PolicyGradient:
+# pylint: disable-next=too-many-instance-attributes, too-few-public-methods
+class PolicyGradient(BaseAlgo):
     """The Policy Gradient algorithm.
 
     References:
@@ -44,475 +42,153 @@ class PolicyGradient:
         /1999/file/64d828b85b0bed98e80ade0a5c43b0f-Paper.pdf>`_
     """
 
-    def __init__(self, env_id: str, cfgs: Config) -> None:
-        """Initialize PolicyGradient.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        self.algo = self.__class__.__name__
-        self.cfgs = deepcopy(cfgs)
-        self.wrapper_type = self.cfgs.wrapper_type
-        self.device = (
-            f'cuda:{self.cfgs.device_id}'
-            if torch.cuda.is_available() and self.cfgs.device == 'cuda'
-            else 'cpu'
+    def _init_env(self) -> None:
+        self._env = OnPolicyAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs)
+        assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, (
+            'The number of steps per epoch is not divisible by the number of ' 'environments.'
+        )
+        self._steps_per_epoch = (
+            self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs
         )
-        added_cfgs = self._get_added_cfgs()
-        self.cfgs.env_cfgs.recurisve_update(added_cfgs)
-        env_cfgs = self.cfgs.env_cfgs
 
-        self.env = wrapper_registry.get(self.wrapper_type)(env_id, cfgs=env_cfgs)
+    def _init_model(self) -> None:
+        self._actor_critic = ConstraintActorCritic(
+            obs_space=self._env.observation_space,
+            act_space=self._env.action_space,
+            model_cfgs=self._cfgs.model_cfgs,
+            epochs=self._cfgs.epochs,
+        ).to(self._device)
+
+        if distributed.world_size() > 1:
+            distributed.sync_params(self._actor_critic)
+
+        if self._cfgs.exploration_noise_anneal:
+            self._actor_critic.set_annealing(
+                epochs=[0, self._cfgs.epochs],
+                std=self._cfgs.std,
+            )
 
-        assert self.cfgs.steps_per_epoch % distributed_utils.num_procs() == 0, (
-            f'Number of processes ({distributed_utils.num_procs()})'
-            f'is not a divisor of the number of steps per epoch {self.cfgs.steps_per_epoch}.'
-        )
-        self.steps_per_epoch = self.cfgs.steps_per_epoch
-        self.local_steps_per_epoch = (
-            cfgs.steps_per_epoch // cfgs.env_cfgs.num_envs // distributed_utils.num_procs()
-        ) + 1
-
-        # ensure local each local process can experience at least one complete episode
-        assert self.env.rollout_data.max_ep_len <= self.local_steps_per_epoch, (
-            f'Reduce number of cores ({distributed_utils.num_procs()})'
-            f'or reduce the number of parallel envrionments {self.env.cfgs.num_envs}'
-            f'or increase batch size {self.cfgs.steps_per_epoch}.'
+    def _init(self) -> None:
+        self._buf = VectorOnPolicyBuffer(
+            obs_space=self._env.observation_space,
+            act_space=self._env.action_space,
+            size=self._steps_per_epoch,
+            gamma=self._cfgs.buffer_cfgs.gamma,
+            lam=self._cfgs.buffer_cfgs.lam,
+            lam_c=self._cfgs.buffer_cfgs.lam_c,
+            advantage_estimator=self._cfgs.buffer_cfgs.adv_estimation_method,
+            standardized_adv_r=self._cfgs.buffer_cfgs.standardized_rew_adv,
+            standardized_adv_c=self._cfgs.buffer_cfgs.standardized_cost_adv,
+            penalty_coefficient=self._cfgs.penalty_param,
+            num_envs=self._cfgs.num_envs,
+            device=self._device,
         )
 
-        # setup actor-critic module
-        self.actor_critic = ConstraintActorCritic(
-            observation_space=self.env.observation_space,
-            action_space=self.env.action_space,
-            model_cfgs=cfgs.model_cfgs,
-        ).to(self.device)
-        self.set_mpi()
-
-        # set up logger and save configuration to disk
-        self.logger = Logger(
-            output_dir=cfgs.data_dir,
-            exp_name=cfgs.exp_name,
-            seed=cfgs.seed,
-            use_tensorboard=cfgs.use_tensorboard,
-            use_wandb=cfgs.use_wandb,
-            config=cfgs,
-            models=[self.actor_critic],
+    def _init_log(self) -> None:
+        self._logger = Logger(
+            output_dir=self._cfgs.data_dir,
+            exp_name=self._cfgs.exp_name,
+            seed=self._cfgs.seed,
+            use_tensorboard=self._cfgs.use_tensorboard,
+            use_wandb=self._cfgs.use_wandb,
+            config=self._cfgs,
         )
 
-        # set up experience buffer
-        self.buf = VectorOnPolicyBuffer(
-            obs_space=self.env.observation_space,
-            act_space=self.env.action_space,
-            size=self.local_steps_per_epoch,
-            gamma=cfgs.buffer_cfgs.gamma,
-            lam=cfgs.buffer_cfgs.lam,
-            lam_c=cfgs.buffer_cfgs.lam_c,
-            advantage_estimator=cfgs.buffer_cfgs.adv_estimation_method,
-            standardized_adv_r=cfgs.buffer_cfgs.standardized_rew_adv,
-            standardized_adv_c=cfgs.buffer_cfgs.standardized_cost_adv,
-            penalty_coefficient=cfgs.penalty_param,
-            num_envs=cfgs.env_cfgs.num_envs,
-            device=self.device,
-        )
-        # set up optimizer for policy and value function
-        self.actor_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.actor, learning_rate=cfgs.actor_lr
-        )
-        self.reward_critic_optimizer = core.set_optimizer(
-            'Adam', module=self.actor_critic.reward_critic, learning_rate=cfgs.critic_lr
-        )
-        if cfgs.use_cost:
-            self.cost_critic_optimizer = core.set_optimizer(
-                'Adam', module=self.actor_critic.cost_critic, learning_rate=cfgs.critic_lr
-            )
-        # set up scheduler for policy learning rate decay
-        self.scheduler = self.set_learning_rate_scheduler()
-        # set up model saving
         what_to_save = {
-            'pi': self.actor_critic.actor,
-            'obs_normalizer': self.env.obs_normalizer,
-        }
-        self.logger.setup_torch_saver(what_to_save=what_to_save)
-        self.logger.torch_save()
-        # set up statistics
-        self.start_time = time.time()
-        self.logger.log('Start with training.')
-        self.epoch_time = None
-        self.penalty_param = None
-        self.critic_loss_fn = nn.MSELoss()
-        self.loss_record = RecordQueue('loss_pi', 'loss_v', 'loss_c', maxlen=100)
-
-        self._init_log()
-
-    def _init_log(self):
-        self.logger.register_key('Train/Epoch')
-        self.logger.register_key('Metrics/EpRet')
-        self.logger.register_key('Metrics/EpCost')
-        self.logger.register_key('Metrics/EpLen')
-
-        # log information about actor
-        self.logger.register_key('Loss/Loss_pi')
-        self.logger.register_key('Loss/Delta_loss_pi')
-        self.logger.register_key('Values/Adv')
-
-        # log information about critic
-        self.logger.register_key('Loss/Loss_reward_critic')
-        self.logger.register_key('Loss/Delta_loss_reward_critic')
-        self.logger.register_key('Values/V')
-
-        if self.cfgs.use_cost:
-            # log information about cost critic
-            self.logger.register_key('Loss/Loss_cost_critic')
-            self.logger.register_key('Loss/Delta_loss_cost_critic')
-            self.logger.register_key('Values/C')
-
-        self.logger.register_key('Train/Entropy')
-        self.logger.register_key('Train/KL')
-        self.logger.register_key('Train/StopIter')
-        self.logger.register_key('Train/PolicyRatio')
-        self.logger.register_key('Train/LR')
-
-        if self.cfgs.env_cfgs.normalized_rew:
-            self.logger.register_key('Misc/RewScaleMean')
-            self.logger.register_key('Misc/RewScaleStddev')
-
-        if self.cfgs.exploration_noise_anneal:
-            self.logger.register_key('Misc/ExplorationNoiseStd')
-
-        if self.cfgs.model_cfgs.actor_type == 'gaussian_learning':
-            self.logger.register_key('Misc/ExplorationNoiseStd')
-
-        self._specific_init_logs()
-
-        # some sub-classes may add information to logs
-        self.logger.register_key('TotalEnvSteps')
-        self.logger.register_key('Time')
-        self.logger.register_key('FPS')
-
-    def _specific_init_logs(self):
-        pass
-
-    def _get_added_cfgs(self) -> dict:
-        """Get additional configurations.
-
-        Returns:
-            dict: The additional configurations.
-        """
-        added_configs = {
-            'device': f'cuda:{self.cfgs.device_id}'
-            if torch.cuda.is_available() and self.cfgs.device == 'cuda'
-            else 'cpu',
-            'seed': self.cfgs.seed,
+            'pi': self._actor_critic.actor,
         }
-        return added_configs
-
-    def set_learning_rate_scheduler(self) -> torch.optim.lr_scheduler.LambdaLR:
-        """Set up learning rate scheduler.
-
-        If use linear learning rate decay,
-        the learning rate will be annealed linearly.
-        """
-        scheduler = None
-        if self.cfgs.linear_lr_decay:
-            # linear anneal
-            def linear_anneal(epoch):
-                return 1 - epoch / self.cfgs.epochs
-
-            scheduler = torch.optim.lr_scheduler.LambdaLR(
-                optimizer=self.actor_optimizer, lr_lambda=linear_anneal
-            )
-        return scheduler
-
-    def set_mpi(self) -> None:
-        """Initialize MPI specifics.
-
-        Sync parameters of actor and critic across cores,
-        only once necessary."""
-        if distributed_utils.num_procs() > 1:
-            # avoid slowdowns from PyTorch + MPI combo
-            distributed_utils.setup_torch_for_mpi()
-            start = time.time()
-            self.logger.log('INFO: Sync actor critic parameters')
-            # sync parameters across cores: only once necessary, grads are averaged!
-            distributed_utils.sync_params(self.actor_critic)
-            self.logger.log(f'Done! (took {time.time()-start:0.3f} sec.)')
+        self._logger.setup_torch_saver(what_to_save)
+        self._logger.torch_save()
 
-    def algorithm_specific_logs(self) -> None:
-        """Use this method to collect log information.
+        self._logger.register_key('Metrics/EpRet', window_length=50)
+        self._logger.register_key('Metrics/EpCost', window_length=50)
+        self._logger.register_key('Metrics/EpLen', window_length=50)
 
-        e.g. log lagrangian for lagrangian-base algorithms,
+        self._logger.register_key('Train/Epoch')
+        self._logger.register_key('Train/Entropy')
+        self._logger.register_key('Train/KL')
+        self._logger.register_key('Train/StopIter')
+        self._logger.register_key('Train/PolicyRatio')
+        self._logger.register_key('Train/LR')
+        if self._cfgs.model_cfgs.actor_type == 'gaussian_learning':
+            self._logger.register_key('Train/PolicyStd')
 
-        .. code-block:: python
+        self._logger.register_key('TotalEnvSteps')
 
-            self.logger.log_tabular('Metrics/LagrangeMultiplier', self.lagrangian_multiplier.item())
-        """
-
-    def check_distributed_parameters(self) -> None:
-        """Check if parameters are synchronized across all processes."""
-        if distributed_utils.num_procs() > 1:
-            self.logger.log('Check if distributed parameters are synchronous..')
-            modules = {
-                'Policy': self.actor_critic.actor,
-                'Value': self.actor_critic.reward_critic,
-            }
-            for key, module in modules.items():
-                flat_params = get_flat_params_from(module)
-                global_min = distributed_utils.mpi_min(torch.sum(flat_params))
-                global_max = distributed_utils.mpi_max(torch.sum(flat_params))
-                assert torch.allclose(global_min, global_max), f'{key} not synced.'
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
-
-        Policy Gradient only use reward advantage.
-
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        return adv - 0.0 * cost_adv
-
-    # pylint: disable-next=too-many-arguments
-    def compute_loss_pi(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        r"""Computing pi/actor loss.
-
-        In Policy Gradient, the loss is defined as:
-
-        .. math::
-
-            L = -\mathbb{E}_{s_t \sim \rho_\theta} \left[
-                \sum_{t=0}^T \left( \frac{\pi_\theta ^{'}(a_t|s_t)}{\pi_\theta(a_t|s_t)} \right)
-                \left( \sum_{t'=t}^T \gamma^{t'-t} r_{t'} \right)
-            \right]
-
-        where :math:`\rho_\theta` is the policy distribution, :math:`\pi_\theta` is the parameters of policy network,
-        :math:`a_t` is the action at time step :math:`t`, :math:`s_t` is the observation at time step :math:`t`,
-        :math:`\gamma` is the discount factor, :math:`r_{t'}` is the reward at time step :math:`t'`.
-
-        Args:
-            obs (torch.Tensor): ``observation`` stored in buffer.
-            act (torch.Tensor): ``action`` stored in buffer.
-            log_p (torch.Tensor): ``log probability`` of action stored in buffer.
-            adv (torch.Tensor): ``advantage`` stored in buffer.
-        """
-        # policy loss
-        dist, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-
-        loss_pi = -(ratio * adv).mean()
-        # useful extra info
-        approx_kl = (0.5 * (dist.mean - act) ** 2 / dist.stddev**2).mean().item()
+        # log information about actor
+        self._logger.register_key('Loss/Loss_pi', delta=True)
+        self._logger.register_key('Value/Adv')
 
-        # compute policy's entropy
-        ent = dist.entropy().mean().item()
+        # log information about critic
+        self._logger.register_key('Loss/Loss_reward_critic', delta=True)
+        self._logger.register_key('Value/reward')
 
-        pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()}
+        if self._cfgs.use_cost:
+            # log information about cost critic
+            self._logger.register_key('Loss/Loss_cost_critic', delta=True)
+            self._logger.register_key('Value/cost')
 
-        return loss_pi, pi_info
+        self._logger.register_key('Time/Total')
+        self._logger.register_key('Time/Rollout')
+        self._logger.register_key('Time/Update')
+        self._logger.register_key('Time/Epoch')
+        self._logger.register_key('Time/FPS')
 
-    def learn(self) -> ConstraintActorCritic:
+    def learn(self) -> None:
         """This is main function for algorithm update, divided into the following steps:
 
         - :meth:`rollout`: collect interactive data from environment.
         - :meth:`update`: perform actor/critic updates.
         - :meth:`log`: epoch/update information for visualization and terminal log print.
         """
-        # main loop: collect experience in env and update/log each epoch
-        for epoch in range(self.cfgs.epochs):
-            self.epoch_time = time.time()
-            # update internals of AC
-            if self.cfgs.exploration_noise_anneal:
-                self.actor_critic.anneal_exploration(frac=epoch / self.cfgs.epochs)
-            # collect data from environment
-            self.env.set_rollout_cfgs(
-                local_steps_per_epoch=self.local_steps_per_epoch,
-                use_cost=self.cfgs.use_cost,
-            )
-            self.env.on_policy_roll_out(
-                self.actor_critic,
-                self.buf,
-                self.logger,
-            )
-            # update: actor, critic, running statistics
-            self.update()
-            # log and store information
-            self.log(epoch)
-            # check if all models own the same parameter values
-            if epoch % self.cfgs.check_freq == 0:
-                self.check_distributed_parameters()
-            # save model to disk
-            if (epoch + 1) % self.cfgs.save_freq == 0:
-                self.logger.torch_save()
-
-        # close opened files to avoid number of open files overflow
-        self.logger.close()
-        return self.actor_critic
-
-    def log(self, epoch: int) -> None:
-        """Log info about epoch.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Train/Epoch
-                -   Current epoch.
-            *   -   Metrics/EpCost
-                -   Average cost of the epoch.
-            *   -   Metrics/EpCost
-                -   Average cost of the epoch.
-            *   -   Metrics/EpRet
-                -   Average return of the epoch.
-            *   -   Metrics/EpLen
-                -   Average length of the epoch.
-            *   -   Values/V
-                -   Average value in :meth:`roll_out()` (from critic network) of the epoch.
-            *   -   Values/C
-                -   Average cost in :meth:`roll_out()` (from critic network) of the epoch.
-            *   -   Values/Adv
-                -   Average advantage in :meth:`roll_out()` of the epoch.
-            *   -   Loss/Loss_pi
-                -   Loss of the policy network.
-            *   -   Loss/Delta_loss_pi
-                -   Delta loss of the policy network.
-            *   -   Loss/Loss_reward_critic
-                -   Loss of the value network.
-            *   -   Loss/Delta_loss_reward_critic
-                -   Delta loss of the value network.
-            *   -   Loss/Loss_cost_critic
-                -   Loss of the cost network.
-            *   -   Loss/Delta_loss_cost_critic
-                -   Delta loss of the cost network.
-            *   -   Train/Entropy
-                -   Entropy of the policy network.
-            *   -   Train/KL
-                -   KL divergence of the policy network.
-            *   -   Train/StopIters
-                -   Number of iterations of the policy network.
-            *   -   Train/PolicyRatio
-                -   Ratio of the policy network.
-            *   -   Train/LR
-                -   Learning rate of the policy network.
-            *   -   Misc/Seed
-                -   Seed of the experiment.
-            *   -   Misc/RewScaleMean
-                -   Mean of the reward scale.
-            *   -   Misc/RewScaleStddev
-                -   Std of the reward scale.
-            *   -   Misc/ExplorationNoisestd
-                -   Std of the exploration noise.
-            *   -   Misc/TotalEnvSteps
-                -   Total steps of the experiment.
-            *   -   Time
-                -   Total time.
-            *   -   FPS
-                -   Frames per second of the epoch.
-
-        Args:
-            epoch (int): current epoch.
-        """
-        total_env_steps = (epoch + 1) * self.cfgs.steps_per_epoch
-        fps = self.cfgs.steps_per_epoch / (time.time() - self.epoch_time)
-        # step the actor learning rate scheduler if provided
-        if self.scheduler and self.cfgs.linear_lr_decay:
-            current_lr = self.scheduler.get_last_lr()[0]
-            self.scheduler.step()
-        else:
-            current_lr = self.cfgs.actor_lr
-
-        self.logger.store(
-            **{
-                'Train/Epoch': epoch + 1,
-                'Train/LR': current_lr,
-                'TotalEnvSteps': total_env_steps,
-                'Time': (time.time() - self.start_time),
-                'FPS': fps,
-            }
-        )
+        start_time = time.time()
+        self._logger.log('INFO: Start training')
 
-        if self.cfgs.env_cfgs.normalized_rew:
-            reward_norm_mean = self.env.rew_normalizer.mean.mean().item()
-            reward_norm_stddev = self.env.rew_normalizer.std.mean().item()
-            self.logger.store(
-                **{
-                    'Misc/RewScaleMean': reward_norm_mean,
-                    'Misc/RewScaleStddev': reward_norm_stddev,
-                }
-            )
+        for epoch in range(self._cfgs.epochs):
+            epoch_time = time.time()
 
-        if self.cfgs.exploration_noise_anneal:
-            noise_std = self.actor_critic.actor.std
-            self.logger.store(
-                **{
-                    'Misc/ExplorationNoiseStd': noise_std,
-                }
-            )
+            # if self._cfgs.exploration_noise_anneal:
+            #     self._actor_critic.anneal_exploration(frac=epoch / self._cfgs.epochs)
 
-        if self.cfgs.model_cfgs.actor_type == 'gaussian_learning':
-            self.logger.store(
-                **{
-                    'Misc/ExplorationNoiseStd': self.actor_critic.actor.std,
-                }
+            roll_out_time = time.time()
+            self._env.roll_out(
+                steps_per_epoch=self._steps_per_epoch,
+                agent=self._actor_critic,
+                buffer=self._buf,
+                logger=self._logger,
             )
+            self._logger.store(**{'Time/Rollout': time.time() - roll_out_time})
 
-        self.algorithm_specific_logs()
-        self.logger.dump_tabular()
-
-    # pylint: disable-next=too-many-locals
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        r"""Update actor, critic, running statistics, following next steps:
-
-        -  Get the ``raw data`` and ``processed data`` from buffer
-
-        .. note::
+            update_time = time.time()
+            self._update()
+            self._logger.store(**{'Time/Update': time.time() - update_time})
 
-            ``raw data`` is the data from environment, while ``processed data`` is the data after pre-processing.
+            self._actor_critic.actor_scheduler.step()
+            if self._cfgs.exploration_noise_anneal:
+                self._actor_critic.annealing(epoch)
 
-            .. list-table::
-
-                *   -   obs
-                    -   ``observaion`` stored in buffer.
-                *   -   act
-                    -   ``action`` stored in buffer.
-                *   -   target_v
-                    -   ``target value`` stored in buffer.
-                *   -   target_c
-                    -   ``target cost`` stored in buffer.
-                *   -   log_p
-                    -   ``log probability`` stored in buffer.
-                *   -   adv
-                    -   ``estimated advantage`` (e.g. **GAE**) stored in buffer.
-                *   -   cost_adv
-                    -   ``estimated cost advantage`` (e.g. **GAE**) stored in buffer.
+            self._logger.store(
+                **{
+                    'TotalEnvSteps': (epoch + 1) * self._cfgs.steps_per_epoch,
+                    'Time/FPS': self._cfgs.steps_per_epoch / (time.time() - epoch_time),
+                    'Time/Total': (time.time() - start_time),
+                    'Time/Epoch': (time.time() - epoch_time),
+                    'Train/Epoch': epoch,
+                    'Train/LR': self._actor_critic.actor_scheduler.get_last_lr()[0],
+                }
+            )
 
-        -  Update value net by :meth:`update_value_net()`.
-        -  Update cost net by :meth:`update_cost_net()`.
-        -  Update policy net by :meth:`update_policy_net()`.
+            self._logger.dump_tabular()
 
-        The cost and value critic network will be updated ``critic_iters`` times (always 40),
-        while the policy network will be updated ``actor_iters`` times (always 80).
-        The basic process of each update is as follows:
+            # save model to disk
+            if (epoch + 1) % self._cfgs.save_freq == 0:
+                self._logger.torch_save()
 
-        #. Get the mini-batch data from buffer.
-        #. Get the loss of network.
-        #. Update the network by loss.
-        #. Repeat steps 2, 3 until the number of mini-batch data is used up.
+        self._logger.close()
 
-        """
-        # get the data from buffer
-        data = self.buf.get()
-        obs, act, log_p, target_v, target_c, adv, cost_adv = (
+    def _update(self) -> None:
+        data = self._buf.get()
+        obs, act, logp, target_value_r, target_value_c, adv_r, adv_c = (
             data['obs'],
             data['act'],
             data['logp'],
@@ -521,220 +197,135 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
             data['adv_r'],
             data['adv_c'],
         )
-        # get the loss before
-        loss_pi_before, loss_v_before = self.loss_record.get_mean('loss_pi', 'loss_v')
-        if self.cfgs.use_cost:
-            loss_c_before = self.loss_record.get_mean('loss_c')
-        self.loss_record.reset('loss_pi', 'loss_v', 'loss_c')
-        # compute the old distribution of policy net.
-        old_dist = self.actor_critic.actor(obs)
-
-        # load the data into the data loader.
-        dataset = torch.utils.data.TensorDataset(obs, act, target_v, target_c, log_p, adv, cost_adv)
-        loader = torch.utils.data.DataLoader(
-            dataset, batch_size=self.cfgs.num_mini_batches, shuffle=True
+
+        original_obs = obs
+        old_distribution = self._actor_critic.actor(obs)
+
+        dataloader = DataLoader(
+            dataset=TensorDataset(obs, act, logp, target_value_r, target_value_c, adv_r, adv_c),
+            batch_size=self._cfgs.num_mini_batches,
+            shuffle=True,
         )
 
-        # update the value net, cost net and policy net for several times.
-        for i in range(self.cfgs.actor_iters):
-            for _, (obs_b, act_b, target_v_b, target_c_b, log_p_b, adv_b, cost_adv_b) in enumerate(
-                loader
-            ):
-                # update the value net.
-                self.update_value_net(obs_b, target_v_b)
-                # update the cost net, if use cost.
-                if self.cfgs.use_cost:
-                    self.update_cost_net(obs_b, target_c_b)
-                # update the policy net.
-                self.update_policy_net(obs_b, act_b, log_p_b, adv_b, cost_adv_b)
-            # compute the new distribution of policy net.
-            new_dist = self.actor_critic.actor(obs)
-            # compute the KL divergence between old and new distribution.
-            torch_kl = (
-                torch.distributions.kl.kl_divergence(old_dist, new_dist)
+        for i in range(self._cfgs.actor_iters):
+            for (
+                obs,
+                act,
+                logp,
+                target_value_r,
+                target_value_c,
+                adv_r,
+                adv_c,
+            ) in dataloader:
+                self._update_rewrad_critic(obs, target_value_r)
+                if self._cfgs.use_cost:
+                    self._update_cost_critic(obs, target_value_c)
+                self._update_actor(obs, act, logp, adv_r, adv_c)
+
+            new_distribution = self._actor_critic.actor(original_obs)
+
+            kl = (
+                torch.distributions.kl.kl_divergence(old_distribution, new_distribution)
                 .sum(-1, keepdim=True)
                 .mean()
                 .item()
             )
-            torch_kl = distributed_utils.mpi_avg(torch_kl)
-            # if the KL divergence is larger than the target KL divergence, stop the update.
-            if self.cfgs.kl_early_stopping and torch_kl > self.cfgs.target_kl:
-                self.logger.log(f'KL early stop at the {i+1} th step.')
+            kl = distributed.dist_avg(kl)
+
+            if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl:
+                self._logger.log(f'Early stopping at iter {i} due to reaching max kl')
                 break
-        # log the information.
-        loss_pi, loss_v = self.loss_record.get_mean('loss_pi', 'loss_v')
-        self.logger.store(
-            **{
-                'Loss/Loss_pi': loss_pi,
-                'Loss/Delta_loss_pi': loss_pi - loss_pi_before,
-                'Train/StopIter': i + 1,
-                'Values/Adv': adv.mean().item(),
-                'Train/KL': torch_kl,
-                'Loss/Delta_loss_reward_critic': loss_v - loss_v_before,
-                'Loss/Loss_reward_critic': loss_v,
-            }
-        )
-        if self.cfgs.use_cost:
-            loss_c = self.loss_record.get_mean('loss_c')
-            self.logger.store(
-                **{
-                    'Loss/Delta_loss_cost_critic': loss_c - loss_c_before,
-                    'Loss/Loss_cost_critic': loss_c,
-                }
-            )
-        return data
 
-    # pylint: disable-next=too-many-locals,too-many-arguments
-    def update_policy_net(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> None:
-        r"""Update policy network under a double for loop.
-
-            The pseudo code is shown below:
-
-            .. code-block:: python
-
-                for _ in range(self.cfgs.actor_iters):
-                    for _ in range(self.cfgs.num_mini_batches):
-                        # Get mini-batch data
-                        # Compute loss
-                        # Update network
-
-            .. warning::
-                For some ``KL divergence`` based algorithms (e.g. TRPO, CPO, etc.),
-                the ``KL divergence`` between the old policy and the new policy is calculated.
-                And the ``KL divergence`` is used to determine whether the update is successful.
-                If the ``KL divergence`` is too large, the update will be terminated.
-
-        Args:
-            obs (torch.Tensor): ``observation`` stored in buffer.
-            act (torch.Tensor): ``action`` stored in buffer.
-            log_p (torch.Tensor): ``log_p`` stored in buffer.
-            adv (torch.Tensor): ``advantage`` stored in buffer.
-            cost_adv (torch.Tensor): ``cost_advantage`` stored in buffer.
-        """
-        # process the advantage function.
-        processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv)
-        # compute the loss of policy net.
-        loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv)
-        # log the loss of policy net.
-        self.loss_record.append(loss_pi=loss_pi.mean().item())
-        # update the policy net.
-        self.actor_optimizer.zero_grad()
-        # backward the loss of policy net.
-        loss_pi.backward()
-        # clip the gradient of policy net.
-        if self.cfgs.use_max_grad_norm:
-            torch.nn.utils.clip_grad_norm_(
-                self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm
-            )
-        # average the gradient of policy net.
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        self.actor_optimizer.step()
-        self.logger.store(
+        self._logger.store(
             **{
-                'Train/Entropy': pi_info['ent'],
-                'Train/PolicyRatio': pi_info['ratio'],
+                'Train/StopIter': i + 1,
+                'Value/Adv': adv_r.mean().item(),
+                'Train/KL': kl,
             }
         )
 
-    def update_value_net(
-        self,
-        obs: torch.Tensor,
-        target_v: torch.Tensor,
-    ) -> None:
-        r"""Update value network under a double for loop.
-
-        The loss function is ``MSE loss``, which is defined in ``torch.nn.MSELoss``.
-        Specifically, the loss function is defined as:
-
-        .. math::
-            L = \frac{1}{N} \sum_{i=1}^N (\hat{V} - V)^2
+    def _update_rewrad_critic(self, obs: torch.Tensor, target_value_r: torch.Tensor) -> None:
+        self._actor_critic.reward_critic_optimizer.zero_grad()
+        loss = nn.functional.mse_loss(self._actor_critic.reward_critic(obs)[0], target_value_r)
 
-        where :math:`\hat{V}` is the predicted cost and :math:`V` is the target cost.
-        The pseudo code is shown below:
+        if self._cfgs.use_critic_norm:
+            for param in self._actor_critic.reward_critic.parameters():
+                loss += param.pow(2).sum() * self._cfgs.critic_norm_coeff
 
-        .. code-block:: python
+        loss.backward()
 
-            for _ in range(self.cfgs.actor_iters):
-                for _ in range(self.cfgs.num_mini_batches):
-                    # Get mini-batch data
-                    # Compute loss
-                    # Update network
-
-        Args:
-            obs (torch.Tensor): ``observation`` stored in buffer.
-            target_v (torch.Tensor): ``target_v`` stored in buffer.
-        """
-        self.reward_critic_optimizer.zero_grad()
-        # compute the loss of value net.
-        loss_v = self.critic_loss_fn(
-            self.actor_critic.reward_critic(obs),
-            target_v,
-        )
-        # add the norm of critic network parameters to the loss function.
-        if self.cfgs.use_critic_norm:
-            for param in self.actor_critic.reward_critic.parameters():
-                loss_v += param.pow(2).sum() * self.cfgs.critic_norm_coeff
-        # log the loss of value net.
-        self.loss_record.append(loss_v=loss_v.mean().item())
-        # backward
-        loss_v.backward()
-        # clip the gradient
-        if self.cfgs.use_max_grad_norm:
+        if self._cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self.actor_critic.reward_critic.parameters(), self.cfgs.max_grad_norm
+                self._actor_critic.reward_critic.parameters(), self._cfgs.max_grad_norm
             )
-        distributed_utils.mpi_avg_grads(self.actor_critic.reward_critic)
-        self.reward_critic_optimizer.step()
+        distributed.avg_grads(self._actor_critic.reward_critic)
+        self._actor_critic.reward_critic_optimizer.step()
 
-    def update_cost_net(self, obs: torch.Tensor, target_c: torch.Tensor) -> None:
-        r"""Update cost network under a double for loop.
+        self._logger.store(**{'Loss/Loss_reward_critic': loss.mean().item()})
 
-        The loss function is ``MSE loss``, which is defined in ``torch.nn.MSELoss``.
-        Specifically, the loss function is defined as:
+    def _update_cost_critic(self, obs: torch.Tensor, target_value_c: torch.Tensor) -> None:
+        self._actor_critic.cost_critic_optimizer.zero_grad()
+        loss = nn.functional.mse_loss(self._actor_critic.cost_critic(obs)[0], target_value_c)
 
-        .. math::
-            L = \frac{1}{N} \sum_{i=1}^N (\hat{C} - C)^2
+        if self._cfgs.use_critic_norm:
+            for param in self._actor_critic.cost_critic.parameters():
+                loss += param.pow(2).sum() * self._cfgs.critic_norm_coeff
 
-        where :math:`\hat{C}` is the predicted cost and :math:`C` is the target cost.
-        The pseudo code is shown below:
+        loss.backward()
 
-        .. code-block:: python
+        if self._cfgs.use_max_grad_norm:
+            torch.nn.utils.clip_grad_norm_(
+                self._actor_critic.cost_critic.parameters(), self._cfgs.max_grad_norm
+            )
+        distributed.avg_grads(self._actor_critic.cost_critic)
+        self._actor_critic.cost_critic_optimizer.step()
 
-            for _ in range(self.cfgs.actor_iters):
-                for _ in range(self.cfgs.num_mini_batches):
-                    # Get mini-batch data
-                    # Compute loss
-                    # Update network
+        self._logger.store(**{'Loss/Loss_cost_critic': loss.mean().item()})
 
-        Args:
-            obs (torch.Tensor): ``observation`` stored in buffer.
-            target_c (torch.Tensor): ``target_c`` stored in buffer.
-        """
-        self.cost_critic_optimizer.zero_grad()
-        # compute the loss of cost net.
-        loss_c = self.critic_loss_fn(
-            self.actor_critic.cost_critic(obs),
-            target_c,
-        )
-        # add the norm of critic network parameters to the loss function.
-        if self.cfgs.use_critic_norm:
-            for param in self.actor_critic.cost_critic.parameters():
-                loss_c += param.pow(2).sum() * self.cfgs.critic_norm_coeff
-        # log the loss.
-        self.loss_record.append(loss_c=loss_c.mean().item())
-        # backward.
-        loss_c.backward()
-        # clip the gradient.
-        if self.cfgs.use_max_grad_norm:
+    def _update_actor(  # pylint: disable=too-many-arguments
+        self,
+        obs: torch.Tensor,
+        act: torch.Tensor,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
+    ) -> None:
+        adv = self._compute_adv_surrogate(adv_r, adv_c)
+        loss, info = self._loss_pi(obs, act, logp, adv)
+        self._actor_critic.actor_optimizer.zero_grad()
+        loss.backward()
+        if self._cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self.actor_critic.cost_critic.parameters(), self.cfgs.max_grad_norm
+                self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm
             )
-        distributed_utils.mpi_avg_grads(self.actor_critic.cost_critic)
-        self.cost_critic_optimizer.step()
+        distributed.avg_grads(self._actor_critic.actor)
+        self._actor_critic.actor_optimizer.step()
+        self._logger.store(
+            **{
+                'Train/Entropy': info['entrophy'],
+                'Train/PolicyRatio': info['ratio'],
+                'Train/PolicyStd': info['std'],
+                'Loss/Loss_pi': loss.mean().item(),
+            }
+        )
+
+    def _compute_adv_surrogate(  # pylint: disable=unused-argument
+        self, adv_r: torch.Tensor, adv_c: torch.Tensor
+    ) -> torch.Tensor:
+        return adv_r
+
+    def _loss_pi(
+        self,
+        obs: torch.Tensor,
+        act: torch.Tensor,
+        logp: torch.Tensor,
+        adv: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, float]]:
+        distribution = self._actor_critic.actor(obs)
+        logp_ = self._actor_critic.actor.log_prob(act)
+        std = self._actor_critic.actor.std
+        ratio = torch.exp(logp_ - logp)
+        loss = -(ratio * adv).mean()
+        entrophy = distribution.entropy().mean().item()
+        info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std}
+        return loss, info
diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py
index 6343903a3..0cb3f6e10 100644
--- a/omnisafe/algorithms/on_policy/base/ppo.py
+++ b/omnisafe/algorithms/on_policy/base/ppo.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Implementation of the PPO algorithm."""
 
-from typing import NamedTuple, Tuple
+from typing import Dict, Tuple
 
 import torch
 
@@ -32,31 +32,9 @@ class PPO(PolicyGradient):
         - URL: `PPO <https://arxiv.org/abs/1707.06347>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize Proximal Policy Optimization.
-
-        .. note::
-            The ``clip`` parameter is the clip parameter in PPO,
-            which is used to clip the ratio of the new policy and the old policy.
-            The ``clip`` parameter is set to 0.2 in the original paper.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-
-    # pylint: disable-next=too-many-arguments
-    def compute_loss_pi(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _loss_pi(
+        self, obs: torch.Tensor, act: torch.Tensor, logp: torch.Tensor, adv: torch.Tensor
+    ) -> Tuple[torch.Tensor, Dict[str, float]]:
         r"""Computing pi/actor loss.
 
         In Proximal Policy Optimization, the loss is defined as:
@@ -75,15 +53,14 @@ def compute_loss_pi(
             adv (torch.Tensor): ``advantage`` stored in buffer.
             cost_adv (torch.Tensor): ``cost advantage`` stored in buffer.
         """
-        dist, _log_p = self.actor_critic.actor(obs, act)
-        # importance ratio
-        ratio = torch.exp(_log_p - log_p)
-        ratio_clip = torch.clamp(ratio, 1 - self.cfgs.clip, 1 + self.cfgs.clip)
-        loss_pi = -(torch.min(ratio * adv, ratio_clip * adv))
-        loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean()
+        distribution = self._actor_critic.actor(obs)
+        logp_ = self._actor_critic.actor.log_prob(act)
+        std = self._actor_critic.actor.std
+        ratio = torch.exp(logp_ - logp)
+        ratio_cliped = torch.clamp(ratio, 1 - self._cfgs.clip, 1 + self._cfgs.clip)
+        loss = -torch.min(ratio * adv, ratio_cliped * adv).mean()
+        loss += self._cfgs.entropy_coef * distribution.entropy().mean()
         # useful extra info
-        approx_kl = (0.5 * (dist.mean - act) ** 2 / dist.stddev**2).mean().item()
-        ent = dist.entropy().mean().item()
-        pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio_clip.mean().item()}
-
-        return loss_pi.mean(), pi_info
+        entrophy = distribution.entropy().mean().item()
+        info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std}
+        return loss, info
diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py
index 11d8e4359..ebc19d2b2 100644
--- a/omnisafe/algorithms/on_policy/base/trpo.py
+++ b/omnisafe/algorithms/on_policy/base/trpo.py
@@ -14,15 +14,16 @@
 # ==============================================================================
 """Implementation of the TRPO algorithm."""
 
-from typing import NamedTuple, Tuple
+from typing import Tuple
 
 import torch
+from torch.distributions import Distribution
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.natural_pg import NaturalPG
-from omnisafe.utils import distributed_utils
+from omnisafe.utils import distributed
+from omnisafe.utils.math import conjugate_gradients
 from omnisafe.utils.tools import (
-    conjugate_gradients,
     get_flat_gradients_from,
     get_flat_params_from,
     set_param_values_to_model,
@@ -39,26 +40,21 @@ class TRPO(NaturalPG):
         - URL: `TRPO <https://arxiv.org/abs/1502.05477>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize Trust Region Policy Optimization.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Misc/AcceptanceStep')
 
     # pylint: disable-next=too-many-arguments,too-many-locals,arguments-differ
-    def search_step_size(
+    def _search_step_size(
         self,
-        step_dir: torch.Tensor,
-        g_flat: torch.Tensor,
-        p_dist: torch.distributions.Distribution,
+        step_direction: torch.Tensor,
+        grad: torch.Tensor,
+        p_dist: Distribution,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
+        logp: torch.Tensor,
         adv: torch.Tensor,
-        loss_pi_before: float,
+        loss_before: float,
         total_steps: int = 15,
         decay: float = 0.8,
     ) -> Tuple[torch.Tensor, int]:
@@ -86,60 +82,58 @@ def search_step_size(
         # How far to go in a single update
         step_frac = 1.0
         # Get old parameterized policy expression
-        _theta_old = get_flat_params_from(self.actor_critic.actor)
+        theta_old = get_flat_params_from(self._actor_critic.actor)
         # Change expected objective function gradient = expected_imrpove best this moment
-        expected_improve = g_flat.dot(step_dir)
+        expected_improve = grad.dot(step_direction)
+        expected_improve = torch.dot(grad, step_direction)
 
         # While not within_trust_region and not out of total_steps:
-        for j in range(total_steps):
+        for step in range(total_steps):
             # update theta params
-            new_theta = _theta_old + step_frac * step_dir
+            new_theta = theta_old + step_frac * step_direction
             # set new params as params of net
-            set_param_values_to_model(self.actor_critic.actor, new_theta)
-            # the stepNo this update accept
-            acceptance_step = j + 1
+            set_param_values_to_model(self._actor_critic.actor, new_theta)
 
             with torch.no_grad():
-                loss_pi, _ = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv)
+                loss, _ = self._loss_pi(obs, act, logp, adv)
                 # compute KL distance between new and old policy
-                q_dist = self.actor_critic.actor(obs)
+                q_dist = self._actor_critic.actor(obs)
                 # KL-distance of old p-dist and new q-dist, applied in KLEarlyStopping
-                torch_kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item()
+                kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item()
+                kl = distributed.dist_avg(kl)
             # real loss improve: old policy loss - new policy loss
-            loss_improve = loss_pi_before - loss_pi.item()
+            loss_improve = loss_before - loss.item()
             # average processes.... multi-processing style like: mpi_tools.mpi_avg(xxx)
-            torch_kl = distributed_utils.mpi_avg(torch_kl)
-            loss_improve = distributed_utils.mpi_avg(loss_improve)
-            menu = (expected_improve, loss_improve)
-            self.logger.log(f'Expected Improvement: {menu[0]} Actual: {menu[1]}')
-            if not torch.isfinite(loss_pi):
-                self.logger.log('WARNING: loss_pi not finite')
+            loss_improve = distributed.dist_avg(loss_improve)
+            self._logger.log(f'Expected Improvement: {expected_improve} Actual: {loss_improve}')
+            if not torch.isfinite(loss):
+                self._logger.log('WARNING: loss_pi not finite')
             elif loss_improve < 0:
-                self.logger.log('INFO: did not improve improve <0')
-            elif torch_kl > self.target_kl * 1.5:
-                self.logger.log('INFO: violated KL constraint.')
+                self._logger.log('INFO: did not improve improve <0')
+            elif kl > self._cfgs.target_kl * 1.5:
+                self._logger.log('INFO: violated KL constraint.')
             else:
                 # step only if surrogate is improved and when within trust reg.
-                self.logger.log(f'Accept step at i={acceptance_step}')
+                acceptance_step = step + 1
+                self._logger.log(f'Accept step at i={acceptance_step}')
                 break
             step_frac *= decay
         else:
-            self.logger.log('INFO: no suitable step found...')
-            step_dir = torch.zeros_like(step_dir)
+            self._logger.log('INFO: no suitable step found...')
+            step_direction = torch.zeros_like(step_direction)
             acceptance_step = 0
 
-        set_param_values_to_model(self.actor_critic.actor, _theta_old)
+        set_param_values_to_model(self._actor_critic.actor, theta_old)
 
-        return step_frac * step_dir, acceptance_step
+        return step_frac * step_direction, acceptance_step
 
-    # pylint: disable-next=too-many-locals,too-many-arguments
-    def update_policy_net(
+    def _update_actor(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
     ) -> None:
         """Update policy network.
 
@@ -155,75 +149,58 @@ def update_policy_net(
         Args:
             obs (torch.Tensor): The observation tensor.
             act (torch.Tensor): The action tensor.
-            log_p (torch.Tensor): The log probability of the action.
-            adv (torch.Tensor): The advantage tensor.
-            cost_adv (torch.Tensor): The cost advantage tensor.
+            logp (torch.Tensor): The log probability of the action.
+            adv_r (torch.Tensor): The advantage tensor.
+            adv_c (torch.Tensor): The cost advantage tensor.
         """
-        # get loss and info values before update
-        self.fvp_obs = obs[::4]
-        theta_old = get_flat_params_from(self.actor_critic.actor)
-        self.actor_critic.actor.zero_grad()
-        # process the advantage function.
-        processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv)
-        # compute the loss of policy net.
-        loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv)
-        loss_pi_before = distributed_utils.mpi_avg(loss_pi.item())
-        p_dist = self.actor_critic.actor(obs)
-        # train policy with multiple steps of gradient descent
-        loss_pi.backward()
-        # average grads across MPI processes
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        g_flat = get_flat_gradients_from(self.actor_critic.actor)
-        g_flat *= -1
-
-        # pylint: disable-next=invalid-name
-        x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters)
+        self._fvp_obs = obs[::4]
+        theta_old = get_flat_params_from(self._actor_critic.actor)
+        self._actor_critic.actor.zero_grad()
+        adv = self._compute_adv_surrogate(adv_r, adv_c)
+        loss, info = self._loss_pi(obs, act, logp, adv)
+        loss_before = distributed.dist_avg(loss).item()
+        p_dist = self._actor_critic.actor(obs)
+
+        loss.backward()
+        distributed.avg_grads(self._actor_critic.actor)
+
+        grad = -get_flat_gradients_from(self._actor_critic.actor)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
-        # note that xHx = g^T x, but calculating xHx is faster than g^T x
-        xHx = torch.dot(x, self.Fvp(x))  # equivalent to : g^T x
-        assert xHx.item() >= 0, 'No negative values'
-
-        # perform descent direction
-        alpha = torch.sqrt(2 * self.target_kl / (xHx + 1e-8))
-        step_direction = alpha * x
+        xHx = torch.dot(x, self._fvp(x))
+        assert xHx.item() >= 0, 'xHx is negative'
+        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+        step_direction = x * alpha
         assert torch.isfinite(step_direction).all(), 'step_direction is not finite'
 
-        # determine step direction and apply SGD step after grads where set
-        # TRPO uses custom backtracking line search
-        final_step_dir, accept_step = self.search_step_size(
-            step_dir=step_direction,
-            g_flat=g_flat,
+        step_direction, accept_step = self._search_step_size(
+            step_direction=step_direction,
+            grad=grad,
             p_dist=p_dist,
-            loss_pi_before=loss_pi_before,
             obs=obs,
             act=act,
-            log_p=log_p,
+            logp=logp,
             adv=adv,
+            loss_before=loss_before,
         )
 
-        # update actor network parameters
-        new_theta = theta_old + final_step_dir
-        set_param_values_to_model(self.actor_critic.actor, new_theta)
+        theta_new = theta_old + step_direction
+        set_param_values_to_model(self._actor_critic.actor, theta_new)
 
         with torch.no_grad():
-            q_dist = self.actor_critic.actor(obs)
-            kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item()
-            loss_pi, pi_info = self.compute_loss_pi(
-                obs=obs, act=act, log_p=log_p, adv=processed_adv
-            )
-            self.loss_record.append(loss_pi=loss_pi.mean().item())
-
-        self.logger.store(
+            loss, info = self._loss_pi(obs, act, logp, adv)
+
+        self._logger.store(
             **{
-                'Values/Adv': adv.mean().item(),
-                'Train/Entropy': pi_info['ent'],
-                'Train/KL': kl,
-                'Train/PolicyRatio': pi_info['ratio'],
-                'Misc/AcceptanceStep': accept_step,
+                'Train/Entropy': info['entrophy'],
+                'Train/PolicyRatio': info['ratio'],
+                'Train/PolicyStd': info['std'],
+                'Loss/Loss_pi': loss.mean().item(),
                 'Misc/Alpha': alpha.item(),
-                'Misc/FinalStepNorm': torch.norm(final_step_dir).mean().item(),
+                'Misc/FinalStepNorm': torch.norm(step_direction).mean().item(),
                 'Misc/xHx': xHx.item(),
-                'Misc/gradient_norm': torch.norm(g_flat).mean().item(),
+                'Misc/gradient_norm': torch.norm(grad).mean().item(),
                 'Misc/H_inv_g': x.norm().item(),
+                'Misc/AcceptanceStep': accept_step,
             }
         )
diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py
index c82f127c7..508773acf 100644
--- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py
+++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py
@@ -14,8 +14,7 @@
 # ==============================================================================
 """Implementation of the early terminated algorithm using PPO."""
 
-from typing import NamedTuple
-
+from omnisafe.adapter import EarlyTerminatedAdapter
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.ppo import PPO
 
@@ -30,11 +29,8 @@ class PPOEarlyTerminated(PPO):
         URL: `Safe Exploration by Solving Early Terminated MDP <https://arxiv.org/abs/2107.04200>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPO_Earyly_Terminated.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
+    def _init_env(self) -> None:
+        self._env = EarlyTerminatedAdapter(
+            self._env_id, self._cfgs.num_envs, self._seed, self._cfgs
+        )
+        self._steps_per_epoch = self._cfgs.steps_per_epoch
diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py
index 54602dfa9..1b546b984 100644
--- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py
+++ b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Implementation of the Lagrange version of the early terminated algorithm using PPOLag."""
 
-from typing import NamedTuple
 
+from omnisafe.adapter import EarlyTerminatedAdapter
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag
 
@@ -30,11 +30,8 @@ class PPOLagEarlyTerminated(PPOLag):
         URL: `Safe Exploration by Solving Early Terminated MDP <https://arxiv.org/abs/2107.04200>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPO_Lag_Earyly_Terminated.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
+    def _init_env(self) -> None:
+        self._env = EarlyTerminatedAdapter(
+            self._env_id, self._cfgs.num_envs, self._seed, self._cfgs
+        )
+        self._steps_per_epoch = self._cfgs.steps_per_epoch
diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py
index 2c720ebe0..3f0969685 100644
--- a/omnisafe/algorithms/on_policy/first_order/cup.py
+++ b/omnisafe/algorithms/on_policy/first_order/cup.py
@@ -14,99 +14,49 @@
 # ==============================================================================
 """Implementation of the CUP algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
+from torch.distributions import Normal
+from torch.utils.data import DataLoader, TensorDataset
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.ppo import PPO
 from omnisafe.common.lagrange import Lagrange
-from omnisafe.common.record_queue import RecordQueue
-from omnisafe.utils import distributed_utils
+from omnisafe.utils import distributed
+from omnisafe.utils.config import Config
 
 
 @registry.register
-class CUP(PPO, Lagrange):
+class CUP(PPO):
     """The Constrained Update Projection (CUP) Approach to Safe Policy Optimization.
 
     References:
         - Title: Constrained Update Projection Approach to Safe Policy Optimization
         - Authors: Long Yang, Jiaming Ji, Juntao Dai, Linrui Zhang, Binbin Zhou, Pengfei Li,
-                 Yaodong Yang, Gang Pan.
+                    Yaodong Yang, Gang Pan.
         - URL: `CUP <https://arxiv.org/abs/2209.07089>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize CUP.
-
-        CUP is a combination of :class:`PPO` and :class:`Lagrange` model.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        PPO.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(
-            self,
-            cost_limit=self.cfgs.lagrange_cfgs.cost_limit,
-            lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init,
-            lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr,
-            lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer,
-            lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound,
-        )
-        self.lam = self.cfgs.lam
-        self.eta = self.cfgs.eta
-        self.max_ratio = 0
-        self.min_ratio = 0
-        self.p_dist = None
-        self.loss_record = RecordQueue('loss_pi', 'loss_v', 'loss_c', 'loss_pi_c', maxlen=100)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-        self.logger.register_key('Train/MaxRatio')
-        self.logger.register_key('Train/MinRatio')
-        self.logger.register_key('Loss/Loss_pi_c')
-        self.logger.register_key('Loss/Delta_loss_pi_c')
-        self.logger.register_key('Train/SecondStepStopIter')
-        self.logger.register_key('Train/SecondStepEntropy')
-        self.logger.register_key('Train/SecondStepPolicyRatio')
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the CUP specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-            *   -   Train/MaxRatio
-                -   The maximum ratio between the current policy and the old policy.
-            *   -   Train/MinRatio
-                -   The minimum ratio between the current policy and the old policy.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-                'Train/MaxRatio': self.max_ratio,
-                'Train/MinRatio': self.min_ratio,
-            }
-        )
-
-    # pylint: disable-next=too-many-locals
-    def compute_loss_cost_performance(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    def _init(self) -> None:
+        super()._init()
+        self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs)
+
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
+        self._logger.register_key('Train/MaxRatio')
+        self._logger.register_key('Train/MinRatio')
+        self._logger.register_key('Loss/Loss_pi_c', delta=True)
+        self._logger.register_key('Train/SecondStepStopIter')
+        self._logger.register_key('Train/SecondStepEntropy')
+        self._logger.register_key('Train/SecondStepPolicyRatio')
+
+    def __init__(self, env_id: str, cfgs: Config) -> None:
+        super().__init__(env_id, cfgs)
+        self._p_dist: Normal
+        self._max_ratio: float = 0.0
+        self._min_ratio: float = 0.0
+
+    def _loss_pi_cost(self, obs, act, logp, adv_c):
         r"""Compute the performance of cost on this moment.
 
         Detailedly, we compute the KL divergence between the current policy and the old policy,
@@ -134,107 +84,98 @@ def compute_loss_cost_performance(
             log_p (torch.Tensor): Log probability.
             cost_adv (torch.Tensor): Cost advantage.
         """
-        dist, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
+        distribution = self._actor_critic.actor(obs)
+        logp_ = self._actor_critic.actor.log_prob(act)
+        std = self._actor_critic.actor.std
+        ratio = torch.exp(logp_ - logp)
 
-        kl_new_old = torch.distributions.kl.kl_divergence(dist, self.p_dist).sum(-1, keepdim=True)
+        kl = torch.distributions.kl_divergence(distribution, self._p_dist).sum(-1, keepdim=True)
 
-        coef = (1 - self.cfgs.buffer_cfgs.gamma * self.cfgs.buffer_cfgs.lam) / (
-            1 - self.cfgs.buffer_cfgs.gamma
+        coef = (1 - self._cfgs.buffer_cfgs.gamma * self._cfgs.buffer_cfgs.lam) / (
+            1 - self._cfgs.buffer_cfgs.gamma
         )
-        cost_loss = (self.lagrangian_multiplier * coef * ratio * cost_adv + kl_new_old).mean()
-        self.loss_record.append(loss_pi_c=cost_loss.item())
+        loss = (self._lagrange.lagrangian_multiplier * coef * ratio * adv_c + kl).mean()
 
         # useful extra info
         temp_max = torch.max(ratio).detach().mean().item()
         temp_min = torch.min(ratio).detach().mean().item()
-        if temp_max > self.max_ratio:
-            self.max_ratio = temp_max
-        if temp_min < self.min_ratio:
-            self.min_ratio = temp_min
-        approx_kl = 0.5 * (log_p - _log_p).mean().item()
-        ent = dist.entropy().mean().item()
-        pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()}
-
-        return cost_loss, pi_info
-
-    # pylint: disable-next=too-many-locals
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+        if temp_max > self._max_ratio:
+            self._max_ratio = temp_max
+        if temp_min < self._min_ratio:
+            self._min_ratio = temp_min
+        entrophy = distribution.entropy().mean().item()
+        info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std}
+
+        self._logger.store(**{'Loss/Loss_pi_c': loss.item()})
+
+        return loss, info
+
+    def _update(self) -> None:
         """Update actor, critic, running statistics as we used in the :class:`PolicyGradient`.
 
         In addition, we also update the Lagrange multiplier parameter,
         by calling the :meth:`update_lagrange_multiplier` function.
         """
         # note that logger already uses MPI statistics across all processes..
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
         # first update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(Jc)
-        # the first stage is to maximize reward.
-        data = PPO.update(self)
-        # the second stage is to minimize cost.
-        # get the loss before
-        loss_pi_c_before = self.loss_record.get_mean('loss_pi_c')
-        self.loss_record.reset('loss_pi_c')
-        obs, act, log_p, cost_adv = (
+        self._lagrange.update_lagrange_multiplier(Jc)
+
+        super()._update()
+
+        data = self._buf.get()
+        obs, act, logp, adv_c = (
             data['obs'],
             data['act'],
             data['logp'],
             data['adv_c'],
         )
+        original_obs = obs
         with torch.no_grad():
-            old_dist = self.actor_critic.actor(obs)
-            old_mean, old_std = old_dist.mean, old_dist.stddev
-        # load the data into the data loader.
-        dataset = torch.utils.data.TensorDataset(obs, act, log_p, cost_adv, old_mean, old_std)
-        loader = torch.utils.data.DataLoader(
-            dataset, batch_size=self.cfgs.num_mini_batches, shuffle=True
+            old_distribution = self._actor_critic.actor(obs)
+            old_mean = old_distribution.mean
+            old_std = old_distribution.stddev
+
+        dataloader = DataLoader(
+            dataset=TensorDataset(obs, act, logp, adv_c, old_mean, old_std),
+            batch_size=self._cfgs.num_mini_batches,
+            shuffle=True,
         )
 
-        # update the policy net several times
-        for i in range(self.cfgs.actor_iters):
-            for _, (obs_b, act_b, log_p_b, cost_adv_b, old_mean_b, old_std_b) in enumerate(loader):
-                # compute the old distribution of policy net.
-                self.p_dist = torch.distributions.Normal(old_mean_b, old_std_b)
-                # compute the loss of cost performance.
-                loss_pi_c, pi_info_c = self.compute_loss_cost_performance(
-                    obs_b, act_b, log_p_b, cost_adv_b
-                )
-                # update the policy net.
-                self.actor_optimizer.zero_grad()
-                # backward
-                loss_pi_c.backward()
-                # clip the gradient of policy net.
-                if self.cfgs.use_max_grad_norm:
+        for i in range(self._cfgs.actor_iters):
+            for obs, act, logp, adv_c, old_mean, old_std in dataloader:
+                self._p_dist = Normal(old_mean, old_std)
+                loss_cost, info = self._loss_pi_cost(obs, act, logp, adv_c)
+                self._actor_critic.actor_optimizer.zero_grad()
+                loss_cost.backward()
+                if self._cfgs.max_grad_norm is not None:
                     torch.nn.utils.clip_grad_norm_(
-                        self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm
+                        self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm
                     )
-                # average the gradient of policy net.
-                distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-                self.actor_optimizer.step()
-            # compute the new distribution of policy net.
-            new_dist = self.actor_critic.actor(obs)
-            # compute the KL divergence between old and new distribution.
-            torch_kl = (
-                torch.distributions.kl.kl_divergence(old_dist, new_dist)
+                distributed.avg_grads(self._actor_critic.actor)
+                self._actor_critic.actor_optimizer.step()
+
+            new_distribution = self._actor_critic.actor(original_obs)
+
+            kl = (
+                torch.distributions.kl.kl_divergence(old_distribution, new_distribution)
                 .sum(-1, keepdim=True)
                 .mean()
                 .item()
             )
-            torch_kl = distributed_utils.mpi_avg(torch_kl)
-            # if the KL divergence is larger than the target KL divergence, stop the update.
-            if self.cfgs.kl_early_stopping and torch_kl > self.cfgs.target_kl:
-                self.logger.log(f'KL early stop at the {i+1} th step in the second stage.')
+            kl = distributed.dist_avg(kl)
+
+            if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl:
+                self._logger.log(f'Early stopping at iter {i} due to reaching max kl')
                 break
 
-        loss_pi_c = self.loss_record.get_mean('loss_pi_c')
-        # log the information.
-        self.logger.store(
+        self._logger.store(
             **{
-                'Loss/Loss_pi_c': loss_pi_c,
-                'Loss/Delta_loss_pi_c': loss_pi_c - loss_pi_c_before,
+                'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier.item(),
+                'Train/MaxRatio': self._max_ratio,
+                'Train/MinRatio': self._min_ratio,
                 'Train/SecondStepStopIter': i + 1,
-                'Train/SecondStepEntropy': pi_info_c['ent'],
-                'Train/SecondStepPolicyRatio': pi_info_c['ratio'],
+                'Train/SecondStepEntropy': info['entrophy'],
+                'Train/SecondStepPolicyRatio': info['ratio'],
             }
         )
-        return data
diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py
index 0f3202c83..0856f04a7 100644
--- a/omnisafe/algorithms/on_policy/first_order/focops.py
+++ b/omnisafe/algorithms/on_policy/first_order/focops.py
@@ -14,17 +14,21 @@
 # ==============================================================================
 """Implementation of the FOCOPS algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
+from typing import Dict, Tuple
 
 import torch
+from torch.distributions import Normal
+from torch.utils.data import DataLoader, TensorDataset
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient
 from omnisafe.common.lagrange import Lagrange
+from omnisafe.utils import distributed
+from omnisafe.utils.config import Config
 
 
 @registry.register
-class FOCOPS(PolicyGradient, Lagrange):
+class FOCOPS(PolicyGradient):
     """The First Order Constrained Optimization in Policy Space (FOCOPS) algorithm.
 
     References:
@@ -33,126 +37,55 @@ class FOCOPS(PolicyGradient, Lagrange):
         - URL: `FOCOPS <https://arxiv.org/abs/2002.06506>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize FOCOPS.
-
-        FOCOPS is a combination of :class:`PolicyGradient` and :class:`Lagrange` model.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        PolicyGradient.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
+    def _init(self) -> None:
+        super()._init()
+        self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs)
+
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
+
+    def __init__(self, env_id: str, cfgs: Config) -> None:
+        super().__init__(env_id, cfgs)
+        self._p_dist: Normal
+
+    def _loss_pi(
+        self, obs: torch.Tensor, act: torch.Tensor, logp: torch.Tensor, adv: torch.Tensor
+    ) -> Tuple[torch.Tensor, Dict[str, float]]:
+        distribution = self._actor_critic.actor(obs)
+        logp_ = self._actor_critic.actor.log_prob(act)
+        std = self._actor_critic.actor.std
+        ratio = torch.exp(logp_ - logp)
+
+        kl = torch.distributions.kl_divergence(distribution, self._p_dist).sum(-1, keepdim=True)
+        loss = (kl - (1 / self._cfgs.lam) * ratio * adv) * (kl.detach() <= self._cfgs.eta).type(
+            torch.float32
         )
-        Lagrange.__init__(
-            self,
-            cost_limit=self.cfgs.lagrange_cfgs.cost_limit,
-            lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init,
-            lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr,
-            lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer,
-            lagrangian_upper_bound=self.cfgs.lagrange_cfgs.lagrangian_upper_bound,
-        )
-        self.lam = self.cfgs.lam
-        self.eta = self.cfgs.eta
-        self.p_dist = None
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
+        loss = loss.mean()
+        loss -= self._cfgs.entropy_coef * distribution.entropy().mean()
 
-    def algorithm_specific_logs(self) -> None:
-        """Log the FOCOPS specific information.
+        entrophy = distribution.entropy().mean().item()
+        info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std}
+        return loss, info
 
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-            }
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        return (adv_r - self._lagrange.lagrangian_multiplier * adv_c) / (
+            1 + self._lagrange.lagrangian_multiplier
         )
 
-    # pylint: disable-next=too-many-arguments
-    def compute_loss_pi(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        r"""
-        Computing pi/actor loss.
-        In FOCOPS, the loss is defined as:
-
-        .. math::
-            :nowrap:
-
-            \begin{eqnarray}
-            L = \nabla_\theta D_{K L}\left(\pi_\theta \| \pi_{\theta^{old}}\right)[s]
-            -\frac{1}{\eta} \underset{a \sim \pi_{\theta^{old}}}
-            {\mathbb{E}}\left[\frac{\nabla_\theta \pi_\theta(a \mid s)}
-            {\pi_{\theta^{old}}(a \mid s)}\left(A^{R}_{\pi_{\theta^{old}}}(s, a)
-            -\lambda A^C_{\pi_{\theta^{old}}}(s, a)\right)\right]
-            \end{eqnarray}
-
-        where :math:`\eta` is a hyperparameter, :math:`\lambda` is the Lagrange multiplier,
-        :math:`A_{\pi_{\theta_k}}(s, a)` is the advantage function,
-        :math:`A^C_{\pi_{\theta_k}}(s, a)` is the cost advantage function,
-        :math:`\pi^*` is the optimal policy, and :math:`\pi_{\theta_k}` is the current policy.
-        """
-        dist, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-
-        kl_new_old = torch.distributions.kl.kl_divergence(dist, self.p_dist).sum(-1, keepdim=True)
-        loss_pi = (kl_new_old - (1 / self.lam) * ratio * adv) * (
-            kl_new_old.detach() <= self.eta
-        ).type(torch.float32)
-        loss_pi = loss_pi.mean()
-        loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean()
-
-        # useful extra info
-        approx_kl = 0.5 * (log_p - _log_p).mean().item()
-        ent = dist.entropy().mean().item()
-        pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()}
-
-        return loss_pi, pi_info
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
-
-        Policy Gradient only use reward advantage.
-
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        return (adv - self.lagrangian_multiplier * cost_adv) / (1 + self.lagrangian_multiplier)
-
-    # pylint: disable-next=too-many-locals
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    def _update(self) -> None:
         """Update actor, critic, running statistics as we used in the :class:`PolicyGradient`.
 
         In addition, we also update the Lagrange multiplier parameter,
         by calling the :meth:`update_lagrange_multiplier` function.
         """
         # note that logger already uses MPI statistics across all processes..
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
         # first update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(Jc)
-        data = self.buf.get()
-        obs, act, log_p, target_v, target_c, adv, cost_adv = (
+        self._lagrange.update_lagrange_multiplier(Jc)
+
+        data = self._buf.get()
+        obs, act, logp, target_value_r, target_value_c, adv_r, adv_c = (
             data['obs'],
             data['act'],
             data['logp'],
@@ -161,76 +94,58 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
             data['adv_r'],
             data['adv_c'],
         )
-        # get the loss before
-        loss_pi_before, loss_v_before = self.loss_record.get_mean('loss_pi', 'loss_v')
-        if self.cfgs.use_cost:
-            loss_c_before = self.loss_record.get_mean('loss_c')
-        self.loss_record.reset('loss_pi', 'loss_v', 'loss_c')
+        original_obs = obs
         with torch.no_grad():
-            old_dist = self.actor_critic.actor(obs)
-            old_mean, old_std = old_dist.mean, old_dist.stddev
-
-        # load the data into the data loader.
-        dataset = torch.utils.data.TensorDataset(
-            obs, act, target_v, target_c, log_p, adv, cost_adv, old_mean, old_std
-        )
-        loader = torch.utils.data.DataLoader(
-            dataset, batch_size=self.cfgs.num_mini_batches, shuffle=True
+            old_distribution = self._actor_critic.actor(obs)
+            old_mean = old_distribution.mean
+            old_std = old_distribution.stddev
+
+        dataloader = DataLoader(
+            dataset=TensorDataset(
+                obs, act, logp, target_value_r, target_value_c, adv_r, adv_c, old_mean, old_std
+            ),
+            batch_size=self._cfgs.num_mini_batches,
+            shuffle=True,
         )
 
-        # update the value net, cost net and policy net for several times.
-        for i in range(self.cfgs.actor_iters):
-            for _, (
-                obs_b,
-                act_b,
-                target_v_b,
-                target_c_b,
-                log_p_b,
-                adv_b,
-                cost_adv_b,
-                old_mean_b,
-                old_std_b,
-            ) in enumerate(loader):
-                # update the value net.
-                self.update_value_net(obs_b, target_v_b)
-                # update the cost net, if use cost.
-                if self.cfgs.use_cost:
-                    self.update_cost_net(obs_b, target_c_b)
-                # update the policy net.
-                self.p_dist = torch.distributions.Normal(old_mean_b, old_std_b)
-                self.update_policy_net(obs_b, act_b, log_p_b, adv_b, cost_adv_b)
-            # compute the new distribution of policy net.
-            new_dist = self.actor_critic.actor(obs)
-            # compute the KL divergence between old and new distribution.
-            torch_kl = (
-                torch.distributions.kl.kl_divergence(old_dist, new_dist)
+        for i in range(self._cfgs.actor_iters):
+            for (
+                obs,
+                act,
+                logp,
+                target_value_r,
+                target_value_c,
+                adv_r,
+                adv_c,
+                old_mean,
+                old_std,
+            ) in dataloader:
+                self._update_rewrad_critic(obs, target_value_r)
+                if self._cfgs.use_cost:
+                    self._update_cost_critic(obs, target_value_c)
+
+                self._p_dist = Normal(old_mean, old_std)
+                self._update_actor(obs, act, logp, adv_r, adv_c)
+
+            new_distribution = self._actor_critic.actor(original_obs)
+
+            kl = (
+                torch.distributions.kl.kl_divergence(old_distribution, new_distribution)
                 .sum(-1, keepdim=True)
                 .mean()
                 .item()
             )
-            # if the KL divergence is larger than the target KL divergence, stop the update.
-            if self.cfgs.kl_early_stopping and torch_kl > self.cfgs.target_kl:
-                self.logger.log(f'KL early stop at the {i+1} th step.')
+            kl = distributed.dist_avg(kl)
+
+            if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl:
+                self._logger.log(f'Early stopping at iter {i} due to reaching max kl')
                 break
-        # log the information.
-        loss_pi, loss_v = self.loss_record.get_mean('loss_pi', 'loss_v')
-        self.logger.store(
+
+        self._logger.store(
             **{
-                'Loss/Loss_pi': loss_pi,
-                'Loss/Delta_loss_pi': loss_pi - loss_pi_before,
                 'Train/StopIter': i + 1,
-                'Values/Adv': adv.mean().item(),
-                'Train/KL': torch_kl,
-                'Loss/Delta_loss_reward_critic': loss_v - loss_v_before,
-                'Loss/Loss_reward_critic': loss_v,
+                'Value/Adv': adv_r.mean().item(),
+                'Train/KL': kl,
+                'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier,
             }
         )
-        if self.cfgs.use_cost:
-            loss_c = self.loss_record.get_mean('loss_c')
-            self.logger.store(
-                **{
-                    'Loss/Delta_loss_cost_critic': loss_c - loss_c_before,
-                    'Loss/Loss_cost_critic': loss_c,
-                }
-            )
-        return data
diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py b/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py
index 3d84b72a4..acfe874e1 100644
--- a/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py
+++ b/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py
@@ -14,12 +14,11 @@
 # ==============================================================================
 """Implementation of the on-policy CRPO algorithm."""
 
-from typing import NamedTuple
-
 import torch
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.ppo import PPO
+from omnisafe.utils.config import Config
 
 
 @registry.register
@@ -32,58 +31,29 @@ class OnCRPO(PPO):
         - URL: `CRPO <https://arxiv.org/pdf/2011.05869.pdf>`_.
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize CRPO.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        PPO.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        self.rew_update = 0
-        self.cost_update = 0
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Misc/RewUpdate')
-        self.logger.register_key('Misc/CostUpdate')
+    def __init__(self, env_id: str, cfgs: Config) -> None:
+        super().__init__(env_id, cfgs)
+        self._rew_update = 0
+        self._cost_update = 0
 
-    def algorithm_specific_logs(self) -> None:
-        """Log the CRPO specific information.
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Misc/RewUpdate')
+        self._logger.register_key('Misc/CostUpdate')
 
-        .. list-table::
-
-            *  -   Things to log
-               -   Description
-            *  -   Metrics/LagrangeMultiplier
-               -   The Lagrange multiplier value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
+    def _update(self) -> None:
+        super()._update()
+        self._logger.store(
             **{
-                'Misc/RewUpdate': self.rew_update,
-                'Misc/CostUpdate': self.cost_update,
+                'Misc/RewUpdate': self._rew_update,
+                'Misc/CostUpdate': self._cost_update,
             }
         )
 
-    def compute_surrogate(self, adv: torch.Tensor, cost_adv: torch.Tensor) -> torch.Tensor:
-        """Compute the surrogate loss of the policy.
-
-        In CRPO algorithm, we first judge whether the cost is within the limit.
-        If the cost is within the limit, we use the advantage of the policy.
-        Otherwise, we use the advantage of the cost.
-
-        Args:
-            adv (torch.Tensor): The advantage of the policy.
-            cost_adv (torch.Tensor): The advantage of the cost.
-        """
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
-        if Jc <= self.cfgs.cost_limit + self.cfgs.distance:
-            self.rew_update += 1
-            return adv
-        self.cost_update += 1
-        return -cost_adv
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
+        if Jc <= self._cfgs.cost_limit + self._cfgs.distance:
+            self._rew_update += 1
+            return adv_r
+        self._cost_update += 1
+        return -adv_c
diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py
index d3233871d..f73d120da 100644
--- a/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py
+++ b/omnisafe/algorithms/on_policy/naive_lagrange/pdo.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implementation of the PDO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
@@ -24,39 +22,21 @@
 
 
 @registry.register
-class PDO(PolicyGradient, Lagrange):
+class PDO(PolicyGradient):
     """The Lagrange version of the Policy Gradient algorithm.
 
     A simple combination of the :class:`Lagrange` method and the :class:`PolicyGradient` algorithm.
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PDO.
-
-        PDO is a combination of :class:`PolicyGradient` and :class:`Lagrange` model.
+    def _init(self) -> None:
+        super()._init()
+        self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs)
 
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        PolicyGradient.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(
-            self,
-            cost_limit=cfgs.lagrange_cfgs.cost_limit,
-            lagrangian_multiplier_init=cfgs.lagrange_cfgs.lagrangian_multiplier_init,
-            lambda_lr=cfgs.lagrange_cfgs.lambda_lr,
-            lambda_optimizer=cfgs.lagrange_cfgs.lambda_optimizer,
-        )
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
 
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    def _update(self) -> None:
         r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
 
         Additionally, we update the Lagrange multiplier parameter,
@@ -73,44 +53,15 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
 
             where :math:`\lambda` is the Lagrange multiplier parameter.
         """
-        # note that logger already uses MPI statistics across all processes.
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
+        # note that logger already uses MPI statistics across all processes..
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
         # first update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(Jc)
+        self._lagrange.update_lagrange_multiplier(Jc)
         # then update the policy and value function
-        PolicyGradient.update(self)
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
+        super()._update()
 
-        PDO uses the Lagrange method to combine the reward and cost.
-        The surrogate loss is defined as the difference between the reward
-        advantage and the cost advantage
+        self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier})
 
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        penalty = self.lambda_range_projection(self.lagrangian_multiplier).item()
-        return (adv - penalty * cost_adv) / (1 + penalty)
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the PDO specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-            }
-        )
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        penalty = self._lagrange.lagrangian_multiplier.item()
+        return (adv_r - penalty * adv_c) / (1 + penalty)
diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py
index cc438181e..3c7a31f4c 100644
--- a/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py
+++ b/omnisafe/algorithms/on_policy/naive_lagrange/ppo_lag.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implementation of the Lagrange version of the PPO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
@@ -24,46 +22,28 @@
 
 
 @registry.register
-class PPOLag(PPO, Lagrange):
+class PPOLag(PPO):
     """The Lagrange version of the PPO algorithm.
 
     A simple combination of the Lagrange method and the Proximal Policy Optimization algorithm.
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOLag.
-
-        PPOLag is a combination of :class:`PPO` and :class:`Lagrange` model.
+    def _init(self) -> None:
+        super()._init()
+        self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs)
 
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        PPO.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(
-            self,
-            cost_limit=self.cfgs.lagrange_cfgs.cost_limit,
-            lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init,
-            lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr,
-            lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer,
-        )
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
 
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        r"""Update actor, critic, running statistics as we used in the :class:`PPO` algorithm.
+    def _update(self) -> None:
+        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
 
         Additionally, we update the Lagrange multiplier parameter,
         by calling the :meth:`update_lagrange_multiplier` method.
 
         .. note::
-            The :meth:`compute_loss_pi` is defined in the :class:`PPO` algorithm.
+            The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm.
             When a lagrange multiplier is used,
             the :meth:`compute_loss_pi` method will return the loss of the policy as:
 
@@ -74,42 +54,14 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
             where :math:`\lambda` is the Lagrange multiplier parameter.
         """
         # note that logger already uses MPI statistics across all processes..
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
         # first update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(Jc)
-        PPO.update(self)
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
+        self._lagrange.update_lagrange_multiplier(Jc)
+        # then update the policy and value function
+        super()._update()
 
-        PPOLag uses the Lagrange method to combine the reward and cost.
-        The surrogate loss is defined as the difference between the reward
-        advantage and the cost advantage
+        self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier})
 
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        penalty = self.lambda_range_projection(self.lagrangian_multiplier).item()
-        return (adv - penalty * cost_adv) / (1 + penalty)
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the PPOLag specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-            }
-        )
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        penalty = self._lagrange.lagrangian_multiplier.item()
+        return (adv_r - penalty * adv_c) / (1 + penalty)
diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py b/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py
index 169b7af65..9017f4383 100644
--- a/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py
+++ b/omnisafe/algorithms/on_policy/naive_lagrange/rcpo.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implementation of the Reward Constrained Policy Optimization algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
@@ -24,7 +22,7 @@
 
 
 @registry.register
-class RCPO(NaturalPG, Lagrange):
+class RCPO(NaturalPG):
     """Reward Constrained Policy Optimization.
 
     References:
@@ -33,34 +31,16 @@ class RCPO(NaturalPG, Lagrange):
         - URL: `Reward Constrained Policy Optimization <https://arxiv.org/abs/1805.11074>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize RCPO.
-
-        RCPO is a combination of :class:`NaturalPG` and :class:`Lagrange` model.
+    def _init(self) -> None:
+        super()._init()
+        self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs)
 
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        NaturalPG.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(
-            self,
-            cost_limit=self.cfgs.lagrange_cfgs.cost_limit,
-            lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init,
-            lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr,
-            lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer,
-        )
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
 
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        r"""Update actor, critic, running statistics as we used in the :class:`NaturalPG` algorithm.
+    def _update(self) -> None:
+        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
 
         Additionally, we update the Lagrange multiplier parameter,
         by calling the :meth:`update_lagrange_multiplier` method.
@@ -77,43 +57,14 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
             where :math:`\lambda` is the Lagrange multiplier parameter.
         """
         # note that logger already uses MPI statistics across all processes..
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
         # first update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(Jc)
-        # then update the policy and value net.
-        NaturalPG.update(self)
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
+        self._lagrange.update_lagrange_multiplier(Jc)
+        # then update the policy and value function
+        super()._update()
 
-        RCPO uses the Lagrange method to combine the reward and cost.
-        The surrogate loss is defined as the difference between the reward
-        advantage and the cost advantage
+        self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier})
 
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        penalty = self.lambda_range_projection(self.lagrangian_multiplier).item()
-        return (adv - penalty * cost_adv) / (1 + penalty)
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the RCPO specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-            }
-        )
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        penalty = self._lagrange.lagrangian_multiplier.item()
+        return (adv_r - penalty * adv_c) / (1 + penalty)
diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py
index 29c39cf76..8a53b38b3 100644
--- a/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py
+++ b/omnisafe/algorithms/on_policy/naive_lagrange/trpo_lag.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implementation of the Lagrange version of the TRPO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
@@ -24,46 +22,28 @@
 
 
 @registry.register
-class TRPOLag(TRPO, Lagrange):
+class TRPOLag(TRPO):
     """The Lagrange version of the TRPO algorithm.
 
     A simple combination of the Lagrange method and the Trust Region Policy Optimization algorithm.
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize TRPOLag.
-
-        TRPOLag is a combination of :class:`TRPO` and :class:`Lagrange` model.
+    def _init(self) -> None:
+        super()._init()
+        self._lagrange = Lagrange(**self._cfgs.lagrange_cfgs)
 
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        TRPO.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        Lagrange.__init__(
-            self,
-            cost_limit=self.cfgs.lagrange_cfgs.cost_limit,
-            lagrangian_multiplier_init=self.cfgs.lagrange_cfgs.lagrangian_multiplier_init,
-            lambda_lr=self.cfgs.lagrange_cfgs.lambda_lr,
-            lambda_optimizer=self.cfgs.lagrange_cfgs.lambda_optimizer,
-        )
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
 
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        r"""Update actor, critic, running statistics as we used in the :class:`TRPO` algorithm.
+    def _update(self) -> None:
+        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
 
         Additionally, we update the Lagrange multiplier parameter,
         by calling the :meth:`update_lagrange_multiplier` method.
 
         .. note::
-            The :meth:`compute_loss_pi` method is defined in the :class:`PolicyGradient` algorithm.
+            The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm.
             When a lagrange multiplier is used,
             the :meth:`compute_loss_pi` method will return the loss of the policy as:
 
@@ -74,43 +54,14 @@ def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
             where :math:`\lambda` is the Lagrange multiplier parameter.
         """
         # note that logger already uses MPI statistics across all processes..
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
         # first update Lagrange multiplier parameter
-        self.update_lagrange_multiplier(Jc)
+        self._lagrange.update_lagrange_multiplier(Jc)
         # then update the policy and value function
-        TRPO.update(self)
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
+        super()._update()
 
-        TRPOLag uses the Lagrange method to combine the reward and cost.
-        The surrogate loss is defined as the difference between the reward
-        advantage and the cost advantage
+        self._logger.store(**{'Metrics/LagrangeMultiplier': self._lagrange.lagrangian_multiplier})
 
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        penalty = self.lambda_range_projection(self.lagrangian_multiplier).item()
-        return (adv - penalty * cost_adv) / (1 + penalty)
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the TRPOLag specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.lagrangian_multiplier.item(),
-            }
-        )
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        penalty = self._lagrange.lagrangian_multiplier.item()
+        return (adv_r - penalty * adv_c) / (1 + penalty)
diff --git a/omnisafe/algorithms/on_policy/penalty_function/ipo.py b/omnisafe/algorithms/on_policy/penalty_function/ipo.py
index 96848aa72..222c1493e 100644
--- a/omnisafe/algorithms/on_policy/penalty_function/ipo.py
+++ b/omnisafe/algorithms/on_policy/penalty_function/ipo.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implementation of IPO algorithm."""
 
-from typing import NamedTuple
-
 import torch
 
 from omnisafe.algorithms import registry
@@ -32,31 +30,17 @@ class IPO(PPO):
         - URL: `IPO <https://arxiv.org/pdf/1910.09615.pdf>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize IPO."""
-        PPO.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        self.penalty = 0
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Penalty')
-
-    def algorithm_specific_logs(self):
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Penalty': self.penalty,
-            }
-        )
-
-    def compute_surrogate(self, adv: torch.Tensor, cost_adv: torch.Tensor) -> torch.Tensor:
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Misc/Penalty')
+
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
         """Compute surrogate loss."""
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
-        self.penalty = self.cfgs.kappa / (self.cfgs.cost_limit - Jc + 1e-8)
-        if self.penalty < 0 or self.penalty > self.cfgs.penalty_max:
-            self.penalty = self.cfgs.penalty_max
-        return (adv - self.penalty * cost_adv) / (1 + self.penalty)
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
+        penalty = self._cfgs.kappa / (self._cfgs.cost_limit - Jc + 1e-8)
+        if penalty < 0 or penalty > self._cfgs.penalty_max:
+            penalty = self._cfgs.penalty_max
+
+        self._logger.store(**{'Misc/Penalty': penalty})
+
+        return (adv_r - penalty * adv_c) / (1 + penalty)
diff --git a/omnisafe/algorithms/on_policy/penalty_function/p3o.py b/omnisafe/algorithms/on_policy/penalty_function/p3o.py
index c92fc4a42..1fc94881f 100644
--- a/omnisafe/algorithms/on_policy/penalty_function/p3o.py
+++ b/omnisafe/algorithms/on_policy/penalty_function/p3o.py
@@ -19,7 +19,7 @@
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.ppo import PPO
-from omnisafe.utils import distributed_utils
+from omnisafe.utils import distributed
 
 
 @registry.register
@@ -32,48 +32,32 @@ class P3O(PPO):
         - URL: `P3O <https://arxiv.org/pdf/2205.11814.pdf>`_
     """
 
-    def compute_loss_cost_performance(
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Loss/Loss_pi_cost', delta=True)
+
+    def _loss_pi_cost(
         self,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        cost_adv: torch.Tensor,
+        logp: torch.Tensor,
+        adv_c: torch.Tensor,
     ) -> torch.Tensor:
-        r"""Compute the loss of the cost performance.
-
-        The loss is defined as:
-
-        .. math::
-
-            \mathcal{L}_{\pi_c} = \kappa \cdot \max
-            \left(0, \frac{\pi_c(a_t|s_t)}{\pi(a_t|s_t)} \cdot A_{c_t} + J_c - \bar{J}_c\right)
-
-        where :math:`\kappa` is the penalty coefficient, :math:`\pi_c` is the cost performance,
-        :math:`\pi` is the policy, :math:`A_{c_t}` is the cost advantage, :math:`J_c` is the cost
-        of the current episode, and :math:`\bar{J}_c` is the cost limit.
-
-        Args:
-            obs (torch.Tensor): The observation tensor.
-            act (torch.Tensor): The action tensor.
-            log_p (torch.Tensor): The log probability of the action.
-            cost_adv (torch.Tensor): The cost advantage.
-        """
-        _, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-        ratio_clip = torch.clamp(ratio, 1 - self.cfgs.clip, 1 + self.cfgs.clip)
-        surr_cadv = (ratio_clip * cost_adv).mean()
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
-        loss_pi_c = self.cfgs.kappa * F.relu(surr_cadv + Jc)
-        return loss_pi_c.mean()
-
-    # pylint: disable-next=too-many-locals,too-many-arguments
-    def update_policy_net(
+        self._actor_critic.actor(obs)
+        logp_ = self._actor_critic.actor.log_prob(act)
+        ratio = torch.exp(logp_ - logp)
+        surr_cadv = (ratio * adv_c).mean()
+        Jc = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit
+        loss_cost = self._cfgs.kappa * F.relu(surr_cadv + Jc)
+        return loss_cost.mean()
+
+    def _update_actor(
         self,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
     ) -> None:
         r"""Update policy network under a double for loop.
 
@@ -100,31 +84,26 @@ def update_policy_net(
             adv (torch.Tensor): ``advantage`` stored in buffer.
             cost_adv (torch.Tensor): ``cost_advantage`` stored in buffer.
         """
-        # process the advantage function.
-        processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv)
-        # compute the loss of policy net.
-        loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv)
-        # compute the cost performance of policy net.
-        loss_pi_c = self.compute_loss_cost_performance(
-            obs=obs, act=act, log_p=log_p, cost_adv=cost_adv
-        )
-        # log the loss of policy net.
-        self.loss_record.append(loss_pi=(loss_pi - loss_pi_c).mean().item())
-        # update the policy net.
-        self.actor_optimizer.zero_grad()
-        # backward the loss of policy net.
-        (loss_pi + loss_pi_c).backward()
-        # clip the gradient of policy net.
-        if self.cfgs.use_max_grad_norm:
+        loss_reward, info = self._loss_pi(obs, act, logp, adv_r)
+        loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
+
+        loss = loss_reward - loss_cost
+
+        self._actor_critic.actor_optimizer.zero_grad()
+        loss.backward()
+        if self._cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self.actor_critic.actor.parameters(), self.cfgs.max_grad_norm
+                self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm
             )
-        # average the gradient of policy net.
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        self.actor_optimizer.step()
-        self.logger.store(
+        distributed.avg_grads(self._actor_critic.actor)
+        self._actor_critic.actor_optimizer.step()
+
+        self._logger.store(
             **{
-                'Train/Entropy': pi_info['ent'],
-                'Train/PolicyRatio': pi_info['ratio'],
+                'Train/Entropy': info['entrophy'],
+                'Train/PolicyRatio': info['ratio'],
+                'Train/PolicyStd': info['std'],
+                'Loss/Loss_pi': loss_reward.mean().item(),
+                'Loss/Loss_pi_cost': loss_cost.mean().item(),
             }
         )
diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py
index 4da4a90dc..64ad66dcd 100644
--- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py
+++ b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py
@@ -14,17 +14,15 @@
 # ==============================================================================
 """Implementation of the PID-Lagrange version of the CPPO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient
+from omnisafe.algorithms.on_policy.base.ppo import PPO
 from omnisafe.common.pid_lagrange import PIDLagrangian
 
 
 @registry.register
-class CPPOPid(PolicyGradient, PIDLagrangian):
+class CPPOPid(PPO):
     r"""The PID-Lagrange version of the CPPO algorithm.
 
     Similar to :class:`PDO`, which is a simple combination of :class:`PolicyGradient` and :class:`Lagrange`,
@@ -41,127 +39,50 @@ class CPPOPid(PolicyGradient, PIDLagrangian):
         - URL: https://arxiv.org/abs/2007.03964
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize CPPOPid.
-
-        CPPOPid is a simple combination of :class:`PolicyGradient` and :class:`PIDLagrangian`.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        PolicyGradient.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs)
-
-        self.clip = self.cfgs.clip
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-        self.logger.register_key('PID/pid_Kp')
-        self.logger.register_key('PID/pid_Ki')
-        self.logger.register_key('PID/pid_Kd')
-
-    def algorithm_specific_logs(self) -> None:
-        """Log the CPPOPid specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-            *   -   PID/pid_Kp
-                -   The Kp value in current epoch.
-            *   -   PID/pid_Ki
-                -   The Ki value in current epoch.
-            *   -   PID/pid_Kd
-                -   The Kd value in current epoch.
-        """
-        super().algorithm_specific_logs()
-        self.logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self.cost_penalty,
-                'PID/pid_Kp': self.pid_kp,
-                'PID/pid_Ki': self.pid_ki,
-                'PID/pid_Kd': self.pid_kd,
-            }
-        )
-
-    # pylint: disable-next=too-many-arguments,too-many-locals
-    def compute_loss_pi(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        r"""
-        Computing pi/actor loss.
-        In CPPOPid, the loss is defined as:
-
-        .. math::
-            L^{CLIP} = \mathbb{E}_{s_t \sim \rho_{\pi}}
-            \left[ \min(r_t (A^{R}_t - \lambda A^{C}_t), \text{clip}(r_t, 1-\epsilon, 1+\epsilon) (A^{R}_t -
-            \lambda A^{C}_t)) \right]
-
-        where :math:`r_t = \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)}`,
-        :math:`\epsilon` is the clip parameter, :math:`A^{R}_t` is the reward advantage,
-        :math:`A^{C}_t` is the cost advantage, and :math:`\lambda` is the Lagrange multiplier.
-
-        Args:
-            obs (torch.Tensor): ``observation`` stored in buffer.
-            act (torch.Tensor): ``action`` stored in buffer.
-            log_p (torch.Tensor): ``log probability`` of action stored in buffer.
-            adv (torch.Tensor): ``advantage`` stored in buffer.
-            cost_adv (torch.Tensor): ``cost advantage`` stored in buffer.
-        """
-        dist, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-        ratio_clip = torch.clamp(ratio, 1 - self.clip, 1 + self.clip)
+    def _init(self) -> None:
+        super()._init()
+        self._pid_lag = PIDLagrangian(**self._cfgs.PID_cfgs)
 
-        surr_adv = (torch.min(ratio * adv, ratio_clip * adv)).mean()
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
+        self._logger.register_key('PID/pid_Kp')
+        self._logger.register_key('PID/pid_Ki')
+        self._logger.register_key('PID/pid_Kd')
 
-        loss_pi = -surr_adv
-        loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean()
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        penalty = self._pid_lag.cost_penalty
+        return (adv_r - penalty * adv_c) / (1 + penalty)
 
-        # useful extra info
-        approx_kl = 0.5 * (log_p - _log_p).mean().item()
-        ent = dist.entropy().mean().item()
-        pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()}
+    def _update(self) -> None:
+        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
 
-        return loss_pi, pi_info
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
-
-        CPPOPid uses the Lagrange method to combine the reward and cost.
-        The surrogate loss is defined as the difference between the reward
-        advantage and the cost advantage
+        Additionally, we update the Lagrange multiplier parameter,
+        by calling the :meth:`update_lagrange_multiplier` method.
 
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        return (adv - self.cost_penalty * cost_adv) / (1 + self.cost_penalty)
+        .. note::
+            The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm.
+            When a lagrange multiplier is used,
+            the :meth:`compute_loss_pi` method will return the loss of the policy as:
 
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        r"""Update actor, critic, running statistics as we used in the :class:`PPO` algorithm.
+            .. math::
+                L_{\pi} = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)}
+                [A^{R}(s_t, a_t) - \lambda A^{C}(s_t, a_t)] \right]
 
-        Additionally, we update the Lagrange multiplier parameter,
-        by calling the :meth:`update_lagrange_multiplier` method.
+            where :math:`\lambda` is the Lagrange multiplier parameter.
         """
-        # note that logger already uses MPI statistics across all processes.
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
-        # first update Lagrange multiplier parameter.
-        self.pid_update(Jc)
-        # then update the policy and value net.
-        PolicyGradient.update(self)
+        # note that logger already uses MPI statistics across all processes..
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
+        # first update Lagrange multiplier parameter
+        self._pid_lag.pid_update(Jc)
+        # then update the policy and value function
+        super()._update()
+
+        self._logger.store(
+            **{
+                'Metrics/LagrangeMultiplier': self._pid_lag.cost_penalty,
+                'PID/pid_Kp': self._pid_lag.pid_kp,
+                'PID/pid_Ki': self._pid_lag.pid_ki,
+                'PID/pid_Kd': self._pid_lag.pid_kd,
+            }
+        )
diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py
index b11091bfb..35a303e23 100644
--- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py
+++ b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Implementation of the PID-Lagrange version of the TRPO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
@@ -24,7 +22,7 @@
 
 
 @registry.register
-class TRPOPid(TRPO, PIDLagrangian):
+class TRPOPid(TRPO):
     """The PID-Lagrange version of the TRPO algorithm.
 
     References:
@@ -33,122 +31,50 @@ class TRPOPid(TRPO, PIDLagrangian):
         - URL: https://arxiv.org/abs/2007.03964
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize TRPOPid.
+    def _init(self) -> None:
+        super()._init()
+        self._pid_lag = PIDLagrangian(**self._cfgs.PID_cfgs)
 
-        TRPOPid is a simple combination of :class:`TRPO` and :class:`PIDLagrangian`.
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/LagrangeMultiplier')
+        self._logger.register_key('PID/pid_Kp')
+        self._logger.register_key('PID/pid_Ki')
+        self._logger.register_key('PID/pid_Kd')
 
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        TRPO.__init__(
-            self,
-            env_id=env_id,
-            cfgs=cfgs,
-        )
-        PIDLagrangian.__init__(self, **self.cfgs.PID_cfgs)
-        self.cost_limit = self.cfgs.cost_limit
+    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
+        penalty = self._pid_lag.cost_penalty
+        return (adv_r - penalty * adv_c) / (1 + penalty)
 
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/LagrangeMultiplier')
-        self.logger.register_key('PID/pid_Kp')
-        self.logger.register_key('PID/pid_Ki')
-        self.logger.register_key('PID/pid_Kd')
+    def _update(self) -> None:
+        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
 
-    def algorithm_specific_logs(self) -> None:
-        """Log the TRPOPid specific information.
+        Additionally, we update the Lagrange multiplier parameter,
+        by calling the :meth:`update_lagrange_multiplier` method.
+
+        .. note::
+            The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm.
+            When a lagrange multiplier is used,
+            the :meth:`compute_loss_pi` method will return the loss of the policy as:
 
-        .. list-table::
+            .. math::
+                L_{\pi} = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)}
+                [A^{R}(s_t, a_t) - \lambda A^{C}(s_t, a_t)] \right]
 
-            *   -   Things to log
-                -   Description
-            *   -   Metrics/LagrangeMultiplier
-                -   The Lagrange multiplier value in current epoch.
-            *   -   PID/pid_Kp
-                -   The Kp value in current epoch.
-            *   -   PID/pid_Ki
-                -   The Ki value in current epoch.
-            *   -   PID/pid_Kd
-                -   The Kd value in current epoch.
+            where :math:`\lambda` is the Lagrange multiplier parameter.
         """
-        super().algorithm_specific_logs()
-        self.logger.store(
+        # note that logger already uses MPI statistics across all processes..
+        Jc = self._logger.get_stats('Metrics/EpCost')[0]
+        # first update Lagrange multiplier parameter
+        self._pid_lag.pid_update(Jc)
+        # then update the policy and value function
+        super()._update()
+
+        self._logger.store(
             **{
-                'Metrics/LagrangeMultiplier': self.cost_penalty,
-                'PID/pid_Kp': self.pid_kp,
-                'PID/pid_Ki': self.pid_ki,
-                'PID/pid_Kd': self.pid_kd,
+                'Metrics/LagrangeMultiplier': self._pid_lag.cost_penalty,
+                'PID/pid_Kp': self._pid_lag.pid_kp,
+                'PID/pid_Ki': self._pid_lag.pid_ki,
+                'PID/pid_Kd': self._pid_lag.pid_kd,
             }
         )
-
-    # pylint: disable-next=too-many-arguments
-    def compute_loss_pi(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        r"""
-        Computing pi/actor loss.
-        In CPPOPid, the loss is defined as:
-
-        .. math::
-            L = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}
-            {\pi_\theta^{old}(a_t|s_t)} [A^{R}_t(s_t, a_t) - \lambda A^{C}_t(s_t, a_t)] \right]
-
-        where :math:`A^{R}_t` is the advantage from the reward and :math:`A^{C}_t` is the advantage from the cost,
-        and :math:`\lambda` is the Lagrange multiplier controlled by the PID controller.
-
-        Args:
-            obs (torch.Tensor): :meth:`observation` stored in buffer.
-            act (torch.Tensor): :meth:`action` stored in buffer.
-            log_p (torch.Tensor): ``log probability`` of action stored in buffer.
-            adv (torch.Tensor): :meth:`advantage` stored in buffer.
-            cost_adv (torch.Tensor): :meth:`cost advantage` stored in buffer.
-        """
-        dist, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-
-        # compute loss via ratio and advantage
-        loss_pi = -(ratio * adv).mean()
-        loss_pi -= self.cfgs.entropy_coef * dist.entropy().mean()
-
-        # useful extra info
-        approx_kl = 0.5 * (log_p - _log_p).mean().item()
-        ent = dist.entropy().mean().item()
-        pi_info = {'kl': approx_kl, 'ent': ent, 'ratio': ratio.mean().item()}
-
-        return loss_pi, pi_info
-
-    def compute_surrogate(
-        self,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute surrogate loss.
-
-        TRPOPid uses the Lagrange method to combine the reward and cost.
-        The surrogate loss is defined as the difference between the reward
-        advantage and the cost advantage
-
-        Args:
-            adv (torch.Tensor): reward advantage
-            cost_adv (torch.Tensor): cost advantage
-        """
-        return (adv - self.cost_penalty * cost_adv) / (1 + self.cost_penalty)
-
-    def update(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        r"""Update actor, critic, running statistics as we used in the :class:`TRPO` algorithm.
-
-        Additionally, we update the Lagrange multiplier parameter,
-        by calling the :meth:`update_lagrange_multiplier` method.
-        """
-        # note that logger already uses MPI statistics across all processes.
-        Jc = self.logger.get_stats('Metrics/EpCost')[0]
-        # first update Lagrange multiplier parameter
-        self.pid_update(Jc)
-        # then update the policy and value net.
-        TRPO.update(self)
diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py
index fb79bfeaa..f8b9970ea 100644
--- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py
+++ b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py
@@ -14,10 +14,10 @@
 # ==============================================================================
 """Implementation of the Lagrange version of the Saute algorithm using PPOLag."""
 
-from typing import NamedTuple
-
+from omnisafe.adapter import SauteAdapter
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag
+from omnisafe.utils import distributed
 
 
 @registry.register
@@ -31,18 +31,15 @@ class PPOLagSaute(PPOLag):
         - URL: `Saute RL<https://arxiv.org/abs/2202.06558>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOLagSaute.
-
-        PPOLagSaute is a combination of :class:`PPO` and :class:`Lagrange` model,
-        using :class:`Saute` as the environment wrapper.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/EpBudget')
+    def _init_env(self) -> None:
+        self._env = SauteAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs)
+        assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, (
+            'The number of steps per epoch is not divisible by the number of ' 'environments.'
+        )
+        self._steps_per_epoch = (
+            self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs
+        )
+
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/EpBudget')
diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py
index c92f5ffc4..7ee288198 100644
--- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py
+++ b/omnisafe/algorithms/on_policy/saute/ppo_saute.py
@@ -14,10 +14,10 @@
 # ==============================================================================
 """Implementation of the Saute algorithm."""
 
-from typing import NamedTuple
-
+from omnisafe.adapter import SauteAdapter
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.ppo import PPO
+from omnisafe.utils import distributed
 
 
 @registry.register
@@ -31,17 +31,15 @@ class PPOSaute(PPO):
         - URL: `Saute RL<https://arxiv.org/abs/2202.06558>`_
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOSaute.
-
-        PPOSaute is a combination of :class:`PPO` and :class:`Saute`.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/EpBudget')
+    def _init_env(self) -> None:
+        self._env = SauteAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs)
+        assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, (
+            'The number of steps per epoch is not divisible by the number of ' 'environments.'
+        )
+        self._steps_per_epoch = (
+            self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs
+        )
+
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Metrics/EpBudget')
diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py
index 8118da581..52cfa2ea1 100644
--- a/omnisafe/algorithms/on_policy/second_order/cpo.py
+++ b/omnisafe/algorithms/on_policy/second_order/cpo.py
@@ -14,16 +14,16 @@
 # ==============================================================================
 """Implementation of the CPO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
+from typing import Tuple
 
 import numpy as np
 import torch
 
 from omnisafe.algorithms import registry
 from omnisafe.algorithms.on_policy.base.trpo import TRPO
-from omnisafe.utils import distributed_utils
+from omnisafe.utils import distributed
+from omnisafe.utils.math import conjugate_gradients
 from omnisafe.utils.tools import (
-    conjugate_gradients,
     get_flat_gradients_from,
     get_flat_params_from,
     set_param_values_to_model,
@@ -42,44 +42,34 @@ class CPO(TRPO):
         - URL: https://arxiv.org/abs/1705.10528
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize CPO.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-        self.cost_limit = cfgs.cost_limit
-        self.loss_pi_cost_before = 0.0
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Misc/cost_gradient_norm')
-        self.logger.register_key('Misc/A')
-        self.logger.register_key('Misc/B')
-        self.logger.register_key('Misc/q')
-        self.logger.register_key('Misc/r')
-        self.logger.register_key('Misc/s')
-        self.logger.register_key('Misc/Lambda_star')
-        self.logger.register_key('Misc/Nu_star')
-        self.logger.register_key('Misc/OptimCase')
+    def _init_log(self) -> None:
+        super()._init_log()
+        self._logger.register_key('Misc/cost_gradient_norm')
+        self._logger.register_key('Misc/A')
+        self._logger.register_key('Misc/B')
+        self._logger.register_key('Misc/q')
+        self._logger.register_key('Misc/r')
+        self._logger.register_key('Misc/s')
+        self._logger.register_key('Misc/Lambda_star')
+        self._logger.register_key('Misc/Nu_star')
+        self._logger.register_key('Misc/OptimCase')
 
     # pylint: disable-next=too-many-arguments,too-many-locals
-    def search_step_size(
+    def _cpo_search_step(
         self,
-        step_dir: torch.Tensor,
-        g_flat: torch.Tensor,
+        step_direction: torch.Tensor,
+        grad: torch.Tensor,
         p_dist: torch.distributions.Distribution,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        loss_pi_before: float,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
+        loss_reward_before: float,
+        loss_cost_before: float,
         total_steps: int = 15,
         decay: float = 0.8,
-        cost_adv: torch.Tensor = None,
-        c: int = 0,
+        violation_c: int = 0,
         optim_case: int = 0,
     ) -> Tuple[torch.Tensor, int]:
         r"""Use line-search to find the step size that satisfies the constraint.
@@ -112,104 +102,73 @@ def search_step_size(
         # get distance each time theta goes towards certain direction
         step_frac = 1.0
         # get and flatten parameters from pi-net
-        _theta_old = get_flat_params_from(self.actor_critic.actor)
+        theta_old = get_flat_params_from(self._actor_critic.actor)
         # reward improvement, g-flat as gradient of reward
-        expected_rew_improve = g_flat.dot(step_dir)
+        expected_reward_improve = torch.dot(grad, step_direction)
 
         # while not within_trust_region and not finish all steps:
-        for j in range(total_steps):
+        for step in range(total_steps):
             # get new theta
-            new_theta = _theta_old + step_frac * step_dir
+            new_theta = theta_old + step_frac * step_direction
             # set new theta as new actor parameters
-            set_param_values_to_model(self.actor_critic.actor, new_theta)
+            set_param_values_to_model(self._actor_critic.actor, new_theta)
             # the last acceptance steps to next step
-            acceptance_step = j + 1
+            acceptance_step = step + 1
 
             with torch.no_grad():
                 # loss of policy reward from target/expected reward
-                loss_pi_rew, _ = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv)
+                loss_reward, _ = self._loss_pi(obs=obs, act=act, logp=logp, adv=adv_r)
                 # loss of cost of policy cost from real/expected reward
-                loss_pi_cost, _ = self.compute_loss_cost_performance(
-                    obs=obs, act=act, log_p=log_p, cost_adv=cost_adv
-                )
-                self.loss_record.append(loss_pi=(loss_pi_rew.mean() + loss_pi_cost.mean()).item())
+                loss_cost = self._loss_pi_cost(obs=obs, act=act, logp=logp, adv_c=adv_c)
                 # compute KL distance between new and old policy
-                q_dist = self.actor_critic.actor(obs)
-                torch_kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item()
+                q_dist = self._actor_critic.actor(obs)
+                kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean()
             # compute improvement of reward
-            loss_rew_improve = loss_pi_before - loss_pi_rew.item()
-            cost_diff = loss_pi_cost.item() - self.loss_pi_cost_before
+            loss_reward_improve = loss_reward_before - loss_reward.item()
+            # compute difference of cost
+            loss_cost_diff = loss_cost.item() - loss_cost_before
 
             # average across MPI processes...
-            torch_kl = distributed_utils.mpi_avg(torch_kl)
+            kl = distributed.dist_avg(kl)
             # pi_average of torch_kl above
-            loss_rew_improve = distributed_utils.mpi_avg(loss_rew_improve)
-            cost_diff = distributed_utils.mpi_avg(cost_diff)
-            menu = (expected_rew_improve, loss_rew_improve)
-            self.logger.log(f'Expected Improvement: {menu[0]} Actual: {menu[1]}')
+            loss_reward_improve = distributed.dist_avg(loss_reward_improve)
+            loss_cost_diff = distributed.dist_avg(loss_cost_diff)
+            self._logger.log(
+                f'Expected Improvement: {expected_reward_improve} Actual: {loss_reward_improve}'
+            )
             # check whether there are nan.
-            if not torch.isfinite(loss_pi_rew) and not torch.isfinite(loss_pi_cost):
-                self.logger.log('WARNING: loss_pi not finite')
-            elif loss_rew_improve < 0 if optim_case > 1 else False:
-                self.logger.log('INFO: did not improve improve <0')
+            if not torch.isfinite(loss_reward) and not torch.isfinite(loss_cost):
+                self._logger.log('WARNING: loss_pi not finite')
+            elif loss_reward_improve < 0 if optim_case > 1 else False:
+                self._logger.log('INFO: did not improve improve <0')
             # change of cost's range
-            elif cost_diff > max(-c, 0):
-                self.logger.log(f'INFO: no improve {cost_diff} > {max(-c, 0)}')
+            elif loss_cost_diff > max(-violation_c, 0):
+                self._logger.log(f'INFO: no improve {loss_cost_diff} > {max(-violation_c, 0)}')
             # check KL-distance to avoid too far gap
-            elif torch_kl > self.target_kl * 1.5:
-                self.logger.log(f'INFO: violated KL constraint {torch_kl} at step {j + 1}.')
+            elif kl > self._cfgs.target_kl * 1.5:
+                self._logger.log(f'INFO: violated KL constraint {kl} at step {step + 1}.')
             else:
                 # step only if surrogate is improved and we are
                 # within the trust region
-                self.logger.log(f'Accept step at i={j + 1}')
+                self._logger.log(f'Accept step at i={step + 1}')
                 break
             step_frac *= decay
         else:
             # if didn't find a step satisfy those conditions
-            self.logger.log('INFO: no suitable step found...')
-            step_dir = torch.zeros_like(step_dir)
+            self._logger.log('INFO: no suitable step found...')
+            step_direction = torch.zeros_like(step_direction)
             acceptance_step = 0
 
-        set_param_values_to_model(self.actor_critic.actor, _theta_old)
-        return step_frac * step_dir, acceptance_step
-
-    def algorithm_specific_logs(self) -> None:
-        r"""Log the CPO specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Misc/cost_gradient_norm
-                -   The norm of the cost gradient.
-            *   -   Misc/q
-                -   The :math:`q` vector, which is the conjugate of Hessian :math:`H`.
-            *   -   Misc/r
-                -   The :math:`r` vector, where :math:`r = g^T H^{-1} b`.
-            *   -   Misc/s
-                -   The :math:`s` vector, where :math:`s = b^T H^{-1} b`
-            *   -   Misc/A
-                -   The A matrix, where :math:`A = q - \frac{r^2}{s}`
-            *   -   Misc/B
-                -   The B matrix, where :math:`B = 2 \delta_{KL} - \frac{c^2}{s}` ,
-                    where :math:`c` is the cost violation in current epoch, and
-                    :math:`\delta_{KL}` is the target KL divergence.
-            *   -   Misc/Lambda_star
-                -   The :math:`\lambda^*` vector.
-            *   -   Misc/Nu_star
-                -   The :math:`\nu^*` vector.
-            *   -   Misc/OptimCase
-                -   The optimization case.
-        """
-        TRPO.algorithm_specific_logs(self)
+        set_param_values_to_model(self._actor_critic.actor, theta_old)
+        return step_frac * step_direction, acceptance_step
 
-    def compute_loss_cost_performance(
+    def _loss_pi_cost(
         self,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        logp: torch.Tensor,
+        adv_c: torch.Tensor,
+    ) -> torch.Tensor:
         r"""Compute the performance of cost on this moment.
 
         Detailedly, we compute the loss of cost of policy cost from real cost.
@@ -224,163 +183,125 @@ def compute_loss_cost_performance(
         Args:
             obs (torch.Tensor): Observation.
             act (torch.Tensor): Action.
-            log_p (torch.Tensor): Log probability.
-            cost_adv (torch.Tensor): Cost advantage.
+            logp (torch.Tensor): Log probability of action.
+            adv_c (torch.Tensor): Cost advantage.
+
+        Returns:
+            torch.Tensor: The loss of cost of policy cost from real cost.
         """
-        _, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-        cost_loss = (ratio * cost_adv).mean()
-        info = {}
-        return cost_loss, info
-
-    # pylint: disable-next=too-many-statements,too-many-locals,too-many-arguments
-    def update_policy_net(
+        self._actor_critic.actor(obs)
+        logp_ = self._actor_critic.actor.log_prob(act)
+        ratio = torch.exp(logp_ - logp)
+        cost_loss = (ratio * adv_c).mean()
+        return cost_loss
+
+    # pylint: disable=invalid-name, too-many-arguments, too-many-locals
+    def _update_actor(
         self,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
     ) -> None:
-        """Update policy network.
+        self._fvp_obs = obs[::4]
+        theta_old = get_flat_params_from(self._actor_critic.actor)
+        self._actor_critic.actor.zero_grad()
+        loss_reward, info = self._loss_pi(obs, act, logp, adv_r)
+        loss_reward_before = distributed.dist_avg(loss_reward).item()
+        p_dist = self._actor_critic.actor(obs)
+
+        loss_reward.backward()
+        distributed.avg_grads(self._actor_critic.actor)
+
+        grad = -get_flat_gradients_from(self._actor_critic.actor)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
+        assert torch.isfinite(x).all(), 'x is not finite'
+        xHx = torch.dot(x, self._fvp(x))
+        assert xHx.item() >= 0, 'xHx is negative'
+        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+
+        self._actor_critic.actor_optimizer.zero_grad()
+        loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
+        loss_cost_before = distributed.dist_avg(loss_cost).item()
 
-        Constrained Policy Optimization updates policy network using the conjugate gradient algorithm,
-        following the steps:
+        loss_cost.backward()
+        distributed.avg_grads(self._actor_critic.actor)
 
-        - Compute the gradient of the policy.
-        - Compute the step direction.
-        - Search for a step size that satisfies the constraint. (Both KL divergence and cost limit).
-        - Update the policy network.
+        b_grad = get_flat_gradients_from(self._actor_critic.actor)
+        ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit
+        cost = ep_costs / (self._logger.get_stats('Metrics/EpLen')[0] + 1e-8)
 
-        Args:
-            obs (torch.Tensor): The observation tensor.
-            act (torch.Tensor): The action tensor.
-            log_p (torch.Tensor): The log probability of the action.
-            adv (torch.Tensor): The advantage tensor.
-            cost_adv (torch.Tensor): The cost advantage tensor.
-        """
-        # get loss and info values before update
-        self.fvp_obs = obs[::4]
-        theta_old = get_flat_params_from(self.actor_critic.actor)
-        self.actor_optimizer.zero_grad()
-        # process the advantage function.
-        processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv)
-        # compute the loss of policy net.
-        loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv)
-        loss_pi_before = distributed_utils.mpi_avg(loss_pi.item())
-        # get prob. distribution before updates, previous dist of possibilities
-        p_dist = self.actor_critic.actor(obs)
-        # train policy with multiple steps of gradient descent
-        loss_pi.backward()
-        # average grads across MPI processes
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        g_flat = get_flat_gradients_from(self.actor_critic.actor)
-
-        # flip sign since policy_loss = -(ration * adv)
-        g_flat *= -1
-        # x: g or g_T in original paper, stands for gradient of cost function
-        x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters)  # pylint: disable=invalid-name
-        assert torch.isfinite(x).all(), 'x is not finite'  # pylint: disable=invalid-name
-        eps = 1.0e-8
-        # note that xHx = g^T x, but calculating xHx is faster than g^T x
-        # equivalent to : g^T x
-        xHx = torch.dot(x, self.Fvp(x))  # pylint: disable = invalid-name
-        alpha = torch.sqrt(2 * self.target_kl / (xHx + eps))
-        assert xHx.item() >= 0, 'No negative values'  # pylint: disable = invalid-name
-
-        # get the policy cost performance gradient b (flat as vector)
-        self.actor_optimizer.zero_grad()
-        loss_cost, _ = self.compute_loss_cost_performance(
-            obs=obs, act=act, log_p=log_p, cost_adv=cost_adv
-        )
-        loss_cost.backward()
-        # average grads across MPI processes
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        self.loss_pi_cost_before = loss_cost.item()
-        b_flat = get_flat_gradients_from(self.actor_critic.actor)
-        # :param ep_costs: do samplings to get approximate costs as ep_costs
-        ep_costs = self.logger.get_stats('Metrics/EpCost')[0]
-        # :params c: how much sampled result of cost goes beyond limit
-        cost = ep_costs - self.cost_limit
-        # Rescale, and add small float to avoid nan
-        cost /= self.logger.get_stats('Metrics/EpLen')[0] + eps  # rescale
-
-        # set variable names as used in the paper with conjugate_gradient method,
-        # used to solve equation(compute Hessian Matrix) instead of Natural Gradient
-
-        p = conjugate_gradients(self.Fvp, b_flat, self.cg_iters)  # pylint: disable = invalid-name
-        q = xHx  # pylint: disable = invalid-name
-        r = g_flat.dot(p)  # pylint: disable = invalid-name
-        s = b_flat.dot(p)  # pylint: disable = invalid-name
-
-        # optim_case: divided into 5 kinds to compute
-        if b_flat.dot(b_flat) <= 1e-6 and cost < 0:
+        p = conjugate_gradients(self._fvp, b_grad, self._cfgs.cg_iters)
+        q = xHx
+        r = torch.dot(grad, p)
+        s = torch.dot(b_grad, p)
+
+        if torch.dot(b_grad, b_grad) <= 1e-6 and cost < 0:
             # feasible step and cost grad is zero: use plain TRPO update...
-            A = torch.zeros(1)  # pylint: disable = invalid-name
-            B = torch.zeros(1)  # pylint: disable = invalid-name
+            A = torch.zeros(1)
+            B = torch.zeros(1)
             optim_case = 4
         else:
             assert torch.isfinite(r).all(), 'r is not finite'
             assert torch.isfinite(s).all(), 's is not finite'
 
-            # A,b: mathematical value, not too much true meaning
-            A = q - r**2 / s  # pylint: disable = invalid-name
-            B = 2 * self.target_kl - cost**2 / s  # pylint: disable = invalid-name
+            A = q - r**2 / s
+            B = 2 * self._cfgs.target_kl - cost**2 / s
 
             if cost < 0 and B < 0:
                 # point in trust region is feasible and safety boundary doesn't intersect
                 # ==> entire trust region is feasible
                 optim_case = 3
-            elif cost < 0 and B >= 0:  # pylint: disable=chained-comparison
-                # x = 0 is feasible and safety boundary intersects
-                # ==> most of trust region is feasible
+            elif cost < 0 <= B:
+                # point in trust region is feasible but safety boundary intersects
+                # ==> only part of trust region is feasible
                 optim_case = 2
             elif cost >= 0 and B >= 0:
-                # x = 0 is infeasible and safety boundary intersects
-                # ==> part of trust region is feasible, recovery possible
+                # point in trust region is infeasible and cost boundary doesn't intersect
+                # ==> entire trust region is infeasible
                 optim_case = 1
-                self.logger.log('Alert! Attempting feasible recovery!', 'yellow')
+                self._logger.log('Alert! Attempting feasible recovery!', 'yellow')
             else:
                 # x = 0 infeasible, and safety half space is outside trust region
                 # ==> whole trust region is infeasible, try to fail gracefully
                 optim_case = 0
-                self.logger.log('Alert! Attempting infeasible recovery!', 'red')
+                self._logger.log('Alert! Attempting infeasible recovery!', 'red')
 
-        # the following computes required nu_star and lambda_star
-        if optim_case in [3, 4]:
+        if optim_case in (3, 4):
             # under 3 and 4 cases directly use TRPO method
-            alpha = torch.sqrt(
-                2 * self.target_kl / (xHx + 1e-8)
-            )  # step gap fixed by KKT condition in conjugate algorithm
+            alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
             nu_star = torch.zeros(1)
             lambda_star = 1 / alpha
-            step_dir = alpha * x  # change step direction to gap * gradient
+            step_direction = alpha * x
 
-        elif optim_case in [1, 2]:
-            # in 1 and 2,
-            def project_on_set(data: torch.Tensor, low: float, high: float) -> torch.Tensor:
-                return torch.Tensor([max(low, min(data, high))])
+        elif optim_case in (1, 2):
+
+            def project(data: torch.Tensor, low: float, high: float) -> torch.Tensor:
+                """Project data to [low, high] interval."""
+                return torch.max(torch.min(data, torch.tensor(high)), torch.tensor(low))
 
             #  analytical Solution to LQCLP, employ lambda,nu to compute final solution of OLOLQC
             #  λ=argmax(f_a(λ),f_b(λ)) = λa_star or λb_star
             #  computing formula shown in appendix, lambda_a and lambda_b
             lambda_a = torch.sqrt(A / B)
-            lambda_b = torch.sqrt(q / (2 * self.target_kl))
+            lambda_b = torch.sqrt(q / (2 * self._cfgs.target_kl))
             # λa_star = Proj(lambda_a ,0 ~ r/c)  λb_star=Proj(lambda_b,r/c~ +inf)
             # where projection(str,b,c)=max(b,min(str,c))
             # may be regarded as a projection from effective region towards safety region
+            r_num = r.item()
             if cost < 0:
-                lambda_a_star = project_on_set(lambda_a, 0.0, r / cost)
-                lambda_b_star = project_on_set(lambda_b, r / cost, np.inf)
+                lambda_a_star = project(lambda_a, 0.0, r_num / cost)
+                lambda_b_star = project(lambda_b, r_num / cost, np.inf)
             else:
-                lambda_a_star = project_on_set(lambda_a, r / cost, np.inf)
-                lambda_b_star = project_on_set(lambda_b, 0.0, r / cost)
+                lambda_a_star = project(lambda_a, r_num / cost, np.inf)
+                lambda_b_star = project(lambda_b, 0.0, r_num / cost)
 
             def f_a(lam):
-                return -0.5 * (A / (lam + eps) + B * lam) - r * cost / (s + eps)
+                return -0.5 * (A / (lam + 1e-8) + B * lam) - r * cost / (s + 1e-8)
 
             def f_b(lam):
-                return -0.5 * (q / (lam + eps) + 2 * self.target_kl * lam)
+                return -0.5 * (q / (lam + 1e-8) + 2 * self._cfgs.target_kl * lam)
 
             lambda_star = (
                 lambda_a_star if f_a(lambda_a_star) >= f_b(lambda_b_star) else lambda_b_star
@@ -388,45 +309,54 @@ def f_b(lam):
 
             # discard all negative values with torch.clamp(x, min=0)
             # Nu_star = (lambda_star * - r)/s
-            nu_star = torch.clamp(lambda_star * cost - r, min=0) / (s + eps)
+            nu_star = torch.clamp(lambda_star * cost - r, min=0) / (s + 1e-8)
             # final x_star as final direction played as policy's loss to backward and update
-            step_dir = 1.0 / (lambda_star + eps) * (x - nu_star * p)
+            step_direction = 1.0 / (lambda_star + 1e-8) * (x - nu_star * p)
 
         else:  # case == 0
             # purely decrease costs
             # without further check
             lambda_star = torch.zeros(1)
-            nu_star = np.sqrt(2 * self.target_kl / (s + eps))
-            step_dir = -nu_star * p
-
-        final_step_dir, accept_step = self.search_step_size(
-            step_dir,
-            g_flat,
-            c=cost,
-            loss_pi_before=loss_pi_before,
-            optim_case=optim_case,
+            nu_star = np.sqrt(2 * self._cfgs.target_kl / (s + 1e-8))
+            step_direction = -nu_star * p
+
+        step_direction, accept_step = self._cpo_search_step(
+            step_direction=step_direction,
+            grad=grad,
             p_dist=p_dist,
             obs=obs,
             act=act,
-            log_p=log_p,
-            adv=adv,
-            cost_adv=cost_adv,
+            logp=logp,
+            adv_r=adv_r,
+            adv_c=adv_c,
+            loss_reward_before=loss_reward_before,
+            loss_cost_before=loss_cost_before,
             total_steps=20,
+            violation_c=cost,
+            optim_case=optim_case,
         )
-        # update actor network parameters
-        new_theta = theta_old + final_step_dir
-        set_param_values_to_model(self.actor_critic.actor, new_theta)
-        self.logger.store(
+
+        theta_new = theta_old + step_direction
+        set_param_values_to_model(self._actor_critic.actor, theta_new)
+
+        with torch.no_grad():
+            loss_reward, info = self._loss_pi(obs, act, logp, adv_r)
+            loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
+            loss = loss_reward + loss_cost
+
+        self._logger.store(
             **{
-                'Train/Entropy': pi_info['ent'],
-                'Train/PolicyRatio': pi_info['ratio'],
+                'Loss/Loss_pi': loss.item(),
+                'Train/Entropy': info['entrophy'],
+                'Train/PolicyRatio': info['ratio'],
+                'Train/PolicyStd': info['std'],
                 'Misc/AcceptanceStep': accept_step,
                 'Misc/Alpha': alpha.item(),
-                'Misc/FinalStepNorm': final_step_dir.norm().mean().item(),
+                'Misc/FinalStepNorm': step_direction.norm().mean().item(),
                 'Misc/xHx': xHx.mean().item(),
                 'Misc/H_inv_g': x.norm().item(),  # H^-1 g
-                'Misc/gradient_norm': torch.norm(g_flat).mean().item(),
-                'Misc/cost_gradient_norm': torch.norm(b_flat).mean().item(),
+                'Misc/gradient_norm': torch.norm(grad).mean().item(),
+                'Misc/cost_gradient_norm': torch.norm(b_grad).mean().item(),
                 'Misc/Lambda_star': lambda_star.item(),
                 'Misc/Nu_star': nu_star.item(),
                 'Misc/OptimCase': int(optim_case),
diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py
index bf548c728..d69ae6cea 100644
--- a/omnisafe/algorithms/on_policy/second_order/pcpo.py
+++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py
@@ -14,15 +14,13 @@
 # ==============================================================================
 """Implementation of the PCPO algorithm."""
 
-from typing import Dict, NamedTuple, Tuple
-
 import torch
 
 from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.trpo import TRPO
-from omnisafe.utils import distributed_utils
+from omnisafe.algorithms.on_policy.second_order.cpo import CPO
+from omnisafe.utils import distributed
+from omnisafe.utils.math import conjugate_gradients
 from omnisafe.utils.tools import (
-    conjugate_gradients,
     get_flat_gradients_from,
     get_flat_params_from,
     set_param_values_to_model,
@@ -30,7 +28,7 @@
 
 
 @registry.register
-class PCPO(TRPO):
+class PCPO(CPO):
     """The Projection-Based Constrained Policy Optimization (PCPO) algorithm.
 
     References:
@@ -39,196 +37,14 @@ class PCPO(TRPO):
         URL:`PCPO <https://arxiv.org/abs/2010.03152>_`
     """
 
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PCPO.
-
-        PCPO is a derivative of TRPO.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-        self.cost_limit = self.cfgs.cost_limit
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Misc/cost_gradient_norm')
-        self.logger.register_key('Misc/A')
-        self.logger.register_key('Misc/B')
-        self.logger.register_key('Misc/q')
-        self.logger.register_key('Misc/r')
-        self.logger.register_key('Misc/s')
-        self.logger.register_key('Misc/Lambda_star')
-        self.logger.register_key('Misc/Nu_star')
-        self.logger.register_key('Misc/OptimCase')
-
     # pylint: disable-next=too-many-locals,too-many-arguments
-    def adjust_cpo_step_direction(
+    def _update_actor(
         self,
-        step_dir: torch.Tensor,
-        g_flat: torch.Tensor,
-        cost: torch.Tensor,
-        optim_case: int,
-        p_dist: torch.distributions.Distribution,
         obs: torch.Tensor,
         act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
-        loss_pi_before: torch.Tensor,
-        loss_pi_cost_before: torch.Tensor,
-        total_steps: int = 25,
-        decay: float = 0.8,
-    ) -> Tuple[torch.Tensor, int]:
-        r"""Use line-search to find the step size that satisfies the constraint.
-
-        PCPO uses line-search to find the step size that satisfies the constraint.
-        The constraint is defined as:
-
-        .. math::
-            J^C(\theta + \alpha \delta) - J^C(\theta) \leq \max \{0, c\}\\
-            D_{KL}(\pi_{\theta}(\cdot|s) || \pi_{\theta + \alpha \delta}(\cdot|s)) \leq \delta_{KL}
-
-        where :math:`\delta_{KL}` is the constraint of KL divergence, :math:`\alpha` is the step size,
-        :math:`c` is the violation of constraint.
-
-        Args:
-            step_dir (torch.Tensor): The step direction.
-            g_flat (torch.Tensor): The gradient of the policy.
-            p_dist (torch.distributions.Distribution): The old policy distribution.
-            obs (torch.Tensor): The observation.
-            act (torch.Tensor): The action.
-            log_p (torch.Tensor): The log probability of the action.
-            adv (torch.Tensor): The advantage.
-            cost_adv (torch.Tensor): The cost advantage.
-            loss_pi_before (torch.Tensor): The loss of the policy before the step.
-            loss_pi_cost_before (torch.Tensor): The loss of the cost before the step.
-            total_steps (int, optional): The total steps of line-search. Defaults to 25.
-            decay (float, optional): The decay of step size. Defaults to 0.8.
-        """
-        step_frac = 1.0
-        _theta_old = get_flat_params_from(self.actor_critic.actor)
-        expected_rew_improve = g_flat.dot(step_dir)
-
-        # while not within_trust_region:
-        for j in range(total_steps):
-            new_theta = _theta_old + step_frac * step_dir
-            set_param_values_to_model(self.actor_critic.actor, new_theta)
-            acceptance_step = j + 1
-
-            with torch.no_grad():
-                # loss of policy reward from target/expected reward
-                loss_pi_rew, _ = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=adv)
-                # loss of cost of policy cost from real/expected reward
-                loss_pi_cost, _ = self.compute_loss_cost_performance(
-                    obs=obs, act=act, log_p=log_p, cost_adv=cost_adv
-                )
-                self.loss_record.append(loss_pi=(loss_pi_rew.mean() + loss_pi_cost.mean()).item())
-                # determine KL div between new and old policy
-                q_dist = self.actor_critic.actor(obs)
-                torch_kl = torch.distributions.kl.kl_divergence(p_dist, q_dist).mean().item()
-            loss_rew_improve = loss_pi_before - loss_pi_rew.item()
-            cost_diff = loss_pi_cost.item() - loss_pi_cost_before
-
-            # average across MPI processes...
-            torch_kl = distributed_utils.mpi_avg(torch_kl)
-            loss_rew_improve = distributed_utils.mpi_avg(loss_rew_improve)
-            cost_diff = distributed_utils.mpi_avg(cost_diff)
-            menu = (expected_rew_improve, loss_rew_improve)
-            self.logger.log(f'Expected Improvement: {menu[0]} Actual: {menu[1]}')
-
-            if not torch.isfinite(loss_pi_rew) and not torch.isfinite(loss_pi_cost):
-                self.logger.log('WARNING: loss_pi not finite')
-            elif loss_rew_improve < 0 if optim_case > 1 else False:
-                self.logger.log('INFO: did not improve improve <0')
-
-            elif cost_diff > max(-cost, 0):
-                self.logger.log(f'INFO: no improve {cost_diff} > {max(-cost, 0)}')
-            elif torch_kl > self.target_kl * 1.5:
-                self.logger.log(f'INFO: violated KL constraint {torch_kl} at step {j + 1}.')
-            else:
-                # step only if surrogate is improved and we are
-                # within the trust region
-                self.logger.log(f'Accept step at i={j + 1}')
-                break
-            step_frac *= decay
-        else:
-            self.logger.log('INFO: no suitable step found...')
-            step_dir = torch.zeros_like(step_dir)
-            acceptance_step = 0
-
-        set_param_values_to_model(self.actor_critic.actor, _theta_old)
-        return step_frac * step_dir, acceptance_step
-
-    def algorithm_specific_logs(self) -> None:
-        r"""Log the PCPO specific information.
-
-        .. list-table::
-
-            *   -   Things to log
-                -   Description
-            *   -   Misc/cost_gradient_norm
-                -   The norm of the cost gradient.
-            *   -   Misc/q
-                -   The :math:`q` vector, which is the conjugate of Hessian :math:`H`.
-            *   -   Misc/r
-                -   The :math:`r` vector, where :math:`r = g^T H^{-1} b`.
-            *   -   Misc/s
-                -   The :math:`s` vector, where :math:`s = b^T H^{-1} b`
-            *   -   Misc/A
-                -   The A matrix, where :math:`A = q - \frac{r^2}{s}`
-            *   -   Misc/B
-                -   The B matrix, where :math:`B = 2 \delta_{KL} - \frac{c^2}{s}` ,
-                    where :math:`c` is the cost violation in current epoch, and
-                    :math:`\delta_{KL}` is the target KL divergence.
-            *   -   Misc/Lambda_star
-                -   The :math:`\lambda^*` vector.
-            *   -   Misc/Nu_star
-                -   The :math:`\nu^*` vector.
-            *   -   Misc/OptimCase
-                -   The optimization case.
-        """
-        TRPO.algorithm_specific_logs(self)
-
-    def compute_loss_cost_performance(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        cost_adv: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        r"""Compute the performance of cost on this moment.
-
-        Detailedly, we compute the loss of cost of policy cost from real cost.
-
-        .. math::
-            L = \mathbb{E}_{\pi} \left[ \frac{\pi(a|s)}{\pi_{old}(a|s)} A^C(s, a) \right]
-
-        where :math:`A^C(s, a)` is the cost advantage,
-        :math:`\pi_{old}(a|s)` is the old policy,
-        :math:`\pi(a|s)` is the current policy.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            act (torch.Tensor): Action.
-            log_p (torch.Tensor): Log probability.
-            cost_adv (torch.Tensor): Cost advantage.
-        """
-        _, _log_p = self.actor_critic.actor(obs, act)
-        ratio = torch.exp(_log_p - log_p)
-        cost_loss = (ratio * cost_adv).mean()
-        info = {}
-        return cost_loss, info
-
-    # pylint: disable-next=too-many-locals,too-many-arguments
-    def update_policy_net(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor,
-        log_p: torch.Tensor,
-        adv: torch.Tensor,
-        cost_adv: torch.Tensor,
+        logp: torch.Tensor,
+        adv_r: torch.Tensor,
+        adv_c: torch.Tensor,
     ) -> None:
         """Update policy network.
 
@@ -247,96 +63,88 @@ def update_policy_net(
             adv (torch.Tensor): The advantage tensor.
             cost_adv (torch.Tensor): The cost advantage tensor.
         """
-        self.fvp_obs = obs[::4]
-        theta_old = get_flat_params_from(self.actor_critic.actor)
-        self.actor_optimizer.zero_grad()
-        # process the advantage function.
-        processed_adv = self.compute_surrogate(adv=adv, cost_adv=cost_adv)
-        # compute the loss of policy net.
-        loss_pi, pi_info = self.compute_loss_pi(obs=obs, act=act, log_p=log_p, adv=processed_adv)
-        loss_pi_before = loss_pi.item()
-        # get prob. distribution before updates
-        p_dist = self.actor_critic.actor(obs)
-        # train policy with multiple steps of gradient descent
-        loss_pi.backward()
-        # average grads across MPI processes
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        g_flat = get_flat_gradients_from(self.actor_critic.actor)
-
-        # flip sign since policy_loss = -(ration * adv)
-        g_flat *= -1
-        x = conjugate_gradients(self.Fvp, g_flat, self.cg_iters)  # pylint: disable = invalid-name
+        # pylint: disable=invalid-name
+        self._fvp_obs = obs[::4]
+        theta_old = get_flat_params_from(self._actor_critic.actor)
+        self._actor_critic.actor.zero_grad()
+        loss_reward, info = self._loss_pi(obs, act, logp, adv_r)
+        loss_reward_before = distributed.dist_avg(loss_reward).item()
+        p_dist = self._actor_critic.actor(obs)
+
+        loss_reward.backward()
+        distributed.avg_grads(self._actor_critic.actor)
+
+        grad = -get_flat_gradients_from(self._actor_critic.actor)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
-        eps = 1.0e-8
-        # note that xHx = g^T x, but calculating xHx is faster than g^T x
-        xHx = torch.dot(x, self.Fvp(x))  # pylint: disable = invalid-name
-        H_inv_g = self.Fvp(x)  # pylint: disable = invalid-name
-        alpha = torch.sqrt(2 * self.target_kl / (xHx + eps))
-        assert xHx.item() >= 0, 'No negative values'
+        xHx = torch.dot(x, self._fvp(x))
+        H_inv_g = self._fvp(x)
+        assert xHx.item() >= 0, 'xHx is negative'
+        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+
+        self._actor_critic.actor_optimizer.zero_grad()
+        loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
+        loss_cost_before = distributed.dist_avg(loss_cost).item()
 
-        # get the policy cost performance gradient b (flat as vector)
-        self.actor_optimizer.zero_grad()
-        loss_cost, _ = self.compute_loss_cost_performance(
-            obs=obs, act=act, log_p=log_p, cost_adv=cost_adv
-        )
         loss_cost.backward()
-        # average grads across MPI processes
-        distributed_utils.mpi_avg_grads(self.actor_critic.actor)
-        loss_pi_cost_before = loss_cost.item()
-        b_flat = get_flat_gradients_from(self.actor_critic.actor)
+        distributed.avg_grads(self._actor_critic.actor)
+
+        b_grad = get_flat_gradients_from(self._actor_critic.actor)
+        ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit
+        cost = ep_costs / (self._logger.get_stats('Metrics/EpLen')[0] + 1e-8)
 
-        ep_costs = self.logger.get_stats('Metrics/EpCost')[0]
-        cost = ep_costs - self.cost_limit
-        cost /= self.logger.get_stats('Metrics/EpLen')[0] + eps  # rescale
-        self.logger.log(f'c = {cost}')
-        self.logger.log(f'b^T b = {b_flat.dot(b_flat).item()}')
+        self._logger.log(f'c = {cost}')
+        self._logger.log(f'b^T b = {b_grad.dot(b_grad).item()}')
 
-        # set variable names as used in the paper
-        p = conjugate_gradients(self.Fvp, b_flat, self.cg_iters)  # pylint: disable = invalid-name
-        q = xHx  # pylint: disable = invalid-name
-        # g^T H^{-1} b
-        r = g_flat.dot(p)  # pylint: disable = invalid-name
-        # b^T H^{-1} b
-        s = b_flat.dot(p)  # pylint: disable = invalid-name
-        step_dir = (
-            torch.sqrt(2 * self.target_kl / (q + 1e-8)) * H_inv_g
+        p = conjugate_gradients(self._fvp, b_grad, self._cfgs.cg_iters)
+        q = xHx
+        r = torch.dot(grad, p)
+        s = torch.dot(b_grad, p)
+
+        step_direction = (
+            torch.sqrt(2 * self._cfgs.target_kl / (q + 1e-8)) * H_inv_g
             - torch.clamp_min(
-                (torch.sqrt(2 * self.target_kl / q) * r + cost) / s,
-                torch.tensor(0.0, device=self.cfgs.device),
+                (torch.sqrt(2 * self._cfgs.target_kl / q) * r + cost) / s,
+                torch.tensor(0.0, device=self._device),
             )
             * p
         )  # pylint: disable = invalid-name
 
-        final_step_dir, accept_step = self.adjust_cpo_step_direction(
-            step_dir,
-            g_flat,
-            cost=cost,
-            optim_case=2,
+        step_direction, accept_step = self._cpo_search_step(
+            step_direction=step_direction,
+            grad=grad,
             p_dist=p_dist,
             obs=obs,
             act=act,
-            log_p=log_p,
-            adv=adv,
-            cost_adv=cost_adv,
-            loss_pi_before=loss_pi_before,
-            loss_pi_cost_before=loss_pi_cost_before,
+            logp=logp,
+            adv_r=adv_r,
+            adv_c=adv_c,
+            loss_reward_before=loss_reward_before,
+            loss_cost_before=loss_cost_before,
             total_steps=20,
+            violation_c=cost,
         )
-        # update actor network parameters
-        new_theta = theta_old + final_step_dir
-        set_param_values_to_model(self.actor_critic.actor, new_theta)
+        theta_new = theta_old + step_direction
+        set_param_values_to_model(self._actor_critic.actor, theta_new)
+
+        with torch.no_grad():
+            loss_reward, info = self._loss_pi(obs, act, logp, adv_r)
+            loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
+            loss = loss_reward + loss_cost
 
-        self.logger.store(
+        self._logger.store(
             **{
-                'Train/Entropy': pi_info['ent'],
-                'Train/PolicyRatio': pi_info['ratio'],
+                'Loss/Loss_pi': loss.item(),
+                'Train/Entropy': info['entrophy'],
+                'Train/PolicyRatio': info['ratio'],
+                'Train/PolicyStd': info['std'],
                 'Misc/AcceptanceStep': accept_step,
                 'Misc/Alpha': alpha.item(),
-                'Misc/FinalStepNorm': final_step_dir.norm().mean().item(),
+                'Misc/FinalStepNorm': step_direction.norm().mean().item(),
                 'Misc/xHx': xHx.mean().item(),
                 'Misc/H_inv_g': x.norm().item(),  # H^-1 g
-                'Misc/gradient_norm': torch.norm(g_flat).mean().item(),
-                'Misc/cost_gradient_norm': torch.norm(b_flat).mean().item(),
+                'Misc/gradient_norm': torch.norm(grad).mean().item(),
+                'Misc/cost_gradient_norm': torch.norm(b_grad).mean().item(),
                 'Misc/Lambda_star': 1.0,
                 'Misc/Nu_star': 1.0,
                 'Misc/OptimCase': int(1),
diff --git a/omnisafe/algorithms/on_policy/simmer/__init__.py b/omnisafe/algorithms/on_policy/simmer/__init__.py
deleted file mode 100644
index 91b2cca64..000000000
--- a/omnisafe/algorithms/on_policy/simmer/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Simmer algorithms."""
-
-from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_pid import PPOLagSimmerPid
-from omnisafe.algorithms.on_policy.simmer.ppo_lag_simmer_q import PPOLagSimmerQ
-from omnisafe.algorithms.on_policy.simmer.ppo_simmer_pid import PPOSimmerPid
-from omnisafe.algorithms.on_policy.simmer.ppo_simmer_q import PPOSimmerQ
-
-
-__all__ = [
-    'PPOLagSimmerPid',
-    'PPOLagSimmerQ',
-    'PPOSimmerPid',
-    'PPOSimmerQ',
-]
diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py
deleted file mode 100644
index 302b65a47..000000000
--- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_pid.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the PID version of the Simmer algorithm using PPOLag."""
-
-from typing import NamedTuple
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag
-
-
-@registry.register
-class PPOLagSimmerPid(PPOLag):
-    """The PID version of the Simmer algorithm implemented with PPOLag.
-
-    References:
-        - Title: Effects of Safety State Augmentation on Safe Exploration
-        - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar.
-        - URL: `Simmer RL <https://arxiv.org/abs/2206.02675>`_
-    """
-
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOLagSimmerPid.
-
-        PPOLagSimmerPid is a combination of :class:`PPO` and :class:`Lagrange` model,
-        using :class:`Simmer` as the environment wrapper.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/EpBudget')
-        self.logger.register_key('Metrics/SafetyBudget')
diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py
deleted file mode 100644
index 731d954b8..000000000
--- a/omnisafe/algorithms/on_policy/simmer/ppo_lag_simmer_q.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Q Simmer algorithm using PPOLag."""
-
-from typing import NamedTuple
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag
-
-
-@registry.register
-class PPOLagSimmerQ(PPOLag):
-    """The Q Simmer algorithm implemented with PPOLag.
-
-    References:
-        - Title: Effects of Safety State Augmentation on Safe Exploration
-        - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar.
-        - URL: `Simmer RL <https://arxiv.org/abs/2206.02675>`_
-    """
-
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOLagSimmerQ.
-
-        PPOLagSimmerQ is a combination of :class:`PPO` and :class:`Lagrange` model,
-        using :class:`Simmer` as the environment wrapper.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/EpBudget')
-        self.logger.register_key('Metrics/SafetyBudget')
diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py
deleted file mode 100644
index 6c0bc33c8..000000000
--- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_pid.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the PID version of the Simmer algorithm using PPO."""
-
-from typing import NamedTuple
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.ppo import PPO
-
-
-@registry.register
-class PPOSimmerPid(PPO):
-    """The PID version of the Simmer algorithm implemented with PPO.
-
-    References:
-        - Title: Effects of Safety State Augmentation on Safe Exploration
-        - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar.
-        - URL: `Simmer RL <https://arxiv.org/abs/2206.02675>`_
-    """
-
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOSimmerPid.
-
-        PPOSimmerPid is a combination of :class:`PPO` and :class:`Simmer` environment wrapper.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/EpBudget')
-        self.logger.register_key('Metrics/SafetyBudget')
diff --git a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py b/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py
deleted file mode 100644
index 2e6589433..000000000
--- a/omnisafe/algorithms/on_policy/simmer/ppo_simmer_q.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Q Simmer algorithm using PPO."""
-
-from typing import NamedTuple
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.ppo import PPO
-
-
-@registry.register
-class PPOSimmerQ(PPO):
-    """The Q Simmer algorithm implemented with PPO.
-
-    References:
-        - Title: Effects of Safety State Augmentation on Safe Exploration
-        - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Jun Wang, Haitham Bou Ammar.
-        - URL: `Simmer RL <https://arxiv.org/abs/2206.02675>`_
-    """
-
-    def __init__(self, env_id: str, cfgs: NamedTuple) -> None:
-        """Initialize PPOSimmerQ.
-
-        PPOSimmerQ is a combination of :class:`PPO` and :class:`Simmer` environment wrapper.
-
-        Args:
-            env_id (str): The environment id.
-            cfgs (NamedTuple): The configuration of the algorithm.
-        """
-        super().__init__(env_id=env_id, cfgs=cfgs)
-
-    def _specific_init_logs(self):
-        super()._specific_init_logs()
-        self.logger.register_key('Metrics/EpBudget')
-        self.logger.register_key('Metrics/SafetyBudget')
diff --git a/omnisafe/common/buffer/onpolicy_buffer.py b/omnisafe/common/buffer/onpolicy_buffer.py
index 4c159dafb..cd858ad66 100644
--- a/omnisafe/common/buffer/onpolicy_buffer.py
+++ b/omnisafe/common/buffer/onpolicy_buffer.py
@@ -20,9 +20,8 @@
 
 from omnisafe.common.buffer.base import BaseBuffer
 from omnisafe.typing import AdvatageEstimator, OmnisafeSpace
-from omnisafe.utils import distributed_utils
-from omnisafe.utils.core import discount_cumsum_torch
-from omnisafe.utils.vtrace import calculate_v_trace
+from omnisafe.utils import distributed
+from omnisafe.utils.math import discount_cumsum
 
 
 class OnPolicyBuffer(BaseBuffer):  # pylint: disable=too-many-instance-attributes
@@ -95,14 +94,14 @@ def finish_path(
     ) -> None:
         """Finish the current path and calculate the advantages of state-action pairs."""
         path_slice = slice(self.path_start_idx, self.ptr)
-        last_value_r = last_value_r.to(self.device)
-        last_value_c = last_value_c.to(self.device)
+        last_value_r = last_value_r.to(self._device)
+        last_value_c = last_value_c.to(self._device)
         rewards = torch.cat([self.data['reward'][path_slice], last_value_r])
         values_r = torch.cat([self.data['value_r'][path_slice], last_value_r])
         costs = torch.cat([self.data['cost'][path_slice], last_value_c])
         values_c = torch.cat([self.data['value_c'][path_slice], last_value_c])
 
-        discountred_ret = discount_cumsum_torch(rewards, self._gamma)[:-1]
+        discountred_ret = discount_cumsum(rewards, self._gamma)[:-1]
         self.data['discounted_ret'][path_slice] = discountred_ret
         rewards -= self._penalty_coefficient * costs
 
@@ -122,7 +121,6 @@ def finish_path(
 
     def get(self) -> Dict[str, torch.Tensor]:
         """Get the data in the buffer."""
-        assert self.ptr == self.max_size, 'The buffer is not full!'
         self.ptr, self.path_start_idx = 0, 0
 
         data = {
@@ -136,11 +134,11 @@ def get(self) -> Dict[str, torch.Tensor]:
             'target_value_c': self.data['target_value_c'],
         }
 
-        self.data['adv_r'] = torch.zeros_like(self.data['adv_r'])
-        self.data['adv_c'] = torch.zeros_like(self.data['adv_c'])
+        # self.data['adv_r'] = torch.zeros_like(self.data['adv_r'])
+        # self.data['adv_c'] = torch.zeros_like(self.data['adv_c'])
 
-        adv_mean, adv_std, *_ = distributed_utils.mpi_statistics_scalar(data['adv_r'])
-        cadv_mean, *_ = distributed_utils.mpi_statistics_scalar(data['adv_c'])
+        adv_mean, adv_std, *_ = distributed.dist_statistics_scalar(data['adv_r'])
+        cadv_mean, *_ = distributed.dist_statistics_scalar(data['adv_c'])
         if self._standardized_adv_r:
             data['adv_r'] = (data['adv_r'] - adv_mean) / (adv_std + 1e-8)
         if self._standardized_adv_c:
@@ -206,15 +204,15 @@ def _calculate_adv_and_value_targets(
         if self._advantage_estimator == 'gae':
             # GAE formula: A_t = \sum_{k=0}^{n-1} (lam*gamma)^k delta_{t+k}
             deltas = rewards[:-1] + self._gamma * values[1:] - values[:-1]
-            adv = discount_cumsum_torch(deltas, self._gamma * lam)
+            adv = discount_cumsum(deltas, self._gamma * lam)
             target_value = adv + values[:-1]
 
         elif self._advantage_estimator == 'gae-rtg':
             # GAE formula: A_t = \sum_{k=0}^{n-1} (lam*gamma)^k delta_{t+k}
             deltas = rewards[:-1] + self._gamma * values[1:] - values[:-1]
-            adv = discount_cumsum_torch(deltas, self._gamma * lam)
+            adv = discount_cumsum(deltas, self._gamma * lam)
             # compute rewards-to-go, to be targets for the value function update
-            target_value = discount_cumsum_torch(rewards, self._gamma)[:-1]
+            target_value = discount_cumsum(rewards, self._gamma)[:-1]
 
         elif self._advantage_estimator == 'vtrace':
             #  v_s = V(x_s) + \sum^{T-1}_{t=s} \gamma^{t-s}
@@ -222,7 +220,7 @@ def _calculate_adv_and_value_targets(
             #                 * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
             path_slice = slice(self.path_start_idx, self.ptr)
             action_probs = self.data['logp'][path_slice].exp()
-            target_value, adv, _ = calculate_v_trace(
+            target_value, adv, _ = self._calculate_v_trace(
                 policy_action_probs=action_probs,
                 values=values,
                 rewards=rewards,
@@ -235,9 +233,72 @@ def _calculate_adv_and_value_targets(
         elif self._advantage_estimator == 'plain':
             # A(x, u) = Q(x, u) - V(x) = r(x, u) + gamma V(x+1) - V(x)
             adv = rewards[:-1] + self._gamma * values[1:] - values[:-1]
-            target_value = discount_cumsum_torch(rewards, self._gamma)[:-1]
+            target_value = discount_cumsum(rewards, self._gamma)[:-1]
 
         else:
             raise NotImplementedError
 
         return adv, target_value
+
+    @staticmethod
+    # pylint: disable-next=too-many-arguments,too-many-locals
+    def _calculate_v_trace(
+        policy_action_probs: torch.Tensor,
+        values: torch.Tensor,  # including bootstrap
+        rewards: torch.Tensor,  # including bootstrap
+        behavior_action_probs: torch.Tensor,
+        gamma: float = 0.99,
+        rho_bar: float = 1.0,
+        c_bar: float = 1.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]:
+        r"""This function is used to calculate V-trace targets.
+
+        .. math::
+            A_t = \sum_{k=0}^{n-1} (\lambda \gamma)^k \delta_{t+k} +
+            (\lambda \gamma)^n * \rho_{t+n} * (1 - d_{t+n}) * (V(x_{t+n}) - b_{t+n})
+
+        Calculate V-trace targets for off-policy actor-critic learning recursively.
+        For more details,
+        please refer to the paper: `Espeholt et al. 2018, IMPALA <https://arxiv.org/abs/1802.01561>`_.
+
+        Args:
+            policy_action_probs (torch.Tensor): action probabilities of policy network, shape=(sequence_length,)
+            values (torch.Tensor): state values, shape=(sequence_length+1,)
+            rewards (torch.Tensor): rewards, shape=(sequence_length+1,)
+            behavior_action_probs (torch.Tensor): action probabilities of behavior network, shape=(sequence_length,)
+            gamma (float): discount factor
+            rho_bar (float): clip rho
+            c_bar (float): clip c
+
+        Returns:
+            tuple: V-trace targets, shape=(batch_size, sequence_length)
+        """
+        assert values.ndim == 1, 'Please provide 1d-arrays'
+        assert rewards.ndim == 1
+        assert policy_action_probs.ndim == 1
+        assert behavior_action_probs.ndim == 1
+        assert c_bar <= rho_bar
+
+        sequence_length = policy_action_probs.shape[0]
+        # pylint: disable-next=assignment-from-no-return
+        rhos = torch.div(policy_action_probs, behavior_action_probs)
+        clip_rhos = torch.min(
+            rhos, torch.as_tensor(rho_bar)
+        )  # pylint: disable=assignment-from-no-return
+        clip_cs = torch.min(
+            rhos, torch.as_tensor(c_bar)
+        )  # pylint: disable=assignment-from-no-return
+        v_s = values[:-1].clone()  # copy all values except bootstrap value
+        last_v_s = values[-1]  # bootstrap from last state
+
+        # calculate v_s
+        for index in reversed(range(sequence_length)):
+            delta = clip_rhos[index] * (rewards[index] + gamma * values[index + 1] - values[index])
+            v_s[index] += delta + gamma * clip_cs[index] * (last_v_s - values[index + 1])
+            last_v_s = v_s[index]  # accumulate current v_s for next iteration
+
+        # calculate q_targets
+        v_s_plus_1 = torch.cat((v_s[1:], values[-1:]))
+        policy_advantage = clip_rhos * (rewards[:-1] + gamma * v_s_plus_1 - values[:-1])
+
+        return v_s, policy_advantage, clip_rhos
diff --git a/omnisafe/common/buffer/vector_onpolicy_buffer.py b/omnisafe/common/buffer/vector_onpolicy_buffer.py
index 1d11d523c..59f634e69 100644
--- a/omnisafe/common/buffer/vector_onpolicy_buffer.py
+++ b/omnisafe/common/buffer/vector_onpolicy_buffer.py
@@ -20,7 +20,7 @@
 
 from omnisafe.common.buffer.onpolicy_buffer import OnPolicyBuffer
 from omnisafe.typing import AdvatageEstimator, OmnisafeSpace
-from omnisafe.utils import distributed_utils
+from omnisafe.utils import distributed
 
 
 class VectorOnPolicyBuffer(OnPolicyBuffer):
@@ -88,8 +88,8 @@ def get(self) -> Dict[str, torch.Tensor]:
                 data_pre[k].append(v)
         data = {k: torch.cat(v, dim=0) for k, v in data_pre.items()}
 
-        adv_mean, adv_std, *_ = distributed_utils.mpi_statistics_scalar(data['adv_r'])
-        cadv_mean, *_ = distributed_utils.mpi_statistics_scalar(data['adv_c'])
+        adv_mean, adv_std, *_ = distributed.dist_statistics_scalar(data['adv_r'])
+        cadv_mean, *_ = distributed.dist_statistics_scalar(data['adv_c'])
         if self._standardized_adv_r:
             data['adv_r'] = (data['adv_r'] - adv_mean) / (adv_std + 1e-8)
         if self._standardized_adv_c:
diff --git a/omnisafe/common/experiment_grid.py b/omnisafe/common/experiment_grid.py
index 8b879cc9f..fb5069a05 100644
--- a/omnisafe/common/experiment_grid.py
+++ b/omnisafe/common/experiment_grid.py
@@ -21,6 +21,7 @@
 from concurrent.futures import ProcessPoolExecutor as Pool
 from copy import deepcopy
 from textwrap import dedent
+from typing import Any, Dict, List
 
 import numpy as np
 from tqdm import trange
@@ -34,10 +35,10 @@ class ExperimentGrid:
     """Tool for running many experiments given hyper-parameters ranges."""
 
     def __init__(self, exp_name='') -> None:
-        self.keys = []
-        self.vals = []
-        self.shs = []
-        self.in_names = []
+        self.keys: List[str] = []
+        self.vals: List[Any] = []
+        self.shs: List[str] = []
+        self.in_names: List[str] = []
         self.div_line_width = 80
         assert isinstance(exp_name, str), 'Name has to be a string.'
         self.name = exp_name
@@ -206,7 +207,7 @@ def update_dic(self, total_dic, item_dic):
     def _variants(self, keys, vals):
         """Recursively builds list of valid variants."""
         if len(keys) == 1:
-            pre_variants = [{}]
+            pre_variants: List[Dict] = [{}]
         else:
             pre_variants = self._variants(keys[1:], vals[1:])
 
@@ -259,7 +260,7 @@ def variants(self):
 
         def unflatten_var(var):
             """Build the full nested dict version of var, based on key names."""
-            new_var = {}
+            new_var: Dict = {}
             unflatten_set = set()
 
             for key, value in var.items():
diff --git a/omnisafe/common/lagrange.py b/omnisafe/common/lagrange.py
index b5e85b82f..d03d2cddc 100644
--- a/omnisafe/common/lagrange.py
+++ b/omnisafe/common/lagrange.py
@@ -14,12 +14,10 @@
 # ==============================================================================
 """Implementation of Lagrange."""
 
-import abc
-
 import torch
 
 
-class Lagrange(abc.ABC):
+class Lagrange:
     r"""Abstract base class for Lagrangian-base Algorithms.
 
     This class implements the Lagrange multiplier update and the Lagrange loss.
diff --git a/omnisafe/common/logger.py b/omnisafe/common/logger.py
index 9398ebf33..61a68f335 100644
--- a/omnisafe/common/logger.py
+++ b/omnisafe/common/logger.py
@@ -26,7 +26,7 @@
 import wandb
 
 from omnisafe.utils.config import Config
-from omnisafe.utils.distributed_utils import mpi_statistics_scalar, proc_id
+from omnisafe.utils.distributed import dist_statistics_scalar, get_rank
 
 
 # As of torch v1.9.0, torch.utils.tensorboard has a bug that is exposed by setuptools 59.6.0.  The
@@ -113,10 +113,10 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
         self._hms_time = hms_time
         self._log_dir = os.path.join(output_dir, exp_name, relpath)
         self._verbose = verbose
-        self._main_proc = proc_id() == 0
+        self._maste_proc = get_rank() == 0
 
         self._output_file: TextIO
-        if self._main_proc:
+        if self._maste_proc:
             os.makedirs(self._log_dir, exist_ok=True)
             self._output_file = open(  # pylint: disable=consider-using-with
                 os.path.join(self._log_dir, output_fname), encoding='utf-8', mode='w'
@@ -130,6 +130,7 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
         self._data: Dict[str, Union[Deque[Union[int, float]], List[Union[int, float]]]] = {}
         self._headers_windwos: Dict[str, Optional[int]] = {}
         self._headers_minmax: Dict[str, bool] = {}
+        self._headers_delta: Dict[str, bool] = {}
         self._current_row: Dict[str, Union[int, float]] = {}
 
         if config is not None:
@@ -139,10 +140,10 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
         self._use_tensorboard = use_tensorboard
         self._use_wandb = use_wandb
 
-        if self._use_tensorboard and self._main_proc:
+        if self._use_tensorboard and self._maste_proc:
             self._tensorboard_writer = SummaryWriter(log_dir=os.path.join(self._log_dir, 'tb'))
 
-        if self._use_wandb and self._main_proc:
+        if self._use_wandb and self._maste_proc:
             project: str = self._config.get('wandb_project', 'omnisafe')
             name: str = self._config.get('wandb_name', f'{exp_name}/{relpath}')
             entity: str = self._config.get('wandb_entity', None)
@@ -168,7 +169,7 @@ def log(
             msg (str): The message to be logged.
             color (int): The color of the message.
         """
-        if self._verbose and self._main_proc:
+        if self._verbose and self._maste_proc:
             print(WordColor.colorize(msg, color, bold, highlight))
 
     def save_config(self, config: Config) -> None:
@@ -177,7 +178,7 @@ def save_config(self, config: Config) -> None:
         Args:
             config (dict): The configuration to be saved.
         """
-        if self._main_proc:
+        if self._maste_proc:
             self.log('Save with config in config.json', 'yellow', bold=True)
             with open(os.path.join(self._log_dir, 'config.json'), encoding='utf-8', mode='w') as f:
                 f.write(config.tojson())
@@ -192,7 +193,7 @@ def setup_torch_saver(self, what_to_save: Dict[str, Any]) -> None:
 
     def torch_save(self) -> None:
         """Save the torch model."""
-        if self._main_proc:
+        if self._maste_proc:
             assert self._what_to_save is not None, 'Please setup torch saver first'
             path = os.path.join(self._log_dir, 'torch_save', f'epoch-{self._epoch}.pt')
             os.makedirs(os.path.dirname(path), exist_ok=True)
@@ -204,7 +205,11 @@ def torch_save(self) -> None:
             torch.save(params, path)
 
     def register_key(
-        self, key: str, window_length: Optional[int] = None, min_and_max: bool = False
+        self,
+        key: str,
+        window_length: Optional[int] = None,
+        min_and_max: bool = False,
+        delta: bool = False,
     ) -> None:
         """Register a key to the logger.
 
@@ -220,10 +225,17 @@ def register_key(
             self._current_row[f'{key}/Max'] = 0
             self._current_row[f'{key}/Std'] = 0
             self._headers_minmax[key] = True
+
         else:
             self._current_row[key] = 0
             self._headers_minmax[key] = False
 
+        if delta:
+            self._current_row[f'{key}/Delta'] = 0
+            self._headers_delta[key] = True
+        else:
+            self._headers_delta[key] = False
+
         if window_length is not None:
             self._data[key] = deque(maxlen=window_length)
             self._headers_windwos[key] = window_length
@@ -250,21 +262,8 @@ def store(self, **kwargs: Union[int, float, np.ndarray, torch.Tensor]) -> None:
 
     def dump_tabular(self) -> None:
         """Dump the tabular data to the console and the file."""
-        for key in self._data:
-            if self._headers_minmax[key]:
-                mean, min_val, max_val, std = self.get_stats(key, True)
-                self._current_row[f'{key}/Mean'] = mean
-                self._current_row[f'{key}/Min'] = min_val
-                self._current_row[f'{key}/Max'] = max_val
-                self._current_row[f'{key}/Std'] = std
-            else:
-                mean = self.get_stats(key, False)[0]
-                self._current_row[key] = mean
-
-            if self._headers_windwos[key] is None:
-                self._data[key] = []
-
-        if self._main_proc:
+        self._update_current_row()
+        if self._maste_proc:
             self._epoch += 1
             if self._verbose:
                 key_lens = list(map(len, self._current_row.keys()))
@@ -291,6 +290,26 @@ def dump_tabular(self) -> None:
             if self._use_wandb:
                 wandb.log(self._current_row, step=self._epoch)
 
+    def _update_current_row(self) -> None:
+        for key in self._data:
+            if self._headers_minmax[key]:
+                old_data = self._current_row[f'{key}/Mean']
+                mean, min_val, max_val, std = self.get_stats(key, True)
+                self._current_row[f'{key}/Mean'] = mean
+                self._current_row[f'{key}/Min'] = min_val
+                self._current_row[f'{key}/Max'] = max_val
+                self._current_row[f'{key}/Std'] = std
+            else:
+                old_data = self._current_row[key]
+                mean = self.get_stats(key, False)[0]
+                self._current_row[key] = mean
+
+            if self._headers_delta[key]:
+                self._current_row[f'{key}/Delta'] = mean - old_data
+
+            if self._headers_windwos[key] is None:
+                self._data[key] = []
+
     def get_stats(self, key, min_and_max: bool = False) -> Tuple[Union[int, float], ...]:
         """Get the statistics of the key."""
         assert key in self._current_row, f'Key {key} has not been registered'
@@ -299,17 +318,17 @@ def get_stats(self, key, min_and_max: bool = False) -> Tuple[Union[int, float],
             vals = list(vals)
 
         if min_and_max:
-            mean, std, min_val, max_val = mpi_statistics_scalar(
+            mean, std, min_val, max_val = dist_statistics_scalar(
                 torch.tensor(vals), with_min_and_max=True
             )
             return mean.item(), min_val.item(), max_val.item(), std.item()
 
-        mean, std = mpi_statistics_scalar(  # pylint: disable=unbalanced-tuple-unpacking
+        mean, std = dist_statistics_scalar(  # pylint: disable=unbalanced-tuple-unpacking
             torch.tensor(vals)
         )
         return (mean.item(),)
 
     def close(self) -> None:
         """Close the logger."""
-        if self._main_proc:
+        if self._maste_proc:
             self._output_file.close()
diff --git a/omnisafe/common/normalizer.py b/omnisafe/common/normalizer.py
index 1327d731f..fde7567cb 100644
--- a/omnisafe/common/normalizer.py
+++ b/omnisafe/common/normalizer.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Implementation of Vector Buffer."""
 
+from typing import Tuple
+
 import torch
 import torch.nn as nn
 
@@ -21,56 +23,88 @@
 class Normalizer(nn.Module):
     """Calculate normalized raw_data from running mean and std
 
-    See https://www.johndcook.com/blog/standard_deviation/
+    See  Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1979), "Updating Formulae and
+    a Pairwise Algorithm for Computing Sample Variances." (PDF), Technical Report STAN-CS-79-773,
+    Department of Computer Science, Stanford University.
     """
 
-    def __init__(self, shape, clip=1e6):
+    def __init__(self, shape: Tuple[int, ...], clip: float = 1e6) -> None:
         """Initialize the normalize."""
         super().__init__()
-        self.raw_data = nn.Parameter(
-            torch.zeros(*shape), requires_grad=False
-        )  # Current value of data stream
-        self.mean = nn.Parameter(torch.zeros(*shape), requires_grad=False)  # Current mean
-        self.sumsq = nn.Parameter(
-            torch.zeros(*shape), requires_grad=False
-        )  # Current sum of squares, used in var/std calculation
+        if shape == ():
+            self.register_buffer('_mean', torch.tensor(0.0))
+            self.register_buffer('_sumsq', torch.tensor(0.0))
+            self.register_buffer('_var', torch.tensor(0.0))
+            self.register_buffer('_std', torch.tensor(0.0))
+            self.register_buffer('_count', torch.tensor(0))
+            self.register_buffer('_clip', clip * torch.tensor(1.0))
+        else:
+            self.register_buffer('_mean', torch.zeros(*shape))
+            self.register_buffer('_sumsq', torch.zeros(*shape))
+            self.register_buffer('_var', torch.zeros(*shape))
+            self.register_buffer('_std', torch.zeros(*shape))
+            self.register_buffer('_count', torch.tensor(0))
+            self.register_buffer('_clip', clip * torch.ones(*shape))
 
-        self.var = nn.Parameter(torch.zeros(*shape), requires_grad=False)  # Current variance
-        self.std = nn.Parameter(torch.zeros(*shape), requires_grad=False)  # Current std
+        self._mean: torch.Tensor  # running mean
+        self._sumsq: torch.Tensor  # running sum of squares
+        self._var: torch.Tensor  # running variance
+        self._std: torch.Tensor  # running standard deviation
+        self._count: torch.Tensor  # number of samples
+        self._clip: torch.Tensor  # clip value
 
-        self.count = nn.Parameter(torch.zeros(1), requires_grad=False)  # Counter
+        self._shape = shape
+        self._first = True
 
-        self.clip = nn.Parameter(clip * torch.ones(*shape), requires_grad=False)
+    @property
+    def shape(self) -> Tuple[int, ...]:
+        """Return the shape of the normalize."""
+        return self._shape
 
-    def push(self, raw_data):
-        """Push a new value into the stream."""
-        self.raw_data.data = raw_data
-        self.count.data[0] += 1
-        if self.count.data[0] == 1:
-            self.mean.data = raw_data
-        else:
-            old_mean = self.mean
-            self.mean.data += (raw_data - self.mean.data) / self.count.data
-            self.sumsq.data += (raw_data - old_mean.data) * (raw_data - self.mean.data)
-            self.var.data = self.sumsq.data / (self.count.data - 1)
-            self.std.data = torch.sqrt(self.var.data)
-            self.std.data = torch.max(self.std.data, 1e-2 * torch.ones_like(self.std.data))
+    @property
+    def mean(self) -> torch.Tensor:
+        """Return the mean of the normalize."""
+        return self._mean
 
-    def forward(self, raw_data=None):
-        """Normalize the raw_data."""
-        return self.normalize(raw_data)
+    @property
+    def std(self) -> torch.Tensor:
+        """Return the std of the normalize."""
+        return self._std
 
-    def pre_process(self, raw_data):
-        """Pre-process the raw_data."""
-        if len(raw_data.shape) == 1:
-            raw_data = raw_data.unsqueeze(-1)
-        return raw_data
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        """Normalize the data."""
+        return self.normalize(data)
 
-    def normalize(self, raw_data=None):
-        """Normalize the raw_data."""
-        raw_data = self.pre_process(raw_data)
-        self.push(raw_data)
-        if self.count <= 1:
-            return self.raw_data.data
-        output = (self.raw_data.data - self.mean.data) / self.std.data
-        return torch.clamp(output, -self.clip.data, self.clip.data)
+    def normalize(self, data: torch.Tensor) -> torch.Tensor:
+        """Normalize the _data."""
+        data = data.to(self._mean.device)
+        self._push(data)
+        if self._count <= 1:
+            return data
+        output = (data - self._mean) / self._std
+        return torch.clamp(output, -self._clip, self._clip)
+
+    def _push(self, raw_data: torch.Tensor) -> None:
+        if raw_data.shape == self._shape:
+            raw_data = raw_data.unsqueeze(0)
+        assert raw_data.shape[1:] == self._shape, 'data shape must be equal to (batch_size, *shape)'
+
+        if self._first:
+            self._mean = torch.mean(raw_data, dim=0)
+            self._sumsq = torch.sum((raw_data - self._mean) ** 2, dim=0)
+            self._count = torch.tensor(
+                raw_data.shape[0], dtype=self._count.dtype, device=self._count.device
+            )
+            self._first = False
+        else:
+            count_raw = raw_data.shape[0]
+            count = self._count + count_raw
+            mean_raw = torch.mean(raw_data, dim=0)
+            delta = mean_raw - self._mean
+            self._mean += delta * count_raw / count
+            sumq_raw = torch.sum((raw_data - mean_raw) ** 2, dim=0)
+            self._sumsq += sumq_raw + delta**2 * self._count * count_raw / count
+            self._count = count
+        self._var = self._sumsq / (self._count - 1)
+        self._std = torch.sqrt(self._var)
+        self._std = torch.max(self._std, 1e-2 * torch.ones_like(self._std))
diff --git a/omnisafe/common/pid_lagrange.py b/omnisafe/common/pid_lagrange.py
index 9b5c6b377..cab894708 100644
--- a/omnisafe/common/pid_lagrange.py
+++ b/omnisafe/common/pid_lagrange.py
@@ -16,6 +16,7 @@
 
 import abc
 from collections import deque
+from typing import Deque
 
 
 # pylint: disable-next=too-few-public-methods,too-many-instance-attributes
@@ -62,12 +63,12 @@ def __init__(
         self.sum_norm = sum_norm
         self.diff_norm = diff_norm
         self.pid_i = lagrangian_multiplier_init
-        self.cost_ds = deque(maxlen=self.pid_d_delay)
+        self.cost_ds: Deque[float] = deque(maxlen=self.pid_d_delay)
         self.cost_ds.append(0)
-        self._delta_p = 0
-        self._cost_d = 0
-        self.cost_limit = cost_limit
-        self.cost_penalty = 0
+        self._delta_p: float = 0
+        self._cost_d: float = 0
+        self.cost_limit: float = cost_limit
+        self.cost_penalty: float = 0
 
     def pid_update(self, ep_cost_avg: float) -> None:
         r"""Update the PID controller.
diff --git a/omnisafe/common/record_queue.py b/omnisafe/common/record_queue.py
deleted file mode 100644
index 4a9d998b3..000000000
--- a/omnisafe/common/record_queue.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of Record Queue."""
-
-from collections import deque
-
-import numpy as np
-
-from omnisafe.typing import List
-
-
-class RecordQueue:
-    """RecordQueue."""
-
-    def __init__(self, *names, maxlen=100) -> None:
-        """Initialize the RecordQueue."""
-        self.queues = {}
-        self._create_deques(*names, maxlen=maxlen)
-
-    def _create_deques(self, *names, maxlen=100) -> None:
-        """Create queues by names."""
-        for name in names:
-            self.queues[name] = deque(maxlen=maxlen)
-
-    def append(self, **kwargs) -> None:
-        """Add values to the queues."""
-        for key, value in kwargs.items():
-            assert key in self.queues, f'{key} has not been set in queues {self.queues.keys()}'
-            self.queues[key].append(value)
-
-    def non_empty_mean(self, name) -> np.ndarray:
-        """Get the mean of the non-empty values."""
-        return np.mean(self.queues[name]) if len(self.queues[name]) else 0.0
-
-    def get_mean(self, *names) -> List:
-        """Get the means of needed queue names."""
-        assert all(
-            name in self.queues for name in names
-        ), f'{names} has not been set in queues {self.queues.keys()}'
-        if len(names) == 1:
-            return self.non_empty_mean(names[0])
-        return [self.non_empty_mean(name) for name in names]
-
-    def reset(self, *names) -> None:
-        """Reset the needed queue."""
-        assert all(
-            name in self.queues for name in names
-        ), f'{names} has not been set in queues {self.queues.keys()}'
-        for name in names:
-            self.queues[name].clear()
diff --git a/omnisafe/configs/on-policy/CPO.yaml b/omnisafe/configs/on-policy/CPO.yaml
index 08b3a1712..bf605bd2d 100644
--- a/omnisafe/configs/on-policy/CPO.yaml
+++ b/omnisafe/configs/on-policy/CPO.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: True
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/CPPOPid.yaml b/omnisafe/configs/on-policy/CPPOPid.yaml
index bd74d172d..741f12196 100644
--- a/omnisafe/configs/on-policy/CPPOPid.yaml
+++ b/omnisafe/configs/on-policy/CPPOPid.yaml
@@ -31,7 +31,7 @@ defaults:
   # Number of epochs
   epochs: 500
   # Number of steps per epoch
-  steps_per_epoch: 32784
+  steps_per_epoch: 32768
   # Number of update iteration for Actor network
   actor_iters: 10
   # Number of update iteration for Critic network
@@ -59,6 +59,14 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +78,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -86,38 +96,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
@@ -148,7 +148,7 @@ defaults:
     max_len: 100
     # The number of threads used to sample data
     num_threads: 20
- ## --------------------------------------Configuration For PID--------------------------------- ##
+## --------------------------------------Configuration For PID--------------------------------- ##
   PID_cfgs:
     # KP for PID
     pid_kp: 0.01
diff --git a/omnisafe/configs/on-policy/CUP.yaml b/omnisafe/configs/on-policy/CUP.yaml
index 514e678d9..a9d93db4a 100644
--- a/omnisafe/configs/on-policy/CUP.yaml
+++ b/omnisafe/configs/on-policy/CUP.yaml
@@ -52,6 +52,14 @@ defaults:
   critic_lr: 0.0003
   # The Address for saving training process data
   data_dir: "./runs"
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   ## ---------------------------Basic configurations for derived class FOCOPS------------------- ##
   # The thereshold for KL early stopping
@@ -77,6 +85,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -93,38 +103,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/FOCOPS.yaml b/omnisafe/configs/on-policy/FOCOPS.yaml
index 6e2ac0d89..4094af4e2 100644
--- a/omnisafe/configs/on-policy/FOCOPS.yaml
+++ b/omnisafe/configs/on-policy/FOCOPS.yaml
@@ -64,6 +64,14 @@ defaults:
   lam: 1.5
   # The size of batch for policy update
   batch_size: 2000
+    # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -75,6 +83,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -91,38 +101,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/IPO.yaml b/omnisafe/configs/on-policy/IPO.yaml
index f7c3c8ba3..2cf52f48a 100644
--- a/omnisafe/configs/on-policy/IPO.yaml
+++ b/omnisafe/configs/on-policy/IPO.yaml
@@ -63,6 +63,14 @@ defaults:
   kappa: 0.01
   # The max of cost penalty
   penalty_max: 1.0
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -76,6 +84,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/NaturalPG.yaml b/omnisafe/configs/on-policy/NaturalPG.yaml
index 8aef74061..30c044d46 100644
--- a/omnisafe/configs/on-policy/NaturalPG.yaml
+++ b/omnisafe/configs/on-policy/NaturalPG.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/OnCRPO.yaml b/omnisafe/configs/on-policy/OnCRPO.yaml
index 516567025..5b7f0c999 100644
--- a/omnisafe/configs/on-policy/OnCRPO.yaml
+++ b/omnisafe/configs/on-policy/OnCRPO.yaml
@@ -59,6 +59,14 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -88,38 +98,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/P3O.yaml b/omnisafe/configs/on-policy/P3O.yaml
index 4f1b25f03..ccb642043 100644
--- a/omnisafe/configs/on-policy/P3O.yaml
+++ b/omnisafe/configs/on-policy/P3O.yaml
@@ -61,6 +61,14 @@ defaults:
   clip: 0.2
   # The coefficient of cost penalty
   kappa: 20.0
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -88,38 +98,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PCPO.yaml b/omnisafe/configs/on-policy/PCPO.yaml
index d363832ea..97485d459 100644
--- a/omnisafe/configs/on-policy/PCPO.yaml
+++ b/omnisafe/configs/on-policy/PCPO.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PDO.yaml b/omnisafe/configs/on-policy/PDO.yaml
index d9ccd2da9..7b64a8564 100644
--- a/omnisafe/configs/on-policy/PDO.yaml
+++ b/omnisafe/configs/on-policy/PDO.yaml
@@ -31,7 +31,7 @@ defaults:
   # Number of epochs
   epochs: 500
   # Number of steps per epoch
-  steps_per_epoch: 32768
+  steps_per_epoch: 32000
   # Number of update iteration for Actor network
   actor_iters: 10
   # Number of update iteration for Critic network
@@ -57,17 +57,28 @@ defaults:
   target_kl: 0.02
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
+
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
   # Whether to use cost critic
-  use_cost: True
+  use_cost: False
   # Cost discounted factor
   cost_gamma: 1.0
   # Whether to use linear decay of learning rate
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -84,38 +95,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml
index e2c79598f..df1a2f18b 100644
--- a/omnisafe/configs/on-policy/PPO.yaml
+++ b/omnisafe/configs/on-policy/PPO.yaml
@@ -31,7 +31,7 @@ defaults:
   # Number of epochs
   epochs: 500
   # Number of steps per epoch
-  steps_per_epoch: 32768
+  steps_per_epoch: 32000
   # Number of update iteration for Actor network
   actor_iters: 40
   # Number of update iteration for Critic network
@@ -59,6 +59,14 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +78,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -84,38 +94,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml
index 313fbdd1f..71951d7d8 100644
--- a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml
+++ b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml
@@ -59,6 +59,16 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 1
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
+  # cost_limit
+  cost_limit: 25
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +80,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -86,38 +98,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PPOLag.yaml b/omnisafe/configs/on-policy/PPOLag.yaml
index 6b00a1aa0..cfd1d4ab5 100644
--- a/omnisafe/configs/on-policy/PPOLag.yaml
+++ b/omnisafe/configs/on-policy/PPOLag.yaml
@@ -59,6 +59,14 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +78,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -86,38 +96,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml
index 71eb61fe2..2823db932 100644
--- a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml
+++ b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml
@@ -59,6 +59,16 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 1
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
+  # cost_limit
+  cost_limit: 25
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +80,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -86,38 +98,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/PPOLagSaute.yaml b/omnisafe/configs/on-policy/PPOLagSaute.yaml
index 1cc769c86..e1b2b7362 100644
--- a/omnisafe/configs/on-policy/PPOLagSaute.yaml
+++ b/omnisafe/configs/on-policy/PPOLagSaute.yaml
@@ -59,6 +59,14 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: False
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +78,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -86,38 +96,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
@@ -163,6 +163,6 @@ defaults:
     # Whether to use standardized obs
     normalized_obs: True
     # The maximum length of record queue
-    max_len: 100
+    max_ep_len: 1000
     # The number of threads used to sample data
     num_threads: 20
diff --git a/omnisafe/configs/on-policy/PPOSaute.yaml b/omnisafe/configs/on-policy/PPOSaute.yaml
index 73d1d4dec..ebfb57d4c 100644
--- a/omnisafe/configs/on-policy/PPOSaute.yaml
+++ b/omnisafe/configs/on-policy/PPOSaute.yaml
@@ -59,6 +59,14 @@ defaults:
   batch_size: 10000
   # The clip range for PPO loss
   clip: 0.2
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: False
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -70,6 +78,8 @@ defaults:
   linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -84,38 +94,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
@@ -151,6 +151,6 @@ defaults:
     # Whether to use standardized obs
     normalized_obs: True
     # The maximum length of record queue
-    max_len: 100
+    max_ep_len: 1000
     # The number of threads used to sample data
     num_threads: 20
diff --git a/omnisafe/configs/on-policy/PolicyGradient.yaml b/omnisafe/configs/on-policy/PolicyGradient.yaml
index 8c1c9e6ab..b7b83b878 100644
--- a/omnisafe/configs/on-policy/PolicyGradient.yaml
+++ b/omnisafe/configs/on-policy/PolicyGradient.yaml
@@ -46,10 +46,6 @@ defaults:
   max_ep_len: 1000
   # The size of mini batch
   num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
   # The Address for saving training process data
   data_dir: "./runs"
   ## ---------------------------Basic configurations for derived class PPO---------------------- ##
@@ -57,6 +53,14 @@ defaults:
   target_kl: 0.02
   # The size of batch for policy update
   batch_size: 10000
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -64,10 +68,10 @@ defaults:
   use_cost: False
   # Cost discounted factor
   cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -82,38 +86,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/RCPO.yaml b/omnisafe/configs/on-policy/RCPO.yaml
index 7b331f4ff..7fb671f6a 100644
--- a/omnisafe/configs/on-policy/RCPO.yaml
+++ b/omnisafe/configs/on-policy/RCPO.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml
index 71dd6a162..1f359b864 100644
--- a/omnisafe/configs/on-policy/TRPO.yaml
+++ b/omnisafe/configs/on-policy/TRPO.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/TRPOLag.yaml b/omnisafe/configs/on-policy/TRPOLag.yaml
index 7b331f4ff..7fb671f6a 100644
--- a/omnisafe/configs/on-policy/TRPOLag.yaml
+++ b/omnisafe/configs/on-policy/TRPOLag.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
diff --git a/omnisafe/configs/on-policy/TRPOPid.yaml b/omnisafe/configs/on-policy/TRPOPid.yaml
index 98778f0f7..ee26aa807 100644
--- a/omnisafe/configs/on-policy/TRPOPid.yaml
+++ b/omnisafe/configs/on-policy/TRPOPid.yaml
@@ -63,6 +63,14 @@ defaults:
   cg_iters: 10
   # Subsampled observation
   fvp_obs: None
+  # The number of parallel environments
+  num_envs: 32
+  # Whether to use standardized reward
+  reward_normalize: True
+  # Whether to use standardized cost
+  cost_normalize: True
+  # Whether to use standardized obs
+  obs_normalize: True
 
   # ---------------------------------------Optional Configuration-------------------------------- #
   ## -----------------------------------Configuration For Cost Critic--------------------------- ##
@@ -74,6 +82,8 @@ defaults:
   linear_lr_decay: False
   # Whether to use exploration noise anneal
   exploration_noise_anneal: False
+  # std
+  std: [0.5, 0.1]
   # The coefficient of reward penalty
   penalty_param: 0.0
   # Whether to use KL early stopping
@@ -90,38 +100,28 @@ defaults:
   critic_norm_coeff: 0.001
   ## ---------------------------------------Configuration For Model----------------------------- ##
   model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
     # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
     weight_initialization_mode: "kaiming_uniform"
     # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
+    actor_type: gaussian_learning
+    # Whether to use linear decay of learning rate
+    linear_lr_decay: True
+    # Configuration of Actor network
+    actor:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Actor network
+      lr: 0.0003
+    # Configuration of Critic network
+    critic:
+      # Size of hidden layers
+      hidden_sizes: [64, 64]
+      # Activation function
+      activation: tanh
+      # The learning rate of Critic network
+      lr: 0.0003
   ## --------------------------------------Configuration For Buffer----------------------------- ##
   buffer_cfgs:
     # Reward discounted factor
@@ -152,7 +152,7 @@ defaults:
     max_len: 100
     # The number of threads used to sample data
     num_threads: 20
- ## --------------------------------------Configuration For PID--------------------------------- ##
+## --------------------------------------Configuration For PID--------------------------------- ##
   PID_cfgs:
     # KP for PID
     pid_kp: 0.01
diff --git a/omnisafe/algorithms/model_based/models/__init__.py b/omnisafe/envs/__init__.py
similarity index 77%
rename from omnisafe/algorithms/model_based/models/__init__.py
rename to omnisafe/envs/__init__.py
index 13e8f052f..eb2348aee 100644
--- a/omnisafe/algorithms/model_based/models/__init__.py
+++ b/omnisafe/envs/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The model-based dynamics model."""
+"""Environment api for omnisafe."""
 
-from omnisafe.algorithms.model_based.models.dynamic_model import EnsembleDynamicsModel
-from omnisafe.algorithms.model_based.models.virtual_env import VirtualEnv
+from omnisafe.envs.core import CMDP, env_register, make, support_envs
+from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv
diff --git a/omnisafe/envs/core.py b/omnisafe/envs/core.py
new file mode 100644
index 000000000..cd92b3205
--- /dev/null
+++ b/omnisafe/envs/core.py
@@ -0,0 +1,336 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The core module of the environment."""
+
+
+import inspect
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from omnisafe.typing import OmnisafeSpace
+
+
+class CMDP(ABC):
+    """The core class of the environment.
+
+    The CMDP class is the core class of the environment. It defines the basic
+    interface of the environment. The environment should inherit from this class
+    and implement the abstract methods.
+
+    Attributes:
+        _support_envs (List[str]): the supported environments.
+        _action_space (OmnisafeSpace): the action space of the environment.
+        _observation_space (OmnisafeSpace): the observation space of the environment.
+        _num_envs (int): the parallel environments, for env that not support parallel, num_envs should be 1
+        _time_limit (Optional[int]): the time limit of the environment, if None, the environment is infinite.
+    """
+
+    _support_envs: List[str]
+    _action_space: OmnisafeSpace
+    _observation_space: OmnisafeSpace
+
+    _num_envs: int
+    _time_limit: Optional[int] = None
+    need_time_limit_wrapper: bool
+    need_auto_reset_wrapper: bool
+
+    @classmethod
+    def support_envs(cls) -> List[str]:
+        """The supported environments.
+
+        Returns:
+            List[str]: the supported environments.
+        """
+        return cls._support_envs
+
+    @abstractmethod
+    def __init__(self, env_id: str, **kwargs) -> None:
+        """Initialize the environment.
+
+        Args:
+            env_id (str): the environment id.
+        """
+        assert (
+            env_id in self.support_envs()
+        ), f'env_id {env_id} is not supported by {self.__class__.__name__}'
+
+    @property
+    def action_space(self) -> OmnisafeSpace:
+        """The action space of the environment.
+
+        Returns:
+            OmnisafeSpace: the action space.
+        """
+        return self._action_space
+
+    @property
+    def observation_space(self) -> OmnisafeSpace:
+        """The observation space of the environment.
+
+        Returns:
+            OmnisafeSpace: the observation space.
+        """
+        return self._observation_space
+
+    @property
+    def num_envs(self) -> int:
+        """The parallel environments.
+
+        Returns:
+            int: the parallel environments.
+        """
+        return self._num_envs
+
+    @property
+    def time_limit(self) -> Optional[int]:
+        """The time limit of the environment.
+
+        Returns:
+            Optional[int]: the time limit of the environment.
+        """
+        return self._time_limit
+
+    @abstractmethod
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        """Run one timestep of the environment's dynamics using the agent actions.
+
+        Args:
+            action (torch.Tensor): action.
+
+        Returns:
+            observation (torch.Tensor): agent's observation of the current environment.
+            reward (torch.Tensor): amount of reward returned after previous action.
+            cost (torch.Tensor): amount of cost returned after previous action.
+            terminated (torch.Tensor): whether the episode has ended, in which case further step()
+            calls will return undefined results.
+            truncated (torch.Tensor): whether the episode has been truncated due to a time limit.
+            info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
+        """
+
+    @abstractmethod
+    def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        """Resets the environment and returns an initial observation.
+
+        Args:
+            seed (Optional[int]): seed for the environment.
+
+        Returns:
+            observation (torch.Tensor): the initial observation of the space.
+            info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
+        """
+
+    @abstractmethod
+    def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        """For parallel env, reset one of the env and returns an initial observation,
+            if env not support parallel, should be same as reset.
+
+        Args:
+            seed (Optional[int]): seed for the environment.
+
+        Returns:
+            observation (torch.Tensor): the initial observation of the space.
+            info (Dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
+        """
+
+    @abstractmethod
+    def set_seed(self, seed: int) -> None:
+        """Sets the seed for this env's random number generator(s).
+
+        Args:
+            seed (int): the seed to use.
+        """
+
+    @abstractmethod
+    def sample_action(self) -> torch.Tensor:
+        """Sample an action from the action space.
+
+        Returns:
+            torch.Tensor: the sampled action.
+        """
+
+    @abstractmethod
+    def render(self) -> Any:
+        """Compute the render frames as specified by :attr:`render_mode` during the initialization of the environment.
+
+        Returns:
+            Any: the render frames, we recommend to use `np.ndarray` which could construct video by moviepy.
+        """
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the environment."""
+
+
+class Wrapper(CMDP):
+    """The wrapper class of the environment.
+
+    The Wrapper class is the wrapper class of the environment. It defines the basic
+    interface of the environment wrapper. The environment wrapper should inherit
+    from this class and implement the abstract methods.
+
+    Attributes:
+        _env (CMDP): the environment.
+
+    """
+
+    def __init__(self, env: CMDP) -> None:
+        """Initialize the wrapper.
+
+        Args:
+            env (CMDP): the environment.
+        """
+        self._env = env
+
+    def __getattr__(self, name: str) -> Any:
+        """Get the attribute of the environment.
+
+        Args:
+            name (str): the attribute name.
+
+        Returns:
+            Any: the attribute.
+        """
+        if name.startswith('_'):
+            raise AttributeError(f'attempted to get missing private attribute {name}')
+        return getattr(self._env, name)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        return self._env.step(action)
+
+    def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        return self._env.reset(seed)
+
+    def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        return self._env.single_reset(idx, seed)
+
+    def set_seed(self, seed: int) -> None:
+        self._env.set_seed(seed)
+
+    def sample_action(self) -> torch.Tensor:
+        return self._env.sample_action()
+
+    def render(self) -> Any:
+        return self._env.render()
+
+    def close(self) -> None:
+        self._env.close()
+
+
+class EnvRegister:
+    """The environment register.
+
+    The EnvRegister is used to register the environment class. It provides the
+    method to get the environment class by the environment id.
+
+    """
+
+    def __init__(self) -> None:
+        self._class: Dict[str, Type[CMDP]] = {}
+        self._support_envs: Dict[str, List[str]] = {}
+
+    def _register(self, env_class: Type[CMDP]) -> None:
+        """Register the environment class.
+
+        Args:
+            env_class (Type[CMDP]): the environment class.
+        """
+        if not inspect.isclass(env_class):
+            raise TypeError(f'{env_class} must be a class')
+        class_name = env_class.__name__
+        if not issubclass(env_class, CMDP):
+            raise TypeError(f'{class_name} must be subclass of CMDP')
+        if class_name in self._class:
+            raise ValueError(f'{class_name} has been registered')
+        env_ids = env_class.support_envs()
+        self._class[class_name] = env_class
+        self._support_envs[class_name] = env_ids
+
+    def register(self, env_class: Type[CMDP]) -> Type[CMDP]:
+        """Register the environment class.
+
+        Args:
+            env_class (Type[CMDP]): the environment class.
+
+        Returns:
+            Type[CMDP]: the environment class.
+        """
+        self._register(env_class)
+        return env_class
+
+    def get_class(self, env_id: str, class_name: Optional[str]) -> Type[CMDP]:
+        """Get the environment class.
+
+        Args:
+            env_id (str): the environment id.
+            class_name (Optional[str]): the environment class name.
+
+        Returns:
+            Type[CMDP]: the environment class.
+        """
+        if class_name is not None:
+            assert class_name in self._class, f'{class_name} is not registered'
+            assert (
+                env_id in self._support_envs[class_name]
+            ), f'{env_id} is not supported by {class_name}'
+            return self._class[class_name]
+
+        for cls_name, env_ids in self._support_envs.items():
+            if env_id in env_ids:
+                return self._class[cls_name]
+        raise ValueError(f'{env_id} is not supported by any environment class')
+
+    def support_envs(self) -> List[str]:
+        """The supported environments.
+
+        Returns:
+            List[str]: the supported environments.
+        """
+        return list({env_id for env_ids in self._support_envs.values() for env_id in env_ids})
+
+
+ENV_REGISTRY = EnvRegister()
+
+env_register = ENV_REGISTRY.register
+support_envs = ENV_REGISTRY.support_envs
+
+
+def make(env_id: str, class_name: Optional[str] = None, **kwargs) -> CMDP:
+    """Create an environment.
+
+    Args:
+        env_id (str): the environment id.
+        class_name (Optional[str]): the environment class name.
+        **kwargs: the keyword arguments for the environment initialization.
+
+    Returns:
+        CMDP: the environment.
+    """
+    env_class = ENV_REGISTRY.get_class(env_id, class_name)
+    return env_class(env_id, **kwargs)
+
+
+__all__ = [
+    'CMDP',
+    'Wrapper',
+    'env_register',
+    'support_envs',
+    'make',
+]
diff --git a/omnisafe/envs/safety_gymnasium_env.py b/omnisafe/envs/safety_gymnasium_env.py
new file mode 100644
index 000000000..f35e13afd
--- /dev/null
+++ b/omnisafe/envs/safety_gymnasium_env.py
@@ -0,0 +1,117 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Environments in the Safety Gymnasium."""
+
+
+from typing import Any, Dict, Optional, Tuple
+
+import safety_gymnasium
+import torch
+
+from omnisafe.envs.core import CMDP, env_register
+
+
+@env_register
+class SafetyGymnasiumEnv(CMDP):
+    """Safety Gymnasium environment."""
+
+    _support_envs = [
+        'SafetyPointGoal0-v0',
+        'SafetyPointGoal1-v0',
+        'SafetyPointGoal2-v0',
+        'SafetyPointButton0-v0',
+        'SafetyPointButton1-v0',
+        'SafetyPointButton2-v0',
+        'SafetyPointPush0-v0',
+        'SafetyPointPush1-v0',
+        'SafetyPointPush2-v0',
+        'SafetyPointCircle0-v0',
+        'SafetyPointCircle1-v0',
+        'SafetyPointCircle2-v0',
+        'SafetyCarGoal0-v0',
+        'SafetyCarGoal1-v0',
+        'SafetyCarGoal2-v0',
+        'SafetyCarButton0-v0',
+        'SafetyCarButton1-v0',
+        'SafetyCarButton2-v0',
+        'SafetyCarPush0-v0',
+        'SafetyCarPush1-v0',
+        'SafetyCarPush2-v0',
+        'SafetyCarCircle0-v0',
+        'SafetyCarCircle1-v0',
+        'SafetyCarCircle2-v0',
+        'SafetyAntGoal0-v0',
+        'SafetyAntGoal1-v0',
+        'SafetyAntGoal2-v0',
+        'SafetyAntButton0-v0',
+        'SafetyAntButton1-v0',
+        'SafetyAntButton2-v0',
+        'SafetyAntPush0-v0',
+        'SafetyAntPush1-v0',
+        'SafetyAntPush2-v0',
+        'SafetyAntCircle0-v0',
+        'SafetyAntCircle1-v0',
+        'SafetyAntCircle2-v0',
+        'SafetyHalfCheetahVelocity-v4',
+        'SafetyHopperVelocity-v4',
+        'SafetySwimmerVelocity-v4',
+        'SafetyWalker2dVelocity-v4',
+        'SafetyAntVelocity-v4',
+        'SafetyHumanoidVelocity-v4',
+    ]
+    need_auto_reset_wrapper = False
+    need_time_limit_wrapper = False
+
+    def __init__(self, env_id: str, num_envs: int = 1, **kwargs) -> None:
+        if num_envs > 1:
+            self._env = safety_gymnasium.vector.make(env_id=env_id, num_envs=num_envs, **kwargs)
+            self._action_space = self._env.single_action_space
+            self._observation_space = self._env.single_observation_space
+        else:
+            self._env = safety_gymnasium.make(id=env_id, autoreset=True, **kwargs)
+            self._action_space = self._env.action_space
+            self._observation_space = self._env.observation_space
+
+        self._num_envs = num_envs
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = self._env.step(action)
+        obs, reward, cost, terminated, truncated = map(
+            lambda x: torch.as_tensor(x, dtype=torch.float32),
+            (obs, reward, cost, terminated, truncated),
+        )
+        return obs, reward, cost, terminated, truncated, info
+
+    def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        obs, info = self._env.reset(seed=seed)
+        return torch.as_tensor(obs, dtype=torch.float32), info
+
+    def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        obs, info = self.reset(seed=seed)
+        return obs[idx], info
+
+    def set_seed(self, seed: int) -> None:
+        self.reset(seed=seed)
+
+    def sample_action(self) -> torch.Tensor:
+        return torch.as_tensor(self._env.action_space.sample(), torch.float32)
+
+    def render(self) -> Any:
+        return self._env.render()
+
+    def close(self) -> None:
+        self._env.close()
diff --git a/omnisafe/envs/wrapper.py b/omnisafe/envs/wrapper.py
new file mode 100644
index 000000000..632819020
--- /dev/null
+++ b/omnisafe/envs/wrapper.py
@@ -0,0 +1,288 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper for the environment."""
+
+
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from gymnasium import spaces
+
+from omnisafe.common import Normalizer
+from omnisafe.envs.core import CMDP, Wrapper
+
+
+class TimeLimit(Wrapper):
+    """Time limit wrapper for the environment.
+
+    Example:
+        >>> env = TimeLimit(env, time_limit=100)
+    """
+
+    def __init__(self, env: CMDP, time_limit: int) -> None:
+        """Initialize the time limit wrapper.
+
+        Args:
+            env (CMDP): The environment to wrap.
+            time_limit (int): The time limit for each episode.
+        """
+        super().__init__(env)
+        self._time_limit: int = time_limit
+        self._time: Union[int, np.ndarray] = (
+            0 if self.num_envs == 1 else np.array([0] * self.num_envs)
+        )
+
+    def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        self._time = 0 if self.num_envs == 1 else np.array([0] * self.num_envs)
+        return super().reset(seed)
+
+    def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        if isinstance(self._time, np.ndarray):
+            self._time[idx] = 0
+        else:
+            self._time = 0
+        return super().single_reset(idx, seed)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = super().step(action)
+
+        self._time += 1
+        truncated = torch.tensor(self._time >= self._time_limit, dtype=torch.bool)
+
+        return obs, reward, cost, terminated, truncated, info
+
+
+class AutoReset(Wrapper):
+    """Auto reset the environment when the episode is terminated.
+
+    Example:
+        >>> env = AutoReset(env)
+
+    """
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = super().step(action)
+
+        if self.num_envs == 1:
+            if terminated or truncated:
+                obs, _ = self.reset()
+        else:
+            dones = terminated | truncated
+            for idx, done in enumerate(dones):
+                if done:
+                    obs[idx], _ = self.single_reset(idx)
+
+        return obs, reward, cost, terminated, truncated, info
+
+
+class ObsNormalize(Wrapper):
+    """Normalize the observation.
+
+    Example:
+        >>> env = ObsNormalize(env)
+
+        >>> norm = Normalizer(env.observation_space.shape) # load saved normalizer
+        >>> env = ObsNormalize(env, norm)
+
+    """
+
+    def __init__(self, env: CMDP, norm: Optional[Normalizer] = None) -> None:
+        super().__init__(env)
+        assert isinstance(self.observation_space, spaces.Box), 'Observation space must be Box'
+
+        if norm is not None:
+            self._obs_normalizer = norm
+        else:
+            self._obs_normalizer = Normalizer(self.observation_space.shape, clip=5)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = super().step(action)
+        info['original_obs'] = obs
+        obs = self._obs_normalizer.normalize(obs)
+        return obs, reward, cost, terminated, truncated, info
+
+    def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        obs, info = super().reset(seed)
+        info['original_obs'] = obs
+        obs = self._obs_normalizer.normalize(obs)
+        return obs, info
+
+    def single_reset(self, idx: int, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        obs, info = super().single_reset(idx, seed)
+        info['original_obs'] = obs
+        obs = self._obs_normalizer.normalize(obs.unsqueeze(0)).squeeze(0)
+        return obs, info
+
+
+class RewardNormalize(Wrapper):
+    """Normalize the reward.
+
+    Example:
+        >>> env = RewardNormalize(env)
+
+        >>> norm = Normalizer(()) # load saved normalizer
+        >>> env = RewardNormalize(env, norm)
+
+    """
+
+    def __init__(self, env: CMDP, norm: Optional[Normalizer] = None) -> None:
+        """Initialize the reward normalizer.
+
+        Args:
+            env (CMDP): The environment to wrap.
+            norm (Optional[Normalizer], optional): The normalizer to use. Defaults to None.
+
+        """
+        super().__init__(env)
+        if norm is not None:
+            self._reward_normalizer = norm
+        else:
+            self._reward_normalizer = Normalizer((), clip=5)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = super().step(action)
+        info['original_reward'] = reward
+        reward = self._reward_normalizer.normalize(reward)
+        return obs, reward, cost, terminated, truncated, info
+
+
+class CostNormalize(Wrapper):
+    """Normalize the cost.
+
+    Example:
+        >>> env = CostNormalize(env)
+
+        >>> norm = Normalizer(()) # load saved normalizer
+        >>> env = CostNormalize(env, norm)
+    """
+
+    def __init__(self, env: CMDP, norm: Optional[Normalizer] = None) -> None:
+        """Initialize the cost normalizer.
+
+        Args:
+            env (CMDP): The environment to wrap.
+            norm (Normalizer, optional): The normalizer to use. Defaults to None.
+        """
+        super().__init__(env)
+        if norm is not None:
+            self._obs_normalizer = norm
+        else:
+            self._cost_normalizer = Normalizer((), clip=5)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = super().step(action)
+        info['original_cost'] = cost
+        cost = self._cost_normalizer.normalize(cost)
+        return obs, reward, cost, terminated, truncated, info
+
+
+class ActionScale(Wrapper):
+    """Scale the action space to a given range.
+
+    Example:
+        >>> env = ActionScale(env, low=-1, high=1)
+        >>> env.action_space
+        Box(-1.0, 1.0, (1,), float32)
+    """
+
+    def __init__(
+        self,
+        env: CMDP,
+        low: Union[int, float],
+        high: Union[int, float],
+    ) -> None:
+        """Initialize the wrapper.
+
+        Args:
+            env: The environment to wrap.
+            low: The lower bound of the action space.
+            high: The upper bound of the action space.
+        """
+        super().__init__(env)
+        assert isinstance(self.action_space, spaces.Box), 'Action space must be Box'
+
+        self._old_min_action = torch.tensor(self.action_space.low, dtype=torch.float32)
+        self._old_max_action = torch.tensor(self.action_space.high, dtype=torch.float32)
+
+        min_action = np.zeros(self.action_space.shape, dtype=self.action_space.dtype) + low
+        max_action = np.zeros(self.action_space.shape, dtype=self.action_space.dtype) + high
+        self._action_space = spaces.Box(
+            low=min_action,
+            high=max_action,
+            shape=self.action_space.shape,
+            dtype=self.action_space.dtype,  # type: ignore
+        )
+
+        self._min_action = torch.tensor(min_action, dtype=torch.float32)
+        self._max_action = torch.tensor(max_action, dtype=torch.float32)
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        action = self._old_min_action + (self._old_max_action - self._old_min_action) * (
+            action - self._min_action
+        ) / (self._max_action - self._min_action)
+        return super().step(action)
+
+
+class Unsqueeze(Wrapper):
+    """Unsqueeze the observation, reward, cost, terminated, truncated and info.
+
+    Example:
+        >>> env = Unsqueeze(env)
+    """
+
+    def __init__(self, env: CMDP) -> None:
+        """Initialize the wrapper.
+
+        Args:
+            env: The environment to wrap.
+        """
+        super().__init__(env)
+        assert self.num_envs == 1, 'Unsqueeze only works with single environment'
+        assert isinstance(self.observation_space, spaces.Box), 'Observation space must be Box'
+
+    def step(
+        self, action: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Dict]:
+        obs, reward, cost, terminated, truncated, info = super().step(action)
+        obs, reward, cost, terminated, truncated = map(
+            lambda x: x.unsqueeze(0), (obs, reward, cost, terminated, truncated)
+        )
+        for k, v in info.items():
+            if isinstance(v, torch.Tensor):
+                info[k] = v.unsqueeze(0)
+
+        return obs, reward, cost, terminated, truncated, info
+
+    def reset(self, seed: Optional[int] = None) -> Tuple[torch.Tensor, Dict]:
+        obs, info = super().reset(seed)
+        obs = obs.unsqueeze(0)
+        for k, v in info.items():
+            if isinstance(v, torch.Tensor):
+                info[k] = v.unsqueeze(0)
+
+        return obs, info
diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py
deleted file mode 100644
index d3f3fc62e..000000000
--- a/omnisafe/evaluator.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of Evaluator."""
-
-import dataclasses
-import json
-import os
-
-import numpy as np
-import torch
-from gymnasium.spaces import Box, Discrete
-from gymnasium.utils.save_video import save_video
-
-from omnisafe.models.actor import ActorBuilder
-from omnisafe.utils.config import Config
-from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper as EnvWrapper
-from omnisafe.wrappers.saute_wrapper import SauteWrapper
-from omnisafe.wrappers.simmer_wrapper import SimmerWrapper
-
-
-class Evaluator:  # pylint: disable=too-many-instance-attributes
-    """This class includes common evaluation methods for safe RL algorithms."""
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        env=None,
-        actor=None,
-        obs_normalize=None,
-        play=True,
-        save_replay=True,
-    ):
-        """Initialize the evaluator.
-
-        Args:
-            env (gymnasium.Env): the environment. if None, the environment will be created from the config.
-            pi (omnisafe.algos.models.actor.Actor): the policy. if None, the policy will be created from the config.
-            obs_normalize (omnisafe.algos.models.obs_normalize): the observation Normalize.
-        """
-        # set the attributes
-        self.env = env
-        self.actor = actor
-        self.obs_normalizer = obs_normalize if obs_normalize is not None else lambda x: x
-        self.env_wrapper_class = type(env) if env is not None else None
-
-        # used when load model from saved file.
-        self.cfg = None
-        self.save_dir = None
-        self.model_name = None
-        self.algo_name = None
-        self.model_params = None
-
-        # set the render mode
-        self.play = play
-        self.save_replay = save_replay
-        if play and save_replay:
-            self.render_mode = 'rgb_array'
-        elif play and not save_replay:
-            self.render_mode = 'human'
-        elif not play and save_replay:
-            self.render_mode = 'rgb_array_list'
-        else:
-            self.render_mode = None
-
-    # pylint: disable-next=too-many-locals
-    def load_saved_model(self, save_dir: str, model_name: str):
-        """Load a saved model.
-
-        Args:
-            save_dir (str): directory where the model is saved.
-            model_name (str): name of the model.
-        """
-        # load the config
-        self.save_dir = save_dir
-        self.model_name = model_name
-        cfg_path = os.path.join(save_dir, 'config.json')
-        try:
-            with open(cfg_path, encoding='utf-8') as file:
-                self.cfg = json.load(file)
-        except FileNotFoundError as error:
-            raise FileNotFoundError(
-                'The config file is not found in the save directory.'
-            ) from error
-
-        # load the saved model
-        model_path = os.path.join(save_dir, 'torch_save', model_name)
-        try:
-            self.model_params = torch.load(model_path)
-        except FileNotFoundError as error:
-            raise FileNotFoundError('The model is not found in the save directory.') from error
-
-        self.algo_name = self.cfg['exp_name'].split('/')[1]
-
-        # make the environment
-        env_id = self.cfg['env_id']
-        self.env = self._make_env(env_id, render_mode=self.render_mode)
-
-        # make the actor
-        observation_space = self.env.observation_space
-        action_space = self.env.action_space
-
-        act_space_type = 'discrete' if isinstance(action_space, Discrete) else 'continuous'
-        actor_type = self.cfg['model_cfgs']['actor_type']
-        if isinstance(action_space, Box):
-            act_dim = action_space.shape[0]
-        elif isinstance(action_space, Discrete):
-            act_dim = action_space.n
-        else:
-            raise ValueError
-
-        obs_dim = observation_space.shape[0]
-        pi_cfg = self.cfg['model_cfgs']['ac_kwargs']['pi']
-        weight_initialization_mode = self.cfg['model_cfgs']['weight_initialization_mode']
-        actor_builder = ActorBuilder(
-            obs_dim=obs_dim,
-            act_dim=act_dim,
-            hidden_sizes=pi_cfg['hidden_sizes'],
-            activation=pi_cfg['activation'],
-            weight_initialization_mode=weight_initialization_mode,
-            shared=None,
-        )
-        if act_space_type == 'discrete':
-            self.actor = actor_builder.build_actor('categorical')
-        else:
-            act_max = torch.as_tensor(action_space.high)
-            act_min = torch.as_tensor(action_space.low)
-            self.actor = actor_builder.build_actor(actor_type, act_max=act_max, act_min=act_min)
-        self.actor.load_state_dict(self.model_params['pi'])
-
-    # pylint: disable-next=too-many-locals
-    def evaluate(
-        self,
-        num_episodes: int = 10,
-        cost_criteria: float = 1.0,
-    ):
-        """Evaluate the agent for num_episodes episodes.
-
-        Args:
-            num_episodes (int): number of episodes to evaluate the agent.
-            cost_criteria (float): the cost criteria for the evaluation.
-
-        Returns:
-            episode_rewards (list): list of episode rewards.
-            episode_costs (list): list of episode costs.
-            episode_lengths (list): list of episode lengths.
-        """
-        if self.env is None or self.actor is None:
-            raise ValueError(
-                'The environment and the policy must be provided or created before evaluating the agent.'
-            )
-
-        episode_rewards = []
-        episode_costs = []
-        episode_lengths = []
-        horizon = self.env.rollout_data.max_ep_len
-
-        for _ in range(num_episodes):
-            obs, _ = self.env.reset()
-            ep_ret, ep_cost = 0.0, 0.0
-
-            for step in range(horizon):
-                with torch.no_grad():
-                    if self.env.obs_normalizer is not None:
-                        obs = self.env.obs_normalizer.normalize(obs)
-                    _, act = self.actor.predict(
-                        torch.as_tensor(obs, dtype=torch.float32),
-                        deterministic=True,
-                        need_log_prob=False,
-                    )
-                [obs, rew, cost], done, truncated, _ = self.env.step(act)
-                ep_ret += rew
-                ep_cost += (cost_criteria**step) * cost
-
-                if done or truncated:
-                    episode_rewards.append(ep_ret)
-                    episode_costs.append(ep_cost)
-                    episode_lengths.append(step + 1)
-                    break
-
-        print('Evaluation results:')
-        print(f'Average episode reward: {np.mean(episode_rewards):.3f}')
-        print(f'Average episode cost: {np.mean(episode_costs):.3f}')
-        print(f'Average episode length: {np.mean(episode_lengths):.3f}')
-        return (
-            episode_rewards,
-            episode_costs,
-        )
-
-    def render(  # pylint: disable=too-many-locals,too-many-arguments,too-many-branches,too-many-statements
-        self,
-        num_episode: int = 0,
-        play=True,
-        save_replay_path: str = None,
-        camera_name: str = None,
-        camera_id: str = None,
-        width: int = None,
-        height: int = None,
-    ):
-        """Render the environment for one episode.
-
-        Args:
-            seed (int): seed for the environment. If None, the environment will be reset with a random seed.
-            save_replay_path (str): path to save the replay. If None, no replay is saved.
-        """
-
-        if save_replay_path is None:
-            save_replay_path = os.path.join(self.save_dir, 'video', self.model_name.split('.')[0])
-
-        # remake the environment if the render mode can not support needed play or save_replay
-        if self.env is None or self.actor is None:
-            raise ValueError(
-                'The environment and the policy must be provided or created before evaluating the agent.'
-            )
-
-        width = self.env.width if width is None else width
-        height = self.env.height if height is None else height
-        env_kwargs = dataclasses.asdict(self.env.render_data)
-        if env_kwargs.get('render_mode') is None:
-            print("Remake the environment with render_mode='rgb_array' to render the environment.")
-            self.env = self._make_env(**env_kwargs)
-            self.render_mode = 'rgb_array'
-
-        if env_kwargs.get('render_mode') == 'human' and save_replay_path is not None:
-            print("Remake the environment with render_mode='rgb_array' to save the replay.")
-            self.env = self._make_env(**env_kwargs)
-            self.render_mode = 'rgb_array'
-
-        if env_kwargs.get('render_mode') == 'rgb_array_list' and play:
-            print("Remake the environment with render_mode='rgb_array' to render the environment.")
-            self.env = self._make_env(**env_kwargs)
-            self.render_mode = 'rgb_array'
-
-        if env_kwargs.get('camara_id') != camera_id or env_kwargs.get('camera_name') != camera_name:
-            print("Remake the environment with render_mode='rgb_array' to change the camera.")
-            env_kwargs['camera_id'] = camera_id
-            env_kwargs['camera_name'] = camera_name
-            self.env = self._make_env(**env_kwargs)
-            self.render_mode = 'rgb_array'
-
-        if env_kwargs.get('height') != height or env_kwargs.get('width') != width:
-            print(
-                "Remake the environment with render_mode='rgb_array' to change the camera width or height."
-            )
-            self.env = self._make_env(**env_kwargs)
-            self.render_mode = 'rgb_array'
-
-        horizon = self.env.rollout_data.max_ep_len
-        frames = []
-        obs, _ = self.env.reset()
-        self.actor.to(self.env.cfgs.device)
-        if self.render_mode == 'human':
-            self.env.render()
-        elif self.render_mode == 'rgb_array':
-            frames.append(self.env.render())
-        if self.env.obs_normalizer is not None:
-            self.env.obs_normalizer.load_state_dict(self.model_params['obs_normalizer'])
-        for episode_idx in range(num_episode):
-            for _ in range(horizon):
-                with torch.no_grad():
-                    if self.env.obs_normalizer is not None:
-                        obs = self.env.obs_normalizer.normalize(obs)
-                    _, act = self.actor.predict(obs, deterministic=True)
-                [obs, _, _], done, truncated, _ = self.env.step(act.cpu().squeeze())
-                if done[0] or truncated[0]:
-                    break
-                if self.render_mode == 'rgb_array':
-                    frames.append(self.env.render())
-
-            if self.render_mode == 'rgb_array_list':
-                frames = self.env.render()
-            if save_replay_path is not None:
-                save_video(
-                    frames,
-                    save_replay_path,
-                    fps=self.env.env.metadata['render_fps'],
-                    episode_trigger=lambda x: True,
-                    episode_index=episode_idx,
-                    name_prefix='eval',
-                )
-            self.env.reset()
-            frames = []
-
-    def _make_env(self, env_id, **env_kwargs):
-        """Make wrapped environment."""
-        env_cfgs = {
-            'num_envs': 1,
-            'seed': 0,
-            'normalized_obs': False,
-            'normalized_rew': False,
-            'normalized_cost': False,
-            'device': 'cpu',
-            'num_threads': 20,
-            'max_len': 100,
-            'async_env': True,
-        }
-        env_cfgs = Config(**env_cfgs)
-        if self.cfg is not None and 'env_cfgs' in self.cfg:
-            self.cfg['env_cfgs']['device'] = 'cpu'
-            self.cfg['env_cfgs']['seed'] = 0
-            env_cfgs = Config(**self.cfg['env_cfgs'])
-
-        if self.algo_name in ['PPOSimmerPid', 'PPOSimmerQ', 'PPOLagSimmerQ', 'PPOLagSimmerPid']:
-            return SimmerWrapper(env_id, env_cfgs, **env_kwargs)
-        if self.algo_name in ['PPOSaute', 'PPOLagSaute']:
-            return SauteWrapper(env_id, env_cfgs, **env_kwargs)
-        return EnvWrapper(env_id, env_cfgs, **env_kwargs)
diff --git a/omnisafe/models/__init__.py b/omnisafe/models/__init__.py
index 6fec5506f..b1b9a049d 100644
--- a/omnisafe/models/__init__.py
+++ b/omnisafe/models/__init__.py
@@ -15,15 +15,12 @@
 """This module contains the model for all methods."""
 
 from omnisafe.models.actor import ActorBuilder
-from omnisafe.models.actor.categorical_actor import CategoricalActor
-from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor
 from omnisafe.models.actor.gaussian_actor import GaussianActor
-from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor
-from omnisafe.models.actor_critic import ActorCritic
-from omnisafe.models.actor_q_critic import ActorQCritic
+from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor
+from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor
+from omnisafe.models.actor_critic.actor_critic import ActorCritic
+from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic
 from omnisafe.models.base import Actor, Critic
-from omnisafe.models.constraint_actor_critic import ConstraintActorCritic
-from omnisafe.models.constraint_actor_q_critic import ConstraintActorQCritic
 from omnisafe.models.critic import CriticBuilder
 from omnisafe.models.critic.q_critic import QCritic
 from omnisafe.models.critic.v_critic import VCritic
diff --git a/omnisafe/models/actor/__init__.py b/omnisafe/models/actor/__init__.py
index dbb8b6301..191befa5a 100644
--- a/omnisafe/models/actor/__init__.py
+++ b/omnisafe/models/actor/__init__.py
@@ -15,7 +15,6 @@
 """The abstract interfaces of Actor networks for the Actor-Critic algorithm."""
 
 from omnisafe.models.actor.actor_builder import ActorBuilder
-from omnisafe.models.actor.categorical_actor import CategoricalActor
-from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor
 from omnisafe.models.actor.gaussian_actor import GaussianActor
-from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor
+from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor
+from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor
diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py
index 13dff87ec..17ea33efd 100644
--- a/omnisafe/models/actor/actor_builder.py
+++ b/omnisafe/models/actor/actor_builder.py
@@ -15,135 +15,54 @@
 """Implementation of ActorBuilder."""
 
 import difflib
-from dataclasses import dataclass
-from typing import Optional, Union
+from typing import List
 
-import torch.nn as nn
-
-from omnisafe.models.actor.categorical_actor import CategoricalActor
-from omnisafe.models.actor.cholesky_actor import MLPCholeskyActor
-from omnisafe.models.actor.gaussian_actor import GaussianActor
-from omnisafe.models.actor.gaussian_stdnet_actor import GaussianStdNetActor
-from omnisafe.utils.model_utils import Activation, InitFunction
-
-
-@dataclass
-class NetworkConfig:
-    """Class for storing network configurations."""
-
-    obs_dim: int
-    act_dim: int
-    hidden_sizes: list
-    activation: Activation = 'tanh'
-    weight_initialization_mode: InitFunction = 'kaiming_uniform'
-    shared: nn.Module = None
-    output_activation: Optional[Activation] = None
-
-
-@dataclass
-class ActionConfig:
-    """Class for storing action configurations."""
-
-    scale_action: bool = False
-    clip_action: bool = False
-    std_learning: bool = True
-    std_init: float = 1.0
+from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor
+from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor
+from omnisafe.models.base import Actor
+from omnisafe.typing import Activation, ActorType, InitFunction, OmnisafeSpace
 
 
 # pylint: disable-next=too-few-public-methods
 class ActorBuilder:
     """Class for building actor networks."""
 
-    # pylint: disable-next=too-many-arguments
     def __init__(
         self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
-        activation: Activation = 'tanh',
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
+        activation: Activation = 'relu',
         weight_initialization_mode: InitFunction = 'kaiming_uniform',
-        shared: nn.Module = None,
-        scale_action: bool = False,
-        clip_action: bool = False,
-        output_activation: Optional[Activation] = 'identity',
-        std_learning: bool = True,
-        std_init: float = 1.0,
     ) -> None:
         """Initialize ActorBuilder."""
-        self.network_config = NetworkConfig(
-            obs_dim=obs_dim,
-            act_dim=act_dim,
-            hidden_sizes=hidden_sizes,
-            activation=activation,
-            output_activation=output_activation,
-            weight_initialization_mode=weight_initialization_mode,
-            shared=shared,
-        )
-        self.action_config = ActionConfig(
-            scale_action=scale_action,
-            clip_action=clip_action,
-            std_learning=std_learning,
-            std_init=std_init,
-        )
+        self._obs_space = obs_space
+        self._act_space = act_space
+        self._weight_initialization_mode = weight_initialization_mode
+        self._activation = activation
+        self._hidden_sizes = hidden_sizes
 
     # pylint: disable-next=too-many-return-statements
-    def build_actor(
-        self, actor_type: str, **kwargs
-    ) -> Union[
-        CategoricalActor,
-        GaussianStdNetActor,
-        MLPCholeskyActor,
-        GaussianActor,
-        NotImplementedError,
-    ]:
+    def build_actor(self, actor_type: ActorType) -> Actor:
         """Build actor network."""
-        if actor_type == 'categorical':
-            return CategoricalActor(
-                obs_dim=self.network_config.obs_dim,
-                act_dim=self.network_config.act_dim,
-                hidden_sizes=self.network_config.hidden_sizes,
-                activation=self.network_config.activation,
-                weight_initialization_mode=self.network_config.weight_initialization_mode,
-                shared=self.network_config.shared,
-                **kwargs,
-            )
-        if actor_type == 'gaussian_stdnet':
-            return GaussianStdNetActor(
-                obs_dim=self.network_config.obs_dim,
-                act_dim=self.network_config.act_dim,
-                hidden_sizes=self.network_config.hidden_sizes,
-                activation=self.network_config.activation,
-                weight_initialization_mode=self.network_config.weight_initialization_mode,
-                shared=self.network_config.shared,
-                scale_action=self.action_config.scale_action,
-                **kwargs,
-            )
-        if actor_type == 'cholesky':
-            return MLPCholeskyActor(
-                obs_dim=self.network_config.obs_dim,
-                act_dim=self.network_config.act_dim,
-                hidden_sizes=self.network_config.hidden_sizes,
-                activation=self.network_config.activation,
-                weight_initialization_mode=self.network_config.weight_initialization_mode,
-                **kwargs,
+        if actor_type == 'gaussian_learning':
+            return GaussianLearningActor(
+                self._obs_space,
+                self._act_space,
+                self._hidden_sizes,
+                activation=self._activation,
+                weight_initialization_mode=self._weight_initialization_mode,
             )
-        if actor_type == 'gaussian':
-            return GaussianActor(
-                obs_dim=self.network_config.obs_dim,
-                act_dim=self.network_config.act_dim,
-                hidden_sizes=self.network_config.hidden_sizes,
-                activation=self.network_config.activation,
-                weight_initialization_mode=self.network_config.weight_initialization_mode,
-                scale_action=self.action_config.scale_action,
-                clip_action=self.action_config.clip_action,
-                output_activation=self.network_config.output_activation,
-                std_learning=self.action_config.std_learning,
-                std_init=self.action_config.std_init,
-                shared=self.network_config.shared,
-                **kwargs,
+        if actor_type == 'gaussian_sac':
+            return GaussianSACActor(
+                self._obs_space,
+                self._act_space,
+                self._hidden_sizes,
+                activation=self._activation,
+                weight_initialization_mode=self._weight_initialization_mode,
             )
 
         raise NotImplementedError(
             f'Actor type {actor_type} is not implemented! '
-            f'Did you mean {difflib.get_close_matches(actor_type, ["categorical", "gaussian_stdnet", "cholesky", "gaussian"], n=1)[0]}?'  # pylint: disable=line-too-long
+            f'Did you mean {difflib.get_close_matches(actor_type, ["gaussian_learning", "gaussian_sac"])[0]}?'
         )
diff --git a/omnisafe/models/actor/categorical_actor.py b/omnisafe/models/actor/categorical_actor.py
deleted file mode 100644
index 0132a128a..000000000
--- a/omnisafe/models/actor/categorical_actor.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of categorical actor."""
-
-from typing import Tuple, Union
-
-import torch
-import torch.nn as nn
-from torch.distributions.categorical import Categorical
-
-from omnisafe.models.base import Actor
-from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network
-
-
-class CategoricalActor(Actor):
-    """Implementation of CategoricalActor.
-
-    A Categorical policy that uses a MLP to map observations to actions distributions.
-    :class:`CategoricalActor` uses a single headed MLP,
-    to predict the logits of the Categorical distribution.
-    This class is an inherit class of :class:`Actor`.
-    You can design your own Categorical policy by inheriting this class or :class:`Actor`.
-    """
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
-        activation: Activation = 'relu',
-        weight_initialization_mode: InitFunction = 'xavier_uniform',
-        shared: nn.Module = None,
-    ) -> None:
-        """Initialize CategoricalActor.
-
-        Args:
-            obs_dim (int): Observation dimension.
-            act_dim (int): Action dimension.
-            hidden_sizes (list): Hidden layer sizes.
-            activation (Activation): Activation function.
-            weight_initialization_mode (InitFunction): Weight initialization mode.
-            shared (nn.Module): Shared network.
-        """
-        super().__init__(
-            obs_dim, act_dim, hidden_sizes, activation, weight_initialization_mode, shared=shared
-        )
-        if shared is not None:
-            action_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], act_dim],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            self.net = nn.Sequential(shared, action_head)
-        else:
-            self.net = build_mlp_network(
-                [obs_dim] + list(hidden_sizes) + [act_dim],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-
-    def _distribution(self, obs: torch.Tensor) -> Categorical:
-        """Get distribution of the action.
-
-        .. note::
-            This function is used to get the distribution of the action.
-            It is used to sample actions and compute log probabilities.
-
-        Args:
-            obs (torch.Tensor): Observation.
-        """
-        logits = self.net(obs)
-        return Categorical(logits=logits)
-
-    def predict(
-        self,
-        obs: torch.Tensor,
-        deterministic: bool = False,
-        need_log_prob: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
-        r"""Predict deterministic or stochastic action based on observation.
-
-        - ``deterministic`` = ``True`` or ``False``
-
-        When training the actor,
-        one important trick to avoid local minimum is to use stochastic actions,
-        which can simply be achieved by sampling actions from the distribution
-        (set ``deterministic`` = ``False``).
-
-        When testing the actor,
-        we want to know the actual action that the agent will take,
-        so we should use deterministic actions (set ``deterministic`` = ``True``).
-
-        - ``need_log_prob`` = ``True`` or ``False``
-
-        In some cases, we need to calculate the log probability of the action,
-        which is used to calculate the loss of the actor.
-        For example, in the case of continuous action space,
-        the loss can be calculated as:
-
-        .. math::
-            L = -\mathbb{E}_{s \sim p(s)} [\log p(a | s) A^R (s, a)]
-
-        where :math:`p(s)` is the distribution of observation,
-        :math:`p(a | s)` is the distribution of action,
-        and :math:`\log p(a | s)` is the log probability of action under the distribution.
-
-        Args:
-            obs (torch.Tensor): observation.
-            deterministic (bool, optional): whether to predict deterministic action. Defaults to False.
-            need_log_prob (bool, optional): whether to return log probability of action. Defaults to False.
-        """
-        dist = self._distribution(obs)
-        if deterministic:
-            action = dist.probs.argmax(dim=-1)
-        else:
-            action = dist.sample()
-        action = action.unsqueeze(0)
-        if need_log_prob:
-            logp_a = dist.log_prob(action)
-            return action, action, logp_a
-        return action, action
diff --git a/omnisafe/models/actor/cholesky_actor.py b/omnisafe/models/actor/cholesky_actor.py
deleted file mode 100644
index 21df20a1f..000000000
--- a/omnisafe/models/actor/cholesky_actor.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of CholeskyActor."""
-
-from typing import Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.distributions import MultivariateNormal
-
-from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network, initialize_layer
-
-
-# pylint: disable-next=too-many-instance-attributes
-class MLPCholeskyActor(nn.Module):
-    r"""Implementation of CholeskyActor.
-
-    A Gaussian policy that uses a MLP to map observations to actions distributions.
-    :class:`MLPCholeskyActor` uses a double headed MLP ,
-    to predict the mean and Cholesky decomposition of the Gaussian distribution.
-
-    .. note::
-        The Cholesky decomposition is a lower triangular matrix L with positive diagonal entries,
-        such that :math:`L^T L = \Sigma`, where :math:`\Sigma` is the covariance matrix of the Gaussian distribution.
-        The Cholesky decomposition is a convenient way to represent a covariance matrix,
-        and it is more numerically stable than the standard representation of the covariance matrix.
-
-    This class is an inherit class of :class:`Actor`.
-    You can design your own Gaussian policy by inheriting this class or :class:`Actor`.
-    """
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        obs_dim: int,
-        act_dim: int,
-        act_max: torch.Tensor,
-        act_min: torch.Tensor,
-        hidden_sizes: list,
-        cov_min: float,
-        mu_clamp_min: float,
-        mu_clamp_max: float,
-        cov_clamp_min: float,
-        cov_clamp_max: float,
-        activation: Activation = 'relu',
-        weight_initialization_mode: InitFunction = 'xavier_uniform',
-    ) -> None:
-        """Initialize MLPCholeskyActor.
-
-        Args:
-            obs_dim (int): observation dimension.
-            act_dim (int): action dimension.
-            act_max (torch.Tensor): maximum value of the action.
-            act_min (torch.Tensor): minimum value of the action.
-            hidden_sizes (list): list of hidden layer sizes.
-            activation (str): activation function.
-            cov_min (float): minimum value of the covariance matrix.
-            mu_clamp_min (float): minimum value of the mean.
-            mu_clamp_max (float): maximum value of the mean.
-            cov_clamp_min (float): minimum value of the covariance matrix.
-            cov_clamp_max (float): maximum value of the covariance matrix.
-            weight_initialization_mode (str): weight initialization mode.
-        """
-        super().__init__()
-        pi_sizes = [obs_dim] + hidden_sizes
-        self.act_limit = act_max
-        self.act_low = torch.nn.Parameter(
-            torch.as_tensor(act_min), requires_grad=False
-        )  # (1, act_dim)
-        self.act_high = torch.nn.Parameter(
-            torch.as_tensor(act_max), requires_grad=False
-        )  # (1, act_dim)
-        self.act_dim = act_dim
-        self.obs_dim = obs_dim
-        self.cov_min = cov_min
-        self.mu_clamp_min = mu_clamp_min
-        self.mu_clamp_max = mu_clamp_max
-        self.cov_clamp_min = cov_clamp_min
-        self.cov_clamp_max = cov_clamp_max
-
-        self.net = build_mlp_network(pi_sizes, activation, activation)
-        self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim)
-        self.cholesky_layer = nn.Linear(hidden_sizes[-1], (self.act_dim * (self.act_dim + 1)) // 2)
-        initialize_layer(weight_initialization_mode, self.mu_layer)
-        # initialize_layer(weight_initialization_mode,self.cholesky_layer)
-        nn.init.constant_(self.mu_layer.bias, 0.0)
-        nn.init.constant_(self.cholesky_layer.bias, 0.0)
-
-    def predict(
-        self, obs: torch.Tensor, deterministic: bool = False, need_log_prob: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
-        r"""Predict action given observation.
-
-        .. note::
-            - Compute the mean and Cholesky decomposition of the Gaussian distribution.
-            - Compute logprob from Gaussian, and then apply correction for Tanh squashing.
-              For details of the correction formula,
-              please refer to the original `SAC paper <https://arxiv.org/abs/1801.01290>`_.
-            - Get action from Multi-variate Gaussian distribution.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool): Whether to use deterministic policy.
-        """
-        if len(obs.shape) == 1:
-            obs = torch.unsqueeze(obs, dim=0)
-        obs_length = obs.size(0)
-
-        net_out = self.net(obs)
-
-        clamped_mu = torch.clamp(self.mu_layer(net_out), self.mu_clamp_min, self.mu_clamp_max)
-        mean = torch.sigmoid(clamped_mu)  # (B, act_dim)
-
-        mean = self.act_low + (self.act_high - self.act_low) * mean
-        cholesky_vector = torch.clamp(
-            self.cholesky_layer(net_out), self.cov_clamp_min, self.cov_clamp_max
-        )
-        cholesky_diag_index = torch.arange(self.act_dim, dtype=torch.long) + 1
-        cholesky_diag_index = (
-            torch.div(cholesky_diag_index * (cholesky_diag_index + 1), 2, rounding_mode='floor') - 1
-        )
-        cholesky_vector[:, cholesky_diag_index] = (
-            F.softplus(cholesky_vector[:, cholesky_diag_index]) + self.cov_min
-        )
-        tril_indices = torch.tril_indices(row=self.act_dim, col=self.act_dim, offset=0)
-        cholesky = torch.zeros(size=(obs_length, self.act_dim, self.act_dim), dtype=torch.float32)
-        cholesky[:, tril_indices[0], tril_indices[1]] = cholesky_vector
-        pi_distribution = MultivariateNormal(mean.to(torch.float32), scale_tril=cholesky)
-
-        if deterministic:
-            pi_action = mean
-        else:
-            pi_action = pi_distribution.rsample()
-
-        pi_action = torch.tanh(pi_action)
-        pi_action = self.act_limit * pi_action
-
-        if need_log_prob:
-            return (
-                pi_action.to(torch.float32),
-                pi_action.to(torch.float32),
-                cholesky.to(torch.float32),
-            )
-        return pi_action.to(torch.float32), pi_action.to(torch.float32)
-
-    def forward(self, obs, deterministic=False):
-        """Forward."""
diff --git a/omnisafe/models/actor/gaussian_actor.py b/omnisafe/models/actor/gaussian_actor.py
index a969320da..1baff6fc7 100644
--- a/omnisafe/models/actor/gaussian_actor.py
+++ b/omnisafe/models/actor/gaussian_actor.py
@@ -12,222 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of GaussianStdNetActor."""
+"""This module contains some base normal distribution agent for the models."""
 
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from torch.distributions.normal import Normal
+from abc import ABC, abstractmethod
 
 from omnisafe.models.base import Actor
-from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network
-
-
-# pylint: disable-next=too-many-instance-attributes
-class GaussianActor(Actor):
-    """Implementation of GaussianStdNetActor."""
-
-    # pylint: disable-next=too-many-arguments, too-many-locals
-    def __init__(
-        self,
-        obs_dim: int,
-        act_dim: int,
-        act_max: torch.Tensor,
-        act_min: torch.Tensor,
-        hidden_sizes: list,
-        activation: Activation = 'tanh',
-        output_activation: Activation = 'identity',
-        weight_initialization_mode: InitFunction = 'kaiming_uniform',
-        shared: nn.Module = None,
-        scale_action: bool = False,
-        clip_action: bool = False,
-        std_learning: bool = True,
-        std_init: float = 1.0,
-        std_end: float = 1.0,
-        std_annealing: bool = False,
-    ) -> None:
-        """Initialize GaussianStdNetActor.
-
-        Args:
-            obs_dim (int): Observation dimension.
-            act_dim (int): Action dimension.
-            act_max (torch.Tensor): Maximum value of the action.
-            act_min (torch.Tensor): Minimum value of the action.
-            hidden_sizes (list): List of hidden layer sizes.
-            activation (Activation): Activation function.
-            output_activation (Activation): Activation function for the output layer.
-            weight_initialization_mode (InitFunction): Weight initialization mode.
-            shared (nn.Module): Shared module.
-            scale_action (bool): Whether to scale the action.
-            clip_action (bool): Whether to clip the action.
-            std_learning (bool): Whether to learn the standard deviation.
-            std_init (float): Initial value of the standard deviation.
-            std_end (float): Final value of the standard deviation.
-            std_annealing (bool): Whether to anneal the standard deviation.
-        """
-        super().__init__(
-            obs_dim, act_dim, hidden_sizes, activation, weight_initialization_mode, shared
-        )
-        self.act_min = act_min
-        self.act_max = act_max
-        self.scale_action = scale_action
-        self.clip_action = clip_action
-        self.std_init = std_init
-        self._std = std_init
-        self.std_end = std_end
-        self.std_annealing = std_annealing
-        assert (
-            self.act_min.size() == self.act_max.size()
-        ), f'The size of act_min {self.act_min} and act_max {self.act_max} should be the same.'
-        if std_annealing:
-            assert (
-                std_init > std_end
-            ), 'If std_annealing is True, std_init should be greater than std_end.'
-            assert not std_learning, 'If std_annealing is True, std_learning should be False.'
-        if std_learning:
-            assert not std_annealing, 'If std_learning is True, std_annealing should be False.'
-
-        if shared is not None:
-            mean_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], act_dim],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            self.net = nn.Sequential(shared, mean_head)
-        else:
-            self.net = build_mlp_network(
-                [obs_dim] + list(hidden_sizes) + [act_dim],
-                activation=activation,
-                output_activation=output_activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-        self.logstd_layer = nn.Parameter(torch.zeros(1, act_dim), requires_grad=std_learning)
-
-    def _distribution(self, obs: torch.Tensor) -> Normal:
-        """Get distribution of the action.
-
-        .. note::
-            The term ``log_std`` is used to control the noise level of the policy,
-            which is a trainable parameter.
-            To avoid the policy to be too explorative,
-            we use ``torch.clamp`` to limit the range of ``log_std``.
-
-        Args:
-            obs (torch.Tensor): Observation.
-        """
-        mean, std = self.get_mean_std(obs)
-        return Normal(mean, std)
-
-    def get_mean_std(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Get mean and std of the action.
-
-        Args:
-            obs (torch.Tensor): Observation.
-
-        """
-        mean = self.net(obs)
-        if len(mean.size()) == 1:
-            mean = mean.view(1, -1)
-        log_std = self.logstd_layer.expand_as(mean)
-        std = torch.exp(log_std) * self._std
-
-        return mean, std
-
-    def get_log_prob(self, obs: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
-        """Get log probability of the action.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            action (torch.Tensor): Action.
-        """
-        dist = self._distribution(obs)
-        return dist.log_prob(action).sum(axis=-1)
-
-    def predict(
-        self,
-        obs: torch.Tensor,
-        deterministic: bool = False,
-        need_log_prob: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
-        r"""Predict action given observation.
-
-        .. note::
-            The action is scaled to the action space by:
-
-            .. math::
-                a = a_{min} + \frac{a + 1}{2} \times (a_{max} - a_{min})
-
-            where :math:`a` is the action predicted by the policy,
-            :math:`a_{min}` and :math:`a_{max}` are the minimum and maximum values of the action space.
-            After scaling, the action is clipped to the range of :math:`[a_{min}, a_{max}]`.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool): Whether to use deterministic policy.
-        """
-        mean, std = self.get_mean_std(obs)
-        dist = Normal(mean, std)
-        if deterministic:
-            out = mean.to(torch.float64)
-        else:
-            out = dist.rsample().to(torch.float64)
-
-        if self.scale_action:
-            # If the action scale is inf, stop scaling the action
-            assert (
-                not torch.isinf(self.act_min).any() and not torch.isinf(self.act_max).any()
-            ), 'The action scale is inf, stop scaling the action.'
-            self.act_min = self.act_min.to(mean.device)
-            self.act_max = self.act_max.to(mean.device)
-            action = self.act_min + (out + 1) / 2 * (self.act_max - self.act_min)
-        else:
-            action = out
-
-        if self.clip_action:
-            action = torch.clamp(action, self.act_min, self.act_max)
-
-        if need_log_prob:
-            log_prob = dist.log_prob(out).sum(axis=-1)
-            return out.to(torch.float32), action.to(torch.float32), log_prob.to(torch.float32)
-        return out.to(torch.float32), action.to(torch.float32)
-
-    def forward(
-        self,
-        obs: torch.Tensor,
-        act: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
-        """Forward function for actor.
-
-        .. note::
-            This forward function has two modes:
 
-            - If ``act`` is not None, it will return the distribution and the log probability of action.
-            - If ``act`` is None, it will return the distribution.
 
-        Args:
-            obs (torch.Tensor): observation.
-            act (torch.Tensor, optional): action. Defaults to None.
-        """
-        dist = self._distribution(obs)
-        if act is not None:
-            log_prob = dist.log_prob(act).sum(axis=-1)
-            return dist, log_prob
-        return dist
+class GaussianActor(Actor, ABC):
+    """A abstract class for normal distribution actor.
 
-    def get_distribution(self, obs: torch.Tensor) -> Normal:
-        """Get distribution of the action.
-        Args:
-            obs (torch.Tensor): Observation.
-        """
-        return self._distribution(obs)
+    AN NormalActor inherits from Actor and use Normal distribution to approximate
+    the policy function.
 
-    def set_std(self, proportion: float) -> float:
-        """To support annealing exploration noise.
+    .. note::
+        You can use this class to implement your own actor by inheriting it.
+    """
 
-        Proportion is annealing from 1. to 0 over course of training.
+    @property
+    @abstractmethod
+    def std(self) -> float:
+        """Get the standard deviation of the normal distribution."""
 
-        Args:
-            proportion (float): proportion of annealing.
-        """
-        self._std = self.std_init * proportion + self.std_end * (1 - proportion)
+    @std.setter
+    @abstractmethod
+    def std(self, std: float) -> None:
+        """Set the standard deviation of the normal distribution."""
diff --git a/omnisafe/models/actor/gaussian_learning_actor.py b/omnisafe/models/actor/gaussian_learning_actor.py
new file mode 100644
index 000000000..cec6b8354
--- /dev/null
+++ b/omnisafe/models/actor/gaussian_learning_actor.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of GaussianStdNetActor."""
+
+from typing import List
+
+import torch
+import torch.nn as nn
+from torch.distributions import Distribution, Normal
+
+from omnisafe.models.actor.gaussian_actor import GaussianActor
+from omnisafe.typing import Activation, InitFunction, OmnisafeSpace
+from omnisafe.utils.model import build_mlp_network
+
+
+# pylint: disable-next=too-many-instance-attributes
+class GaussianLearningActor(GaussianActor):
+    """Implementation of GaussianLearningActor."""
+
+    def __init__(
+        self,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
+        activation: Activation = 'relu',
+        weight_initialization_mode: InitFunction = 'kaiming_uniform',
+    ) -> None:
+        """Initialize GaussianLearningActor.
+
+        Args:
+            obs_space (OmnisafeSpace): Observation space.
+            act_space (OmnisafeSpace): Action space.
+            hidden_sizes (list): List of hidden layer sizes.
+            activation (Activation): Activation function.
+            weight_initialization_mode (InitFunction): Weight initialization mode.
+            shared (nn.Module): Shared module.
+        """
+        super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode)
+        self.mean = build_mlp_network(
+            sizes=[self._obs_dim, *self._hidden_sizes, self._act_dim],
+            activation=activation,
+            weight_initialization_mode=weight_initialization_mode,
+        )
+        self.log_std = nn.Parameter(torch.zeros(self._act_dim), requires_grad=True)
+
+    def _distribution(self, obs: torch.Tensor) -> Distribution:
+        mean = self.mean(obs)
+        std = torch.exp(self.log_std)
+        return Normal(mean, std)
+
+    def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor:
+        self._current_dist = self._distribution(obs)
+        self._after_inference = True
+        if deterministic:
+            return self._current_dist.mean
+        return self._current_dist.rsample()
+
+    def forward(self, obs: torch.Tensor) -> Distribution:
+        self._current_dist = self._distribution(obs)
+        self._after_inference = True
+        return self._current_dist
+
+    def log_prob(self, act: torch.Tensor) -> torch.Tensor:
+        assert self._after_inference, 'log_prob() should be called after predict() or forward()'
+        self._after_inference = False
+        return self._current_dist.log_prob(act).sum(axis=-1)
+
+    @property
+    def std(self) -> float:
+        return torch.exp(self.log_std).mean().item()
+
+    @std.setter
+    def std(self, std: float) -> None:
+        device = self.log_std.device
+        self.log_std.data.fill_(torch.log(torch.tensor(std, device=device)))
diff --git a/omnisafe/models/actor/gaussian_sac_actor.py b/omnisafe/models/actor/gaussian_sac_actor.py
new file mode 100644
index 000000000..00bcb346f
--- /dev/null
+++ b/omnisafe/models/actor/gaussian_sac_actor.py
@@ -0,0 +1,76 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of GaussianStdNetActor."""
+
+from typing import List
+
+import torch
+from torch.distributions import Distribution
+
+from omnisafe.models.base import Actor
+from omnisafe.typing import Activation, InitFunction, OmnisafeSpace
+from omnisafe.utils.math import TanhNormal
+from omnisafe.utils.model import build_mlp_network
+
+
+class GaussianSACActor(Actor):
+    """Implementation of GaussianSACActor."""
+
+    def __init__(
+        self,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
+        activation: Activation = 'relu',
+        weight_initialization_mode: InitFunction = 'kaiming_uniform',
+    ) -> None:
+        super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode)
+        self.net = build_mlp_network(
+            sizes=[self._obs_dim, *self._hidden_sizes, self._act_dim * 2],
+            activation=activation,
+            weight_initialization_mode=weight_initialization_mode,
+        )
+
+    def _distribution(self, obs: torch.Tensor) -> Distribution:
+        mean, log_std = self.net(obs).chunk(2, dim=-1)
+        log_std = torch.clamp(log_std, min=-20, max=2)
+        std = log_std.exp()
+        return TanhNormal(mean, std)
+
+    def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor:
+        self._current_dist = self._distribution(obs)
+        self._after_inference = True
+        if deterministic:
+            return self._current_dist.mean
+        return self._current_dist.rsample()
+
+    def forward(self, obs: torch.Tensor) -> Distribution:
+        self._current_dist = self._distribution(obs)
+        self._after_inference = True
+        return self._current_dist
+
+    def log_prob(self, act: torch.Tensor) -> torch.Tensor:
+        assert self._after_inference, 'log_prob() should be called after predict() or forward()'
+        self._after_inference = False
+        return self._current_dist.log_prob(act).sum(axis=-1)
+
+    @property
+    def std(self) -> float:
+        """Get the standard deviation of the normal distribution."""
+        return self._current_dist.stddev.mean().item()
+
+    @std.setter
+    def std(self, std: float) -> None:
+        raise NotImplementedError('GaussianStdNetActor does not support setting std.')
diff --git a/omnisafe/models/actor/gaussian_stdnet_actor.py b/omnisafe/models/actor/gaussian_stdnet_actor.py
deleted file mode 100644
index 7b7189f6a..000000000
--- a/omnisafe/models/actor/gaussian_stdnet_actor.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of GaussianStdNetActor."""
-
-import torch
-import torch.nn as nn
-from torch.distributions.normal import Normal
-
-from omnisafe.models.base import Actor
-from omnisafe.utils.model_utils import Activation, build_mlp_network
-
-
-class GaussianStdNetActor(Actor):
-    """Implementation of GaussianStdNetActor."""
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        obs_dim,
-        act_dim,
-        act_max: torch.Tensor,
-        act_min: torch.Tensor,
-        hidden_sizes: list,
-        activation: Activation = 'relu',
-        output_activation: Activation = 'tanh',
-        weight_initialization_mode: Activation = 'kaiming_uniform',
-        shared=None,
-        scale_action=False,
-        clip_action: bool = False,
-    ):
-        """Initialize GaussianStdNetActor."""
-        super().__init__(
-            obs_dim, act_dim, hidden_sizes, activation, weight_initialization_mode, shared
-        )
-        self.act_min = act_min
-        self.act_max = act_max
-        self.scale_action = scale_action
-        self.clip_action = clip_action
-
-        if shared is not None:
-            mean_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], act_dim],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            std_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], act_dim],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            self.mean = nn.Sequential(shared, mean_head)
-            self.log_std = nn.Sequential(shared, std_head)
-        else:
-            net = build_mlp_network(
-                [obs_dim] + list(hidden_sizes),
-                activation=activation,
-                output_activation=output_activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            mean_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], act_dim],
-                activation=activation,
-                output_activation=output_activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            std_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], act_dim],
-                activation=activation,
-                output_activation=output_activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            self.mean = nn.Sequential(net, mean_head)
-            self.log_std = nn.Sequential(net, std_head)
-        self.net = nn.ModuleList([self.mean, self.log_std])
-
-    def _distribution(self, obs):
-        """Get distribution of the action.
-
-        .. note::
-            The term ``log_std`` is used to control the noise level of the policy,
-            which is a trainable parameter.
-            To avoid the policy to be too explorative,
-            we use ``torch.clamp`` to limit the range of ``log_std``.
-
-        Args:
-            obs (torch.Tensor): Observation.
-        """
-        mean = self.mean(obs)
-        log_std = self.log_std(obs)
-        log_std = torch.clamp(log_std, -20, 2)
-        std = torch.exp(log_std)
-        return Normal(mean, std)
-
-    def predict(self, obs, deterministic=False, need_log_prob=False):
-        r"""Predict action given observation.
-
-        .. note::
-            The action is scaled to the action space by:
-
-            .. math::
-                a = a_{min} + \frac{a + 1}{2} \times (a_{max} - a_{min})
-
-            where :math:`a` is the action predicted by the policy,
-            :math:`a_{min}` and :math:`a_{max}` are the minimum and maximum values of the action space.
-            After scaling, the action is clipped to the range of :math:`[a_{min}, a_{max}]`.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool): Whether to use deterministic policy.
-        """
-        dist = self._distribution(obs)
-        if deterministic:
-            out = dist.mean
-        else:
-            out = dist.rsample()
-
-        if self.scale_action:
-            # If the action scale is inf, stop scaling the action
-            assert (
-                not torch.isinf(self.act_min).any() and not torch.isinf(self.act_max).any()
-            ), 'The action scale is inf, stop scaling the action.'
-            self.act_min = self.act_min.to(out.device)
-            self.act_max = self.act_max.to(out.device)
-            action = self.act_min + (out + 1) / 2 * (self.act_max - self.act_min)
-        else:
-            action = out
-
-        if self.clip_action:
-            action = torch.clamp(action, self.act_min, self.act_max)
-
-        if need_log_prob:
-            log_prob = dist.log_prob(out).sum(axis=-1)
-            log_prob -= torch.log(1.00001 - torch.tanh(out) ** 2).sum(axis=-1)
-            return out.to(torch.float32), action.to(torch.float32), log_prob.to(torch.float32)
-        return out.to(torch.float32), action.to(torch.float32)
-
-    def forward(self, obs, act=None):
-        """Forward function for actor.
-
-        .. note::
-            This forward function has two modes:
-
-            - If ``act`` is not None, it will return the distribution and the log probability of action.
-            - If ``act`` is None, it will return the distribution.
-
-        Args:
-            obs (torch.Tensor): observation.
-            act (torch.Tensor, optional): action. Defaults to None.
-        """
-        dist = self._distribution(obs)
-        if act is not None:
-            log_prob = dist.log_prob(act).sum(axis=-1)
-            return dist, log_prob
-        return dist
diff --git a/omnisafe/models/actor_critic.py b/omnisafe/models/actor_critic.py
deleted file mode 100644
index 60108aead..000000000
--- a/omnisafe/models/actor_critic.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of ActorCritic."""
-
-from typing import NamedTuple, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-from gymnasium.spaces import Box, Discrete
-
-from omnisafe.models.actor import ActorBuilder
-from omnisafe.models.critic import CriticBuilder
-from omnisafe.utils.model_utils import build_mlp_network
-
-
-# pylint: disable-next=too-many-instance-attributes
-class ActorCritic(nn.Module):
-    """Class for ActorCritic.
-
-    In ``omnisafe``, we combine the actor and critic into one this class.
-
-    .. list-table::
-
-        *   -   Model
-            -   Description
-            -   Function
-        *   -   Actor
-            -   The policy network, input is observation, output is action.
-                Choose the actor from the following options:
-                :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`,
-                :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`.
-            -   Choose the action based on the observation.
-        *   -   Value Critic
-            -   The value network, input is observation, output is reward value.
-                Choose the critic from the following options:
-                :class:`QCritic`, :class:`VCritic`.
-            -   Estimate the reward value of the observation.
-    """
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        observation_space: Box,
-        action_space: Box,
-        model_cfgs: NamedTuple,
-    ) -> None:
-        """Initialize ActorCritic
-
-        .. note::
-            Instead of creating the actor or critic directly, we use the builder to create them.
-            The advantage of this is that,
-            each type of critic has a uniform way of passing parameters.
-            This makes it easy for users to use existing critics,
-            and also facilitates the extension of new critic types.
-
-        Args:
-            observation_space (Box): Observation space.
-            action_space (Box): Action space.
-            standardized_obs (bool): Whether to standardize the observation.
-            scale_rewards (bool): Whether to scale the rewards.
-            model_cfgs (NamedTuple): Model configurations.
-        """
-        super().__init__()
-
-        self.obs_shape = observation_space.shape
-        self.obs_dim = observation_space.shape[-1]
-
-        self.act_space_type = 'discrete' if isinstance(action_space, Discrete) else 'continuous'
-        self.act_dim = action_space.shape[-1] if isinstance(action_space, Box) else action_space.n
-
-        self.model_cfgs = model_cfgs
-        self.ac_kwargs = model_cfgs.ac_kwargs
-
-        # use for shared weights
-        layer_units = [self.obs_dim] + self.ac_kwargs.pi.hidden_sizes
-        activation = self.ac_kwargs.pi.activation
-        if model_cfgs.shared_weights:
-            self.shared = build_mlp_network(
-                layer_units,
-                activation=activation,
-                weight_initialization_mode=model_cfgs.weight_initialization_mode,
-                output_activation=activation,
-            )
-        else:
-            self.shared = None
-
-        # build actor
-        actor_builder = ActorBuilder(
-            obs_dim=self.obs_dim,
-            act_dim=self.act_dim,
-            weight_initialization_mode=model_cfgs.weight_initialization_mode,
-            shared=self.shared,
-            **self.ac_kwargs.pi,
-        )
-        if self.act_space_type == 'discrete':
-            self.actor = actor_builder.build_actor('categorical')
-        else:
-            act_max = torch.as_tensor(action_space.high)
-            act_min = torch.as_tensor(action_space.low)
-            self.actor = actor_builder.build_actor(
-                model_cfgs.actor_type, act_max=act_max, act_min=act_min
-            )
-
-        # build critic
-        critic_builder = CriticBuilder(
-            obs_dim=self.obs_dim,
-            act_dim=self.act_dim,
-            hidden_sizes=self.ac_kwargs.val.hidden_sizes,
-            activation=self.ac_kwargs.val.activation,
-            weight_initialization_mode=model_cfgs.weight_initialization_mode,
-            shared=self.shared,
-        )
-        self.reward_critic = critic_builder.build_critic('v')
-
-    def forward(self, obs: torch.Tensor) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Forward pass of the actor-critic model"""
-        return self.step(obs)
-
-    def step(
-        self, obs: torch.Tensor, deterministic: bool = False
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Step function of the actor-critic model
-
-        Input observation, output value (from :class:`Critic`) action,
-        and its log probability (from :class`Actor`).
-
-        .. note::
-            The observation is standardized by the running mean and standard deviation.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool, optional): Whether to use deterministic action.
-        """
-        with torch.no_grad():
-            value = self.reward_critic(obs)
-            raw_action, action, logp_a = self.actor.predict(
-                obs, deterministic=deterministic, need_log_prob=True
-            )
-
-        return raw_action, action, value, logp_a
-
-    def anneal_exploration(self, frac: float) -> None:
-        """Update internals of actors
-
-        Updates exploration parameters for Gaussian actors update log_std
-
-        Args:
-            frac: progress of epochs. 1.0 is the end of training.
-        """
-        if hasattr(self.actor, 'set_std'):
-            self.actor.set_std(1 - frac)
diff --git a/omnisafe/models/actor_critic/actor_critic.py b/omnisafe/models/actor_critic/actor_critic.py
new file mode 100644
index 000000000..f920e27a9
--- /dev/null
+++ b/omnisafe/models/actor_critic/actor_critic.py
@@ -0,0 +1,154 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of ActorCritic."""
+
+from typing import List, Tuple
+
+import torch
+from torch import nn, optim
+from torch.optim.lr_scheduler import ConstantLR, LinearLR, _LRScheduler
+
+from omnisafe.models.actor.actor_builder import ActorBuilder
+from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor
+from omnisafe.models.critic.critic_builder import CriticBuilder
+from omnisafe.typing import OmnisafeSpace
+from omnisafe.utils.config import ModelConfig
+from omnisafe.utils.schedule import PiecewiseSchedule, Schedule
+
+
+class ActorCritic(nn.Module):
+    """Class for ActorCritic.
+
+    In ``omnisafe``, we combine the actor and critic into one this class.
+
+    .. list-table::
+
+        *   -   Model
+            -   Description
+            -   Function
+        *   -   Actor
+            -   The policy network, input is observation, output is action.
+                Choose the actor from the following options:
+                :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`,
+                :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`.
+            -   Choose the action based on the observation.
+        *   -   Value Critic
+            -   The value network, input is observation, output is reward value.
+                Choose the critic from the following options:
+                :class:`QCritic`, :class:`VCritic`.
+            -   Estimate the reward value of the observation.
+    """
+
+    # pylint: disable-next=too-many-arguments
+    def __init__(
+        self,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        model_cfgs: ModelConfig,
+        epochs: int,
+    ) -> None:
+        """Initialize ActorCritic."""
+        super().__init__()
+        self.actor = ActorBuilder(
+            obs_space=obs_space,
+            act_space=act_space,
+            hidden_sizes=model_cfgs.actor.hidden_sizes,
+            activation=model_cfgs.actor.activation,
+            weight_initialization_mode=model_cfgs.weight_initialization_mode,
+        ).build_actor(actor_type=model_cfgs.actor_type)
+        self.reward_critic = CriticBuilder(
+            obs_space=obs_space,
+            act_space=act_space,
+            hidden_sizes=model_cfgs.critic.hidden_sizes,
+            activation=model_cfgs.critic.activation,
+            weight_initialization_mode=model_cfgs.weight_initialization_mode,
+            num_critics=1,
+            use_obs_encoder=False,
+        ).build_critic(critic_type='v')
+        self.add_module('actor', self.actor)
+        self.add_module('reward_critic', self.reward_critic)
+
+        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=model_cfgs.actor.lr)
+        self.reward_critic_optimizer = optim.Adam(
+            self.reward_critic.parameters(), lr=model_cfgs.critic.lr
+        )
+
+        self.actor_scheduler: _LRScheduler
+        if model_cfgs.linear_lr_decay:
+            self.actor_scheduler = LinearLR(
+                self.actor_optimizer,
+                start_factor=1.0,
+                end_factor=0.0,
+                total_iters=epochs,
+                verbose=True,
+            )
+        else:
+            self.actor_scheduler = ConstantLR(
+                self.actor_optimizer, factor=1.0, total_iters=epochs, verbose=True
+            )
+
+        self.std_schedule: Schedule
+
+    def step(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]:
+        """Choose the action based on the observation. used in rollout without gradient.
+
+        Args:
+            obs: The observation.
+            deterministic: Whether to use deterministic action. default: False.
+
+        Returns:
+            The action, value_r, and log_prob.
+        """
+        with torch.no_grad():
+            value_r = self.reward_critic(obs)
+            act = self.actor.predict(obs, deterministic=deterministic)
+            log_prob = self.actor.log_prob(act)
+        return act, value_r[0], log_prob
+
+    def forward(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]:
+        """Choose the action based on the observation. used in training with gradient.
+
+        Args:
+            obs: The observation.
+            deterministic: Whether to use deterministic action. default: False.
+
+        Returns:
+            The action, value_r, and log_prob.
+        """
+        return self.step(obs, deterministic=deterministic)
+
+    def set_annealing(self, epochs: List[float], std: List[float]) -> None:
+        """Set the annealing mode for the actor.
+
+        Args:
+            annealing: Whether to use annealing mode.
+        """
+        assert isinstance(
+            self.actor, GaussianLearningActor
+        ), 'Only GaussianLearningActor support annealing.'
+        self.std_schedule = PiecewiseSchedule(
+            endpoints=list(zip(epochs, std)), outside_value=std[-1]
+        )
+
+    def annealing(self, epoch: int) -> None:
+        """Set the annealing mode for the actor.
+
+        Args:
+            epoch: The current epoch.
+        """
+        assert isinstance(
+            self.actor, GaussianLearningActor
+        ), 'Only GaussianLearningActor support annealing.'
+        self.actor.std = self.std_schedule.value(epoch)
diff --git a/omnisafe/models/actor_critic/constraint_actor_critic.py b/omnisafe/models/actor_critic/constraint_actor_critic.py
new file mode 100644
index 000000000..f69cd6d5e
--- /dev/null
+++ b/omnisafe/models/actor_critic/constraint_actor_critic.py
@@ -0,0 +1,117 @@
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of ConstraintActorCritic."""
+
+from typing import Tuple
+
+import torch
+from torch import optim
+
+from omnisafe.models.actor_critic.actor_critic import ActorCritic
+from omnisafe.models.critic.critic_builder import CriticBuilder
+from omnisafe.typing import OmnisafeSpace
+from omnisafe.utils.config import ModelConfig
+
+
+class ConstraintActorCritic(ActorCritic):
+    """ConstraintActorCritic is a wrapper around ActorCritic that adds a cost critic to the model.
+
+    In ``omnisafe``, we combine the actor and critic into one this class.
+
+    .. list-table::
+
+        *   -   Model
+            -   Description
+            -   Function
+        *   -   Actor
+            -   The policy network, input is observation, output is action.
+                Choose the actor from the following options:
+                :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`,
+                :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`.
+            -   Choose the action based on the observation.
+        *   -   Reward Critic
+            -   The value network, input is observation,
+                output is reward value.
+                Choose the critic from the following options:
+                :class:`QCritic`, :class:`VCritic`.
+            -   Estimate the reward value of the observation.
+        *   -   Cost Critic
+            -   The value network, input is observation,
+                output is cost value.
+                Choose the critic from the following options:
+                :class:`QCritic`, :class:`VCritic`.
+            -   Estimate the cost value of the observation.
+    """
+
+    def __init__(
+        self,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        model_cfgs: ModelConfig,
+        epochs: int,
+    ) -> None:
+        """Initialize ConstraintActorCritic."""
+        super().__init__(obs_space, act_space, model_cfgs, epochs)
+        self.cost_critic = CriticBuilder(
+            obs_space=obs_space,
+            act_space=act_space,
+            hidden_sizes=model_cfgs.critic.hidden_sizes,
+            activation=model_cfgs.critic.activation,
+            weight_initialization_mode=model_cfgs.weight_initialization_mode,
+            num_critics=1,
+            use_obs_encoder=False,
+        ).build_critic('v')
+        self.add_module('cost_critic', self.cost_critic)
+
+        self.cost_critic_optimizer = optim.Adam(
+            self.cost_critic.parameters(), lr=model_cfgs.critic.lr
+        )
+
+    def step(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]:
+        """Choose action based on observation.
+
+        Args:
+            obs (torch.Tensor): Observation.
+            deterministic (bool): Whether to use deterministic policy.
+
+        Returns:
+            action (torch.Tensor): Action.
+            value_r (torch.Tensor): Reward value.
+            value_c (torch.Tensor): Cost value.
+            log_prob (torch.Tensor): Log probability of action.
+        """
+        with torch.no_grad():
+            value_r = self.reward_critic(obs)
+            value_c = self.cost_critic(obs)
+
+            action = self.actor.predict(obs, deterministic=deterministic)
+            log_prob = self.actor.log_prob(action)
+
+        return action, value_r[0], value_c[0], log_prob
+
+    def forward(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]:
+        """Choose action based on observation.
+
+        Args:
+            obs (torch.Tensor): Observation.
+            deterministic (bool): Whether to use deterministic policy.
+
+        Returns:
+            action (torch.Tensor): Action.
+            value_r (torch.Tensor): Reward value.
+            value_c (torch.Tensor): Cost value.
+            log_prob (torch.Tensor): Log probability of action.
+        """
+        return self.step(obs, deterministic=deterministic)
diff --git a/omnisafe/models/actor_q_critic.py b/omnisafe/models/actor_q_critic.py
deleted file mode 100644
index 92995a7e0..000000000
--- a/omnisafe/models/actor_q_critic.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of ActorQCritic."""
-
-from typing import NamedTuple, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-from gymnasium.spaces import Box, Discrete
-
-from omnisafe.models.actor import ActorBuilder
-from omnisafe.models.critic.q_critic import QCritic
-from omnisafe.utils.model_utils import build_mlp_network
-
-
-# pylint: disable-next=too-many-instance-attributes
-class ActorQCritic(nn.Module):
-    """Class for ActorCritic.
-
-    In ``omnisafe``, we combine the actor and critic into one this class.
-
-    .. list-table::
-
-        *   -   Model
-            -   Description
-            -   Function
-        *   -   Actor
-            -   The policy network, input is observation, output is action.
-                Choose the actor from the following options:
-                :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`,
-                :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`.
-            -   Choose the action based on the observation.
-        *   -   Value Q Critic
-            -   The value network, input is observation-action pair,
-                output is reward value.
-                Choose the critic from the following options:
-                :class:`QCritic`, :class:`VCritic`.
-            -   Estimate the reward value of the observation.
-    """
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        observation_space: Box,
-        action_space: Box,
-        model_cfgs: NamedTuple,
-    ) -> None:
-        """Initialize ActorQCritic
-
-        .. note::
-            Instead of creating the actor or critic directly, we use the builder to create them.
-            The advantage of this is that,
-            each type of critic has a uniform way of passing parameters.
-            This makes it easy for users to use existing critics,
-            and also facilitates the extension of new critic types.
-
-        Args:
-            observation_space: observation space
-            action_space: action space
-            standardized_obs: whether to standardize observation
-            shared_weights: whether to share weights between actor and critic
-            model_cfgs: model configurations
-            weight_initialization_mode: weight initialization mode
-            device: device, cpu or cuda
-        """
-        super().__init__()
-
-        self.obs_shape = observation_space.shape
-        self.act_dim = action_space.shape[-1] if isinstance(action_space, Box) else action_space.n
-        self.ac_kwargs = model_cfgs.ac_kwargs
-        # build policy and value functions
-        self.act_space_type = 'discrete' if isinstance(action_space, Discrete) else 'continuous'
-        self.obs_dim = observation_space.shape[0]
-
-        # Use for shared weights
-        layer_units = [self.obs_dim] + model_cfgs.ac_kwargs.pi.hidden_sizes
-
-        activation = model_cfgs.ac_kwargs.pi.activation
-        if model_cfgs.shared_weights:
-            shared = build_mlp_network(
-                layer_units,
-                activation=activation,
-                weight_initialization_mode=model_cfgs.weight_initialization_mode,
-                output_activation=activation,
-            )
-        else:
-            shared = None
-        actor_builder = ActorBuilder(
-            obs_dim=self.obs_dim,
-            act_dim=self.act_dim,
-            weight_initialization_mode=model_cfgs.weight_initialization_mode,
-            shared=shared,
-            **model_cfgs.ac_kwargs.pi,
-        )
-
-        if model_cfgs.actor_type == 'cholesky':
-            self.actor = actor_builder.build_actor(
-                model_cfgs.actor_type,
-                act_max=torch.as_tensor(action_space.high),
-                act_min=torch.as_tensor(action_space.low),
-                cov_min=model_cfgs.cov_min,
-                mu_clamp_min=model_cfgs.mu_clamp_min,
-                mu_clamp_max=model_cfgs.mu_clamp_max,
-                cov_clamp_min=model_cfgs.cov_clamp_min,
-                cov_clamp_max=model_cfgs.cov_clamp_max,
-            )
-        elif self.act_space_type == 'discrete':
-            self.actor = actor_builder.build_actor('categorical')
-        else:
-            act_max = torch.as_tensor(action_space.high)
-            act_min = torch.as_tensor(action_space.low)
-            self.actor = actor_builder.build_actor(
-                model_cfgs.actor_type,
-                act_max=act_max,
-                act_min=act_min,
-            )
-
-        self.critic = QCritic(
-            self.obs_dim,
-            self.act_dim,
-            hidden_sizes=model_cfgs.ac_kwargs.val.hidden_sizes,
-            activation=model_cfgs.ac_kwargs.val.activation,
-            weight_initialization_mode=model_cfgs.weight_initialization_mode,
-            shared=shared,
-            num_critics=model_cfgs.ac_kwargs.val.num_critics,
-            action_type='continuous' if isinstance(action_space, Box) else 'discrete',
-        )
-
-    def forward(self, obs: torch.Tensor) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Forward pass of the actor-critic model"""
-        return self.step(obs)
-
-    def step(
-        self, obs: torch.Tensor, deterministic: bool = False
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Step function of the actor-critic model
-
-        Input observation, output value (from :class:`Critic`) action,
-        and its log probability (from :class`Actor`).
-
-        .. note::
-            The observation is standardized by the running mean and standard deviation.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool, optional): Whether to use deterministic action.
-        """
-        with torch.no_grad():
-            raw_action, action, logp_a = self.actor.predict(
-                obs, deterministic=deterministic, need_log_prob=True
-            )
-            value = self.critic(obs, action)[0]
-
-        return raw_action, action, value, logp_a
-
-    def anneal_exploration(self, frac: float) -> None:
-        """Update internals of actors
-
-        Updates exploration parameters for Gaussian actors update log_std
-
-        Args:
-            frac: progress of epochs. 1.0 is the end of training.
-        """
-        if hasattr(self.actor, 'set_std'):
-            self.actor.set_std(1 - frac)
diff --git a/omnisafe/models/base.py b/omnisafe/models/base.py
index da5fb4511..784c7723a 100644
--- a/omnisafe/models/base.py
+++ b/omnisafe/models/base.py
@@ -14,17 +14,18 @@
 # ==============================================================================
 """This module contains some base abstract classes for the models."""
 
-import abc
-from typing import List, Tuple, Union
+from abc import ABC, abstractmethod
+from typing import List
 
 import torch
 import torch.nn as nn
-from torch.distributions.normal import Normal
+from gymnasium import spaces
+from torch.distributions import Distribution
 
-from omnisafe.utils.model_utils import Activation, InitFunction
+from omnisafe.typing import Activation, InitFunction, OmnisafeSpace
 
 
-class Actor(abc.ABC, nn.Module):
+class Actor(ABC, nn.Module):
     """A abstract class for actor.
 
     An actor approximates the policy function that maps observations to actions.
@@ -38,34 +39,45 @@ class Actor(abc.ABC, nn.Module):
     # pylint: disable-next=too-many-arguments
     def __init__(
         self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
         activation: Activation = 'relu',
-        weight_initialization_mode: InitFunction = 'xavier_uniform',
-        shared: nn.Module = None,
+        weight_initialization_mode: InitFunction = 'kaiming_uniform',
     ) -> None:
         """Initialize the base actor.
 
         Args:
-            obs_dim (int): observation dimension.
-            act_dim (int): action dimension.
+            obs_space (OmnisafeSpace): observation space.
+            act_space (OmnisafeSpace): action space.
             hidden_sizes (list): hidden layer sizes.
             activation (Activation): activation function.
             weight_initialization_mode (InitFunction, optional): weight initialization mode.
-                                                                Defaults to ``xavier_uniform``.
+                                                                Defaults to ``kaiming_uniform``.
             shared (nn.Module, optional): shared module. Defaults to None.
         """
         nn.Module.__init__(self)
-        self.obs_dim = obs_dim
-        self.act_dim = act_dim
-        self.shared = shared
-        self.weight_initialization_mode = weight_initialization_mode
-        self.activation = activation
-        self.hidden_sizes = hidden_sizes
-
-    @abc.abstractmethod
-    def _distribution(self, obs) -> Normal:
+        self._obs_space = obs_space
+        self._act_space = act_space
+        self._weight_initialization_mode = weight_initialization_mode
+        self._activation = activation
+        self._hidden_sizes = hidden_sizes
+
+        self._current_dist: Distribution
+        self._after_inference: bool = False
+
+        if isinstance(self._obs_space, spaces.Box) and len(self._obs_space.shape) == 1:
+            self._obs_dim = self._obs_space.shape[0]
+        else:
+            raise NotImplementedError
+
+        if isinstance(self._act_space, spaces.Box) and len(self._act_space.shape) == 1:
+            self._act_dim = self._act_space.shape[0]
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def _distribution(self, obs: torch.Tensor) -> Distribution:
         r"""Return the distribution of action.
 
         An actor generates a distribution, which is used to sample actions during training.
@@ -86,15 +98,28 @@ def _distribution(self, obs) -> Normal:
 
         Args:
             obs (torch.Tensor): observation.
+
+        Returns:
+            Distribution: the distribution of action.
+        """
+
+    @abstractmethod
+    def forward(self, obs: torch.Tensor) -> Distribution:
+        r"""Return the distribution of action.
+
+        Args:
+            obs (torch.Tensor): observation.
+
+        Returns:
+            Distribution: the distribution of action.
         """
 
-    @abc.abstractmethod
+    @abstractmethod
     def predict(
         self,
         obs: torch.Tensor,
         deterministic: bool = False,
-        need_log_prob: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
+    ) -> torch.Tensor:
         r"""Predict deterministic or stochastic action based on observation.
 
         - ``deterministic`` = ``True`` or ``False``
@@ -108,13 +133,6 @@ def predict(
         we want to know the actual action that the agent will take,
         so we should use deterministic actions (set ``deterministic`` = ``True``).
 
-        - ``need_log_prob`` = ``True`` or ``False``
-
-        In some cases, we need to calculate the log probability of the action,
-        which is used to calculate the loss of the actor.
-        For example, in the case of Policy Gradient,
-        the loss is defined as
-
         .. math::
             L = -\mathbb{E}_{s \sim p(s)} [\log p(a | s) A^R (s, a)]
 
@@ -126,11 +144,24 @@ def predict(
         Args:
             obs (torch.Tensor): observation.
             deterministic (bool, optional): whether to predict deterministic action. Defaults to False.
-            need_log_prob (bool, optional): whether to return log probability of action. Defaults to False.
+        """
+
+    @abstractmethod
+    def log_prob(self, act: torch.Tensor) -> torch.Tensor:
+        r"""Return the log probability of action under the distribution.
+
+        log_prob only can be called after calling ``predict`` or ``forward``.
+
+        Args:
+            obs (torch.Tensor): observation.
+            act (torch.Tensor): action.
+
+        Returns:
+            torch.Tensor: the log probability of action under the distribution.
         """
 
 
-class Critic(abc.ABC, nn.Module):
+class Critic(ABC, nn.Module):
     """A abstract class for critic.
 
     A critic approximates the value function that maps observations to values.
@@ -147,46 +178,40 @@ class Critic(abc.ABC, nn.Module):
     # pylint: disable-next=too-many-arguments
     def __init__(
         self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
         activation: Activation = 'relu',
-        weight_initialization_mode: InitFunction = 'xavier_uniform',
-        shared: nn.Module = None,
+        weight_initialization_mode: InitFunction = 'kaiming_uniform',
+        num_critics: int = 1,
+        use_obs_encoder: bool = False,
     ) -> None:
         """Initialize the base critic.
 
         Args:
-            obs_dim (int): observation dimension.
-            act_dim (int): action dimension.
+            obs_space (OmnisafeSpace): observation space.
+            act_space (OmnisafeSpace): action space.
             hidden_sizes (list): hidden layer sizes.
             activation (Activation, optional): activation function. Defaults to 'relu'.
             weight_initialization_mode (InitFunction, optional): weight initialization mode.
-                                                                Defaults to 'xavier_uniform'.
+                                                                Defaults to 'kaiming_uniform'.
             shared (nn.Module, optional): shared module. Defaults to None.
         """
         nn.Module.__init__(self)
-        self.obs_dim = obs_dim
-        self.act_dim = act_dim
-        self.shared = shared
-        self.weight_initialization_mode = weight_initialization_mode
-        self.activation = activation
-        self.hidden_sizes = hidden_sizes
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        obs: torch.Tensor,
-        act: torch.Tensor = None,
-    ) -> Union[torch.Tensor, List]:
-        """Forward function for critic.
-
-        .. note::
-            This forward function has two modes:
-            - If ``act`` is not None, it will return the value of the observation-action pair.
-            - If ``act`` is None, it will return the value of the observation.
-
-        Args:
-            obs (torch.Tensor): observation.
-            act (torch.Tensor, optional): action. Defaults to None.
-        """
+        self._obs_space = obs_space
+        self._act_space = act_space
+        self._weight_initialization_mode = weight_initialization_mode
+        self._activation = activation
+        self._hidden_sizes = hidden_sizes
+        self._num_critics = num_critics
+        self._use_obs_encoder = use_obs_encoder
+
+        if isinstance(self._obs_space, spaces.Box) and len(self._obs_space.shape) == 1:
+            self._obs_dim = self._obs_space.shape[0]
+        else:
+            raise NotImplementedError
+
+        if isinstance(self._act_space, spaces.Box) and len(self._act_space.shape) == 1:
+            self._act_dim = self._act_space.shape[0]
+        else:
+            raise NotImplementedError
diff --git a/omnisafe/models/constraint_actor_critic.py b/omnisafe/models/constraint_actor_critic.py
deleted file mode 100644
index 8c12dc0b6..000000000
--- a/omnisafe/models/constraint_actor_critic.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of ConstraintActorCritic."""
-
-from typing import NamedTuple, Tuple
-
-import numpy as np
-import torch
-from gymnasium.spaces import Box
-
-from omnisafe.models.actor_critic import ActorCritic
-from omnisafe.models.critic import CriticBuilder
-
-
-class ConstraintActorCritic(ActorCritic):
-    """ConstraintActorCritic is a wrapper around ActorCritic that adds a cost critic to the model.
-
-    In ``omnisafe``, we combine the actor and critic into one this class.
-
-    .. list-table::
-
-        *   -   Model
-            -   Description
-            -   Function
-        *   -   Actor
-            -   The policy network, input is observation, output is action.
-                Choose the actor from the following options:
-                :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`,
-                :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`.
-            -   Choose the action based on the observation.
-        *   -   Reward Critic
-            -   The value network, input is observation,
-                output is reward value.
-                Choose the critic from the following options:
-                :class:`QCritic`, :class:`VCritic`.
-            -   Estimate the reward value of the observation.
-        *   -   Cost Critic
-            -   The value network, input is observation,
-                output is cost value.
-                Choose the critic from the following options:
-                :class:`QCritic`, :class:`VCritic`.
-            -   Estimate the cost value of the observation.
-    """
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        observation_space: Box,
-        action_space: Box,
-        model_cfgs: NamedTuple,
-    ) -> None:
-        """Initialize ConstraintActorCritic
-
-        Args:
-            observation_space (Box): Observation space.
-            action_space (Box): Action space.
-            standardized_obs (bool): Whether to standardize the observation.
-            scale_rewards (bool): Whether to scale the rewards.
-            model_cfgs (NamedTuple): Model configurations.
-        """
-        ActorCritic.__init__(
-            self,
-            observation_space,
-            action_space,
-            model_cfgs,
-        )
-
-        critic_builder = CriticBuilder(
-            obs_dim=self.obs_dim,
-            act_dim=self.act_dim,
-            hidden_sizes=self.ac_kwargs.val.hidden_sizes,
-            activation=self.ac_kwargs.val.activation,
-            weight_initialization_mode=self.model_cfgs.weight_initialization_mode,
-            shared=self.shared,
-        )
-        self.cost_critic = critic_builder.build_critic('v')
-
-    def step(
-        self, obs: torch.Tensor, deterministic: bool = False
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray,]:
-        """Step function of the actor-critic model
-
-        Input observation, output reward and cost value (from :class:`Critic`) action,
-        and its log probability (from :class`Actor`).
-
-        .. note::
-            The observation is standardized by the running mean and standard deviation.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool, optional): Whether to use deterministic action.
-        """
-        with torch.no_grad():
-            value = self.reward_critic(obs)
-            cost_value = self.cost_critic(obs)
-
-            raw_action, action, logp_a = self.actor.predict(
-                obs, deterministic=deterministic, need_log_prob=True
-            )
-
-        return raw_action, action, value, cost_value, logp_a
diff --git a/omnisafe/models/constraint_actor_q_critic.py b/omnisafe/models/constraint_actor_q_critic.py
deleted file mode 100644
index 7050f0840..000000000
--- a/omnisafe/models/constraint_actor_q_critic.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of ConstraintActorQCritic."""
-
-from typing import NamedTuple, Tuple
-
-import numpy as np
-import torch
-from gymnasium.spaces import Box
-
-from omnisafe.models.actor_q_critic import ActorQCritic
-from omnisafe.models.critic.q_critic import QCritic
-
-
-class ConstraintActorQCritic(ActorQCritic):
-    """ConstraintActorCritic is a wrapper around ActorCritic that adds a cost critic to the model.
-
-    In ``omnisafe``, we combine the actor and critic into one this class.
-
-    .. list-table::
-
-        *   -   Model
-            -   Description
-            -   Function
-        *   -   Actor
-            -   The policy network, input is observation, output is action.
-                Choose the actor from the following options:
-                :class:`MLPActor`, :class:`CategoricalActor`, :class:`GaussianAnnealingActor`,
-                :class:`GaussianLearningActor`, :class:`GaussianStdNetActor`, :class:`MLPCholeskyActor`.
-            -   Choose the action based on the observation.
-        *   -   Reward Q-Critic
-            -   The value network, input is observation-action pair,
-                output is reward value.
-                Choose the critic from the following options:
-                :class:`QCritic`, :class:`VCritic`.
-            -   Estimate the reward value of the observation.
-        *   -   Cost Q-Critic
-            -   The value network, input is observation-action pair,
-                output is cost value.
-                Choose the critic from the following options:
-                :class:`QCritic`, :class:`VCritic`.
-            -   Estimate the cost value of the observation.
-    """
-
-    # pylint: disable-next=too-many-arguments
-    def __init__(
-        self,
-        observation_space: Box,
-        action_space: Box,
-        model_cfgs: NamedTuple,
-    ) -> None:
-        """Initialize ConstraintActorQCritic.
-
-        Args:
-            observation_space: The observation space.
-            action_space: The action space.
-            standardized_obs: Whether to standardize the observation.
-            model_cfgs: The model configurations.
-        """
-
-        super().__init__(
-            observation_space=observation_space,
-            action_space=action_space,
-            model_cfgs=model_cfgs,
-        )
-        self.cost_critic = QCritic(
-            obs_dim=self.obs_dim,
-            act_dim=self.act_dim,
-            hidden_sizes=self.ac_kwargs.val.hidden_sizes,
-            activation=self.ac_kwargs.val.activation,
-            weight_initialization_mode=model_cfgs.weight_initialization_mode,
-            shared=model_cfgs.shared_weights,
-        )
-
-    def step(
-        self, obs: torch.Tensor, deterministic: bool = False
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray,]:
-        """Step function of the actor-critic model
-
-        Input observation-action pair, output reward and cost value (from :class:`QCritic`) action,
-        and its log probability (from :class`Actor`).
-
-        .. note::
-            The observation is standardized by the running mean and standard deviation.
-
-        Args:
-            obs (torch.Tensor): Observation.
-            deterministic (bool, optional): Whether to use deterministic action.
-        """
-        with torch.no_grad():
-            raw_action, action, logp_a = self.actor.predict(
-                obs, deterministic=deterministic, need_log_prob=True
-            )
-            value = self.critic(obs, action)[0]
-            cost_value = self.cost_critic(obs, action)[0]
-
-        return raw_action, action, value, cost_value, logp_a
diff --git a/omnisafe/models/critic/critic_builder.py b/omnisafe/models/critic/critic_builder.py
index 733e99478..c3aa69f7f 100644
--- a/omnisafe/models/critic/critic_builder.py
+++ b/omnisafe/models/critic/critic_builder.py
@@ -14,13 +14,13 @@
 # ==============================================================================
 """Implementation of CriticBuilder."""
 
-from typing import Union
-
-import torch.nn as nn
+import difflib
+from typing import List
 
+from omnisafe.models.base import Critic
 from omnisafe.models.critic.q_critic import QCritic
 from omnisafe.models.critic.v_critic import VCritic
-from omnisafe.utils.model_utils import Activation, InitFunction
+from omnisafe.typing import Activation, CriticType, InitFunction, OmnisafeSpace
 
 
 # pylint: disable-next=too-few-public-methods
@@ -40,35 +40,37 @@ class CriticBuilder:
     # pylint: disable-next=too-many-arguments
     def __init__(
         self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
         activation: Activation = 'relu',
         weight_initialization_mode: InitFunction = 'kaiming_uniform',
-        shared: nn.Module = None,
+        num_critics: int = 1,
+        use_obs_encoder: bool = False,
     ) -> None:
         """Initialize CriticBuilder.
 
         Args:
-            obs_dim (int): Observation dimension.
-            act_dim (int): Action dimension.
-            hidden_sizes (list): Hidden layer sizes.
+            obs_space (OmnisafeSpace): Observation space.
+            act_space (OmnisafeSpace): Action space.
+            hidden_sizes (List[int]): Hidden sizes of the critic network.
             activation (Activation): Activation function.
             weight_initialization_mode (InitFunction): Weight initialization mode.
-            shared (nn.Module): Shared network.
+            num_critics (int): Number of critics.
+            use_obs_encoder (bool): Whether to use observation encoder, only used in q critic.
         """
-        self.obs_dim = obs_dim
-        self.act_dim = act_dim
-        self.hidden_sizes = hidden_sizes
-        self.activation = activation
-        self.weight_initialization_mode = weight_initialization_mode
-        self.shared = shared
+        self._obs_space = obs_space
+        self._act_space = act_space
+        self._weight_initialization_mode = weight_initialization_mode
+        self._activation = activation
+        self._hidden_sizes = hidden_sizes
+        self._num_critics = num_critics
+        self._use_obs_encoder = use_obs_encoder
 
     def build_critic(
         self,
-        critic_type: str,
-        use_obs_encoder: bool = True,
-    ) -> Union[QCritic, VCritic, NotImplementedError]:
+        critic_type: CriticType,
+    ) -> Critic:
         """Build critic.
 
         Currently, we support two types of critics: ``q`` and ``v``.
@@ -79,22 +81,25 @@ def build_critic(
         """
         if critic_type == 'q':
             return QCritic(
-                obs_dim=self.obs_dim,
-                act_dim=self.act_dim,
-                hidden_sizes=self.hidden_sizes,
-                activation=self.activation,
-                weight_initialization_mode=self.weight_initialization_mode,
-                shared=self.shared,
-                use_obs_encoder=use_obs_encoder,
+                obs_space=self._obs_space,
+                act_space=self._act_space,
+                hidden_sizes=self._hidden_sizes,
+                activation=self._activation,
+                weight_initialization_mode=self._weight_initialization_mode,
+                num_critics=self._num_critics,
+                use_obs_encoder=self._use_obs_encoder,
             )
         if critic_type == 'v':
             return VCritic(
-                obs_dim=self.obs_dim,
-                act_dim=self.act_dim,
-                hidden_sizes=self.hidden_sizes,
-                activation=self.activation,
-                weight_initialization_mode=self.weight_initialization_mode,
-                shared=self.shared,
+                obs_space=self._obs_space,
+                act_space=self._act_space,
+                hidden_sizes=self._hidden_sizes,
+                activation=self._activation,
+                weight_initialization_mode=self._weight_initialization_mode,
+                num_critics=self._num_critics,
             )
 
-        raise NotImplementedError(f'critic_type "{critic_type}" is not implemented.')
+        raise NotImplementedError(
+            f'critic_type "{critic_type}" is not implemented.'
+            f'Did you mean one of {difflib.get_close_matches(critic_type, ["q", "v"])[0]}?'
+        )
diff --git a/omnisafe/models/critic/q_critic.py b/omnisafe/models/critic/q_critic.py
index 06f95ef48..5c14f5056 100644
--- a/omnisafe/models/critic/q_critic.py
+++ b/omnisafe/models/critic/q_critic.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Implementation of QCritic."""
-from typing import List, Optional
+
+from typing import List
 
 import torch
 import torch.nn as nn
 
 from omnisafe.models.base import Critic
-from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network
+from omnisafe.typing import Activation, InitFunction, OmnisafeSpace
+from omnisafe.utils.model import build_mlp_network
 
 
 class QCritic(Critic):
@@ -33,24 +35,22 @@ class QCritic(Critic):
     # pylint: disable-next=too-many-arguments
     def __init__(
         self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
         activation: Activation = 'relu',
-        weight_initialization_mode: InitFunction = 'xavier_uniform',
-        shared: nn.Module = None,
+        weight_initialization_mode: InitFunction = 'kaiming_uniform',
         num_critics: int = 1,
-        use_obs_encoder: bool = True,
-        action_type: str = 'continuous',
+        use_obs_encoder: bool = False,
     ) -> None:
         """Initialize the critic network.
 
         The Q critic network has two modes:
 
         -  ``use_obs_encoder`` = ``False`` :
-           The input of the network is the concatenation of the observation and action.
+            The input of the network is the concatenation of the observation and action.
         -  ``use_obs_encoder`` = ``True`` :
-           The input of the network is the concatenation of the output of the observation encoder and action.
+            The input of the network is the concatenation of the output of the observation encoder and action.
 
         For example, in :class:`DDPG`,
         the action is not directly concatenated with the observation,
@@ -63,56 +63,55 @@ def __init__(
             you need to use the index to get it.
 
         Args:
-            obs_dim (int): Observation dimension.
-            act_dim (int): Action dimension.
-            hidden_sizes (list): Hidden layer sizes.
-            activation (Activation): Activation function.
-            weight_initialization_mode (InitFunction): Weight initialization mode.
-            shared (nn.Module): Shared network.
-            num_critics (int): Number of critics.
-            use_obs_encoder (bool): Whether to use observation encoder.
+            obs_space (OmnisafeSpace): observation space.
+            act_space (OmnisafeSpace): action space.
+            hidden_sizes (list): list of hidden layer sizes.
+            activation (Activation): activation function.
+            weight_initialization_mode (InitFunction): weight initialization mode.
+            shared (nn.Module): shared network.
+            num_critics (int): number of critics.
+            use_obs_encoder (bool): whether to use observation encoder.
+
         """
-        self.use_obs_encoder = use_obs_encoder
-        Critic.__init__(
-            self,
-            obs_dim=obs_dim,
-            act_dim=act_dim,
-            hidden_sizes=hidden_sizes,
-            activation=activation,
-            weight_initialization_mode=weight_initialization_mode,
-            shared=shared,
+        super().__init__(
+            obs_space,
+            act_space,
+            hidden_sizes,
+            activation,
+            weight_initialization_mode,
+            num_critics,
+            use_obs_encoder,
         )
-        self.critic_list = []
-        expand_dim = act_dim if action_type == 'continuous' else 1
-        for idx in range(num_critics):
-            if self.use_obs_encoder:
+        self.net_lst: List[nn.Module] = []
+        for idx in range(self._num_critics):
+            if self._use_obs_encoder:
                 obs_encoder = build_mlp_network(
-                    [obs_dim, hidden_sizes[0]],
+                    [self._obs_dim, hidden_sizes[0]],
                     activation=activation,
                     output_activation=activation,
                     weight_initialization_mode=weight_initialization_mode,
                 )
                 net = build_mlp_network(
-                    [hidden_sizes[0] + expand_dim] + hidden_sizes[1:] + [1],
+                    [hidden_sizes[0] + self._act_dim] + hidden_sizes[1:] + [1],
                     activation=activation,
                     weight_initialization_mode=weight_initialization_mode,
                 )
                 critic = nn.Sequential(obs_encoder, net)
             else:
                 net = build_mlp_network(
-                    [obs_dim + act_dim] + hidden_sizes[:] + [1],
+                    [self._obs_dim + self._act_dim] + hidden_sizes + [1],
                     activation=activation,
                     weight_initialization_mode=weight_initialization_mode,
                 )
                 critic = nn.Sequential(net)
-            self.critic_list.append(critic)
+            self.net_lst.append(critic)
             self.add_module(f'critic_{idx}', critic)
 
     def forward(
         self,
         obs: torch.Tensor,
-        act: Optional[torch.Tensor] = None,
-    ) -> List:
+        act: torch.Tensor,
+    ) -> List[torch.Tensor]:
         """Forward function.
 
         As a multi-critic network, the output of the network is a list of Q-values.
@@ -125,10 +124,6 @@ def forward(
             act (torch.Tensor): Action.
         """
         res = []
-        for critic in self.critic_list:
-            if self.use_obs_encoder:
-                encodered_obs = critic[0](obs)
-                res.append(torch.squeeze(critic[1](torch.cat([encodered_obs, act], dim=-1)), -1))
-            else:
-                res.append(torch.squeeze(critic[0](torch.cat([obs, act], dim=-1)), -1))
+        for critic in self.net_lst:
+            res.append(torch.squeeze(critic(torch.cat([obs, act], dim=-1)), -1))
         return res
diff --git a/omnisafe/models/critic/v_critic.py b/omnisafe/models/critic/v_critic.py
index 19c6aa022..5e8d859ba 100644
--- a/omnisafe/models/critic/v_critic.py
+++ b/omnisafe/models/critic/v_critic.py
@@ -14,11 +14,14 @@
 # ==============================================================================
 """Implementation of VCritic."""
 
+from typing import List
+
 import torch
 import torch.nn as nn
 
 from omnisafe.models.base import Critic
-from omnisafe.utils.model_utils import Activation, InitFunction, build_mlp_network
+from omnisafe.typing import Activation, InitFunction, OmnisafeSpace
+from omnisafe.utils.model import build_mlp_network
 
 
 class VCritic(Critic):
@@ -29,15 +32,14 @@ class VCritic(Critic):
     You can design your own V-function approximator by inheriting this class or :class:`Critic`.
     """
 
-    # pylint: disable-next=too-many-arguments
     def __init__(
         self,
-        obs_dim: int,
-        act_dim: int,
-        hidden_sizes: list,
+        obs_space: OmnisafeSpace,
+        act_space: OmnisafeSpace,
+        hidden_sizes: List[int],
         activation: Activation = 'relu',
-        weight_initialization_mode: InitFunction = 'xavier_uniform',
-        shared: nn.Module = None,
+        weight_initialization_mode: InitFunction = 'kaiming_uniform',
+        num_critics: int = 1,
     ) -> None:
         """Initialize the critic network.
 
@@ -49,41 +51,37 @@ def __init__(
             weight_initialization_mode (InitFunction): Weight initialization mode.
             shared (nn.Module): Shared network.
         """
-        Critic.__init__(
-            self,
-            obs_dim=obs_dim,
-            act_dim=act_dim,
-            hidden_sizes=hidden_sizes,
-            activation=activation,
-            weight_initialization_mode=weight_initialization_mode,
-            shared=shared,
+        super().__init__(
+            obs_space,
+            act_space,
+            hidden_sizes,
+            activation,
+            weight_initialization_mode,
+            num_critics,
+            use_obs_encoder=False,
         )
-        if shared is not None:
-            value_head = build_mlp_network(
-                sizes=[hidden_sizes[-1], 1],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
-            )
-            self.net = nn.Sequential(shared, value_head)
-        else:
-            self.net = build_mlp_network(
-                [obs_dim] + list(hidden_sizes) + [1],
-                activation=activation,
-                weight_initialization_mode=weight_initialization_mode,
+        self.net_lst: List[nn.Module] = []
+        for idx in range(self._num_critics):
+            net = build_mlp_network(
+                sizes=[self._obs_dim, *self._hidden_sizes, 1],
+                activation=self._activation,
+                weight_initialization_mode=self._weight_initialization_mode,
             )
-            self.add_module('critic', self.net)
+            self.net_lst.append(net)
+            self.add_module(f'critic_{idx}', net)
 
     def forward(
         self,
         obs: torch.Tensor,
-        act: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> List[torch.Tensor]:
         """Forward function.
 
         Specifically, V function approximator maps observations to V-values.
 
         Args:
             obs (torch.Tensor): Observations.
-            act (torch.Tensor): Actions.
         """
-        return torch.squeeze(self.net(obs), -1)
+        res = []
+        for critic in self.net_lst:
+            res.append(torch.squeeze(critic(obs), -1))
+        return res
diff --git a/omnisafe/typing.py b/omnisafe/typing.py
index a5369fcbb..48e81c4eb 100644
--- a/omnisafe/typing.py
+++ b/omnisafe/typing.py
@@ -36,6 +36,8 @@
 Activation = Literal['identity', 'relu', 'sigmoid', 'softplus', 'tanh']
 AdvatageEstimator = Literal['gae', 'gae-rtg', 'vtrace', 'plain']
 InitFunction = Literal['kaiming_uniform', 'xavier_normal', 'glorot', 'xavier_uniform', 'orthogonal']
+CriticType = Literal['v', 'q']
+ActorType = Literal['gaussian_learning', 'gaussian_sac']
 
 __all__ = [
     'Activation',
diff --git a/omnisafe/utils/config.py b/omnisafe/utils/config.py
index 34ff4acaa..5eed962a6 100644
--- a/omnisafe/utils/config.py
+++ b/omnisafe/utils/config.py
@@ -20,7 +20,7 @@
 
 import yaml
 
-from omnisafe.typing import Activation, AdvatageEstimator, InitFunction
+from omnisafe.typing import Activation, ActorType, AdvatageEstimator, InitFunction
 
 
 class Config(dict):
@@ -54,21 +54,7 @@ class Config(dict):
     max_grad_norm: float
     use_critic_norm: bool
     critic_norm_coeff: bool
-    model_cfgs: 'Config'
-    shared_weights: bool
-    weight_initialization_mode: InitFunction
-    actor_type: str
-    ac_kwargs: 'Config'
-    pi: 'Config'
-    hidden_sizes: List[int]
-    activation: Activation
-    output_activation: Activation
-    scale_action: bool
-    clip_action: bool
-    std_learning: bool
-    std_init: float
-    val: 'Config'
-    num_critics: int
+    model_cfgs: 'ModelConfig'
     buffer_cfgs: 'Config'
     gamma: float
     lam: float
@@ -100,6 +86,10 @@ def __getattr__(self, name: str) -> Any:
         except KeyError:
             return super().__getattribute__(name)
 
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Set attribute."""
+        self[name] = value
+
     def todict(self) -> dict:
         """Convert Config to dictionary."""
         config_dict = {}
@@ -145,6 +135,20 @@ def recurisve_update(self, update_args: Dict[str, Any]) -> None:
                     self[key] = value
 
 
+class ModelConfig(Config):
+    """Model config."""
+
+    weight_initialization_mode: InitFunction
+    actor_type: ActorType
+    actor: 'ModelConfig'
+    critic: 'ModelConfig'
+    hidden_sizes: List[int]
+    activation: Activation
+    std: List[float]
+    use_obs_encoder: bool
+    lr: float
+
+
 def get_default_kwargs_yaml(algo: str, env_id: str, algo_type: str) -> Config:
     """Get the default kwargs from ``yaml`` file.
 
diff --git a/omnisafe/utils/core.py b/omnisafe/utils/core.py
deleted file mode 100644
index 3329abcf4..000000000
--- a/omnisafe/utils/core.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Some Core Functions"""
-
-from typing import Union
-
-import torch
-
-from omnisafe.models import ConstraintActorCritic, ConstraintActorQCritic
-
-
-def set_optimizer(
-    opt: str, module: Union[ConstraintActorCritic, ConstraintActorQCritic], learning_rate: float
-) -> torch.optim.Optimizer:
-    """Returns an initialized optimizer from PyTorch.
-
-    .. note::
-
-        The optimizer can be chosen from the following list:
-
-        - Adam
-        - AdamW
-        - Adadelta
-        - Adagrad
-        - Adamax
-        - ASGD
-        - LBFGS
-        - RMSprop
-        - Rprop
-        - SGD
-
-    Args:
-        opt (str): optimizer name.
-        module (torch.nn.Module): module to be optimized.
-        learning_rate (float): learning rate.
-    """
-    assert hasattr(torch.optim, opt), f'Optimizer={opt} not found in torch.'
-    optimizer = getattr(torch.optim, opt)
-
-    return optimizer(module.parameters(), lr=learning_rate, eps=1e-5)
-
-
-def discount_cumsum_torch(x_vector: torch.Tensor, discount: float) -> torch.Tensor:
-    """Compute the discounted cumulative sum of vectors."""
-    length = x_vector.shape[0]
-    x_vector = x_vector.type(torch.float64)
-    for idx in reversed(range(length)):
-        if idx == length - 1:
-            cumsum = x_vector[idx]
-        else:
-            cumsum = x_vector[idx] + discount * cumsum
-        x_vector[idx] = cumsum
-    return x_vector
diff --git a/omnisafe/utils/distributed_utils.py b/omnisafe/utils/distributed.py
similarity index 64%
rename from omnisafe/utils/distributed_utils.py
rename to omnisafe/utils/distributed.py
index cf2e88000..615784566 100644
--- a/omnisafe/utils/distributed_utils.py
+++ b/omnisafe/utils/distributed.py
@@ -17,7 +17,7 @@
 import os
 import subprocess
 import sys
-from typing import Tuple
+from typing import Any, Tuple, Union
 
 import numpy as np
 import torch
@@ -25,57 +25,54 @@
 from torch.distributed import ReduceOp
 
 
-def setup_torch_for_mpi():
+def setup_distributed() -> None:
     """Avoid slowdowns caused by each separate process's PyTorch,
     using more than its fair share of CPU resources.
     """
     old_num_threads = torch.get_num_threads()
     # decrease number of torch threads for MPI
-    if old_num_threads > 1 and num_procs() > 1:
-        fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
+    if old_num_threads > 1 and world_size() > 1:
+        fair_num_threads = max(int(torch.get_num_threads() / world_size()), 1)
         torch.set_num_threads(fair_num_threads)
         print(
-            f'Proc {proc_id()}: Decreased number of Torch threads from '
+            f'Proc {get_rank()}: Decreased number of Torch threads from '
             f'{old_num_threads} to {torch.get_num_threads()}',
             flush=True,
         )
 
 
-def mpi_avg_grads(module: torch.nn.Module) -> None:
-    """Average contents of gradient buffers across MPI processes.
+def get_rank() -> int:
+    """Get rank of calling process."""
+    if os.getenv('MASTER_ADDR') is None:
+        return 0
+    return dist.get_rank()
 
-    Args:
-        module (torch.nn.Module): module to be averaged.
-    """
-    if num_procs() > 1:
-        for parameter in module.parameters():
-            p_grad_numpy = parameter.grad
-            avg_p_grad = mpi_avg(parameter.grad)
-            p_grad_numpy[:] = avg_p_grad[:]
 
+def is_master() -> bool:
+    """Test whether the process is the root process."""
+    return bool(get_rank() == 0)
 
-def sync_params(module: torch.nn.Module) -> None:
-    """Sync all parameters of module across all MPI processes.
 
-    .. note::
+def world_size() -> int:
+    """Count active MPI processes."""
+    if os.getenv('MASTER_ADDR') is None:
+        return 1
+    return dist.get_world_size()
 
-        This function only works when the training is multi-processing.
 
-    Args:
-        module (torch.nn.Module): module to be synchronized.
-    """
-    if num_procs() > 1:
-        for parameter in module.parameters():
-            p_numpy = parameter.data
-            broadcast(p_numpy)
+reduce = dist.reduce
+allreduce = dist.all_reduce
+gather = dist.gather
+allgather = dist.all_gather
+broadcast = dist.broadcast
+scatter = dist.scatter
 
 
-def mpi_fork(
+def fork(
     parallel: int,
     bind_to_core: bool = False,
     use_number_of_threads: bool = False,
     device: str = 'cpu',
-    test_message: list = None,
 ) -> bool:
     """The entrance of multi-processing.
 
@@ -93,10 +90,10 @@ def mpi_fork(
         bind_to_core (bool, optional): Defaults to False.
         use_number_of_threads (bool, optional): Defaults to False.
     """
-    is_parent = False
-    back_end = 'gloo' if device == 'cpu' else 'nccl'
+    is_parent: bool = False
+    backend = 'gloo' if device == 'cpu' else 'nccl'
     if os.getenv('MASTER_ADDR') is not None and os.getenv('IN_DIST') is None:
-        dist.init_process_group(backend=back_end)
+        dist.init_process_group(backend=backend)
         os.environ['IN_DIST'] = '1'
     # check if MPI is already setup..
     if parallel > 1 and os.getenv('MASTER_ADDR') is None:
@@ -116,36 +113,15 @@ def mpi_fork(
             args += ['-bind-to', 'core']
         if use_number_of_threads:
             args += ['--use-hwthread-cpus']
-        args += test_message or sys.argv
+        args += sys.argv
+        print(sys.argv)
         # this is the parent process, spawn sub-processes..
         subprocess.check_call(args, env=env)
         is_parent = True
     return is_parent
 
 
-def is_root_process() -> bool:
-    """Test whether the process is the root process."""
-    return bool(dist.get_rank() == 0)
-
-
-def proc_id() -> int:
-    """Get rank of calling process."""
-    if os.getenv('MASTER_ADDR') is None:
-        return 0
-    return dist.get_rank()
-
-
-def allreduce(*args, **kwargs) -> torch.Tensor:
-    """Allreduce operation."""
-    return dist.all_reduce(*args, **kwargs)
-
-
-def gather(*args, **kwargs) -> torch.Tensor:
-    """Gather operation."""
-    return dist.gather(*args, **kwargs)
-
-
-def mpi_avg_torch_tensor(value: torch.Tensor) -> None:
+def avg_tensor(value: torch.Tensor) -> None:
     """Average a torch tensor over MPI processes.
     Since torch and numpy share same memory space,
     tensors of dim > 0 can be be manipulated through call by reference,
@@ -154,40 +130,76 @@ def mpi_avg_torch_tensor(value: torch.Tensor) -> None:
         value (torch.Tensor): value to be averaged.
     """
     assert isinstance(value, torch.Tensor)
-    if num_procs() > 1:
+    if world_size() > 1:
         assert len(value.shape) > 0
-        avg_x = mpi_avg(value)
+        avg_x = dist_avg(value)
         value[:] = avg_x[:]
 
 
-def num_procs() -> int:
-    """Count active MPI processes."""
-    if os.getenv('MASTER_ADDR') is None:
-        return 1
-    return dist.get_world_size()
+def avg_grads(module: torch.nn.Module) -> None:
+    """Average contents of gradient buffers across MPI processes.
+
+    Args:
+        module (torch.nn.Module): module to be averaged.
+    """
+    if world_size() > 1:
+        for parameter in module.parameters():
+            if parameter.grad is not None:
+                p_grad = parameter.grad
+                avg_p_grad = dist_avg(parameter.grad)
+                p_grad[:] = avg_p_grad[:]
+
+
+def sync_params(module: torch.nn.Module) -> None:
+    """Sync all parameters of module across all MPI processes.
+
+    .. note::
 
+        This function only works when the training is multi-processing.
 
-def broadcast(value: torch.Tensor, src: int = 0) -> torch.Tensor:
-    """Broadcast."""
-    dist.broadcast(value, src=src)
+    Args:
+        module (torch.nn.Module): module to be synchronized.
+    """
+    if world_size() > 1:
+        for parameter in module.parameters():
+            p_numpy = parameter.data
+            broadcast(p_numpy, src=0)
 
 
-def mpi_avg(value: torch.Tensor) -> torch.Tensor:
-    """Average a scalar or numpy vector over MPI processes."""
-    return mpi_sum(value) / num_procs()
+def avg_params(module: torch.nn.Module) -> None:
+    """Average contents of all parameters across MPI processes.
+
+    Args:
+        module (torch.nn.Module): module to be averaged.
+    """
+    if world_size() > 1:
+        for parameter in module.parameters():
+            param_tensor = parameter.data
+            avg_param_tensor = dist_avg(param_tensor)
+            param_tensor[:] = avg_param_tensor[:]
+
+
+def dist_avg(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor:
+    """Average a tensor over distributed processes."""
+    return dist_sum(value) / world_size()
 
 
-def mpi_max(value: torch.Tensor) -> torch.Tensor:
-    """Determine global maximum of scalar or numpy array over MPI processes."""
-    return mpi_op(value, ReduceOp.MAX)
+def dist_max(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor:
+    """Determine global maximum of tensor over distributed processes."""
+    return dist_op(value, ReduceOp.MAX)
 
 
-def mpi_min(value: torch.Tensor) -> torch.Tensor:
-    """Determine global minimum of scalar or numpy array over MPI processes."""
-    return mpi_op(value, ReduceOp.MIN)
+def dist_min(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor:
+    """Determine global minimum of tensor over distributed processes."""
+    return dist_op(value, ReduceOp.MIN)
 
 
-def mpi_op(value: torch.Tensor, operation: ReduceOp) -> torch.Tensor:
+def dist_sum(value: Union[np.ndarray, torch.Tensor, int, float]) -> torch.Tensor:
+    """Sum a tensor over distributed processes."""
+    return dist_op(value, ReduceOp.SUM)
+
+
+def dist_op(value: Union[np.ndarray, torch.Tensor, int, float], operation: Any) -> torch.Tensor:
     """Multi-processing operation.
 
     .. note::
@@ -199,20 +211,15 @@ def mpi_op(value: torch.Tensor, operation: ReduceOp) -> torch.Tensor:
         value (torch.Tensor): value to be operated.
         operation (ReduceOp): operation type.
     """
-    if num_procs() == 1:
-        return value
-    value, scalar = ([value], True) if np.isscalar(value) else (value, False)
-    value = torch.as_tensor(value, dtype=torch.float32)
+    if world_size() == 1:
+        return torch.as_tensor(value, dtype=torch.float32)
+    value_, scalar = ([value], True) if np.isscalar(value) else (value, False)
+    value = torch.as_tensor(value_, dtype=torch.float32)
     allreduce(value, op=operation)
     return value[0] if scalar else value
 
 
-def mpi_sum(value: torch.Tensor) -> torch.Tensor:
-    """Sum a scalar or numpy vector over MPI processes."""
-    return mpi_op(value, ReduceOp.SUM)
-
-
-def mpi_statistics_scalar(
+def dist_statistics_scalar(
     value: torch.Tensor, with_min_and_max: bool = False
 ) -> Tuple[torch.Tensor, ...]:
     """Get mean/std and optional min/max of scalar x across MPI processes.
@@ -221,14 +228,15 @@ def mpi_statistics_scalar(
         value (torch.Tensor): value to be operated.
         with_min_and_max (bool): whether to return min and max.
     """
-    global_sum, global_n = mpi_sum([torch.sum(value), len(value)])
+    global_sum = dist_sum(torch.sum(value))
+    global_n = dist_sum(len(value))
     mean = global_sum / global_n
 
-    global_sum_sq = mpi_sum(torch.sum((value - mean) ** 2))
+    global_sum_sq = dist_sum(torch.sum((value - mean) ** 2))
     # compute global std
     std = torch.sqrt(global_sum_sq / global_n)
     if with_min_and_max:
-        global_min = mpi_min(value)
-        global_max = mpi_max(value)
+        global_min = dist_min(value)
+        global_max = dist_max(value)
         return mean, std, global_min, global_max
     return mean, std
diff --git a/omnisafe/utils/exp_grid_tools.py b/omnisafe/utils/exp_grid_tools.py
index 027e1e949..9b7cc922d 100644
--- a/omnisafe/utils/exp_grid_tools.py
+++ b/omnisafe/utils/exp_grid_tools.py
@@ -15,6 +15,7 @@
 """Tools for Experiment Grid."""
 
 import string
+from typing import List, Union
 
 
 def all_bools(vals: list) -> bool:
@@ -22,7 +23,7 @@ def all_bools(vals: list) -> bool:
     return all(isinstance(v, bool) for v in vals)
 
 
-def valid_str(vals: list or str or type) -> str:
+def valid_str(vals: Union[List, str]) -> str:
     r"""Convert a value or values to a string which could go in a path of file.
 
     Partly based on `this gist`_.
diff --git a/omnisafe/utils/algo_utils.py b/omnisafe/utils/math.py
similarity index 50%
rename from omnisafe/utils/algo_utils.py
rename to omnisafe/utils/math.py
index 0af602658..b8e936f5f 100644
--- a/omnisafe/utils/algo_utils.py
+++ b/omnisafe/utils/math.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 """Implementation of the algo utils."""
-from typing import Tuple
+from typing import Callable, Tuple
 
 import torch
+from torch.distributions import Normal, TanhTransform, TransformedDistribution
 
 
 def get_transpose(tensor: torch.Tensor) -> torch.Tensor:
@@ -105,3 +106,124 @@ def gaussian_kl(
     c_mean_q = 0.5 * torch.mean(inner_mean_q)
     c_sigma_q = 0.5 * torch.mean(inner_sigma_q)
     return c_mean_q, c_sigma_q, torch.mean(sigma_p_det), torch.mean(sigma_q_det)
+
+
+def discount_cumsum(x_vector: torch.Tensor, discount: float) -> torch.Tensor:
+    """Compute the discounted cumulative sum of vectors."""
+    length = x_vector.shape[0]
+    x_vector = x_vector.type(torch.float64)
+    for idx in reversed(range(length)):
+        if idx == length - 1:
+            cumsum = x_vector[idx]
+        else:
+            cumsum = x_vector[idx] + discount * cumsum
+        x_vector[idx] = cumsum
+    return x_vector
+
+
+def conjugate_gradients(
+    Avp: Callable[[torch.Tensor], torch.Tensor],
+    b_vector: torch.Tensor,
+    num_steps: int = 10,
+    residual_tol: float = 1e-10,
+    eps: float = 1e-6,
+):  # pylint: disable=invalid-name,too-many-locals
+    """Implementation of Conjugate gradient algorithm.
+
+    Conjugate gradient algorithm is used to solve the linear system of equations :math:`Ax = b`.
+    The algorithm is described in detail in the paper `Conjugate Gradient Method`_.
+
+    .. _Conjugate Gradient Method: https://en.wikipedia.org/wiki/Conjugate_gradient_method
+
+    .. note::
+        Increasing ``num_steps`` will lead to a more accurate approximation
+        to :math:`A^{-1} b`, and possibly slightly-improved performance,
+        but at the cost of slowing things down.
+        Also probably don't play with this hyperparameter.
+
+    Args:
+        num_steps (int): Number of iterations of conjugate gradient to perform.
+    """
+
+    x = torch.zeros_like(b_vector)
+    r = b_vector - Avp(x)
+    p = r.clone()
+    rdotr = torch.dot(r, r)
+
+    for _ in range(num_steps):
+        z = Avp(p)
+        alpha = rdotr / (torch.dot(p, z) + eps)
+        x += alpha * p
+        r -= alpha * z
+        new_rdotr = torch.dot(r, r)
+        if torch.sqrt(new_rdotr) < residual_tol:
+            break
+        mu = new_rdotr / (rdotr + eps)
+        p = r + mu * p
+        rdotr = new_rdotr
+    return x
+
+
+class SafeTanhTransformer(TanhTransform):
+    """Safe Tanh Transformer."""
+
+    def _call(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.clamp(torch.tanh(x), min=-0.999999, max=0.999999)
+
+    def _inverse(self, y: torch.Tensor) -> torch.Tensor:
+        if y.dtype.is_floating_point:
+            eps = torch.finfo(y.dtype).eps
+        else:
+            raise ValueError('Expected floating point type')
+        y = y.clamp(min=-1 + eps, max=1 - eps)
+        x = super()._inverse(y)
+        return x
+
+
+class TanhNormal(TransformedDistribution):  # pylint: disable=abstract-method
+    r"""
+    Creates a tanh-normal distribution.
+
+        X ~ Normal(loc, scale)
+        Y = tanh(X) ~ TanhNormal(loc, scale)
+
+    Example::
+
+        >>> m = TanhNormal(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # tanh-normal distributed with mean=0 and stddev=1
+        tensor([-0.7616])
+
+    Args:
+        loc (float or Tensor): mean of the underlying normal distribution
+        scale (float or Tensor): standard deviation of the underlying normal distribution
+    """
+
+    arg_constraints = {
+        'loc': Normal.arg_constraints['loc'],
+        'scale': Normal.arg_constraints['scale'],
+    }
+    support = TransformedDistribution.support
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        base_dist = Normal(loc, scale, validate_args=validate_args)
+        super().__init__(base_dist, SafeTanhTransformer(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(TanhNormal, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def mean(self):
+        return SafeTanhTransformer()(self.base_dist.mean)
+
+    @property
+    def stddev(self):
+        return self.base_dist.stddev
+
+    def entropy(self):
+        return self.base_dist.entropy()
+
+    @property
+    def variance(self):
+        return self.base_dist.variance
diff --git a/omnisafe/utils/model_utils.py b/omnisafe/utils/model.py
similarity index 67%
rename from omnisafe/utils/model_utils.py
rename to omnisafe/utils/model.py
index 5361c744b..9ec8c75ec 100644
--- a/omnisafe/utils/model_utils.py
+++ b/omnisafe/utils/model.py
@@ -14,14 +14,13 @@
 # ==============================================================================
 """This module contains the helper functions for the model."""
 
-from typing import List, Literal, Union
+from typing import List, Type, Union
 
 import numpy as np
+import torch
 from torch import nn
 
-
-Activation = Literal['identity', 'relu', 'sigmoid', 'softplus', 'tanh']
-InitFunction = Literal['kaiming_uniform', 'xavier_normal', 'glorot', 'xavier_uniform', 'orthogonal']
+from omnisafe.typing import Activation, InitFunction
 
 
 def initialize_layer(init_function: InitFunction, layer: nn.Linear) -> None:
@@ -49,7 +48,7 @@ def initialize_layer(init_function: InitFunction, layer: nn.Linear) -> None:
 
 def get_activation(
     activation: Activation,
-) -> Union[nn.Identity, nn.ReLU, nn.Sigmoid, nn.Softplus, nn.Tanh]:
+) -> Union[Type[nn.Identity], Type[nn.ReLU], Type[nn.Sigmoid], Type[nn.Softplus], Type[nn.Tanh]]:
     """Get the activation function.
 
     The ``activation`` can be chosen from:
@@ -83,12 +82,47 @@ def build_mlp_network(
         output_activation (Activation): The output activation function.
         weight_initialization_mode (InitFunction): The initialization function.
     """
-    activation = get_activation(activation)
-    output_activation = get_activation(output_activation)
+    activation_fn = get_activation(activation)
+    output_activation_fn = get_activation(output_activation)
     layers = []
     for j in range(len(sizes) - 1):
-        act = activation if j < len(sizes) - 2 else output_activation
+        act_fn = activation_fn if j < len(sizes) - 2 else output_activation_fn
         affine_layer = nn.Linear(sizes[j], sizes[j + 1])
         initialize_layer(weight_initialization_mode, affine_layer)
-        layers += [affine_layer, act()]
+        layers += [affine_layer, act_fn()]
     return nn.Sequential(*layers)
+
+
+def set_optimizer(
+    opt: str, module: Union[nn.Module, List[nn.Parameter]], learning_rate: float
+) -> torch.optim.Optimizer:
+    """Returns an initialized optimizer from PyTorch.
+
+    .. note::
+
+        The optimizer can be chosen from the following list:
+
+        - Adam
+        - AdamW
+        - Adadelta
+        - Adagrad
+        - Adamax
+        - ASGD
+        - LBFGS
+        - RMSprop
+        - Rprop
+        - SGD
+
+    Args:
+        opt (str): optimizer name.
+        module (Union[nn.Module, List[nn.Parameter]]): module or parameters.
+        learning_rate (float): learning rate.
+    """
+    assert hasattr(torch.optim, opt), f'Optimizer={opt} not found in torch.'
+    optimizer = getattr(torch.optim, opt)
+
+    if isinstance(module, list):
+        return optimizer(module, lr=learning_rate)
+    if isinstance(module, nn.Module):
+        return optimizer(module.parameters(), lr=learning_rate)
+    raise TypeError(f'Invalid module type: {type(module)}')
diff --git a/omnisafe/utils/online_mean_std.py b/omnisafe/utils/online_mean_std.py
deleted file mode 100644
index 7b3bf7e1b..000000000
--- a/omnisafe/utils/online_mean_std.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the online mean and standard deviation."""
-
-import numpy as np
-import torch
-
-from omnisafe.utils import distributed_utils
-
-
-class OnlineMeanStd(torch.nn.Module):
-    """
-    Track mean and standard deviation of inputs with incremental formula.
-    """
-
-    def __init__(self, epsilon=1e-5, shape=()):
-        super().__init__()
-        self.mean = torch.nn.Parameter(torch.zeros(*shape), requires_grad=False)
-        self.std = torch.nn.Parameter(torch.ones(*shape), requires_grad=False)
-        self.count = torch.nn.Parameter(torch.zeros(1), requires_grad=False)
-        self.eps = epsilon
-        self.bound = 10
-        self.shape = shape
-
-    @property
-    def var(self):
-        """Return variance."""
-        return torch.square(self.std)
-
-    @staticmethod
-    def _convert_to_torch(params, dtype=torch.float32) -> torch.Tensor:
-        if isinstance(params, np.ndarray):
-            params = torch.from_numpy(params).float()
-        if isinstance(params, float):
-            params = torch.tensor([params], dtype=dtype)  # use [] to make tensor torch.Size([1])
-        if isinstance(params, np.floating):
-            params = torch.tensor([params], dtype=dtype)  # use [] to make tensor torch.Size([1])
-        return params
-
-    def forward(self, data, subtract_mean=True, clip=False):
-        """Make input average free and scale to standard deviation."""
-        # sanity checks
-        if len(data.shape) >= 2:
-            assert (
-                data.shape[-1] == self.mean.shape[-1]
-            ), f'got shape={data.shape} but expected: {self.mean.shape}'
-
-        is_numpy = isinstance(data, np.ndarray)
-        data = self._convert_to_torch(data)
-        if subtract_mean:
-            data_new = (data - self.mean) / (self.std + self.eps)
-        else:
-            data_new = data / (self.std + self.eps)
-        if clip:
-            data_new = torch.clamp(data_new, -self.bound, self.bound)
-        data_new = data_new.numpy() if is_numpy else data_new
-        return data_new
-
-    # pylint: disable-next=too-many-locals
-    def update(self, data) -> None:
-        """Update internals incrementally.
-        Note: works for both vector and matrix inputs.
-        MPI implementation according to Chan et al.[10]; see:
-        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
-        """
-        data = self._convert_to_torch(data)
-
-        # ==== Input checks
-        msg = f'Expected dim in [1, 2], but got dim={len(data.shape)}.'
-        assert len(data.shape) == 2 or len(data.shape) == 1, msg
-        if self.shape[0] > 1:  # expect matrix inputs
-            msg = f'Expected obs_dim={self.shape[0]} but got: {data.shape[1]}'
-            assert len(data.shape) == 2 and data.shape[1] == self.shape[0], msg
-        if self.shape[0] == 1:
-            assert len(data.shape) == 1, f'Expected dim=1 but got: {data.shape}'
-            # reshape is necessary since mean operator reduces vector dim by one
-            data = data.view((-1, 1))
-
-        n_b = data.shape[0] * distributed_utils.num_procs()  # get batch size
-        n_a = self.count.clone()
-        n_a_b = self.count + n_b
-        batch_mean = torch.mean(data, dim=0)
-
-        # 1) Calculate mean and average batch mean across processes
-        distributed_utils.mpi_avg_torch_tensor(batch_mean)
-        delta = batch_mean - self.mean
-        mean_new = self.mean + delta * n_b / n_a_b
-
-        # 2) Determine variance and sync across processes
-        diff = data - mean_new
-        batch_var = torch.mean(diff**2, dim=0)
-        distributed_utils.mpi_avg_torch_tensor(batch_var)
-
-        # Update running terms
-        m2_a = n_a * self.var
-        m2_b = n_b * batch_var
-        ratio = n_a * n_b / n_a_b
-        m2_a_b = m2_a + m2_b + delta**2 * ratio
-
-        # 3) Update parameters - access internal values with data attribute
-        self.mean.data = mean_new
-        self.count.data = n_a_b
-        new_var = m2_a_b / n_a_b
-        self.std.data = torch.sqrt(new_var)
diff --git a/omnisafe/utils/schedule.py b/omnisafe/utils/schedule.py
new file mode 100644
index 000000000..679527ee0
--- /dev/null
+++ b/omnisafe/utils/schedule.py
@@ -0,0 +1,93 @@
+# Copyright 2022 OmniSafe Team. All Rights Reserved.
+# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""helper class to generate scheduling params"""
+
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Union
+
+
+def _linear_interpolation(l, r, alpha):  # pylint: disable=invalid-name
+    return l + alpha * (r - l)
+
+
+class Schedule(ABC):
+    """Schedule for a value based on the step"""
+
+    @abstractmethod
+    def value(self, time: Union[int, float]) -> Union[int, float]:
+        """Value at time t.
+
+        Args:
+            t (float): Time.
+
+        Returns:
+            float: Value at time t.
+        """
+
+
+# pylint: disable=too-few-public-methods
+class PiecewiseSchedule(Schedule):
+    """Piece-wise schedule for a value based on the step"""
+
+    def __init__(
+        self,
+        endpoints: List[Tuple[Union[int, float], Union[int, float]]],
+        outside_value=Optional[Union[int, float]],
+    ) -> None:
+        """From OpenAI baselines"""
+        idxes = [e[0] for e in endpoints]
+        assert idxes == sorted(idxes)
+        self._interpolation = _linear_interpolation
+        self._outside_value = outside_value
+        self._endpoints = endpoints
+
+    def value(self, time: Union[int, float]) -> Union[int, float]:
+        """Value at time t.
+
+        Args:
+            t (float): Time.
+
+        Returns:
+            float: Value at time t.
+        """
+        # pylint: disable=invalid-name
+        for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
+            if l_t <= time < r_t:
+                alpha = float(time - l_t) / (r_t - l_t)
+                return self._interpolation(l, r, alpha)
+
+        # t does not belong to any of the pieces, so doom.
+        assert self._outside_value is not None
+        return self._outside_value
+
+
+class ConstantSchedule(Schedule):
+    """Constant schedule for a value"""
+
+    def __init__(self, value):
+        """Value remains constant over time.
+        Parameters
+        ----------
+        value: float
+            Constant value of the schedule
+        """
+        self._v = value
+
+    def value(
+        self, time: Union[int, float]
+    ) -> Union[int, float]:  # pylint: disable=unused-argument
+        """See Schedule.value"""
+        return self._v
diff --git a/omnisafe/utils/tools.py b/omnisafe/utils/tools.py
index b7f831370..b49cd7ce5 100644
--- a/omnisafe/utils/tools.py
+++ b/omnisafe/utils/tools.py
@@ -14,11 +14,12 @@
 # ==============================================================================
 """tool_function_packages"""
 
+import os
+import random
+
 import numpy as np
 import torch
 
-from omnisafe.typing import Any, Callable, Union
-
 
 def get_flat_params_from(model: torch.nn.Module) -> torch.Tensor:
     """This function is used to get the flattened parameters from the model.
@@ -61,49 +62,6 @@ def get_flat_gradients_from(model: torch.nn.Module) -> torch.Tensor:
     return torch.cat(grads)
 
 
-def conjugate_gradients(
-    Avp: Callable[[torch.Tensor], torch.Tensor],
-    b_vector: torch.Tensor,
-    num_steps: int = 10,
-    residual_tol: float = 1e-10,
-    eps: float = 1e-6,
-):  # pylint: disable=invalid-name,too-many-locals
-    """Implementation of Conjugate gradient algorithm.
-
-    Conjugate gradient algorithm is used to solve the linear system of equations :math:`Ax = b`.
-    The algorithm is described in detail in the paper `Conjugate Gradient Method`_.
-
-    .. _Conjugate Gradient Method: https://en.wikipedia.org/wiki/Conjugate_gradient_method
-
-    .. note::
-        Increasing ``num_steps`` will lead to a more accurate approximation
-        to :math:`A^{-1} b`, and possibly slightly-improved performance,
-        but at the cost of slowing things down.
-        Also probably don't play with this hyperparameter.
-
-    Args:
-        num_steps (int): Number of iterations of conjugate gradient to perform.
-    """
-
-    x = torch.zeros_like(b_vector)
-    r = b_vector - Avp(x)
-    p = r.clone()
-    rdotr = torch.dot(r, r)
-
-    for _ in range(num_steps):
-        z = Avp(p)
-        alpha = rdotr / (torch.dot(p, z) + eps)
-        x += alpha * p
-        r -= alpha * z
-        new_rdotr = torch.dot(r, r)
-        if torch.sqrt(new_rdotr) < residual_tol:
-            break
-        mu = new_rdotr / (rdotr + eps)
-        p = r + mu * p
-        rdotr = new_rdotr
-    return x
-
-
 def set_param_values_to_model(model: torch.nn.Module, vals: torch.Tensor) -> None:
     """This function is used to set the parameters to the model.
 
@@ -116,70 +74,19 @@ def set_param_values_to_model(model: torch.nn.Module, vals: torch.Tensor) -> Non
         vals (torch.Tensor): parameters to be set.
     """
     assert isinstance(vals, torch.Tensor)
-    i = 0
+    i: int = 0
     for _, param in model.named_parameters():
         if param.requires_grad:  # param has grad and, hence, must be set
             orig_size = param.size()
             size = np.prod(list(param.size()))
-            new_values = vals[i : i + size]
+            new_values = vals[i : int(i + size)]
             # set new param values
             new_values = new_values.view(orig_size)
             param.data = new_values
-            i += size  # increment array position
+            i += int(size)  # increment array position
     assert i == len(vals), f'Lengths do not match: {i} vs. {len(vals)}'
 
 
-# pylint: disable-next=too-many-branches,too-many-return-statements
-def to_ndarray(item: Any, dtype: np.dtype = None) -> Union[np.ndarray, TypeError, None]:
-    """This function is used to convert the data type to ndarray.
-
-    Change `torch.Tensor`, sequence of scalars to ndarray, and keep other data types unchanged.
-
-    .. note:
-        Now supports item type: :obj:`torch.Tensor`,  :obj:`dict`, :obj:`list`, :obj:`tuple` and :obj:`None`
-
-    Args:
-        item (Any): item to be converted.
-        dtype (np.dtype): data type of the output ndarray. Default to None.
-    """
-
-    if isinstance(item, dict):
-        new_data = {}
-        for key, value in item.items():
-            new_data[key] = to_ndarray(value, dtype)
-        return new_data
-
-    if isinstance(item, (list, tuple)):
-        if len(item) == 0:
-            return None
-        if hasattr(item, '_fields'):  # namedtuple
-            return type(item)(*[to_ndarray(t, dtype) for t in item])
-        new_data = []
-        for data in item:
-            new_data.append(to_ndarray(data, dtype))
-        return new_data
-
-    if isinstance(item, torch.Tensor):
-        if item.device != 'cpu':
-            item = item.detach().cpu()
-        if dtype is None:
-            return item.numpy()
-        return item.numpy().astype(dtype)
-
-    if isinstance(item, np.ndarray):
-        if dtype is None:
-            return item
-        return item.astype(dtype)
-
-    if np.isscalar(item):
-        return np.array(item)
-
-    if item is None:
-        return None
-
-    raise TypeError(f'not support item type: {item}')
-
-
 def expand_dims(*args):
     """This function is used to expand the dimensions of the input data.
 
@@ -195,7 +102,7 @@ def expand_dims(*args):
     return [np.expand_dims(item, axis=0) for item in args]
 
 
-def as_tensor(*args, device: torch.device = 'cpu'):
+def as_tensor(*args, device: torch.device = torch.device('cpu')):
     """This function is used to convert the input data to tensor.
 
     .. note::
@@ -208,3 +115,20 @@ def as_tensor(*args, device: torch.device = 'cpu'):
     if len(args) == 1:
         return torch.as_tensor(args[0], dtype=torch.float32)
     return [torch.as_tensor(item, dtype=torch.float32, device=device) for item in args]
+
+
+def seed_all(seed: int):
+    """This function is used to set the random seed for all the packages."""
+
+    os.environ['PYTHONHASHSEED'] = str(seed)
+
+    random.seed(seed)
+    np.random.seed(seed)
+
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    try:
+        torch.use_deterministic_algorithms(True)
+    except AttributeError:
+        pass
diff --git a/omnisafe/utils/vtrace.py b/omnisafe/utils/vtrace.py
deleted file mode 100644
index 3d479711e..000000000
--- a/omnisafe/utils/vtrace.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""vtrace"""
-
-from typing import Tuple
-
-import torch
-
-
-# pylint: disable-next=too-many-arguments,too-many-locals
-def calculate_v_trace(
-    policy_action_probs: torch.Tensor,
-    values: torch.Tensor,  # including bootstrap
-    rewards: torch.Tensor,  # including bootstrap
-    behavior_action_probs: torch.Tensor,
-    gamma: float = 0.99,
-    rho_bar: float = 1.0,
-    c_bar: float = 1.0,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]:
-    r"""This function is used to calculate V-trace targets.
-
-    .. math::
-        A_t = \sum_{k=0}^{n-1} (\lambda \gamma)^k \delta_{t+k} +
-        (\lambda \gamma)^n * \rho_{t+n} * (1 - d_{t+n}) * (V(x_{t+n}) - b_{t+n})
-
-    Calculate V-trace targets for off-policy actor-critic learning recursively.
-    For more details,
-    please refer to the paper: `Espeholt et al. 2018, IMPALA <https://arxiv.org/abs/1802.01561>`_.
-
-    Args:
-        policy_action_probs (torch.Tensor): action probabilities of policy network, shape=(sequence_length,)
-        values (torch.Tensor): state values, shape=(sequence_length+1,)
-        rewards (torch.Tensor): rewards, shape=(sequence_length+1,)
-        behavior_action_probs (torch.Tensor): action probabilities of behavior network, shape=(sequence_length,)
-        gamma (float): discount factor
-        rho_bar (float): clip rho
-        c_bar (float): clip c
-
-    Returns:
-        tuple: V-trace targets, shape=(batch_size, sequence_length)
-    """
-    assert values.ndim == 1, 'Please provide 1d-arrays'
-    assert rewards.ndim == 1
-    assert policy_action_probs.ndim == 1
-    assert behavior_action_probs.ndim == 1
-    assert c_bar <= rho_bar
-
-    sequence_length = policy_action_probs.shape[0]
-    # pylint: disable-next=assignment-from-no-return
-    rhos = torch.div(policy_action_probs, behavior_action_probs)
-    clip_rhos = torch.min(
-        rhos, torch.as_tensor(rho_bar)
-    )  # pylint: disable=assignment-from-no-return
-    clip_cs = torch.min(rhos, torch.as_tensor(c_bar))  # pylint: disable=assignment-from-no-return
-    v_s = values[:-1].clone()  # copy all values except bootstrap value
-    last_v_s = values[-1]  # bootstrap from last state
-
-    # calculate v_s
-    for index in reversed(range(sequence_length)):
-        delta = clip_rhos[index] * (rewards[index] + gamma * values[index + 1] - values[index])
-        v_s[index] += delta + gamma * clip_cs[index] * (last_v_s - values[index + 1])
-        last_v_s = v_s[index]  # accumulate current v_s for next iteration
-
-    # calculate q_targets
-    v_s_plus_1 = torch.cat((v_s[1:], values[-1:]))
-    policy_advantage = clip_rhos * (rewards[:-1] + gamma * v_s_plus_1 - values[:-1])
-
-    return v_s, policy_advantage, clip_rhos
diff --git a/omnisafe/wrappers/__init__.py b/omnisafe/wrappers/__init__.py
deleted file mode 100644
index 9615479e9..000000000
--- a/omnisafe/wrappers/__init__.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Environment wrappers."""
-
-import itertools
-from types import MappingProxyType
-
-from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper
-from omnisafe.wrappers.early_terminated_wrapper import EarlyTerminatedWrapper
-from omnisafe.wrappers.saute_wrapper import SauteWrapper
-from omnisafe.wrappers.simmer_wrapper import PidController, QController, SimmerWrapper
-
-
-ENVWRAPPERS = {
-    'cmdp-wrapper': CMDPWrapper,
-    'saute-wrapper': SauteWrapper,
-    'simmer-wrapper': SimmerWrapper,
-    'early-terminated-wrapper': EarlyTerminatedWrapper,
-}
-
-ENVWRAPPERS2TYPE = {
-    env_wrapper: env_wrapper_type for env_wrapper_type, env_wrapper in ENVWRAPPERS.items()
-}
-
-__all__ = ENVWRAPPERS['all'] = tuple(itertools.chain(ENVWRAPPERS.values()))
-
-assert len(ENVWRAPPERS2TYPE) == len(__all__), 'Duplicate environment wrappers found.'
-
-ENVWRAPPERS = MappingProxyType(ENVWRAPPERS)
-ENVWRAPPERS2TYPE = MappingProxyType(ENVWRAPPERS2TYPE)
-
-del itertools, MappingProxyType
diff --git a/omnisafe/wrappers/early_terminated_wrapper.py b/omnisafe/wrappers/early_terminated_wrapper.py
deleted file mode 100644
index 5b8a18347..000000000
--- a/omnisafe/wrappers/early_terminated_wrapper.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Early terminated wrapper."""
-
-from typing import Dict, Tuple, TypeVar
-
-import numpy as np
-
-from omnisafe.utils.tools import as_tensor, expand_dims
-from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper
-from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY
-
-
-RenderFrame = TypeVar('RenderFrame')
-
-
-@WRAPPER_REGISTRY.register
-# pylint: disable-next=too-many-instance-attributes
-class EarlyTerminatedWrapper(CMDPWrapper):
-    """Implementation of the environment wrapper for early-terminated algorithms.
-
-    ``omnisafe`` use different environment wrappers for different kinds of algorithms.
-    This is the environment wrapper for early-terminated algorithms.
-
-    .. note::
-        The only difference between this wrapper and :class:`OnPolicyEnvWrapper` is that,
-        this wrapper terminates the episode when the cost is unequal to 0.
-        Any on-policy algorithm can use this wrapper,
-        to convert itself into an early-terminated algorithm.
-        ``omnisafe`` provides a implementation of :class:`PPOEarlyTerminated`,
-        and :class:`PPOLagarlyTerminated`.
-    """
-
-    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, bool, bool, Dict]:
-        """Step the environment.
-
-        The environment will be stepped by the action from the agent.
-        Corresponding to the Markov Decision Process,
-        the environment will return the ``next observation``,
-        ``reward``, ``cost``, ``terminated``, ``truncated`` and ``info``.
-
-        Args:
-            action (np.ndarray): action.
-        """
-        next_obs, reward, cost, terminated, truncated, info = self.env.step(
-            action.cpu().numpy().squeeze()
-        )
-        if self.cfgs.num_envs == 1:
-            next_obs, reward, cost, terminated, truncated, info = expand_dims(
-                next_obs, reward, cost, terminated, truncated, info
-            )
-            if terminated | truncated:
-                next_obs, info = self.reset()
-        for idx, single_cost in enumerate(cost):
-            if single_cost:
-                terminated[idx] = True
-        self.rollout_data.rollout_log.ep_ret += reward
-        self.rollout_data.rollout_log.ep_costs += cost
-        self.rollout_data.rollout_log.ep_len += np.ones(self.cfgs.num_envs)
-        return (
-            as_tensor(next_obs, reward, cost, device=self.cfgs.device),
-            terminated,
-            truncated,
-            info,
-        )
diff --git a/omnisafe/wrappers/model_based_wrapper.py b/omnisafe/wrappers/model_based_wrapper.py
deleted file mode 100644
index c5f8810db..000000000
--- a/omnisafe/wrappers/model_based_wrapper.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Environment wrapper for model-based algorithms."""
-
-import gymnasium
-import numpy as np
-import safety_gymnasium
-import torch
-
-from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY
-
-
-# ----------------------------------------------------------------------------------------------------------
-ROBOTS = ['Point', 'Car', 'Doggo']
-TASKS = ['Goal', 'Button']
-
-XYZ_SENSORS = {
-    'Point': ['velocimeter'],
-    'Car': ['velocimeter'],
-    'Doggo': ['velocimeter', 'accelerometer'],
-}
-
-ANGLE_SENSORS = {
-    'Point': ['gyro', 'magnetometer'],
-    'Car': ['magnetometer', 'gyro'],
-    'Doggo': ['magnetometer', 'gyro'],
-}
-
-CONSTRAINTS_SAFELOOP = {
-    'Goal': ['vases', 'hazards'],
-    'Button': ['hazards', 'gremlins', 'buttons'],
-}
-CONSTRAINTS_MBPPO = {
-    'Goal': ['hazards'],
-    'Button': ['hazards', 'gremlins', 'buttons'],
-}
-
-
-@WRAPPER_REGISTRY.register
-class ModelBasedEnvWrapper:  # pylint: disable=too-many-instance-attributes
-    """Model-based Environment"""
-
-    def __init__(self, algo, env_id, render_mode=None):
-        self.algo = algo
-        self.env_id = env_id  # safety gym not use this attribute
-        self.render_mode = render_mode
-        self.timestep = 0
-        self.num_steps = 1000
-        self.goal_distance = 0
-        self.modelbased_safetygym = [
-            'SafetyPointGoal3-v0',
-            'SafetyCarGoal3-v0',
-            'SafetyPointGoal1-v0',
-            'SafetyCarGoal1-v0',
-        ]
-        self.modelbased_mujoco_velocity = [
-            'Ant-v4',
-            'Swimmer-v4',
-            'HalfCheetah-v4',
-            'Hopper-v4',
-            'Humanoid-v4',
-            'Walker2d-v4',
-            'Ant-v3',
-            'Swimmer-v3',
-            'HalfCheetah-v3',
-            'Hopper-v3',
-            'Humanoid-v3',
-            'Walker2d-v3',
-        ]
-        assert (
-            env_id in self.modelbased_safetygym + self.modelbased_mujoco_velocity
-        ), f'not support {env_id}'
-        if env_id in self.modelbased_safetygym:
-            self.robot = 'Point' if 'Point' in env_id else 'Car'
-            self.task = 'Goal'
-            self.env_type = 'gym'
-            self.hazards_size = 0.2
-            self.robot = self.robot.capitalize()  # mujoco  not use this attribute
-            self.task = self.task.capitalize()  # mujoco  not use this attribute
-            assert self.robot in ROBOTS, f'can not recognize the robot type {self.robot}'
-            assert self.task in TASKS, f'can not recognize the task type {self.task}'
-            self.env = safety_gymnasium.make(env_id, render_mode=render_mode)
-            self.init_sensor()
-            self.observation_space = gymnasium.spaces.Box(
-                -np.inf, np.inf, (self.ac_state_size,), dtype=np.float32
-            )
-            self.action_space = gymnasium.spaces.Box(
-                -1, 1, (self.env.action_space.shape[0],), dtype=np.float32
-            )
-            self.goal_position = self.env.task.goal_pos[0][:2]
-            self.robot_position = self.env.task.robot_pos
-            self.hazards_position = self.env.task.hazards_pos
-        elif env_id in self.modelbased_mujoco_velocity:
-            self.env_type = 'mujoco-velocity'
-            self.env = gymnasium.make(env_id)
-            self.observation_space = self.env.observation_space
-            self.action_space = self.env.action_space
-            self.dynamics_state_size = self.observation_space.shape[0]
-            self.ac_state_size = self.observation_space.shape[0]
-
-    def set_eplen(self, eplen):
-        """Set episode length"""
-        self.num_steps = eplen
-
-    def get_observation_cost(self, obs):
-        """Get batch cost from batch observation"""
-        if torch.is_tensor(obs):
-            obs = obs.cpu().detach().numpy()
-        batch_size = obs.shape[0]
-        hazards_key = self.key_to_slice['hazards']
-        hazard_obs = obs[:, hazards_key].reshape(batch_size, -1, 2)
-        hazards_dist = np.sqrt(np.sum(np.square(hazard_obs), axis=2)).reshape(batch_size, -1)
-        cost = ((hazards_dist < self.hazards_size) * (self.hazards_size - hazards_dist)).sum(1) * 10
-
-        return cost
-
-    def init_sensor(self):
-        """Initialize sensor observation"""
-        self.xyz_sensors = XYZ_SENSORS[self.robot]
-        self.angle_sensors = ANGLE_SENSORS[self.robot]
-        self.constraints_safeloop = CONSTRAINTS_SAFELOOP[self.task]
-        self.constraints_mbppo = CONSTRAINTS_MBPPO[self.task]
-        self.base_state_name = self.xyz_sensors + self.angle_sensors + ['goal']
-        self.env.reset()
-        obs = self.get_obs()
-        self.obs_flat_size = sum(np.prod(i.shape) for i in list(obs.values()))
-        if self.algo == 'MBPPOLag':
-            self.flatten_order = (
-                self.base_state_name + self.constraints_mbppo + ['robot_m'] + ['robot']
-            )
-        elif self.algo in ['SafeLOOP', 'CAP']:
-            self.flatten_order = self.base_state_name + self.constraints_safeloop
-
-        self.key_to_slice = {}
-        offset = 0
-        for k in self.flatten_order:
-            k_size = np.prod(obs[k].shape)
-            self.key_to_slice[k] = slice(offset, offset + k_size)
-
-            offset += k_size
-        self.base_state_dim = sum(np.prod(obs[k].shape) for k in list(self.base_state_name))
-        self.action_dim = self.env.action_space.shape[0]
-        self.key_to_slice['base_state'] = slice(0, self.base_state_dim)
-
-        self.reset()
-        obs_flat = self.get_obs_flatten()
-        if self.algo == 'MBPPOLag':
-            self.dynamics_state_size = obs_flat.shape[0]  # 42
-            self.ac_state_size = np.array(self.generate_lidar(obs_flat)).shape[0]  # 26
-
-        elif self.algo in ['SafeLOOP', 'CAP']:
-            self.dynamics_state_size = obs_flat.shape[0]  # 42
-            self.ac_state_size = obs_flat.shape[0]  # 42
-
-    def reset(self):
-        """Reset Environment"""
-        self.timestep = 0  # Reset internal timer
-
-        if self.env_type == 'mujoco-velocity':
-            obs, _ = self.env.reset()
-            return obs
-
-        self.env.reset()
-        obs = self.get_obs_flatten()
-        if self.algo == 'MBPPOLag':
-            self.goal_position = self.env.task.goal_pos[0][:2]
-            self.robot_position = self.env.task.robot_pos
-            self.hazards_position = self.env.task.hazards_pos
-            self.goal_distance = self.dist_xy(self.robot_position, self.goal_position)
-
-        return obs
-
-    def step(self, action, num_repeat):  # pylint: disable=too-many-locals
-        """Simulate Environment"""
-        reward = 0
-        cost = 0
-        step_num = 0
-        if self.env_type == 'gym':
-            for _ in range(num_repeat):
-                control = action
-                _, reward_k, cost_k, terminated, truncated, info = self.env.step(control)
-                terminated = False  # not used now
-                step_num += 1
-                reward += reward_k
-                cost += cost_k
-                self.timestep += 1  # Increment internal timer
-                if self.timestep >= self.num_steps:
-                    truncated = True
-                observation = self.get_obs_flatten()
-                goal_met = 'goal_met' in info.keys()  # reach the goal
-                if terminated or truncated or goal_met:
-                    # the action is not related to next state, so break
-                    break
-            if self.algo in ['MBPPOLag', 'SafeLOOP', 'CAP']:
-                info = {'cost': cost, 'goal_met': goal_met, 'step_num': step_num}
-        elif self.env_type == 'mujoco-velocity':
-            for _ in range(num_repeat):
-                control = action
-                state_k, reward_k, terminated, truncated, info = self.env.step(control)
-                step_num += 1
-                reward += reward_k
-                if 'y_velocity' not in info:
-                    cost_k = np.abs(info['x_velocity'])
-                else:
-                    cost_k = np.sqrt(info['x_velocity'] ** 2 + info['y_velocity'] ** 2)
-                cost += cost_k
-                self.timestep += 1  # Increment internal timer
-                if self.timestep >= self.num_steps:
-                    truncated = True
-                if terminated or truncated:
-                    # the action is not related to next state, so break
-                    break
-            info = {'cost': cost, 'goal_met': False, 'step_num': step_num}
-            observation = state_k
-        return observation, reward, cost, terminated, truncated, info
-
-    def render(self):
-        """render environment"""
-        return self.env.render()
-
-    def close(self):
-        """close environment"""
-        self.env.close()
-
-    def recenter(self, pos):
-        '''Return the egocentric XY vector to a position from the robot'''
-        return self.env.task.ego_xy(pos)
-
-    def get_obs(self):
-        '''
-        We will ignore the z-axis coordinates in every poses.
-        The returned obs coordinates are all in the robot coordinates.
-        '''
-        obs = {}
-        robot_pos = self.env.task.robot_pos
-        goal_pos = self.env.task.goal_pos[0]
-        vases_pos_list = self.env.task.vases_pos  # list of shape (3,) ndarray
-        hazards_pos_list = self.env.task.hazards_pos  # list of shape (3,) ndarray
-        ego_goal_pos = self.recenter(np.array(goal_pos[:2]))
-        ego_vases_pos_list = [
-            self.env.task.ego_xy(pos[:2]) for pos in vases_pos_list
-        ]  # list of shape (2,) ndarray
-        ego_hazards_pos_list = [
-            self.env.task.ego_xy(pos[:2]) for pos in hazards_pos_list
-        ]  # list of shape (2,) ndarray
-
-        # append obs to the dict
-        for sensor in self.xyz_sensors:  # Explicitly listed sensors
-            if sensor == 'accelerometer':
-                obs[sensor] = self.env.task.world.get_sensor(sensor)[:1]  # only x axis matters
-            elif sensor == 'ballquat_rear':
-                obs[sensor] = self.env.task.world.get_sensor(sensor)
-            else:
-                obs[sensor] = self.env.task.world.get_sensor(sensor)[:2]  # only x,y axis matters
-
-        for sensor in self.angle_sensors:
-            if sensor == 'gyro':
-                obs[sensor] = self.env.task.world.get_sensor(sensor)[
-                    2:
-                ]  # [2:] # only z axis matters
-                # pass # gyro does not help
-            else:
-                obs[sensor] = self.env.task.world.get_sensor(sensor)
-        if self.algo == 'MBPPOLag':
-            # --------modification-----------------
-            obs['robot'] = np.array(robot_pos[:2])
-            obs['hazards'] = np.array(ego_hazards_pos_list)  # (hazard_num, 2)
-            robot_matrix = self.env.task.world.robot_mat()
-            obs['robot_m'] = np.array(robot_matrix[0][:2])
-            obs['goal'] = ego_goal_pos  # (2,)
-        elif self.algo in ['CAP', 'SafeLOOP']:
-            obs['vases'] = np.array(ego_vases_pos_list)  # (vase_num, 2)
-            obs['hazards'] = np.array(ego_hazards_pos_list)  # (hazard_num, 2)
-            obs['goal'] = ego_goal_pos  # (2,)
-        return obs
-
-    def get_obs_flatten(self):
-        '''get the flattened obs.'''
-        obs = self.get_obs()
-        flat_obs = np.zeros(self.obs_flat_size)
-        for k in self.flatten_order:
-            idx = self.key_to_slice[k]
-            flat_obs[idx] = obs[k].flat
-        return flat_obs
-
-    def get_dist_reward(self):
-        '''
-        @return reward: negative distance from robot to the goal
-        '''
-        return -self.env.task.dist_goal()
-
-    @property
-    def action_range(self):
-        """Get action range"""
-        return float(self.env.action_space.low[0]), float(self.env.action_space.high[0])
-
-    def sample_random_action(self):
-        '''Sample an action randomly from a uniform distribution over all valid actions.'''
-        return self.env.action_space.sample()
-
-    def dist_xy(self, pos1, pos2):
-        '''Return the distance from the robot to an XY position.'''
-        pos1 = np.asarray(pos1)
-        pos2 = np.asarray(pos2)
-        if pos1.shape == (3,):
-            pos1 = pos1[:2]
-        if pos2.shape == (3,):
-            pos2 = pos2[:2]
-        return np.sqrt(np.sum(np.square(pos1 - pos2)))
-
-    def get_reward_cost(self, state):
-        '''Assuming we have reward & cost function. available with us in closed form.'''
-        last_dist_goal = self.goal_distance
-        robot_pos = state[self.key_to_slice['robot']]
-        # ----cost----
-        cost = 0
-        hazards_cost = 1.0
-        for h_pos in self.hazards_position:
-            h_dist = self.dist_xy(h_pos, robot_pos)
-            if h_dist <= self.hazards_size:
-                cost += hazards_cost * (self.hazards_size - h_dist)
-        if cost > 0:
-            cost = 1
-        else:
-            cost = 0
-        # ----reward----
-
-        reward = 0
-        reward_distance = 1.0
-        reward_goal = 1.0
-        goal_size = 0.3
-
-        dist_goal = self.dist_xy(robot_pos, self.goal_position)
-        reward += (last_dist_goal - dist_goal) * reward_distance
-        last_dist_goal = dist_goal
-        goal_flag = False
-        if dist_goal < goal_size:
-            reward += reward_goal
-            goal_flag = True
-        # clip reward
-        if reward < -10:
-            reward = -10
-        elif reward > 10:
-            reward = 10
-        self.goal_distance = last_dist_goal
-        return reward, cost, goal_flag
-
-    def get_goal_flag(self, robot_pos, goal_pos):
-        """Get goal flat"""
-        dist_goal = self.dist_xy(robot_pos, goal_pos)
-        goal_size = 0.3
-        return dist_goal < goal_size
-
-    def ego_xy(self, robot_matrix, robot_pos, pos):
-        '''Return the egocentric XY vector to a position from the robot'''
-        assert pos.shape == (2,), f'Bad pos {pos}'
-        robot_3vec = robot_pos
-        robot_mat = robot_matrix
-
-        pos_3vec = np.concatenate([pos, [0]])  # Add a zero z-coordinate
-        robot_3vec = np.concatenate([robot_3vec, [0]])
-        world_3vec = pos_3vec - robot_3vec
-        return np.matmul(world_3vec, robot_mat)[:2]
-
-    def obs_lidar_pseudo(
-        self, robot_matrix, robot_pos, positions
-    ):  # pylint: disable=too-many-locals
-        '''
-        Return a robot-centric lidar observation of a list of positions.
-
-        Lidar is a set of bins around the robot (divided evenly in a circle).
-        The detection directions are exclusive and exhaustive for a full 360 view.
-        Each bin reads 0 if there are no objects in that direction.
-        If there are multiple objects, the distance to the closest one is used.
-        Otherwise the bin reads the fraction of the distance towards the robot.
-
-        E.g. if the object is 90% of lidar_max_dist away, the bin will read 0.1,
-        and if the object is 10% of lidar_max_dist away, the bin will read 0.9.
-        (The reading can be thought of as "closeness" or inverse distance)
-
-        This encoding has some desirable properties:
-            - bins read 0 when empty
-            - bins smoothly increase as objects get close
-            - maximum reading is 1.0 (where the object overlaps the robot)
-            - close objects occlude far objects
-            - constant size observation with variable numbers of objects
-        '''
-        lidar_num_bins = 16
-        lidar_max_dist = 3
-        obs = np.zeros(lidar_num_bins)
-        lidar_exp_gain = 1.0
-        lidar_alias = True
-        for pos in positions:
-            pos = np.asarray(pos)
-            if pos.shape == (3,):
-                pos = pos[:2]  # Truncate Z coordinate
-            position_z = np.complex(
-                *self.ego_xy(robot_matrix, robot_pos, pos)
-            )  # X, Y as real, imaginary components
-            dist = np.abs(position_z)
-            angle = np.angle(position_z) % (np.pi * 2)
-            bin_size = (np.pi * 2) / lidar_num_bins
-            sensor_bin = int(angle / bin_size)
-            bin_angle = bin_size * sensor_bin
-            if lidar_max_dist is None:
-                sensor = np.exp(-lidar_exp_gain * dist)
-            else:
-                sensor = max(0, lidar_max_dist - dist) / lidar_max_dist
-            obs[sensor_bin] = max(obs[sensor_bin], sensor)
-            # Aliasing
-            if lidar_alias:
-                alias = (angle - bin_angle) / bin_size
-                assert (
-                    0 <= alias <= 1
-                ), f'bad alias {alias}, dist {dist}, angle {angle}, bin {sensor_bin}'
-                bin_plus = (sensor_bin + 1) % lidar_num_bins
-                bin_minus = (sensor_bin - 1) % lidar_num_bins
-                obs[bin_plus] = max(obs[bin_plus], alias * sensor)
-                obs[bin_minus] = max(obs[bin_minus], (1 - alias) * sensor)
-        return obs
-
-    def make_observation(self, state, lidar):
-        """Get observation"""
-        state = list(state)
-        lidar = list(lidar)
-        base_state = state[self.key_to_slice['base_state']]
-        obs = base_state + lidar + state[self.key_to_slice['robot']]
-
-        return obs
-
-    def generate_lidar(self, obs):
-        """Get lidar observation"""
-        robot_matrix_x_y = obs[self.key_to_slice['robot_m']]
-        robot_matrix_x = robot_matrix_x_y[0]
-        robot_matrix_y = robot_matrix_x_y[1]
-        first_row = [robot_matrix_x, robot_matrix_y, 0]
-        second_row = [-robot_matrix_y, robot_matrix_x, 0]
-        third_row = [0, 0, 1]
-        robot_matrix = [first_row, second_row, third_row]
-        robot_pos = obs[self.key_to_slice['robot']]
-        lidar_vec = self.obs_lidar_pseudo(robot_matrix, robot_pos, self.hazards_position)
-        obs_vec = self.make_observation(obs, lidar_vec)
-        return obs_vec
diff --git a/omnisafe/wrappers/saute_wrapper.py b/omnisafe/wrappers/saute_wrapper.py
deleted file mode 100644
index faffdcf38..000000000
--- a/omnisafe/wrappers/saute_wrapper.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Environment wrapper for saute algorithms."""
-
-from dataclasses import dataclass
-
-import numpy as np
-import torch
-from gymnasium import spaces
-
-from omnisafe.common.normalizer import Normalizer
-from omnisafe.common.record_queue import RecordQueue
-from omnisafe.typing import NamedTuple, Optional
-from omnisafe.utils.tools import as_tensor, expand_dims
-from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper
-from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY
-
-
-@dataclass
-class RolloutLog:
-    """Log for roll out."""
-
-    ep_ret: np.ndarray
-    ep_costs: np.ndarray
-    ep_len: np.ndarray
-    ep_budget: np.ndarray
-
-
-@dataclass
-class SauteData:
-    """Data for Saute RL."""
-
-    safety_budget: float
-    unsafe_reward: float
-    safety_obs: np.ndarray
-
-
-@dataclass
-class RolloutData:
-    """Data for roll out."""
-
-    local_steps_per_epoch: int
-    max_ep_len: int
-    use_cost: bool
-    current_obs: torch.Tensor
-    rollout_log: RolloutLog
-    saute_data: SauteData
-
-
-@WRAPPER_REGISTRY.register
-class SauteWrapper(CMDPWrapper):
-    r"""SauteEnvWrapper.
-
-    Saute is a safe RL algorithm that uses state augmentation to ensure safety.
-    The state augmentation is the concatenation of the original state and the safety state.
-    The safety state is the safety budget minus the cost divided by the safety budget.
-
-    .. note::
-        - If the safety state is greater than 0, the reward is the original reward.
-        - If the safety state is less than 0, the reward is the unsafe reward (always 0 or less than 0).
-
-    ``omnisafe`` provides two implementations of Saute RL: :class:`PPOSaute` and :class:`PPOLagSaute`.
-
-    References:
-
-    - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation
-    - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang,
-      David Mguni, Jun Wang, Haitham Bou-Ammar.
-    - URL: https://arxiv.org/abs/2202.06558
-    """
-
-    def __init__(self, env_id, cfgs: Optional[NamedTuple] = None, **env_kwargs) -> None:
-        """Initialize environment wrapper.
-
-        Args:
-            env_id (str): environment id.
-            cfgs (collections.namedtuple): configs.
-            env_kwargs (dict): The additional parameters of environments.
-        """
-        super().__init__(env_id, cfgs, **env_kwargs)
-        if hasattr(self.env, '_max_episode_steps'):
-            max_ep_len = self.env._max_episode_steps
-        else:
-            max_ep_len = 1000
-        if cfgs.scale_safety_budget:
-            safety_budget = (
-                cfgs.safety_budget
-                * (1 - self.cfgs.saute_gamma**max_ep_len)
-                / (1 - self.cfgs.saute_gamma)
-                / np.float32(max_ep_len)
-                * np.ones((self.cfgs.num_envs, 1))
-            )
-        else:
-            safety_budget = cfgs.safety_budget * np.ones((self.cfgs.num_envs, 1))
-        safety_obs = np.ones((self.cfgs.num_envs, 1), dtype=np.float32)
-        self.rollout_data = RolloutData(
-            0.0,
-            max_ep_len,
-            False,
-            None,
-            RolloutLog(
-                np.zeros(self.cfgs.num_envs),
-                np.zeros(self.cfgs.num_envs),
-                np.zeros(self.cfgs.num_envs),
-                np.zeros((self.cfgs.num_envs, 1)),
-            ),
-            SauteData(
-                safety_budget=safety_budget,
-                unsafe_reward=cfgs.unsafe_reward,
-                safety_obs=safety_obs,
-            ),
-        )
-        high = np.array(np.hstack([self.observation_space.high, np.inf]), dtype=np.float32)
-        low = np.array(np.hstack([self.observation_space.low, np.inf]), dtype=np.float32)
-        self.observation_space = spaces.Box(high=high, low=low)
-        self.obs_normalizer = (
-            Normalizer(shape=(self.cfgs.num_envs, self.observation_space.shape[0]), clip=5).to(
-                self.cfgs.device
-            )
-            if self.cfgs.normalized_obs
-            else None
-        )
-        self.record_queue = RecordQueue(
-            'ep_ret', 'ep_cost', 'ep_len', 'ep_budget', maxlen=self.cfgs.max_len
-        )
-        self.rollout_data.current_obs = self.reset()[0]
-
-    def augment_obs(self, obs: np.ndarray) -> np.ndarray:
-        """Augmenting the obs with the safety obs.
-
-        Detailedly, the augmented obs is the concatenation of the original obs and the safety obs.
-        The safety obs is the safety budget minus the cost divided by the safety budget.
-
-        Args:
-            obs (np.ndarray): observation.
-            safety_obs (np.ndarray): safety observation.
-        """
-        augmented_obs = np.hstack([obs, self.rollout_data.saute_data.safety_obs])
-        return augmented_obs
-
-    def safety_step(self, cost: np.ndarray, done: bool) -> np.ndarray:
-        """Update the normalized safety obs.
-
-        Args:
-            cost (np.ndarray): cost.
-        """
-        if done:
-            self.rollout_data.saute_data.safety_obs = np.ones(
-                (self.cfgs.num_envs, 1), dtype=np.float32
-            )
-        else:
-            self.rollout_data.saute_data.safety_obs -= (
-                cost / self.rollout_data.saute_data.safety_budget
-            )
-            self.rollout_data.saute_data.safety_obs /= self.cfgs.saute_gamma
-
-    def safety_reward(self, reward: np.ndarray) -> np.ndarray:
-        """Update the reward.
-
-        Args:
-            reward (np.ndarray): reward.
-            next_safety_obs (np.ndarray): next safety observation.
-        """
-        for idx, safety_obs in enumerate(self.rollout_data.saute_data.safety_obs):
-            if safety_obs <= 0:
-                reward[idx] = self.rollout_data.saute_data.unsafe_reward
-        return reward
-
-    def reset(self) -> tuple((torch.Tensor, dict)):
-        """Reset environment.
-
-        .. note::
-            The safety obs is initialized to 1.0.
-
-        Args:
-            seed (int): seed for environment reset.
-        """
-        obs, info = self.env.reset()
-        if self.cfgs.num_envs == 1:
-            obs = expand_dims(obs)
-            info = [info]
-        self.rollout_data.saute_data.safety_obs = np.ones((self.cfgs.num_envs, 1), dtype=np.float32)
-        obs = self.augment_obs(obs)
-        return torch.as_tensor(obs, dtype=torch.float32, device=self.cfgs.device), info
-
-    def step(
-        self, action: torch.Tensor
-    ) -> tuple((torch.Tensor, torch.Tensor, torch.Tensor, bool, dict)):
-        """Step environment.
-
-        .. note::
-            The safety obs is updated by the cost.
-            The reward is updated by the safety obs.
-            Detailedly, the reward is the original reward if the safety obs is greater than 0,
-            otherwise the reward is the unsafe reward.
-
-        Args:
-            action (torch.Tensor): action.
-        """
-        next_obs, reward, cost, terminated, truncated, info = self.env.step(
-            action.cpu().numpy().squeeze()
-        )
-        if self.cfgs.num_envs == 1:
-            next_obs, reward, cost, terminated, truncated, info = expand_dims(
-                next_obs, reward, cost, terminated, truncated, info
-            )
-            self.safety_step(cost, done=terminated | truncated)
-            if terminated | truncated:
-                augmented_obs, info = self.reset()
-            else:
-                augmented_obs = self.augment_obs(next_obs)
-        else:
-            augmented_obs = self.augment_obs(next_obs)
-        self.rollout_data.rollout_log.ep_ret += reward
-        self.rollout_data.rollout_log.ep_costs += cost
-        self.rollout_data.rollout_log.ep_len += np.ones(self.cfgs.num_envs)
-        self.rollout_data.rollout_log.ep_budget += self.rollout_data.saute_data.safety_obs
-        reward = self.safety_reward(reward)
-        return (
-            as_tensor(augmented_obs, reward, cost, device=self.cfgs.device),
-            terminated,
-            truncated,
-            info,
-        )
-
-    def reset_log(
-        self,
-        idx,
-    ) -> None:
-        (
-            self.rollout_data.rollout_log.ep_ret[idx],
-            self.rollout_data.rollout_log.ep_costs[idx],
-            self.rollout_data.rollout_log.ep_len[idx],
-            self.rollout_data.rollout_log.ep_budget[idx],
-        ) = (0.0, 0.0, 0.0, 0.0)
-
-    def rollout_log(
-        self,
-        logger,
-        idx,
-        is_train: bool = True,
-    ) -> None:
-        """Log the information of the rollout."""
-        self.record_queue.append(
-            ep_ret=self.rollout_data.rollout_log.ep_ret[idx],
-            ep_cost=self.rollout_data.rollout_log.ep_costs[idx],
-            ep_len=self.rollout_data.rollout_log.ep_len[idx],
-            ep_budget=self.rollout_data.rollout_log.ep_budget[idx],
-        )
-        avg_ep_ret, avg_ep_cost, avg_ep_len, avg_ep_budget = self.record_queue.get_mean(
-            'ep_ret', 'ep_cost', 'ep_len', 'ep_budget'
-        )
-        if is_train:
-            logger.store(
-                **{
-                    'Metrics/EpRet': avg_ep_ret,
-                    'Metrics/EpCost': avg_ep_cost,
-                    'Metrics/EpLen': avg_ep_len,
-                    'Metrics/EpBudget': avg_ep_budget,
-                }
-            )
-        else:
-            logger.store(
-                **{
-                    'Test/EpRet': avg_ep_ret,
-                    'Test/EpCost': avg_ep_cost,
-                    'Test/EpLen': avg_ep_len,
-                    'Test/EpBudget': avg_ep_budget,
-                }
-            )
diff --git a/omnisafe/wrappers/simmer_wrapper.py b/omnisafe/wrappers/simmer_wrapper.py
deleted file mode 100644
index 068f45137..000000000
--- a/omnisafe/wrappers/simmer_wrapper.py
+++ /dev/null
@@ -1,688 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY pid_kiND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Environment wrapper for Simmer algorithm."""
-
-import copy
-from dataclasses import dataclass
-from typing import Dict, Tuple
-
-import numpy as np
-import torch
-from gymnasium import spaces
-
-from omnisafe.common.normalizer import Normalizer
-from omnisafe.common.record_queue import RecordQueue
-from omnisafe.typing import NamedTuple, Optional
-from omnisafe.utils.tools import as_tensor, expand_dims
-from omnisafe.wrappers.cmdp_wrapper import CMDPWrapper
-from omnisafe.wrappers.wrapper_registry import WRAPPER_REGISTRY
-
-
-@dataclass
-class RolloutLog:
-    """Log for roll out."""
-
-    ep_ret: np.ndarray = 0.0
-    ep_costs: np.ndarray = 0.0
-    ep_len: np.ndarray = 0.0
-    ep_budget: np.ndarray = 0.0
-
-
-@dataclass
-class SimmerData:
-    """Data for Simmer RL."""
-
-    safety_budget: float = 0.0
-    upper_budget: float = 0.0
-    lower_budget: float = 0.0
-    relative_budget: float = 0.0
-    unsafe_reward: float = 0.0
-    safety_obs: np.ndarray = None
-
-
-@dataclass
-class RolloutData:
-    """Data for roll out."""
-
-    local_steps_per_epoch: int = 0
-    max_ep_len: int = 0
-    use_cost: bool = False
-    current_obs: np.ndarray = 0.0
-    rollout_log: RolloutLog = None
-    simmer_data: SimmerData = None
-
-
-@dataclass
-class PidData:
-    """Data for PID controller."""
-
-    pid_kp: float
-    pid_ki: float
-    pid_kd: float
-    tau: float
-    step_size: float
-
-
-@dataclass
-class QData:
-    """Data for Q controller."""
-
-    state_dim: int
-    action_dim: int
-    tau: float
-    threshold: float
-    learning_rate: float
-    epsilon: float
-
-
-@dataclass
-class QTable:
-    """Q table for Q controller."""
-
-    action_space: np.ndarray
-    q_function: np.ndarray
-    state_space: np.ndarray
-
-
-# pylint: disable-next=too-many-instance-attributes
-class PidController:
-    """Using PID controller to control the safety budget in Simmer environment."""
-
-    def __init__(
-        self,
-        cfgs: NamedTuple,
-        safety_budget: float = 25.0,
-        lower_budget: float = 1.0,
-        upper_budget: float = 25.0,
-    ) -> None:
-        """Initialize the PID controller.
-
-        Args:
-            cfgs (CfgNode): Configurations.
-            safety_budget (float): The initial safety budget.
-            lower_budget (float): The lower bound of safety budget.
-            upper_budget (float): The upper bound of safety budget.
-        """
-        self.pid_data = PidData(
-            pid_kp=cfgs.pid_kp,
-            pid_ki=cfgs.pid_ki,
-            pid_kd=cfgs.pid_kd,
-            tau=cfgs.tau,
-            step_size=cfgs.step_size,
-        )
-        self.simmer_data = SimmerData(
-            safety_budget=safety_budget,
-            upper_budget=upper_budget,
-            lower_budget=lower_budget,
-        )
-
-        # initialize the PID controller.
-        self.error = 0.0
-        self.error_i = 0.0
-        self.prev_action = 0
-        self.prev_raw_action = 0
-        self._init_check()
-
-    def _init_check(self) -> None:
-        """Check the initial value of PID controller."""
-        assert self.pid_data.pid_kp >= 0, 'pid_kp should be non-negative.'
-        assert self.pid_data.pid_ki >= 0, 'pid_ki should be non-negative.'
-        assert self.pid_data.pid_kd >= 0, 'pid_kd should be non-negative.'
-        assert self.pid_data.tau >= 0 and self.pid_data.tau <= 1, 'tau should be in [0, 1].'
-        assert self.pid_data.step_size > 0, 'step_size should be positive.'
-        assert (
-            self.simmer_data.safety_budget >= self.simmer_data.lower_budget
-        ), 'safety_budget should be larger than lower_budget.'
-        assert (
-            self.simmer_data.safety_budget <= self.simmer_data.upper_budget
-        ), 'safety_budget should be smaller than upper_budget.'
-
-    def compute_raw_action(self, obs: float) -> float:
-        r"""Compute the raw action based on current obs.
-
-        Detailedly, the raw action is computed by the PID controller.
-
-        .. math::
-            a = K_p e_p + K_i \int e_p dt + K_d \frac{de_p}{dt}
-
-        where :math:`e_p` is the error of the PID controller.
-
-        Args:
-            obs (float): The current observation.
-        """
-        # low pass filter.
-        error_p = self.pid_data.tau * self.error + (1 - self.pid_data.tau) * (
-            self.simmer_data.safety_budget - obs
-        )
-        self.error_i += self.error
-        error_d = self.pid_data.pid_kd * (self.prev_action - self.prev_raw_action)
-
-        # compute PID error.
-        curr_raw_action = (
-            self.pid_data.pid_kp * error_p
-            + self.pid_data.pid_ki * self.error_i
-            + self.pid_data.pid_kd * error_d
-        )
-        return curr_raw_action
-
-    def act(self, obs: float) -> float:
-        """Compute the safety budget based on the observation ``Jc``, following the several steps:
-
-        - Compute the raw action based on the observation ``Jc``.
-        - Clip the raw action.
-        - Compute the safety budget.
-
-        Args:
-            obs (float): The current observation.
-        """
-        curr_raw_action = self.compute_raw_action(obs)
-
-        # clip the raw action.
-        curr_action = np.clip(curr_raw_action, -self.pid_data.step_size, self.pid_data.step_size)
-        self.prev_action = curr_action
-        self.prev_raw_action = curr_raw_action
-        raw_budget = self.simmer_data.safety_budget + curr_action
-
-        # clip the safety budget.
-        self.simmer_data.safety_budget = np.clip(
-            raw_budget, self.simmer_data.lower_budget, self.simmer_data.upper_budget
-        )
-
-        return self.simmer_data.safety_budget
-
-
-# pylint: disable-next=too-many-instance-attributes
-class QController:
-    """Using Q-learning to control the safety budget in Simmer environment."""
-
-    def __init__(
-        self,
-        cfgs,
-        safety_budget: float = 25.0,
-        lower_budget: float = 1.0,
-        upper_budget: float = 25.0,
-    ) -> None:
-        """ "
-        Initialize the Q-learning controller.
-
-        Args:
-            cfgs (CfgNode): The config file.
-            safety_budget (float): The initial safety budget.
-            lower_budget (float): The lower bound of the safety budget.
-            upper_budget (float): The upper bound of the safety budget.
-        """
-        self.safety_budget = safety_budget
-        self.q_data = QData(
-            state_dim=cfgs.state_dim,
-            action_dim=cfgs.act_dim,
-            tau=cfgs.tau,
-            threshold=cfgs.threshold,
-            learning_rate=cfgs.q_lr,
-            epsilon=cfgs.epsilon,
-        )
-        self.q_table = QTable(
-            action_space=np.linspace(-1, 1, cfgs.act_dim, dtype=int),
-            q_function=np.zeros((cfgs.state_dim, cfgs.act_dim)),
-            state_space=np.linspace(lower_budget, upper_budget, cfgs.state_dim),
-        )
-        self.action = 0
-        self.step(self.action)
-
-        # initialize the observation (Cost value per epoch) buffer.
-        self.prev_obs = copy.copy(self.safety_budget)
-        self.filtered_obs_buffer = []
-        self.filtered_obs = 0
-        self._init_check()
-
-    def _init_check(self) -> None:
-        """Check the initial value of Q-learning controller."""
-        assert self.q_data.state_dim > 0, 'state_dim should be positive.'
-        assert self.q_data.action_dim > 0, 'action_dim should be positive.'
-        assert self.q_data.tau >= 0 and self.q_data.tau <= 1, 'tau should be in [0, 1].'
-        assert self.q_data.threshold >= 0, 'threshold should be non-negative.'
-        assert self.q_data.learning_rate > 0, 'learning_rate should be positive.'
-        assert self.q_data.epsilon >= 0 and self.q_data.epsilon <= 1, 'epsilon should be in [0, 1].'
-
-    def get_state_idx(self, state: float) -> int:
-        """Get the state index.
-
-        Args:
-            state (float): The current state.
-        """
-        state_idx = np.argwhere(self.q_table.state_space == state)[0][0]
-        return state_idx
-
-    def get_action_idx(self, action: float) -> int:
-        """Get the action index.
-
-        Args:
-            action (float): The current action.
-        """
-        action_idx = np.argwhere(self.q_table.action_space == action)
-        return action_idx
-
-    def get_random_action(self) -> float:
-        """Get the random action.
-
-        Returns:
-            float: The random action.
-        """
-        action_idx = np.random.randint(0, self.q_data.action_dim)
-        return self.q_table.action_space[action_idx]
-
-    def get_greedy_action(self, state: float) -> float:
-        """Get the greedy action.
-
-        Args:
-            state (float): The current state(``cost_limit``).
-        """
-        state_idx = self.get_state_idx(state)
-        action_idx = np.argmax(self.q_table.q_function[state_idx, :])
-        action = self.q_table.action_space[action_idx]
-        return action
-
-    def update_q_function(
-        self, state: float, action: float, reward: float, next_state: float
-    ) -> None:
-        """Update the Q function using the Bellman equation.
-
-        Detailedly, the Q function is updated as follows:
-
-        .. math::
-            Q(s, a) = (1 - \\alpha) Q(s, a) + \\alpha (r + \\tau \\max_{a'} Q(s', a'))
-
-        where :math:`s` is the current state, :math:`a` is the current action,
-        :math:`r` is the reward, :math:`s'` is the next state,
-        :math:`\\alpha` is the learning rate,
-        and :math:`\\tau` is the discount factor.
-
-        Args:
-            state (float): The current state.
-            action (float): The current action.
-            reward (float): The reward.
-            next_state (float): The next state.
-        """
-        state_idx = self.get_state_idx(state)
-        action_idx = self.get_action_idx(action)
-        next_state_idx = self.get_state_idx(next_state)
-        self.q_table.q_function[state_idx, action_idx] = (
-            1 - self.q_data.learning_rate
-        ) * self.q_table.q_function[state_idx, action_idx] + self.q_data.learning_rate * (
-            reward + self.q_data.tau * np.max(self.q_table.q_function[next_state_idx, :])
-        )
-
-    def step(self, action: float) -> float:
-        """Step the environment.
-
-        Args:
-            action (float): The current action.
-        """
-        state_idx = self.get_state_idx(self.safety_budget)
-        state_idx = np.clip(state_idx + action, 0, self.q_data.state_dim - 1, dtype=int)
-        self.safety_budget = self.q_table.state_space[state_idx]
-        return self.safety_budget
-
-    def reward(self, state: float, action: float, obs: float) -> float:
-        r"""Get the reward function based on whether the observation is within the threshold.
-
-        Detailedly, the reward function is defined as follows:
-
-        .. list-table::
-
-            *   -   States
-                -   Increase
-                -   No change
-                -   Decrease
-            *   -   Unsafe
-                -   -1
-                -   -1
-                -   2
-            *   -   Safe
-                -   0.5
-                -   1
-                -   -1
-            *   -   Very Safe
-                -   0.5
-                -   1
-                -   -1
-
-        Args:
-            state (float): The current state.
-            action (float): The current action.
-            obs (float): The observation.
-        """
-        action_idx = self.get_action_idx(action)
-        if int(self.q_data.threshold > obs - state and obs - state > -self.q_data.threshold):
-            reward = np.array([-1, 1, 0.5])[action_idx]
-        elif int(obs - state <= -self.q_data.threshold):
-            reward = np.array([-1, 0, 2])[action_idx]
-        elif int(obs - state >= self.q_data.threshold):
-            reward = np.array([2, -1, -1])[action_idx]
-        return reward[0]
-
-    def act(self, obs: float) -> float:
-        """Compute the safety budget based on the observation ``Jc``, following the several steps:
-
-        - Filter the observation using a low-pass filter.
-        - Use epsilon greedy to explore the environment.
-        - Update the Q function by calling :meth:`update_q_function`.
-        - Return the safety budget.
-
-        Args:
-            obs (float): The current observation.
-
-        """
-        prev_obs = self.filtered_obs
-        self.filtered_obs = self.q_data.tau * prev_obs + (1 - self.q_data.tau) * obs
-        self.filtered_obs_buffer.append(self.filtered_obs)
-        state = self.safety_budget
-
-        # use epsilon greedy to explore the environment
-        epsilon = np.random.random()
-        if epsilon > self.q_data.epsilon:
-            action = self.get_random_action()
-        else:
-            action = self.get_greedy_action(state)
-        reward = self.reward(state, action, self.filtered_obs)
-        next_state = self.step(action)
-        safety_budget = next_state
-
-        # update the Q function
-        self.update_q_function(state, action, reward, next_state)
-        return safety_budget
-
-
-@WRAPPER_REGISTRY.register
-# pylint: disable-next=too-many-instance-attributes
-class SimmerWrapper(CMDPWrapper):
-    r"""SimmerEnvWrapper.
-
-    Simmer is a safe RL algorithm that uses a safety budget to control the exploration of the RL agent.
-    Similar to :class:`SauteEnvWrapper`, Simmer uses state augmentation to ensure safety.
-    Additionally, Simmer uses PID controller and Q learning controller to control the safety budget.
-
-    .. note::
-
-        - If the safety state is greater than 0, the reward is the original reward.
-        - If the safety state is less than 0, the reward is the unsafe reward (always 0 or less than 0).
-
-    ``omnisafe`` provides two implementations of Simmer RL: :class:`PPOSimmer` and :class:`PPOLagSimmer`.
-
-    References:
-
-    - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation
-    - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee, Ziyan Wang,
-      David Mguni, Jun Wang, Haitham Bou-Ammar.
-    - URL: https://arxiv.org/abs/2202.06558
-
-    """
-
-    def __init__(self, env_id, cfgs: Optional[NamedTuple] = None, **env_kwargs) -> None:
-        """Initialize environment wrapper.
-
-        Args:
-            env_id (str): environment id.
-            cfgs (collections.namedtuple): configs.
-            env_kwargs (dict): The additional parameters of environments.
-        """
-        super().__init__(env_id, cfgs, **env_kwargs)
-        if hasattr(self.env, '_max_episode_steps'):
-            max_ep_len = self.env._max_episode_steps
-        else:
-            max_ep_len = 1000
-        if cfgs.scale_safety_budget:
-            safety_budget = (
-                cfgs.lower_budget
-                * (1 - cfgs.simmer_gamma**max_ep_len)
-                / (1 - cfgs.simmer_gamma)
-                / np.float32(max_ep_len)
-            )
-            lower_budget = (
-                cfgs.lower_budget
-                * (1 - cfgs.simmer_gamma**max_ep_len)
-                / (1 - cfgs.simmer_gamma)
-                / np.float32(max_ep_len)
-            )
-            upper_budget = (
-                cfgs.upper_budget
-                * (1 - cfgs.simmer_gamma**max_ep_len)
-                / (1 - cfgs.simmer_gamma)
-                / np.float32(max_ep_len)
-            )
-        else:
-            safety_budget = cfgs.lower_budget
-            lower_budget = cfgs.lower_budget
-            upper_budget = cfgs.upper_budget
-        self.rollout_data = RolloutData(
-            0.0,
-            max_ep_len,
-            False,
-            None,
-            RolloutLog(
-                np.zeros(self.cfgs.num_envs),
-                np.zeros(self.cfgs.num_envs),
-                np.zeros(self.cfgs.num_envs),
-                np.zeros((self.cfgs.num_envs, 1)),
-            ),
-            SimmerData(
-                safety_budget=safety_budget,
-                upper_budget=upper_budget,
-                lower_budget=lower_budget,
-                relative_budget=safety_budget / upper_budget,
-                unsafe_reward=cfgs.unsafe_reward,
-                safety_obs=safety_budget / upper_budget,
-            ),
-        )
-        high = np.array(np.hstack([self.observation_space.high, np.inf]), dtype=np.float32)
-        low = np.array(np.hstack([self.observation_space.low, np.inf]), dtype=np.float32)
-        self.observation_space = spaces.Box(high=high, low=low)
-        self.obs_normalizer = (
-            Normalizer(shape=(self.cfgs.num_envs, self.observation_space.shape[0]), clip=5).to(
-                device=self.cfgs.device
-            )
-            if self.cfgs.normalized_obs
-            else None
-        )
-        self.record_queue = RecordQueue(
-            'ep_ret', 'ep_cost', 'ep_len', 'ep_budget', maxlen=self.cfgs.max_len
-        )
-        if cfgs.simmer_controller == 'PID':
-            self.controller = PidController(
-                cfgs.controller_cfgs,
-                safety_budget=self.rollout_data.simmer_data.safety_budget,
-                lower_budget=self.rollout_data.simmer_data.lower_budget,
-                upper_budget=self.rollout_data.simmer_data.upper_budget,
-            )
-        elif cfgs.simmer_controller == 'Q':
-            self.controller = QController(
-                cfgs.controller_cfgs,
-                safety_budget=self.rollout_data.simmer_data.safety_budget,
-                lower_budget=self.rollout_data.simmer_data.lower_budget,
-                upper_budget=self.rollout_data.simmer_data.upper_budget,
-            )
-        else:
-            raise NotImplementedError(
-                f'Controller type {cfgs.simmer_controller} is not implemented.'
-            )
-        self.rollout_data.current_obs = self.reset()[0]
-
-    def _init_check(self) -> None:
-        super()._init_check()
-        assert (
-            self.cfgs.simmer_gamma >= 0 and self.cfgs.simmer_gamma <= 1
-        ), 'The simmer gamma should be in [0, 1].'
-
-    def augment_obs(self, obs: np.ndarray) -> np.ndarray:
-        """Augmenting the obs with the safety obs.
-
-        Detailedly, the augmented obs is the concatenation of the original obs and the safety obs.
-        The safety obs is the safety budget minus the cost divided by the safety budget.
-
-        Args:
-            obs (np.ndarray): observation.
-            safety_obs (np.ndarray): safety observation.
-        """
-        augmented_obs = np.hstack([obs, self.rollout_data.simmer_data.safety_obs])
-        return augmented_obs
-
-    def safety_step(self, cost: np.ndarray, done: bool) -> np.ndarray:
-        """Update the normalized safety obs.
-
-        Args:
-            cost (np.ndarray): cost.
-        """
-        if done:
-            self.rollout_data.simmer_data.safety_obs = np.ones(
-                (self.cfgs.num_envs, 1), dtype=np.float32
-            )
-        else:
-            self.rollout_data.simmer_data.safety_obs -= (
-                cost / self.rollout_data.simmer_data.upper_budget
-            )
-            self.rollout_data.simmer_data.safety_obs /= self.cfgs.simmer_gamma
-
-    def safety_reward(self, reward: np.ndarray) -> np.ndarray:
-        """Update the reward.
-
-        Args:
-            reward (np.ndarray): reward.
-            next_safety_obs (np.ndarray): next safety observation.
-        """
-        for idx, safety_obs in enumerate(self.rollout_data.simmer_data.safety_obs):
-            if safety_obs <= 0:
-                reward[idx] = self.rollout_data.simmer_data.unsafe_reward
-        return reward
-
-    def reset(self) -> Tuple[torch.Tensor, Dict]:
-        r"""Reset environment.
-
-        .. note::
-            The safety obs is initialized to ``rel_safety_budget``,
-            which is the safety budget divided by the upper budget.
-            The safety budget is controlled by the controller.
-
-        Args:
-            seed (int): seed for environment reset.
-        """
-        obs, info = self.env.reset()
-        if self.cfgs.num_envs == 1:
-            obs = expand_dims(obs)
-            info = [info]
-        self.rollout_data.simmer_data.relative_budget = (
-            self.rollout_data.simmer_data.safety_budget / self.rollout_data.simmer_data.upper_budget
-        )
-        self.rollout_data.simmer_data.safety_obs = (
-            self.rollout_data.simmer_data.relative_budget
-            * np.ones((self.cfgs.num_envs, 1), dtype=np.float32)
-        )
-        obs = self.augment_obs(obs)
-        return torch.as_tensor(obs, dtype=torch.float32, device=self.cfgs.device), info
-
-    def step(
-        self, action: torch.Tensor
-    ) -> tuple((torch.Tensor, torch.Tensor, torch.Tensor, bool, dict)):
-        """Step environment.
-
-        .. note::
-            The safety obs is updated by the cost.
-            The reward is updated by the safety obs.
-            Detailedly, the reward is the original reward if the safety obs is greater than 0,
-            otherwise the reward is the unsafe reward.
-
-        Args:
-            action (torch.Tensor): action.
-        """
-        next_obs, reward, cost, terminated, truncated, info = self.env.step(
-            action.cpu().numpy().squeeze()
-        )
-        if self.cfgs.num_envs == 1:
-            next_obs, reward, cost, terminated, truncated, info = expand_dims(
-                next_obs, reward, cost, terminated, truncated, info
-            )
-            self.safety_step(cost, done=terminated | truncated)
-            if terminated | truncated:
-                augmented_obs, info = self.reset()
-            else:
-                augmented_obs = self.augment_obs(next_obs)
-        else:
-            augmented_obs = self.augment_obs(next_obs)
-        self.rollout_data.rollout_log.ep_ret += reward
-        self.rollout_data.rollout_log.ep_costs += cost
-        self.rollout_data.rollout_log.ep_len += np.ones(self.cfgs.num_envs)
-        self.rollout_data.rollout_log.ep_budget += self.rollout_data.simmer_data.safety_obs
-        reward = self.safety_reward(reward)
-        return (
-            as_tensor(augmented_obs, reward, cost, device=self.cfgs.device),
-            terminated,
-            truncated,
-            info,
-        )
-
-    def set_budget(self, Jc):
-        """Set the safety budget by the controller.
-
-        Args:
-            Jc (np.ndarray): The safety budget.
-        """
-        self.rollout_data.simmer_data.safety_budget = self.controller.act(Jc)
-
-    def rollout_log(
-        self,
-        logger,
-        idx,
-        is_train: bool = True,
-    ) -> None:
-        """Log the information of the rollout."""
-        self.record_queue.append(
-            ep_ret=self.rollout_data.rollout_log.ep_ret[idx],
-            ep_cost=self.rollout_data.rollout_log.ep_costs[idx],
-            ep_len=self.rollout_data.rollout_log.ep_len[idx],
-            ep_budget=self.rollout_data.rollout_log.ep_budget[idx],
-        )
-        avg_ep_ret, avg_ep_cost, avg_ep_len, avg_ep_budget = self.record_queue.get_mean(
-            'ep_ret', 'ep_cost', 'ep_len', 'ep_budget'
-        )
-        if is_train:
-            logger.store(
-                **{
-                    'Metrics/EpRet': avg_ep_ret,
-                    'Metrics/EpCost': avg_ep_cost,
-                    'Metrics/EpLen': avg_ep_len,
-                    'Metrics/EpBudget': avg_ep_budget,
-                    'Metrics/SafetyBudget': self.rollout_data.simmer_data.safety_budget,
-                }
-            )
-            self.set_budget(avg_ep_cost)
-        else:
-            logger.store(
-                **{
-                    'Test/EpRet': avg_ep_ret,
-                    'Test/EpCost': avg_ep_cost,
-                    'Test/EpLen': avg_ep_len,
-                    'Test/EpBudget': avg_ep_budget,
-                    'Test/SafetyBudget': self.rollout_data.simmer_data.safety_budget,
-                }
-            )
-
-    def reset_log(
-        self,
-        idx,
-    ) -> None:
-        (
-            self.rollout_data.rollout_log.ep_ret[idx],
-            self.rollout_data.rollout_log.ep_costs[idx],
-            self.rollout_data.rollout_log.ep_len[idx],
-            self.rollout_data.rollout_log.ep_budget[idx],
-        ) = (0.0, 0.0, 0.0, 0.0)
diff --git a/omnisafe/wrappers/wrapper_registry.py b/omnisafe/wrappers/wrapper_registry.py
deleted file mode 100644
index 7ff4e47c1..000000000
--- a/omnisafe/wrappers/wrapper_registry.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Registry for algorithms."""
-
-import inspect
-
-
-class WrapperRegistry:
-    """A registry to map strings to classes.
-    Args:
-        name (str): Registry name.
-    """
-
-    def __init__(self, name):
-        self._name = name
-        self._module_dict = {}
-
-    def __repr__(self):
-        format_str = (
-            self.__class__.__name__ + f'(name={self._name}, items={list(self._module_dict.keys())})'
-        )
-        return format_str
-
-    @property
-    def name(self):
-        """Return the name of the registry."""
-        return self._name
-
-    @property
-    def module_dict(self):
-        """Return a dict mapping names to classes."""
-        return self._module_dict
-
-    def get(self, key):
-        """Get the class that has been registered under the given key."""
-        return self._module_dict.get(key, None)
-
-    def _register_module(self, module_class):
-        """Register a module.
-        Args:
-            module (:obj:`nn.Module`): Module to be registered.
-        """
-        if not inspect.isclass(module_class):
-            raise TypeError(f'module must be a class, but got {type(module_class)}')
-        module_name = module_class.__name__
-        if module_name in self._module_dict:
-            raise KeyError(f'{module_name} is already registered in {self.name}')
-        self._module_dict[module_name] = module_class
-
-    def register(self, cls):
-        """Register a module class."""
-        self._register_module(cls)
-        return cls
-
-
-WRAPPER_REGISTRY = WrapperRegistry('OmniSafe-Wrappers')
-
-
-register = WRAPPER_REGISTRY.register
-get = WRAPPER_REGISTRY.get
diff --git a/pyproject.toml b/pyproject.toml
index b2dfc8437..23502074b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "scipy >= 1.7.0",
     "joblib >= 1.2.0",
     "pyyaml >= 6.0",
+    "types-pyyaml >= 6.0",
     "xmltodict >= 0.13.0",
     "moviepy >= 1.0.0",
     "typing-extensions >= 4.0.0",
diff --git a/tests/test_model.py b/tests/test_model.py
index 98cc2d482..e29946407 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -25,8 +25,8 @@
 from omnisafe.models import ActorBuilder, CriticBuilder
 from omnisafe.models.actor_critic import ActorCritic
 from omnisafe.models.actor_q_critic import ActorQCritic
+from omnisafe.typing import Activation, InitFunction
 from omnisafe.utils.config import Config
-from omnisafe.utils.model_utils import Activation, InitFunction
 
 
 @helpers.parametrize(
diff --git a/tests/test_utils.py b/tests/test_utils.py
deleted file mode 100644
index 533cd3ddf..000000000
--- a/tests/test_utils.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test Utils"""
-
-import os
-import sys
-
-import numpy as np
-import torch
-
-import helpers
-import omnisafe
-from omnisafe.common.experiment_grid import ExperimentGrid
-from omnisafe.typing import NamedTuple, Tuple
-from omnisafe.utils.core import discount_cumsum_torch
-from omnisafe.utils.distributed_utils import mpi_fork, mpi_statistics_scalar
-from omnisafe.utils.tools import to_ndarray
-
-
-@helpers.parametrize(item=[1, 1.0, [1, 2, 3], (1, 2, 3), {'a': 1, 'b': 2}, torch.tensor([1, 2, 3])])
-def test_to_ndarray(item):
-    """Test to_ndarray."""
-    if isinstance(item, torch.Tensor):
-        assert isinstance(to_ndarray(item), np.ndarray)
-    elif isinstance(item, list):
-        out_list = to_ndarray(item)
-        for val in out_list:
-            assert isinstance(val, np.ndarray)
-    elif isinstance(item, tuple):
-        out_tuple = to_ndarray(item)
-        for val in out_tuple:
-            assert isinstance(val, np.ndarray)
-    elif isinstance(item, dict):
-        out_dict = to_ndarray(item)
-        for val in out_dict.values():
-            assert isinstance(val, np.ndarray)
-    else:
-        assert isinstance(to_ndarray(item), np.ndarray)
-
-
-def get_answer(gamma: float) -> torch.Tensor:
-    """Input gamma and return the answer."""
-    if gamma == 0.9:
-        return torch.tensor([11.4265, 11.5850, 10.6500, 8.5000, 5.0000], dtype=torch.float64)
-    elif gamma == 0.99:
-        return torch.tensor([14.6045, 13.7419, 11.8605, 8.9500, 5.0000], dtype=torch.float64)
-    elif gamma == 0.999:
-        return torch.tensor([14.9600, 13.9740, 11.9860, 8.9950, 5.0000], dtype=torch.float64)
-
-
-@helpers.parametrize(
-    discount=[0.9, 0.99, 0.999],
-)
-def test_discount_cumsum_torch(
-    discount: float,
-):
-    """Test discount_cumsum_torch."""
-    x1 = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float64)
-    y1 = get_answer(discount)
-    assert torch.allclose(
-        discount_cumsum_torch(x1, discount), y1
-    ), 'discount_cumsum_torch is not correct'
-
-
-def test_distributed_tools():
-    """Test mpi_fork."""
-    mpi_fork(2, test_message=['examples/train_from_custom_dict.py', '--parallel', '2'])
-
-
-def train(
-    exp_id: str, algo: str, env_id: str, custom_cfgs: NamedTuple, num_threads: int = 6
-) -> Tuple[float, float, float]:
-    """Train a policy from exp-x config with OmniSafe.
-
-    Args:
-        exp_id (str): Experiment ID.
-        algo (str): Algorithm to train.
-        env_id (str): The name of test environment.
-        custom_cfgs (NamedTuple): Custom configurations.
-        num_threads (int, optional): Number of threads. Defaults to 6.
-    """
-    torch.set_num_threads(num_threads)
-    sys.stdout = sys.__stdout__
-    sys.stderr = sys.__stderr__
-    print(f'exp-x: {exp_id} is training...')
-    USE_REDIRECTION = True
-    if USE_REDIRECTION:
-        if not os.path.exists(custom_cfgs['data_dir']):
-            os.makedirs(custom_cfgs['data_dir'])
-        sys.stdout = open(f'{custom_cfgs["data_dir"]}terminal.log', 'w', encoding='utf-8')
-        sys.stderr = open(f'{custom_cfgs["data_dir"]}error.log', 'w', encoding='utf-8')
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs)
-    reward, cost, ep_len = agent.learn()
-    return reward, cost, ep_len
-
-
-def test_train(
-    exp_name='Safety_Gymnasium_Goal',
-    algo='CPO',
-    env_id='SafetyHalfCheetahVelocity-v4',
-    epochs=1,
-    steps_per_epoch=1000,
-    num_envs=1,
-):
-    """Test train."""
-    eg = ExperimentGrid(exp_name=exp_name)
-    eg.add('algo', [algo])
-    eg.add('env_id', [env_id])
-    eg.add('epochs', [epochs])
-    eg.add('steps_per_epoch', [steps_per_epoch])
-    eg.add('env_cfgs', [{'num_envs': num_envs}])
-    eg.run(train, num_pool=1, is_test=True)