From ccad2185ccf905c0cdd8e924856a7043d8ffd0d4 Mon Sep 17 00:00:00 2001
From: zmsn-2077 <73586554+zmsn-2077@users.noreply.github.com>
Date: Mon, 6 Mar 2023 00:12:29 +0800
Subject: [PATCH] feat: update architecture of config.yaml (#126)

---
 .github/workflows/ci.yml                      |  24 +-
 examples/benchmarks/run_experiment_grid.py    |  18 +-
 examples/train_from_custom_dict.py            |  18 +-
 examples/train_policy.py                      |  47 +-
 omnisafe/adapter/online_adapter.py            |   6 +-
 omnisafe/adapter/onpolicy_adapter.py          |   2 +-
 omnisafe/algorithms/__init__.py               |   8 +-
 omnisafe/algorithms/algo_wrapper.py           |  72 +-
 omnisafe/algorithms/base_algo.py              |   4 +-
 omnisafe/algorithms/on_policy/__init__.py     |  21 +-
 .../algorithms/on_policy/base/natural_pg.py   |   6 +-
 .../on_policy/base/policy_gradient.py         |  93 +--
 omnisafe/algorithms/on_policy/base/ppo.py     |   6 +-
 omnisafe/algorithms/on_policy/base/trpo.py    |   6 +-
 .../on_policy/early_terminated/__init__.py    |  26 -
 .../early_terminated/ppo_early_terminated.py  |  36 -
 .../ppo_lag_early_terminated.py               |  37 -
 .../algorithms/on_policy/first_order/cup.py   |  14 +-
 .../on_policy/first_order/focops.py           |  16 +-
 .../on_policy/naive_lagrange/crpo.py          |   2 +-
 .../on_policy/penalty_function/ipo.py         |   6 +-
 .../on_policy/penalty_function/p3o.py         |  10 +-
 .../on_policy/pid_lagrange/__init__.py        |  24 -
 .../on_policy/pid_lagrange/cppo_pid.py        |  88 ---
 .../on_policy/pid_lagrange/trpo_pid.py        |  80 ---
 .../algorithms/on_policy/saute/__init__.py    |  24 -
 .../on_policy/saute/ppo_lag_saute.py          |  45 --
 .../algorithms/on_policy/saute/ppo_saute.py   |  45 --
 .../algorithms/on_policy/second_order/cpo.py  |  22 +-
 .../algorithms/on_policy/second_order/pcpo.py |  14 +-
 omnisafe/common/logger.py                     |   8 +-
 omnisafe/configs/model-based/CAP.yaml         |  95 ---
 omnisafe/configs/model-based/MBPPOLag.yaml    | 148 ----
 omnisafe/configs/model-based/SafeLOOP.yaml    | 129 ----
 omnisafe/configs/off-policy/CVPO.yaml         | 174 -----
 omnisafe/configs/off-policy/DDPG.yaml         | 143 ----
 omnisafe/configs/off-policy/DDPGLag.yaml      | 158 -----
 omnisafe/configs/off-policy/DDPGPid.yaml      | 172 -----
 .../configs/off-policy/DDPGSafetyLayer.yaml   | 150 -----
 omnisafe/configs/off-policy/OffCRPO.yaml      | 144 ----
 omnisafe/configs/off-policy/SAC.yaml          | 154 -----
 omnisafe/configs/off-policy/SACLag.yaml       | 164 -----
 omnisafe/configs/off-policy/SACPid.yaml       | 178 -----
 omnisafe/configs/off-policy/SDDPG.yaml        | 156 -----
 omnisafe/configs/off-policy/TD3.yaml          | 275 --------
 omnisafe/configs/off-policy/TD3Lag.yaml       | 158 -----
 omnisafe/configs/off-policy/TD3Pid.yaml       | 170 -----
 omnisafe/configs/on-policy/CPO.yaml           | 228 +++----
 omnisafe/configs/on-policy/CPPOPid.yaml       | 176 -----
 omnisafe/configs/on-policy/CUP.yaml           | 223 +++----
 omnisafe/configs/on-policy/FOCOPS.yaml        | 225 +++----
 omnisafe/configs/on-policy/IPO.yaml           | 234 +++----
 omnisafe/configs/on-policy/NaturalPG.yaml     | 226 +++----
 omnisafe/configs/on-policy/OnCRPO.yaml        | 222 +++---
 omnisafe/configs/on-policy/P3O.yaml           | 220 +++---
 omnisafe/configs/on-policy/PCPO.yaml          | 228 +++----
 omnisafe/configs/on-policy/PDO.yaml           | 214 +++---
 omnisafe/configs/on-policy/PPO.yaml           | 214 +++---
 .../configs/on-policy/PPOEarlyTerminated.yaml | 154 -----
 omnisafe/configs/on-policy/PPOLag.yaml        | 216 +++---
 .../on-policy/PPOLagEarlyTerminated.yaml      | 164 -----
 omnisafe/configs/on-policy/PPOLagSaute.yaml   | 170 -----
 .../configs/on-policy/PPOLagSimmerPid.yaml    | 186 ------
 omnisafe/configs/on-policy/PPOLagSimmerQ.yaml | 188 ------
 omnisafe/configs/on-policy/PPOSaute.yaml      | 158 -----
 omnisafe/configs/on-policy/PPOSimmerPid.yaml  | 176 -----
 omnisafe/configs/on-policy/PPOSimmerQ.yaml    | 178 -----
 .../configs/on-policy/PolicyGradient.yaml     | 202 +++---
 omnisafe/configs/on-policy/RCPO.yaml          | 228 +++----
 omnisafe/configs/on-policy/TRPO.yaml          | 230 +++----
 omnisafe/configs/on-policy/TRPOLag.yaml       | 228 +++----
 omnisafe/configs/on-policy/TRPOPid.yaml       | 180 -----
 omnisafe/models/actor_critic/actor_critic.py  |  40 +-
 .../actor_critic/constraint_actor_critic.py   |   7 +-
 omnisafe/utils/config.py                      | 129 ++--
 omnisafe/utils/tools.py                       |  59 ++
 pyproject.toml                                |   1 +
 tests/test_model.py                           | 630 +++++++++---------
 tests/test_policy.py                          | 353 +++++-----
 tests/test_safety_gym_envs.py                 |  67 --
 80 files changed, 2337 insertions(+), 7413 deletions(-)
 delete mode 100644 omnisafe/algorithms/on_policy/early_terminated/__init__.py
 delete mode 100644 omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py
 delete mode 100644 omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py
 delete mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/__init__.py
 delete mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py
 delete mode 100644 omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py
 delete mode 100644 omnisafe/algorithms/on_policy/saute/__init__.py
 delete mode 100644 omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py
 delete mode 100644 omnisafe/algorithms/on_policy/saute/ppo_saute.py
 delete mode 100644 omnisafe/configs/model-based/CAP.yaml
 delete mode 100644 omnisafe/configs/model-based/MBPPOLag.yaml
 delete mode 100644 omnisafe/configs/model-based/SafeLOOP.yaml
 delete mode 100644 omnisafe/configs/off-policy/CVPO.yaml
 delete mode 100644 omnisafe/configs/off-policy/DDPG.yaml
 delete mode 100644 omnisafe/configs/off-policy/DDPGLag.yaml
 delete mode 100644 omnisafe/configs/off-policy/DDPGPid.yaml
 delete mode 100644 omnisafe/configs/off-policy/DDPGSafetyLayer.yaml
 delete mode 100644 omnisafe/configs/off-policy/OffCRPO.yaml
 delete mode 100644 omnisafe/configs/off-policy/SAC.yaml
 delete mode 100644 omnisafe/configs/off-policy/SACLag.yaml
 delete mode 100644 omnisafe/configs/off-policy/SACPid.yaml
 delete mode 100644 omnisafe/configs/off-policy/SDDPG.yaml
 delete mode 100644 omnisafe/configs/off-policy/TD3.yaml
 delete mode 100644 omnisafe/configs/off-policy/TD3Lag.yaml
 delete mode 100644 omnisafe/configs/off-policy/TD3Pid.yaml
 delete mode 100644 omnisafe/configs/on-policy/CPPOPid.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOEarlyTerminated.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOLagSaute.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOLagSimmerPid.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOLagSimmerQ.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOSaute.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOSimmerPid.yaml
 delete mode 100644 omnisafe/configs/on-policy/PPOSimmerQ.yaml
 delete mode 100644 omnisafe/configs/on-policy/TRPOPid.yaml
 delete mode 100644 tests/test_safety_gym_envs.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2933289bc..b1f6ee8d3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -61,9 +61,9 @@ jobs:
         run: |
           make addlicense
 
-      - name: mypy
-        run: |
-          make mypy
+      # - name: mypy
+      #   run: |
+      #     make mypy
 
       - name: Install dependencies
         run: |
@@ -80,15 +80,15 @@ jobs:
 
       # TODO: enable this when ready
       # - name: Run tests and collect coverage
-        # run: |
-          # pytest tests --ignore-glob='*profile.py' --cov=omnisafe --cov-report=xml
-          # --cov-report=term-missing --durations=0 -v --color=yes
+      # run: |
+      # pytest tests --ignore-glob='*profile.py' --cov=omnisafe --cov-report=xml
+      # --cov-report=term-missing --durations=0 -v --color=yes
 
       # TODO: enable this when ready
       # - name: Upload coverage reports to Codecov
-        # run: |
-          # # Replace `linux` below with the appropriate OS
-          # # Options are `alpine`, `linux`, `macos`, `windows`
-          # curl -Os https://uploader.codecov.io/latest/linux/codecov
-          # chmod +x codecov
-          # ./codecov -t ${CODECOV_TOKEN=634594d3-0416-4632-ab6a-3bf34a8c0af3}
+      # run: |
+      # # Replace `linux` below with the appropriate OS
+      # # Options are `alpine`, `linux`, `macos`, `windows`
+      # curl -Os https://uploader.codecov.io/latest/linux/codecov
+      # chmod +x codecov
+      # ./codecov -t ${CODECOV_TOKEN=634594d3-0416-4632-ab6a-3bf34a8c0af3}
diff --git a/examples/benchmarks/run_experiment_grid.py b/examples/benchmarks/run_experiment_grid.py
index 2f06baa66..56634f042 100644
--- a/examples/benchmarks/run_experiment_grid.py
+++ b/examples/benchmarks/run_experiment_grid.py
@@ -53,10 +53,16 @@ def train(
 
 if __name__ == '__main__':
     eg = ExperimentGrid(exp_name='Safety_Gymnasium_Goal')
-    eg.add('algo', ['PPO', 'PPOLag'])
+    base_policy = ['PolicyGradient', 'NaturalPG', 'TRPO', 'PPO']
+    naive_lagrange_policy = ['PPOLag', 'TRPOLag', 'RCPO', 'OnCRPO', 'PDO']
+    first_order_policy = ['CUP', 'FOCOPS']
+    second_order_policy = ['CPO', 'PCPO']
+    eg.add('algo', base_policy + naive_lagrange_policy + first_order_policy + second_order_policy)
     eg.add('env_id', ['SafetyPointGoal1-v0'])
-    eg.add('epochs', 1)
-    eg.add('actor_lr', [0.001, 0.003, 0.004], 'lr', True)
-    eg.add('actor_iters', [1, 2], 'ac_iters', True)
-    eg.add('seed', [0, 5, 10])
-    eg.run(train, num_pool=10)
+    eg.add('logger_cfgs:use_wandb', [True])
+    eg.add('logger_cfgs:wandb_project', ['omnisafe_jiaming'])
+    # eg.add('train_cfgs:total_steps', 2000)
+    # eg.add('algo_cfgs:update_cycle', 1000)
+    # eg.add('train_cfgs:vector_env_nums', 1)
+    eg.add('seed', [0])
+    eg.run(train, num_pool=13)
diff --git a/examples/train_from_custom_dict.py b/examples/train_from_custom_dict.py
index 29c4616a5..d1d6f770c 100644
--- a/examples/train_from_custom_dict.py
+++ b/examples/train_from_custom_dict.py
@@ -28,9 +28,23 @@
     metavar='N',
     help='Number of paralleled progress for calculations.',
 )
-custom_dict = {'epochs': 1, 'data_dir': './runs'}
+custom_cfgs = {
+    'train_cfgs': {
+        'total_steps': 1000,
+    },
+    'algo_cfgs': {
+        'update_cycle': 1000,
+        'update_iters': 1,
+    },
+    'logger_cfgs': {
+        'use_wandb': False,
+    },
+    'env_cfgs': {
+        'vector_env_nums': 1,
+    },
+}
 args, _ = parser.parse_known_args()
-agent = omnisafe.Agent('PPOLag', env_id, custom_cfgs=custom_dict, parallel=args.parallel)
+agent = omnisafe.Agent('PPOLag', env_id, custom_cfgs=custom_cfgs, parallel=args.parallel)
 agent.learn()
 
 # obs = env.reset()
diff --git a/examples/train_policy.py b/examples/train_policy.py
index 2236aac86..5d3451acd 100644
--- a/examples/train_policy.py
+++ b/examples/train_policy.py
@@ -17,6 +17,7 @@
 import argparse
 
 import omnisafe
+from omnisafe.utils.tools import custom_cfgs_to_dict, update_dic
 
 
 if __name__ == '__main__':
@@ -26,7 +27,7 @@
         type=str,
         metavar='ALGO',
         default='PPOLag',
-        help='Algorithm to train',
+        help='algorithm to train',
         choices=omnisafe.ALGORITHMS['all'],
     )
     parser.add_argument(
@@ -34,24 +35,56 @@
         type=str,
         metavar='ENV',
         default='SafetyPointGoal1-v0',
-        help='The name of test environment',
+        help='the name of test environment',
     )
     parser.add_argument(
         '--parallel',
         default=1,
         type=int,
         metavar='N',
-        help='Number of paralleled progress for calculations.',
+        help='number of paralleled progress for calculations.',
+    )
+    parser.add_argument(
+        '--total-steps',
+        type=int,
+        default=1638400,
+        metavar='STEPS',
+        help='total number of steps to train for algorithm',
+    )
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='cpu',
+        metavar='DEVICES',
+        help='device to use for training',
+    )
+    parser.add_argument(
+        '--vector-env-nums',
+        type=int,
+        default=16,
+        metavar='VECTOR-ENV',
+        help='number of vector envs to use for training',
+    )
+    parser.add_argument(
+        '--torch-threads',
+        type=int,
+        default=16,
+        metavar='THREADS',
+        help='number of threads to use for torch',
     )
     args, unparsed_args = parser.parse_known_args()
     keys = [k[2:] for k in unparsed_args[0::2]]
     values = list(unparsed_args[1::2])
-    unparsed_dict = dict(zip(keys, values))
-    # env = omnisafe.Env(args.env_id)
+    unparsed_args = dict(zip(keys, values))
+
+    custom_cfgs = {}
+    for k, v in unparsed_args.items():
+        update_dic(custom_cfgs, custom_cfgs_to_dict(k, v))
+
     agent = omnisafe.Agent(
         args.algo,
         args.env_id,
-        parallel=args.parallel,
-        custom_cfgs=unparsed_dict,
+        train_terminal_cfgs=vars(args),
+        custom_cfgs=custom_cfgs,
     )
     agent.learn()
diff --git a/omnisafe/adapter/online_adapter.py b/omnisafe/adapter/online_adapter.py
index ba9277c4b..f8f483ed3 100644
--- a/omnisafe/adapter/online_adapter.py
+++ b/omnisafe/adapter/online_adapter.py
@@ -47,9 +47,9 @@ def __init__(  # pylint: disable=too-many-arguments
         self._env_id = env_id
         self._env = make(env_id, num_envs=num_envs)
         self._wrapper(
-            obs_normalize=cfgs.obs_normalize,
-            reward_normalize=cfgs.reward_normalize,
-            cost_normalize=cfgs.cost_normalize,
+            obs_normalize=cfgs.algo_cfgs.obs_normalize,
+            reward_normalize=cfgs.algo_cfgs.reward_normalize,
+            cost_normalize=cfgs.algo_cfgs.cost_normalize,
         )
         self._env.set_seed(seed)
 
diff --git a/omnisafe/adapter/onpolicy_adapter.py b/omnisafe/adapter/onpolicy_adapter.py
index f816e20d4..a99bd7028 100644
--- a/omnisafe/adapter/onpolicy_adapter.py
+++ b/omnisafe/adapter/onpolicy_adapter.py
@@ -62,7 +62,7 @@ def roll_out(  # pylint: disable=too-many-locals
 
             self._log_value(reward=reward, cost=cost, info=info)
 
-            if self._cfgs.use_cost:
+            if self._cfgs.algo_cfgs.use_cost:
                 logger.store(**{'Value/cost': value_c})
             logger.store(**{'Value/reward': value_r})
 
diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py
index 9c74117ac..784f2ee46 100644
--- a/omnisafe/algorithms/__init__.py
+++ b/omnisafe/algorithms/__init__.py
@@ -21,7 +21,7 @@
 from omnisafe.algorithms.base_algo import BaseAlgo
 
 # On-Policy Safe
-from omnisafe.algorithms.on_policy import (  # PPOLagSimmerPid,; PPOLagSimmerQ,; PPOSimmerPid,; PPOSimmerQ,
+from omnisafe.algorithms.on_policy import (
     CPO,
     CUP,
     FOCOPS,
@@ -30,17 +30,11 @@
     PPO,
     RCPO,
     TRPO,
-    CPPOPid,
     NaturalPG,
     OnCRPO,
     PolicyGradient,
-    PPOEarlyTerminated,
     PPOLag,
-    PPOLagEarlyTerminated,
-    PPOLagSaute,
-    PPOSaute,
     TRPOLag,
-    TRPOPid,
 )
 
 
diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py
index 19d6c5c74..2d2fef1c4 100644
--- a/omnisafe/algorithms/algo_wrapper.py
+++ b/omnisafe/algorithms/algo_wrapper.py
@@ -15,7 +15,6 @@
 """Implementation of the AlgoWrapper Class."""
 
 import difflib
-import os
 import sys
 from typing import Any, Dict, Optional
 
@@ -25,7 +24,7 @@
 
 from omnisafe.algorithms import ALGORITHM2TYPE, ALGORITHMS, registry
 from omnisafe.utils import distributed
-from omnisafe.utils.config import get_default_kwargs_yaml
+from omnisafe.utils.config import check_all_configs, get_default_kwargs_yaml
 
 
 class AlgoWrapper:
@@ -35,39 +34,62 @@ def __init__(
         self,
         algo: str,
         env_id: str,
-        parallel: int = 1,
+        train_terminal_cfgs: Optional[Dict[str, Any]] = None,
         custom_cfgs: Optional[Dict[str, Any]] = None,
     ):
         self.algo = algo
-        self.parallel = parallel
         self.env_id = env_id
         # algo_type will set in _init_checks()
         self.algo_type: str
+
+        self.train_terminal_cfgs = train_terminal_cfgs
         self.custom_cfgs = custom_cfgs
         self.evaluator = None
+        self.cfgs = self._init_config()
         self._init_checks()
 
+    def _init_config(self):
+        """Init config."""
+        assert self.algo in ALGORITHMS['all'], (
+            f"{self.algo} doesn't exist. "
+            f"Did you mean {difflib.get_close_matches(self.algo, ALGORITHMS['all'], n=1)[0]}?"
+        )
+        self.algo_type = ALGORITHM2TYPE.get(self.algo, '')
+        if self.algo_type is None or self.algo_type == '':
+            raise ValueError(f'{self.algo} is not supported!')
+        if self.algo_type in ['off-policy', 'model-based']:
+            assert (
+                self.train_terminal_cfgs.parallel == 1
+            ), 'off-policy or model-based only support parallel==1!'
+        cfgs = get_default_kwargs_yaml(self.algo, self.env_id, self.algo_type)
+
+        # update the cfgs from custom configurations
+        if self.custom_cfgs:
+            cfgs.recurisve_update(self.custom_cfgs)
+        # update the cfgs from custom terminal configurations
+        if self.train_terminal_cfgs:
+            cfgs.train_cfgs.recurisve_update(self.train_terminal_cfgs)
+
+        # the exp_name format is PPO-<SafetyPointGoal1-v0>-
+        exp_name = f'{self.algo}-<{self.env_id}>'
+        cfgs.recurisve_update({'exp_name': exp_name, 'env_id': self.env_id})
+        cfgs.train_cfgs.recurisve_update(
+            {'epochs': cfgs.train_cfgs.total_steps // cfgs.algo_cfgs.update_cycle}
+        )
+        return cfgs
+
     def _init_checks(self):
         """Init checks."""
         assert isinstance(self.algo, str), 'algo must be a string!'
-        assert isinstance(self.parallel, int), 'parallel must be an integer!'
-        assert self.parallel > 0, 'parallel must be greater than 0!'
+        assert isinstance(self.cfgs.train_cfgs.parallel, int), 'parallel must be an integer!'
+        assert self.cfgs.train_cfgs.parallel > 0, 'parallel must be greater than 0!'
         assert (
             isinstance(self.custom_cfgs, dict) or self.custom_cfgs is None
         ), 'custom_cfgs must be a dict!'
-        assert self.algo in ALGORITHMS['all'], (
-            f"{self.algo} doesn't exist. "
-            f"Did you mean {difflib.get_close_matches(self.algo, ALGORITHMS['all'], n=1)[0]}?"
-        )
         assert self.env_id in safe_registry, (
             f"{self.env_id} doesn't exist. "
             f'Did you mean {difflib.get_close_matches(self.env_id, safe_registry, n=1)[0]}?'
         )
-        self.algo_type = ALGORITHM2TYPE.get(self.algo, '')
-        if self.algo_type is None or self.algo_type == '':
-            raise ValueError(f'{self.algo} is not supported!')
-        if self.algo_type in ['off-policy', 'model-based']:
-            assert self.parallel == 1, 'off-policy or model-based only support parallel==1!'
 
     def learn(self):
         """Agent Learning."""
@@ -75,26 +97,20 @@ def learn(self):
         # If also hardware threading CPUs should be used
         # enable this by the use_number_of_threads=True
         physical_cores = psutil.cpu_count(logical=False)
-        use_number_of_threads = bool(self.parallel > physical_cores)
-
-        cfgs = get_default_kwargs_yaml(self.algo, self.env_id, self.algo_type)
-        exp_name = os.path.join(self.env_id, self.algo)
-        cfgs.recurisve_update({'exp_name': exp_name, 'env_id': self.env_id})
-        if self.custom_cfgs is not None:
-            cfgs.recurisve_update(self.custom_cfgs)
-
-        # check_all_configs(cfgs, self.algo_type)
-
-        torch.set_num_threads(cfgs.num_threads)
+        use_number_of_threads = bool(self.cfgs.train_cfgs.parallel > physical_cores)
 
+        check_all_configs(self.cfgs, self.algo_type)
+        torch.set_num_threads(self.cfgs.train_cfgs.torch_threads)
         if distributed.fork(
-            self.parallel, use_number_of_threads=use_number_of_threads, device=cfgs.device
+            self.cfgs.train_cfgs.parallel,
+            use_number_of_threads=use_number_of_threads,
+            device=self.cfgs.train_cfgs.device,
         ):
             # Re-launches the current script with workers linked by MPI
             sys.exit()
         agent = registry.get(self.algo)(
             env_id=self.env_id,
-            cfgs=cfgs,
+            cfgs=self.cfgs,
         )
         ep_ret, ep_cost, ep_len = agent.learn()
         return ep_ret, ep_len, ep_cost
diff --git a/omnisafe/algorithms/base_algo.py b/omnisafe/algorithms/base_algo.py
index a1113de5b..caf19cb5d 100644
--- a/omnisafe/algorithms/base_algo.py
+++ b/omnisafe/algorithms/base_algo.py
@@ -35,8 +35,8 @@ def __init__(self, env_id: str, cfgs: Config) -> None:
         self._seed = cfgs.seed + distributed.get_rank() * 1000
         seed_all(self._seed)
 
-        assert hasattr(cfgs, 'device'), 'Please specify the device in the config file.'
-        self._device = torch.device(self._cfgs.device)
+        assert hasattr(cfgs.train_cfgs, 'device'), 'Please specify the device in the config file.'
+        self._device = torch.device(self._cfgs.train_cfgs.device)
 
         distributed.setup_distributed()
 
diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py
index b155319bd..050006f64 100644
--- a/omnisafe/algorithms/on_policy/__init__.py
+++ b/omnisafe/algorithms/on_policy/__init__.py
@@ -14,26 +14,27 @@
 # ==============================================================================
 """On-policy algorithms."""
 
-from omnisafe.algorithms.on_policy import (  # simmer,
+from omnisafe.algorithms.on_policy import (
     base,
-    early_terminated,
     first_order,
     naive_lagrange,
     penalty_function,
-    pid_lagrange,
-    saute,
     second_order,
 )
 from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient
-from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, PPOLagEarlyTerminated
+
+# from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, PPOLagEarlyTerminated
 from omnisafe.algorithms.on_policy.first_order import CUP, FOCOPS
 from omnisafe.algorithms.on_policy.naive_lagrange import PDO, RCPO, OnCRPO, PPOLag, TRPOLag
 from omnisafe.algorithms.on_policy.penalty_function import IPO, P3O
-from omnisafe.algorithms.on_policy.pid_lagrange import CPPOPid, TRPOPid
-from omnisafe.algorithms.on_policy.saute import PPOLagSaute, PPOSaute
+
+# from omnisafe.algorithms.on_policy.saute import PPOLagSaute, PPOSaute
 from omnisafe.algorithms.on_policy.second_order import CPO, PCPO
 
 
+# from omnisafe.algorithms.on_policy.pid_lagrange import CPPOPid, TRPOPid
+
+
 # from omnisafe.algorithms.on_policy.simmer import (
 #     PPOLagSimmerPid,
 #     PPOLagSimmerQ,
@@ -44,12 +45,12 @@
 
 __all__ = [
     *base.__all__,
-    *early_terminated.__all__,
+    # *early_terminated.__all__,
     *first_order.__all__,
     *naive_lagrange.__all__,
     *penalty_function.__all__,
-    *pid_lagrange.__all__,
-    *saute.__all__,
+    # *pid_lagrange.__all__,
+    # *saute.__all__,
     *second_order.__all__,
     # *simmer.__all__,
 ]
diff --git a/omnisafe/algorithms/on_policy/base/natural_pg.py b/omnisafe/algorithms/on_policy/base/natural_pg.py
index be36d8723..cf33dca82 100644
--- a/omnisafe/algorithms/on_policy/base/natural_pg.py
+++ b/omnisafe/algorithms/on_policy/base/natural_pg.py
@@ -80,7 +80,7 @@ def _fvp(self, params: torch.Tensor) -> torch.Tensor:
 
         flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads])
         distributed.avg_tensor(flat_grad_grad_kl)
-        return flat_grad_grad_kl + params * self._cfgs.cg_damping
+        return flat_grad_grad_kl + params * self._cfgs.algo_cfgs.cg_damping
 
     def _update_actor(  # pylint: disable=too-many-arguments, too-many-locals
         self,
@@ -100,11 +100,11 @@ def _update_actor(  # pylint: disable=too-many-arguments, too-many-locals
         distributed.avg_grads(self._actor_critic.actor)
 
         grad = -get_flat_gradients_from(self._actor_critic.actor)
-        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.algo_cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
         xHx = torch.dot(x, self._fvp(x))
         assert xHx.item() >= 0, 'xHx is negative'
-        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+        alpha = torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (xHx + 1e-8))
         step_direction = x * alpha
         assert torch.isfinite(step_direction).all(), 'step_direction is not finite'
 
diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py
index a63fd5411..ae3ec1d22 100644
--- a/omnisafe/algorithms/on_policy/base/policy_gradient.py
+++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py
@@ -43,12 +43,16 @@ class PolicyGradient(BaseAlgo):
     """
 
     def _init_env(self) -> None:
-        self._env = OnPolicyAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs)
-        assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, (
-            'The number of steps per epoch is not divisible by the number of ' 'environments.'
+        self._env = OnPolicyAdapter(
+            self._env_id, self._cfgs.train_cfgs.vector_env_nums, self._seed, self._cfgs
         )
+        assert (self._cfgs.algo_cfgs.update_cycle) % (
+            distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums
+        ) == 0, ('The number of steps per epoch is not divisible by the number of ' 'environments.')
         self._steps_per_epoch = (
-            self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs
+            self._cfgs.algo_cfgs.update_cycle
+            // distributed.world_size()
+            // self._cfgs.train_cfgs.vector_env_nums
         )
 
     def _init_model(self) -> None:
@@ -56,16 +60,16 @@ def _init_model(self) -> None:
             obs_space=self._env.observation_space,
             act_space=self._env.action_space,
             model_cfgs=self._cfgs.model_cfgs,
-            epochs=self._cfgs.epochs,
+            epochs=self._cfgs.train_cfgs.epochs,
         ).to(self._device)
 
         if distributed.world_size() > 1:
             distributed.sync_params(self._actor_critic)
 
-        if self._cfgs.exploration_noise_anneal:
+        if self._cfgs.model_cfgs.exploration_noise_anneal:
             self._actor_critic.set_annealing(
-                epochs=[0, self._cfgs.epochs],
-                std=self._cfgs.std,
+                epochs=[0, self._cfgs.train_cfgs.epochs],
+                std=self._cfgs.model_cfgs.std,
             )
 
     def _init(self) -> None:
@@ -73,24 +77,24 @@ def _init(self) -> None:
             obs_space=self._env.observation_space,
             act_space=self._env.action_space,
             size=self._steps_per_epoch,
-            gamma=self._cfgs.buffer_cfgs.gamma,
-            lam=self._cfgs.buffer_cfgs.lam,
-            lam_c=self._cfgs.buffer_cfgs.lam_c,
-            advantage_estimator=self._cfgs.buffer_cfgs.adv_estimation_method,
-            standardized_adv_r=self._cfgs.buffer_cfgs.standardized_rew_adv,
-            standardized_adv_c=self._cfgs.buffer_cfgs.standardized_cost_adv,
-            penalty_coefficient=self._cfgs.penalty_param,
-            num_envs=self._cfgs.num_envs,
+            gamma=self._cfgs.algo_cfgs.gamma,
+            lam=self._cfgs.algo_cfgs.lam,
+            lam_c=self._cfgs.algo_cfgs.lam_c,
+            advantage_estimator=self._cfgs.algo_cfgs.adv_estimation_method,
+            standardized_adv_r=self._cfgs.algo_cfgs.standardized_rew_adv,
+            standardized_adv_c=self._cfgs.algo_cfgs.standardized_cost_adv,
+            penalty_coefficient=self._cfgs.algo_cfgs.penalty_coef,
+            num_envs=self._cfgs.train_cfgs.vector_env_nums,
             device=self._device,
         )
 
     def _init_log(self) -> None:
         self._logger = Logger(
-            output_dir=self._cfgs.data_dir,
+            output_dir=self._cfgs.logger_cfgs.log_dir,
             exp_name=self._cfgs.exp_name,
             seed=self._cfgs.seed,
-            use_tensorboard=self._cfgs.use_tensorboard,
-            use_wandb=self._cfgs.use_wandb,
+            use_tensorboard=self._cfgs.logger_cfgs.use_tensorboard,
+            use_wandb=self._cfgs.logger_cfgs.use_wandb,
             config=self._cfgs,
         )
 
@@ -126,7 +130,7 @@ def _init_log(self) -> None:
         self._logger.register_key('Loss/Loss_reward_critic', delta=True)
         self._logger.register_key('Value/reward')
 
-        if self._cfgs.use_cost:
+        if self._cfgs.algo_cfgs.use_cost:
             # log information about cost critic
             self._logger.register_key('Loss/Loss_cost_critic', delta=True)
             self._logger.register_key('Value/cost')
@@ -147,12 +151,9 @@ def learn(self) -> Tuple[Union[int, float], ...]:
         start_time = time.time()
         self._logger.log('INFO: Start training')
 
-        for epoch in range(self._cfgs.epochs):
+        for epoch in range(self._cfgs.train_cfgs.epochs):
             epoch_time = time.time()
 
-            # if self._cfgs.exploration_noise_anneal:
-            #     self._actor_critic.anneal_exploration(frac=epoch / self._cfgs.epochs)
-
             roll_out_time = time.time()
             self._env.roll_out(
                 steps_per_epoch=self._steps_per_epoch,
@@ -166,25 +167,29 @@ def learn(self) -> Tuple[Union[int, float], ...]:
             self._update()
             self._logger.store(**{'Time/Update': time.time() - update_time})
 
-            self._actor_critic.actor_scheduler.step()
-            if self._cfgs.exploration_noise_anneal:
+            if self._cfgs.model_cfgs.exploration_noise_anneal:
                 self._actor_critic.annealing(epoch)
 
+            if self._cfgs.model_cfgs.actor.lr != 'None':
+                self._actor_critic.actor_scheduler.step()
+
             self._logger.store(
                 **{
-                    'TotalEnvSteps': (epoch + 1) * self._cfgs.steps_per_epoch,
-                    'Time/FPS': self._cfgs.steps_per_epoch / (time.time() - epoch_time),
+                    'TotalEnvSteps': (epoch + 1) * self._cfgs.algo_cfgs.update_cycle,
+                    'Time/FPS': self._cfgs.algo_cfgs.update_cycle / (time.time() - epoch_time),
                     'Time/Total': (time.time() - start_time),
                     'Time/Epoch': (time.time() - epoch_time),
                     'Train/Epoch': epoch,
-                    'Train/LR': self._actor_critic.actor_scheduler.get_last_lr()[0],
+                    'Train/LR': 0.0
+                    if self._cfgs.model_cfgs.actor.lr == 'None'
+                    else self._actor_critic.actor_scheduler.get_last_lr()[0],
                 }
             )
 
             self._logger.dump_tabular()
 
             # save model to disk
-            if (epoch + 1) % self._cfgs.save_freq == 0:
+            if (epoch + 1) % self._cfgs.logger_cfgs.save_model_freq == 0:
                 self._logger.torch_save()
 
         ep_ret = self._logger.get_stats('Metrics/EpRet')[0]
@@ -211,11 +216,11 @@ def _update(self) -> None:
 
         dataloader = DataLoader(
             dataset=TensorDataset(obs, act, logp, target_value_r, target_value_c, adv_r, adv_c),
-            batch_size=self._cfgs.num_mini_batches,
+            batch_size=self._cfgs.algo_cfgs.batch_size,
             shuffle=True,
         )
 
-        for i in range(self._cfgs.actor_iters):
+        for i in range(self._cfgs.algo_cfgs.update_iters):
             for (
                 obs,
                 act,
@@ -226,7 +231,7 @@ def _update(self) -> None:
                 adv_c,
             ) in dataloader:
                 self._update_rewrad_critic(obs, target_value_r)
-                if self._cfgs.use_cost:
+                if self._cfgs.algo_cfgs.use_cost:
                     self._update_cost_critic(obs, target_value_c)
                 self._update_actor(obs, act, logp, adv_r, adv_c)
 
@@ -240,7 +245,7 @@ def _update(self) -> None:
             )
             kl = distributed.dist_avg(kl)
 
-            if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl:
+            if self._cfgs.algo_cfgs.kl_early_stop and kl > self._cfgs.algo_cfgs.target_kl:
                 self._logger.log(f'Early stopping at iter {i} due to reaching max kl')
                 break
 
@@ -256,15 +261,15 @@ def _update_rewrad_critic(self, obs: torch.Tensor, target_value_r: torch.Tensor)
         self._actor_critic.reward_critic_optimizer.zero_grad()
         loss = nn.functional.mse_loss(self._actor_critic.reward_critic(obs)[0], target_value_r)
 
-        if self._cfgs.use_critic_norm:
+        if self._cfgs.algo_cfgs.use_critic_norm:
             for param in self._actor_critic.reward_critic.parameters():
-                loss += param.pow(2).sum() * self._cfgs.critic_norm_coeff
+                loss += param.pow(2).sum() * self._cfgs.algo_cfgs.critic_norm_coef
 
         loss.backward()
 
-        if self._cfgs.use_max_grad_norm:
+        if self._cfgs.algo_cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self._actor_critic.reward_critic.parameters(), self._cfgs.max_grad_norm
+                self._actor_critic.reward_critic.parameters(), self._cfgs.algo_cfgs.max_grad_norm
             )
         distributed.avg_grads(self._actor_critic.reward_critic)
         self._actor_critic.reward_critic_optimizer.step()
@@ -275,15 +280,15 @@ def _update_cost_critic(self, obs: torch.Tensor, target_value_c: torch.Tensor) -
         self._actor_critic.cost_critic_optimizer.zero_grad()
         loss = nn.functional.mse_loss(self._actor_critic.cost_critic(obs)[0], target_value_c)
 
-        if self._cfgs.use_critic_norm:
+        if self._cfgs.algo_cfgs.use_critic_norm:
             for param in self._actor_critic.cost_critic.parameters():
-                loss += param.pow(2).sum() * self._cfgs.critic_norm_coeff
+                loss += param.pow(2).sum() * self._cfgs.algo_cfgs.critic_norm_coef
 
         loss.backward()
 
-        if self._cfgs.use_max_grad_norm:
+        if self._cfgs.algo_cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self._actor_critic.cost_critic.parameters(), self._cfgs.max_grad_norm
+                self._actor_critic.cost_critic.parameters(), self._cfgs.algo_cfgs.max_grad_norm
             )
         distributed.avg_grads(self._actor_critic.cost_critic)
         self._actor_critic.cost_critic_optimizer.step()
@@ -302,9 +307,9 @@ def _update_actor(  # pylint: disable=too-many-arguments
         loss, info = self._loss_pi(obs, act, logp, adv)
         self._actor_critic.actor_optimizer.zero_grad()
         loss.backward()
-        if self._cfgs.use_max_grad_norm:
+        if self._cfgs.algo_cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm
+                self._actor_critic.actor.parameters(), self._cfgs.algo_cfgs.max_grad_norm
             )
         distributed.avg_grads(self._actor_critic.actor)
         self._actor_critic.actor_optimizer.step()
diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py
index 0cb3f6e10..4df3f2416 100644
--- a/omnisafe/algorithms/on_policy/base/ppo.py
+++ b/omnisafe/algorithms/on_policy/base/ppo.py
@@ -57,9 +57,11 @@ def _loss_pi(
         logp_ = self._actor_critic.actor.log_prob(act)
         std = self._actor_critic.actor.std
         ratio = torch.exp(logp_ - logp)
-        ratio_cliped = torch.clamp(ratio, 1 - self._cfgs.clip, 1 + self._cfgs.clip)
+        ratio_cliped = torch.clamp(
+            ratio, 1 - self._cfgs.algo_cfgs.clip, 1 + self._cfgs.algo_cfgs.clip
+        )
         loss = -torch.min(ratio * adv, ratio_cliped * adv).mean()
-        loss += self._cfgs.entropy_coef * distribution.entropy().mean()
+        loss += self._cfgs.algo_cfgs.entropy_coef * distribution.entropy().mean()
         # useful extra info
         entrophy = distribution.entropy().mean().item()
         info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std}
diff --git a/omnisafe/algorithms/on_policy/base/trpo.py b/omnisafe/algorithms/on_policy/base/trpo.py
index ebc19d2b2..6285954ce 100644
--- a/omnisafe/algorithms/on_policy/base/trpo.py
+++ b/omnisafe/algorithms/on_policy/base/trpo.py
@@ -110,7 +110,7 @@ def _search_step_size(
                 self._logger.log('WARNING: loss_pi not finite')
             elif loss_improve < 0:
                 self._logger.log('INFO: did not improve improve <0')
-            elif kl > self._cfgs.target_kl * 1.5:
+            elif kl > self._cfgs.algo_cfgs.target_kl * 1.5:
                 self._logger.log('INFO: violated KL constraint.')
             else:
                 # step only if surrogate is improved and when within trust reg.
@@ -165,11 +165,11 @@ def _update_actor(  # pylint: disable=too-many-arguments,too-many-locals
         distributed.avg_grads(self._actor_critic.actor)
 
         grad = -get_flat_gradients_from(self._actor_critic.actor)
-        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.algo_cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
         xHx = torch.dot(x, self._fvp(x))
         assert xHx.item() >= 0, 'xHx is negative'
-        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+        alpha = torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (xHx + 1e-8))
         step_direction = x * alpha
         assert torch.isfinite(step_direction).all(), 'step_direction is not finite'
 
diff --git a/omnisafe/algorithms/on_policy/early_terminated/__init__.py b/omnisafe/algorithms/on_policy/early_terminated/__init__.py
deleted file mode 100644
index f6493344f..000000000
--- a/omnisafe/algorithms/on_policy/early_terminated/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Early terminated algorithms."""
-
-from omnisafe.algorithms.on_policy.early_terminated.ppo_early_terminated import PPOEarlyTerminated
-from omnisafe.algorithms.on_policy.early_terminated.ppo_lag_early_terminated import (
-    PPOLagEarlyTerminated,
-)
-
-
-__all__ = [
-    'PPOEarlyTerminated',
-    'PPOLagEarlyTerminated',
-]
diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py
deleted file mode 100644
index 508773acf..000000000
--- a/omnisafe/algorithms/on_policy/early_terminated/ppo_early_terminated.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the early terminated algorithm using PPO."""
-
-from omnisafe.adapter import EarlyTerminatedAdapter
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.ppo import PPO
-
-
-@registry.register
-class PPOEarlyTerminated(PPO):
-    """The early terminated algorithm implemented with PPO.
-
-    References:
-        Title: Safe Exploration by Solving Early Terminated MDP
-        Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou.
-        URL: `Safe Exploration by Solving Early Terminated MDP <https://arxiv.org/abs/2107.04200>`_
-    """
-
-    def _init_env(self) -> None:
-        self._env = EarlyTerminatedAdapter(
-            self._env_id, self._cfgs.num_envs, self._seed, self._cfgs
-        )
-        self._steps_per_epoch = self._cfgs.steps_per_epoch
diff --git a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py b/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py
deleted file mode 100644
index 1b546b984..000000000
--- a/omnisafe/algorithms/on_policy/early_terminated/ppo_lag_early_terminated.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Lagrange version of the early terminated algorithm using PPOLag."""
-
-
-from omnisafe.adapter import EarlyTerminatedAdapter
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag
-
-
-@registry.register
-class PPOLagEarlyTerminated(PPOLag):
-    """The Lagrange version of the early terminated algorithm implemented with PPOLag.
-
-    References:
-        Title: Safe Exploration by Solving Early Terminated MDP
-        Authors: Hao Sun, Ziping Xu, Meng Fang, Zhenghao Peng, Jiadong Guo, Bo Dai, Bolei Zhou.
-        URL: `Safe Exploration by Solving Early Terminated MDP <https://arxiv.org/abs/2107.04200>`_
-    """
-
-    def _init_env(self) -> None:
-        self._env = EarlyTerminatedAdapter(
-            self._env_id, self._cfgs.num_envs, self._seed, self._cfgs
-        )
-        self._steps_per_epoch = self._cfgs.steps_per_epoch
diff --git a/omnisafe/algorithms/on_policy/first_order/cup.py b/omnisafe/algorithms/on_policy/first_order/cup.py
index 3f0969685..2f3bf2aa4 100644
--- a/omnisafe/algorithms/on_policy/first_order/cup.py
+++ b/omnisafe/algorithms/on_policy/first_order/cup.py
@@ -91,8 +91,8 @@ def _loss_pi_cost(self, obs, act, logp, adv_c):
 
         kl = torch.distributions.kl_divergence(distribution, self._p_dist).sum(-1, keepdim=True)
 
-        coef = (1 - self._cfgs.buffer_cfgs.gamma * self._cfgs.buffer_cfgs.lam) / (
-            1 - self._cfgs.buffer_cfgs.gamma
+        coef = (1 - self._cfgs.algo_cfgs.gamma * self._cfgs.algo_cfgs.lam) / (
+            1 - self._cfgs.algo_cfgs.gamma
         )
         loss = (self._lagrange.lagrangian_multiplier * coef * ratio * adv_c + kl).mean()
 
@@ -138,19 +138,19 @@ def _update(self) -> None:
 
         dataloader = DataLoader(
             dataset=TensorDataset(obs, act, logp, adv_c, old_mean, old_std),
-            batch_size=self._cfgs.num_mini_batches,
+            batch_size=self._cfgs.algo_cfgs.batch_size,
             shuffle=True,
         )
 
-        for i in range(self._cfgs.actor_iters):
+        for i in range(self._cfgs.algo_cfgs.update_iters):
             for obs, act, logp, adv_c, old_mean, old_std in dataloader:
                 self._p_dist = Normal(old_mean, old_std)
                 loss_cost, info = self._loss_pi_cost(obs, act, logp, adv_c)
                 self._actor_critic.actor_optimizer.zero_grad()
                 loss_cost.backward()
-                if self._cfgs.max_grad_norm is not None:
+                if self._cfgs.algo_cfgs.max_grad_norm is not None:
                     torch.nn.utils.clip_grad_norm_(
-                        self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm
+                        self._actor_critic.actor.parameters(), self._cfgs.algo_cfgs.max_grad_norm
                     )
                 distributed.avg_grads(self._actor_critic.actor)
                 self._actor_critic.actor_optimizer.step()
@@ -165,7 +165,7 @@ def _update(self) -> None:
             )
             kl = distributed.dist_avg(kl)
 
-            if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl:
+            if self._cfgs.algo_cfgs.kl_early_stop and kl > self._cfgs.algo_cfgs.target_kl:
                 self._logger.log(f'Early stopping at iter {i} due to reaching max kl')
                 break
 
diff --git a/omnisafe/algorithms/on_policy/first_order/focops.py b/omnisafe/algorithms/on_policy/first_order/focops.py
index 0856f04a7..7006a484e 100644
--- a/omnisafe/algorithms/on_policy/first_order/focops.py
+++ b/omnisafe/algorithms/on_policy/first_order/focops.py
@@ -58,11 +58,11 @@ def _loss_pi(
         ratio = torch.exp(logp_ - logp)
 
         kl = torch.distributions.kl_divergence(distribution, self._p_dist).sum(-1, keepdim=True)
-        loss = (kl - (1 / self._cfgs.lam) * ratio * adv) * (kl.detach() <= self._cfgs.eta).type(
-            torch.float32
-        )
+        loss = (kl - (1 / self._cfgs.algo_cfgs.focops_lam) * ratio * adv) * (
+            kl.detach() <= self._cfgs.algo_cfgs.focops_eta
+        ).type(torch.float32)
         loss = loss.mean()
-        loss -= self._cfgs.entropy_coef * distribution.entropy().mean()
+        loss -= self._cfgs.algo_cfgs.entropy_coef * distribution.entropy().mean()
 
         entrophy = distribution.entropy().mean().item()
         info = {'entrophy': entrophy, 'ratio': ratio.mean().item(), 'std': std}
@@ -104,11 +104,11 @@ def _update(self) -> None:
             dataset=TensorDataset(
                 obs, act, logp, target_value_r, target_value_c, adv_r, adv_c, old_mean, old_std
             ),
-            batch_size=self._cfgs.num_mini_batches,
+            batch_size=self._cfgs.algo_cfgs.batch_size,
             shuffle=True,
         )
 
-        for i in range(self._cfgs.actor_iters):
+        for i in range(self._cfgs.algo_cfgs.update_iters):
             for (
                 obs,
                 act,
@@ -121,7 +121,7 @@ def _update(self) -> None:
                 old_std,
             ) in dataloader:
                 self._update_rewrad_critic(obs, target_value_r)
-                if self._cfgs.use_cost:
+                if self._cfgs.algo_cfgs.use_cost:
                     self._update_cost_critic(obs, target_value_c)
 
                 self._p_dist = Normal(old_mean, old_std)
@@ -137,7 +137,7 @@ def _update(self) -> None:
             )
             kl = distributed.dist_avg(kl)
 
-            if self._cfgs.kl_early_stopping and kl > self._cfgs.target_kl:
+            if self._cfgs.algo_cfgs.kl_early_stop and kl > self._cfgs.algo_cfgs.target_kl:
                 self._logger.log(f'Early stopping at iter {i} due to reaching max kl')
                 break
 
diff --git a/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py b/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py
index acfe874e1..2d7b50f43 100644
--- a/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py
+++ b/omnisafe/algorithms/on_policy/naive_lagrange/crpo.py
@@ -52,7 +52,7 @@ def _update(self) -> None:
 
     def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
         Jc = self._logger.get_stats('Metrics/EpCost')[0]
-        if Jc <= self._cfgs.cost_limit + self._cfgs.distance:
+        if Jc <= self._cfgs.algo_cfgs.cost_limit + self._cfgs.algo_cfgs.distance:
             self._rew_update += 1
             return adv_r
         self._cost_update += 1
diff --git a/omnisafe/algorithms/on_policy/penalty_function/ipo.py b/omnisafe/algorithms/on_policy/penalty_function/ipo.py
index 222c1493e..c65a80df1 100644
--- a/omnisafe/algorithms/on_policy/penalty_function/ipo.py
+++ b/omnisafe/algorithms/on_policy/penalty_function/ipo.py
@@ -37,9 +37,9 @@ def _init_log(self) -> None:
     def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
         """Compute surrogate loss."""
         Jc = self._logger.get_stats('Metrics/EpCost')[0]
-        penalty = self._cfgs.kappa / (self._cfgs.cost_limit - Jc + 1e-8)
-        if penalty < 0 or penalty > self._cfgs.penalty_max:
-            penalty = self._cfgs.penalty_max
+        penalty = self._cfgs.algo_cfgs.kappa / (self._cfgs.algo_cfgs.cost_limit - Jc + 1e-8)
+        if penalty < 0 or penalty > self._cfgs.algo_cfgs.penalty_max:
+            penalty = self._cfgs.algo_cfgs.penalty_max
 
         self._logger.store(**{'Misc/Penalty': penalty})
 
diff --git a/omnisafe/algorithms/on_policy/penalty_function/p3o.py b/omnisafe/algorithms/on_policy/penalty_function/p3o.py
index 1fc94881f..debf4c0a6 100644
--- a/omnisafe/algorithms/on_policy/penalty_function/p3o.py
+++ b/omnisafe/algorithms/on_policy/penalty_function/p3o.py
@@ -47,8 +47,8 @@ def _loss_pi_cost(
         logp_ = self._actor_critic.actor.log_prob(act)
         ratio = torch.exp(logp_ - logp)
         surr_cadv = (ratio * adv_c).mean()
-        Jc = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit
-        loss_cost = self._cfgs.kappa * F.relu(surr_cadv + Jc)
+        Jc = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.algo_cfgs.cost_limit
+        loss_cost = self._cfgs.algo_cfgs.kappa * F.relu(surr_cadv + Jc)
         return loss_cost.mean()
 
     def _update_actor(
@@ -87,13 +87,13 @@ def _update_actor(
         loss_reward, info = self._loss_pi(obs, act, logp, adv_r)
         loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
 
-        loss = loss_reward - loss_cost
+        loss = loss_reward + loss_cost
 
         self._actor_critic.actor_optimizer.zero_grad()
         loss.backward()
-        if self._cfgs.use_max_grad_norm:
+        if self._cfgs.algo_cfgs.use_max_grad_norm:
             torch.nn.utils.clip_grad_norm_(
-                self._actor_critic.actor.parameters(), self._cfgs.max_grad_norm
+                self._actor_critic.actor.parameters(), self._cfgs.algo_cfgs.max_grad_norm
             )
         distributed.avg_grads(self._actor_critic.actor)
         self._actor_critic.actor_optimizer.step()
diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py b/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py
deleted file mode 100644
index 2203bfc44..000000000
--- a/omnisafe/algorithms/on_policy/pid_lagrange/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""PID Lagrange algorithms."""
-
-from omnisafe.algorithms.on_policy.pid_lagrange.cppo_pid import CPPOPid
-from omnisafe.algorithms.on_policy.pid_lagrange.trpo_pid import TRPOPid
-
-
-__all__ = [
-    'CPPOPid',
-    'TRPOPid',
-]
diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py
deleted file mode 100644
index 64ad66dcd..000000000
--- a/omnisafe/algorithms/on_policy/pid_lagrange/cppo_pid.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the PID-Lagrange version of the CPPO algorithm."""
-
-import torch
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.ppo import PPO
-from omnisafe.common.pid_lagrange import PIDLagrangian
-
-
-@registry.register
-class CPPOPid(PPO):
-    r"""The PID-Lagrange version of the CPPO algorithm.
-
-    Similar to :class:`PDO`, which is a simple combination of :class:`PolicyGradient` and :class:`Lagrange`,
-    this class is a simple combination of :class:`PolicyGradient` and :class:`PIDLagrangian`.
-
-    .. note::
-        The PID-Lagrange is more general than the Lagrange, and can be used in any policy gradient algorithm.
-        (``omnisafe`` provide the PID-Lagrange version of the PPO (just this class) and TRPO.)
-        Furthermore, it is more stable than the naive Lagrange.
-
-    References:
-        - Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods
-        - Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel.
-        - URL: https://arxiv.org/abs/2007.03964
-    """
-
-    def _init(self) -> None:
-        super()._init()
-        self._pid_lag = PIDLagrangian(**self._cfgs.PID_cfgs)
-
-    def _init_log(self) -> None:
-        super()._init_log()
-        self._logger.register_key('Metrics/LagrangeMultiplier')
-        self._logger.register_key('PID/pid_Kp')
-        self._logger.register_key('PID/pid_Ki')
-        self._logger.register_key('PID/pid_Kd')
-
-    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
-        penalty = self._pid_lag.cost_penalty
-        return (adv_r - penalty * adv_c) / (1 + penalty)
-
-    def _update(self) -> None:
-        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
-
-        Additionally, we update the Lagrange multiplier parameter,
-        by calling the :meth:`update_lagrange_multiplier` method.
-
-        .. note::
-            The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm.
-            When a lagrange multiplier is used,
-            the :meth:`compute_loss_pi` method will return the loss of the policy as:
-
-            .. math::
-                L_{\pi} = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)}
-                [A^{R}(s_t, a_t) - \lambda A^{C}(s_t, a_t)] \right]
-
-            where :math:`\lambda` is the Lagrange multiplier parameter.
-        """
-        # note that logger already uses MPI statistics across all processes..
-        Jc = self._logger.get_stats('Metrics/EpCost')[0]
-        # first update Lagrange multiplier parameter
-        self._pid_lag.pid_update(Jc)
-        # then update the policy and value function
-        super()._update()
-
-        self._logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self._pid_lag.cost_penalty,
-                'PID/pid_Kp': self._pid_lag.pid_kp,
-                'PID/pid_Ki': self._pid_lag.pid_ki,
-                'PID/pid_Kd': self._pid_lag.pid_kd,
-            }
-        )
diff --git a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py b/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py
deleted file mode 100644
index 35a303e23..000000000
--- a/omnisafe/algorithms/on_policy/pid_lagrange/trpo_pid.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the PID-Lagrange version of the TRPO algorithm."""
-
-import torch
-
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.trpo import TRPO
-from omnisafe.common.pid_lagrange import PIDLagrangian
-
-
-@registry.register
-class TRPOPid(TRPO):
-    """The PID-Lagrange version of the TRPO algorithm.
-
-    References:
-        - Title: Responsive Safety in Reinforcement Learning by PID Lagrangian Methods
-        - Authors: Joshua Achiam, David Held, Aviv Tamar, Pieter Abbeel.
-        - URL: https://arxiv.org/abs/2007.03964
-    """
-
-    def _init(self) -> None:
-        super()._init()
-        self._pid_lag = PIDLagrangian(**self._cfgs.PID_cfgs)
-
-    def _init_log(self) -> None:
-        super()._init_log()
-        self._logger.register_key('Metrics/LagrangeMultiplier')
-        self._logger.register_key('PID/pid_Kp')
-        self._logger.register_key('PID/pid_Ki')
-        self._logger.register_key('PID/pid_Kd')
-
-    def _compute_adv_surrogate(self, adv_r: torch.Tensor, adv_c: torch.Tensor) -> torch.Tensor:
-        penalty = self._pid_lag.cost_penalty
-        return (adv_r - penalty * adv_c) / (1 + penalty)
-
-    def _update(self) -> None:
-        r"""Update actor, critic, running statistics as we used in the :class:`PolicyGradient` algorithm.
-
-        Additionally, we update the Lagrange multiplier parameter,
-        by calling the :meth:`update_lagrange_multiplier` method.
-
-        .. note::
-            The :meth:`compute_loss_pi` is defined in the :class:`PolicyGradient` algorithm.
-            When a lagrange multiplier is used,
-            the :meth:`compute_loss_pi` method will return the loss of the policy as:
-
-            .. math::
-                L_{\pi} = \mathbb{E}_{s_t \sim \rho_{\pi}} \left[ \frac{\pi_\theta(a_t|s_t)}{\pi_\theta^{old}(a_t|s_t)}
-                [A^{R}(s_t, a_t) - \lambda A^{C}(s_t, a_t)] \right]
-
-            where :math:`\lambda` is the Lagrange multiplier parameter.
-        """
-        # note that logger already uses MPI statistics across all processes..
-        Jc = self._logger.get_stats('Metrics/EpCost')[0]
-        # first update Lagrange multiplier parameter
-        self._pid_lag.pid_update(Jc)
-        # then update the policy and value function
-        super()._update()
-
-        self._logger.store(
-            **{
-                'Metrics/LagrangeMultiplier': self._pid_lag.cost_penalty,
-                'PID/pid_Kp': self._pid_lag.pid_kp,
-                'PID/pid_Ki': self._pid_lag.pid_ki,
-                'PID/pid_Kd': self._pid_lag.pid_kd,
-            }
-        )
diff --git a/omnisafe/algorithms/on_policy/saute/__init__.py b/omnisafe/algorithms/on_policy/saute/__init__.py
deleted file mode 100644
index 57902b6f1..000000000
--- a/omnisafe/algorithms/on_policy/saute/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Saute algorithms."""
-
-from omnisafe.algorithms.on_policy.saute.ppo_lag_saute import PPOLagSaute
-from omnisafe.algorithms.on_policy.saute.ppo_saute import PPOSaute
-
-
-__all__ = [
-    'PPOLagSaute',
-    'PPOSaute',
-]
diff --git a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py
deleted file mode 100644
index f8b9970ea..000000000
--- a/omnisafe/algorithms/on_policy/saute/ppo_lag_saute.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Lagrange version of the Saute algorithm using PPOLag."""
-
-from omnisafe.adapter import SauteAdapter
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.naive_lagrange.ppo_lag import PPOLag
-from omnisafe.utils import distributed
-
-
-@registry.register
-class PPOLagSaute(PPOLag):
-    """The Saute algorithm implemented with PPOLag.
-
-    References:
-        - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation
-        - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee,
-            Ziyan Wang, David Mguni, Jun Wang, Haitham Bou-Ammar.
-        - URL: `Saute RL<https://arxiv.org/abs/2202.06558>`_
-    """
-
-    def _init_env(self) -> None:
-        self._env = SauteAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs)
-        assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, (
-            'The number of steps per epoch is not divisible by the number of ' 'environments.'
-        )
-        self._steps_per_epoch = (
-            self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs
-        )
-
-    def _init_log(self) -> None:
-        super()._init_log()
-        self._logger.register_key('Metrics/EpBudget')
diff --git a/omnisafe/algorithms/on_policy/saute/ppo_saute.py b/omnisafe/algorithms/on_policy/saute/ppo_saute.py
deleted file mode 100644
index 7ee288198..000000000
--- a/omnisafe/algorithms/on_policy/saute/ppo_saute.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of the Saute algorithm."""
-
-from omnisafe.adapter import SauteAdapter
-from omnisafe.algorithms import registry
-from omnisafe.algorithms.on_policy.base.ppo import PPO
-from omnisafe.utils import distributed
-
-
-@registry.register
-class PPOSaute(PPO):
-    """The Saute algorithm implemented with PPO.
-
-    References:
-        - Title: Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation
-        - Authors: Aivar Sootla, Alexander I. Cowen-Rivers, Taher Jafferjee,
-            Ziyan Wang, David Mguni, Jun Wang, Haitham Bou-Ammar.
-        - URL: `Saute RL<https://arxiv.org/abs/2202.06558>`_
-    """
-
-    def _init_env(self) -> None:
-        self._env = SauteAdapter(self._env_id, self._cfgs.num_envs, self._seed, self._cfgs)
-        assert self._cfgs.steps_per_epoch % (distributed.world_size() * self._cfgs.num_envs) == 0, (
-            'The number of steps per epoch is not divisible by the number of ' 'environments.'
-        )
-        self._steps_per_epoch = (
-            self._cfgs.steps_per_epoch // distributed.world_size() // self._cfgs.num_envs
-        )
-
-    def _init_log(self) -> None:
-        super()._init_log()
-        self._logger.register_key('Metrics/EpBudget')
diff --git a/omnisafe/algorithms/on_policy/second_order/cpo.py b/omnisafe/algorithms/on_policy/second_order/cpo.py
index 52cfa2ea1..d02dcac20 100644
--- a/omnisafe/algorithms/on_policy/second_order/cpo.py
+++ b/omnisafe/algorithms/on_policy/second_order/cpo.py
@@ -145,7 +145,7 @@ def _cpo_search_step(
             elif loss_cost_diff > max(-violation_c, 0):
                 self._logger.log(f'INFO: no improve {loss_cost_diff} > {max(-violation_c, 0)}')
             # check KL-distance to avoid too far gap
-            elif kl > self._cfgs.target_kl * 1.5:
+            elif kl > self._cfgs.algo_cfgs.target_kl * 1.5:
                 self._logger.log(f'INFO: violated KL constraint {kl} at step {step + 1}.')
             else:
                 # step only if surrogate is improved and we are
@@ -215,13 +215,13 @@ def _update_actor(
         distributed.avg_grads(self._actor_critic.actor)
 
         grad = -get_flat_gradients_from(self._actor_critic.actor)
-        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.algo_cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
         xHx = torch.dot(x, self._fvp(x))
         assert xHx.item() >= 0, 'xHx is negative'
-        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+        alpha = torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (xHx + 1e-8))
 
-        self._actor_critic.actor_optimizer.zero_grad()
+        self._actor_critic.zero_grad()
         loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
         loss_cost_before = distributed.dist_avg(loss_cost).item()
 
@@ -229,10 +229,10 @@ def _update_actor(
         distributed.avg_grads(self._actor_critic.actor)
 
         b_grad = get_flat_gradients_from(self._actor_critic.actor)
-        ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit
+        ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.algo_cfgs.cost_limit
         cost = ep_costs / (self._logger.get_stats('Metrics/EpLen')[0] + 1e-8)
 
-        p = conjugate_gradients(self._fvp, b_grad, self._cfgs.cg_iters)
+        p = conjugate_gradients(self._fvp, b_grad, self._cfgs.algo_cfgs.cg_iters)
         q = xHx
         r = torch.dot(grad, p)
         s = torch.dot(b_grad, p)
@@ -247,7 +247,7 @@ def _update_actor(
             assert torch.isfinite(s).all(), 's is not finite'
 
             A = q - r**2 / s
-            B = 2 * self._cfgs.target_kl - cost**2 / s
+            B = 2 * self._cfgs.algo_cfgs.target_kl - cost**2 / s
 
             if cost < 0 and B < 0:
                 # point in trust region is feasible and safety boundary doesn't intersect
@@ -270,7 +270,7 @@ def _update_actor(
 
         if optim_case in (3, 4):
             # under 3 and 4 cases directly use TRPO method
-            alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+            alpha = torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (xHx + 1e-8))
             nu_star = torch.zeros(1)
             lambda_star = 1 / alpha
             step_direction = alpha * x
@@ -285,7 +285,7 @@ def project(data: torch.Tensor, low: float, high: float) -> torch.Tensor:
             #  λ=argmax(f_a(λ),f_b(λ)) = λa_star or λb_star
             #  computing formula shown in appendix, lambda_a and lambda_b
             lambda_a = torch.sqrt(A / B)
-            lambda_b = torch.sqrt(q / (2 * self._cfgs.target_kl))
+            lambda_b = torch.sqrt(q / (2 * self._cfgs.algo_cfgs.target_kl))
             # λa_star = Proj(lambda_a ,0 ~ r/c)  λb_star=Proj(lambda_b,r/c~ +inf)
             # where projection(str,b,c)=max(b,min(str,c))
             # may be regarded as a projection from effective region towards safety region
@@ -301,7 +301,7 @@ def f_a(lam):
                 return -0.5 * (A / (lam + 1e-8) + B * lam) - r * cost / (s + 1e-8)
 
             def f_b(lam):
-                return -0.5 * (q / (lam + 1e-8) + 2 * self._cfgs.target_kl * lam)
+                return -0.5 * (q / (lam + 1e-8) + 2 * self._cfgs.algo_cfgs.target_kl * lam)
 
             lambda_star = (
                 lambda_a_star if f_a(lambda_a_star) >= f_b(lambda_b_star) else lambda_b_star
@@ -317,7 +317,7 @@ def f_b(lam):
             # purely decrease costs
             # without further check
             lambda_star = torch.zeros(1)
-            nu_star = np.sqrt(2 * self._cfgs.target_kl / (s + 1e-8))
+            nu_star = np.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (s + 1e-8))
             step_direction = -nu_star * p
 
         step_direction, accept_step = self._cpo_search_step(
diff --git a/omnisafe/algorithms/on_policy/second_order/pcpo.py b/omnisafe/algorithms/on_policy/second_order/pcpo.py
index d69ae6cea..8642e1d4f 100644
--- a/omnisafe/algorithms/on_policy/second_order/pcpo.py
+++ b/omnisafe/algorithms/on_policy/second_order/pcpo.py
@@ -75,14 +75,14 @@ def _update_actor(
         distributed.avg_grads(self._actor_critic.actor)
 
         grad = -get_flat_gradients_from(self._actor_critic.actor)
-        x = conjugate_gradients(self._fvp, grad, self._cfgs.cg_iters)
+        x = conjugate_gradients(self._fvp, grad, self._cfgs.algo_cfgs.cg_iters)
         assert torch.isfinite(x).all(), 'x is not finite'
         xHx = torch.dot(x, self._fvp(x))
         H_inv_g = self._fvp(x)
         assert xHx.item() >= 0, 'xHx is negative'
-        alpha = torch.sqrt(2 * self._cfgs.target_kl / (xHx + 1e-8))
+        alpha = torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (xHx + 1e-8))
 
-        self._actor_critic.actor_optimizer.zero_grad()
+        self._actor_critic.zero_grad()
         loss_cost = self._loss_pi_cost(obs, act, logp, adv_c)
         loss_cost_before = distributed.dist_avg(loss_cost).item()
 
@@ -90,21 +90,21 @@ def _update_actor(
         distributed.avg_grads(self._actor_critic.actor)
 
         b_grad = get_flat_gradients_from(self._actor_critic.actor)
-        ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.cost_limit
+        ep_costs = self._logger.get_stats('Metrics/EpCost')[0] - self._cfgs.algo_cfgs.cost_limit
         cost = ep_costs / (self._logger.get_stats('Metrics/EpLen')[0] + 1e-8)
 
         self._logger.log(f'c = {cost}')
         self._logger.log(f'b^T b = {b_grad.dot(b_grad).item()}')
 
-        p = conjugate_gradients(self._fvp, b_grad, self._cfgs.cg_iters)
+        p = conjugate_gradients(self._fvp, b_grad, self._cfgs.algo_cfgs.cg_iters)
         q = xHx
         r = torch.dot(grad, p)
         s = torch.dot(b_grad, p)
 
         step_direction = (
-            torch.sqrt(2 * self._cfgs.target_kl / (q + 1e-8)) * H_inv_g
+            torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / (q + 1e-8)) * H_inv_g
             - torch.clamp_min(
-                (torch.sqrt(2 * self._cfgs.target_kl / q) * r + cost) / s,
+                (torch.sqrt(2 * self._cfgs.algo_cfgs.target_kl / q) * r + cost) / s,
                 torch.tensor(0.0, device=self._device),
             )
             * p
diff --git a/omnisafe/common/logger.py b/omnisafe/common/logger.py
index 61a68f335..73a6b41a1 100644
--- a/omnisafe/common/logger.py
+++ b/omnisafe/common/logger.py
@@ -144,10 +144,10 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
             self._tensorboard_writer = SummaryWriter(log_dir=os.path.join(self._log_dir, 'tb'))
 
         if self._use_wandb and self._maste_proc:
-            project: str = self._config.get('wandb_project', 'omnisafe')
-            name: str = self._config.get('wandb_name', f'{exp_name}/{relpath}')
-            entity: str = self._config.get('wandb_entity', None)
-            wandb.init(project=project, name=name, entity=entity, dir=self._log_dir, config=config)
+            project: str = self._config.logger_cfgs.get('wandb_project', 'omnisafe')
+            name: str = f'{exp_name}-{relpath}'
+            print('project', project, 'name', name)
+            wandb.init(project=project, name=name, dir=self._log_dir, config=config)
             if config is not None:
                 wandb.config.update(config)
             if models is not None:
diff --git a/omnisafe/configs/model-based/CAP.yaml b/omnisafe/configs/model-based/CAP.yaml
deleted file mode 100644
index b37cf923e..000000000
--- a/omnisafe/configs/model-based/CAP.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The environment wrapper type
-  wrapper_type: ModelBasedEnvWrapper
-  # Number of training time step
-  max_real_time_steps: 1000000
-  # Number of timestep in an episode
-  max_ep_len: 1000
-  # CUDA or CPU device
-  device: "cuda:0"
-  # Number of repeated action
-  action_repeat: 1
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # Reward discounted factor
-  gamma: 0.99
-  # Cost discounted factor
-  cost_gamma: 0.99
-  # Noise add to action for exploration
-  exploration_noise: 0.0
-  # Size of Off-policy Buffer
-  replay_size: 1000000
-  # Batch size of Off-policy Buffer
-  batch_size: 256
-  # log information every `log_freq` timesteps
-  log_freq: 1000
-  # update dynamics every `update_dynamics_freq` timesteps
-  update_dynamics_freq: 1000
-
-  ## ----------------------------Basic configurations for dynamics model-------------------- ##
-  dynamics_cfgs:
-    # Number of network for ensemble model
-    network_size: 5
-    # output size for ensemble model
-    elite_size: 5
-    # Size of hidden layers
-    hidden_size: 200
-    # Whether use decay loss
-    use_decay: True
-
-  ## ----------------------------Basic configurations for MPC controller-------------------- ##
-  mpc_config:
-    # Planning horizon
-    horizon: 30
-    # Sample population
-    popsize: 500
-    # Repeat sample population 'particles' times
-    particles: 20
-    # Number of planning iteration
-    max_iters: 5
-    # Update coefficicent for new mean and var
-    alpha: 0.1
-    # Mixed actor sample to gaussian sample
-    mixture_coefficient: 0.0
-    # Number of elite action trajectories
-    minimal_elites: 50
-    # Var threshold to stop planning iteration
-    epsilon: 0.001
-    # Clip observation to [-obs_clip, obs_clip]
-    obs_clip: 1000
-
-  ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 152 # discount cost limit in HalfCheetah-v3
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 1.0
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.1
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-    # scaling factor of cost limit
-    beta: 1
diff --git a/omnisafe/configs/model-based/MBPPOLag.yaml b/omnisafe/configs/model-based/MBPPOLag.yaml
deleted file mode 100644
index ee129665a..000000000
--- a/omnisafe/configs/model-based/MBPPOLag.yaml
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The environment wrapper type
-  wrapper_type: ModelBasedEnvWrapper
-  # Number of training time step
-  max_real_time_steps: 1000000
-  # Number of timestep in an episode
-  max_ep_len: 1000
-  # CUDA or CPU device
-  device: "cpu"
-  # Number of repeated action
-  action_repeat: 1
-  # clip obseravation to [-obs_clip, obs_clip]
-  obs_clip: 1000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # Number of update iteration for Actor network
-  pi_iters: 80
-  # Number of update iteration for Critic network
-  critic_iters: 80
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # Size of Off-policy Buffer
-  replay_size: 1000000
-  # Batch size of Off-policy Buffer
-  batch_size: 0
-  # log information every `log_freq` timestep
-  log_freq: 20000
-  # update actor and critic every `update_policy_freq` timestep
-  update_policy_freq: 10000
-  # update dynamics every `update_dynamics_freq` timestep
-  update_dynamics_freq: 10000
-
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.012
-  # The clip range for PPO loss
-  clip: 0.2
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Cost discounted factor
-  cost_gamma: 1.0
-  kl_early_stopping: True
-  # Whther to use reward penalty
-  reward_penalty: False
-  # Whether to use reward scaling
-  scale_rewards: False
-  # Whether to use standardized observation
-  standardized_obs: False
-
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-        actor_type: gaussian_annealing
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus"
-        activation: tanh
-      val:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Type of activation functon, choosing from "tanh", "relu", "sigmoid", "identity", "softplus"
-        activation: tanh
-
-  ## ----------------------------Basic configurations for derived class MBPPOLag-------------------- ##
-  # Virtual roll out horizon
-  horizon: 80
-  # Imaging steps every policy update
-  imaging_steps_per_policy_update: 30000
-  # Number of mixed real data in training data
-  mixed_real_time_steps: 1500
-  # Number of dynamics network for computing performance ratio
-  validation_num: 6
-  # number of candidates for computing performance ratio
-  validation_threshold_num: 4
-  # Validation horizon for computing performance ratio
-  validation_horizon: 75
-
-  ## ----------------------------Basic configurations for dynamics model-------------------- ##
-  dynamics_cfgs:
-    # Number of network for ensemble model
-    network_size: 8
-    # output size for ensemble model
-    elite_size: 6
-    # Size of hidden layers
-    hidden_size: 200
-    # Whether use decay loss
-    use_decay: True
-
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.97
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.97
-    # Method to estimate the advantage reward/cost, choosing from "gae", "gae-rtg", "plain", "vtrace"
-    adv_estimation_method: "gae-rtg"
-    # Whether to use standardized reward
-    standardized_reward: True
-    # Whether to use standardized cost
-    standardized_cost: True
-  ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 18.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.5
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.05
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-    # scaling factor of cost limit
-    beta: 0.02
diff --git a/omnisafe/configs/model-based/SafeLOOP.yaml b/omnisafe/configs/model-based/SafeLOOP.yaml
deleted file mode 100644
index d0f5b25bd..000000000
--- a/omnisafe/configs/model-based/SafeLOOP.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The environment wrapper type
-  wrapper_type: ModelBasedEnvWrapper
-  # Number of training time step
-  max_real_time_steps: 1000000
-  # Number of timestep in an episode
-  max_ep_len: 1000
-  # CUDA or CPU device
-  device: "cpu"
-  # Number of repeated action
-  action_repeat: 5
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # Time of strating update policy
-  update_policy_start_timesteps: 10000
-  # Times of update actor-critic
-  update_policy_iters: 50
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # Reward discounted factor
-  gamma: 0.99
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Size of Off-policy Buffer
-  replay_size: 1000000
-  # Batch size of Off-policy Buffer
-  batch_size: 256
-  # log information every `log_freq` timestep
-  log_freq: 20000
-  # update actor and critic every `update_policy_freq` timestep
-  update_policy_freq: 250
-  # update dynamics every `update_dynamics_freq` timestep
-  update_dynamics_freq: 1250
-  # Noise add to action for exploration
-  exploration_noise: 0.0
-  # Whether to use cost critic
-  use_cost: False
-  # Whether to use standardized observation
-  standardized_obs: False
-  ## ---------------------------Basic configurations for derived class SAC---------------------- ##
-  # The entropy coefficient
-  alpha: 0.2
-  # The learning rate of Alpha
-  alpha_gamma: 0.99
-  # The soft update coefficient
-  polyak: 0.995
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-        actor_type: "gaussian_stdnet"
-        # The standard deviation of Gaussian noise
-        act_noise: 0.1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## ----------------------------Basic configurations for dynamics model-------------------- ##
-  dynamics_cfgs:
-    # Number of network for ensemble model
-    network_size: 7
-    # output size for ensemble model
-    elite_size: 5
-    # Size of hidden layers
-    hidden_size: 200
-    # Whether use decay loss
-    use_decay: True
-  ## ----------------------------Basic configurations for MPC controller-------------------- ##
-  mpc_config:
-    # Planning horizon
-    horizon: 8
-    # Sample population
-    popsize: 100
-    # Repeat sample population 'particles' times
-    particles: 4
-    # Number of planning iteration
-    max_iters: 8
-    # Update coefficicent for new mean and var
-    alpha: 0.1
-    # Mixed actor sample to gaussian sample
-    mixture_coefficient: 0.05
-    # Coefficicent for rescaling action score
-    kappa: 1
-    # Safety threshold
-    safety_threshold: 0.2
-    # Number of elite action trajectories
-    minimal_elites: 10
-    # Clip observation to [-obs_clip, obs_clip]
-    obs_clip: 1000
diff --git a/omnisafe/configs/off-policy/CVPO.yaml b/omnisafe/configs/off-policy/CVPO.yaml
deleted file mode 100644
index bec644a88..000000000
--- a/omnisafe/configs/off-policy/CVPO.yaml
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class DDPG---------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ----------------------------Basic configurations for derived class CVPO-------------------- ##
-  # Hard constraint of the mean in the M-step.
-  kl_mean_constraint: 0.01
-  # Hard constraint of the covariance in the M-step.
-  kl_var_constraint: 0.0001
-  # Hard constraint in the M-step.
-  kl_constraint: 0.01
-  # Scaling factor of the mean of lagrangian multiplier in the M-step.
-  alpha_mean_scale: 1.0
-  # Scaling factor of the variance of lagrangian multiplier in the M-step.
-  alpha_var_scale: 100.0
-  # Scaling factor of the lagrangian multiplier in the M-step.
-  alpha_scale: 10.0
-  # Maximum number of the mean of alpha
-  alpha_mean_max: 0.1
-  # Maximum number of the variance of alpha
-  alpha_var_max: 10.0
-  # Maximum of alpha
-  alpha_max: 1.0
-  # The number of sampled actions.
-  sample_action_num: 64
-  # The maximum number of steps of M.
-  mstep_iteration_num: 5
-  # The maximum number of steps of E.
-  dual_constraint: 0.1
-  # The tolerance of cost violation.
-  cost_limit: 25.0
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: True
-  # The initial value of cost limit
-  init_cost_limit: 100.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: "cholesky"
-    # Minimum value of covariance
-    cov_min: 0.0001
-    # Minimum value of mean of clamp
-    mu_clamp_min: -5
-    # Maximum value of mean of clamp
-    mu_clamp_max: 5
-    # Minimum value of covariance of clamp
-    cov_clamp_min: -5
-    # Maximum value of covariance of clamp
-    cov_clamp_max: 20
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/DDPG.yaml b/omnisafe/configs/off-policy/DDPG.yaml
deleted file mode 100644
index 1e7df53c5..000000000
--- a/omnisafe/configs/off-policy/DDPG.yaml
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class DDPG---------------------- ##
-  # The random seed
-  seed: 5
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # The number of episode to test
-  num_test_episodes: 10
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/DDPGLag.yaml b/omnisafe/configs/off-policy/DDPGLag.yaml
deleted file mode 100644
index 06728e16e..000000000
--- a/omnisafe/configs/off-policy/DDPGLag.yaml
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The soft update coefficient
-  polyak: 0.995
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: True
-  # The initial value of cost limit
-  init_cost_limit: 100.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.0001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer------------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.0
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.01
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/DDPGPid.yaml b/omnisafe/configs/off-policy/DDPGPid.yaml
deleted file mode 100644
index 8ad68ecfb..000000000
--- a/omnisafe/configs/off-policy/DDPGPid.yaml
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 2000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 64
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer------------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
- ## --------------------------------------Configuration For PID--------------------------------- ##
-  PID_cfgs:
-    # KP for PID
-    pid_kp: 0.1
-    # KI for PID
-    pid_ki: 0.003
-    # KD for PID
-    pid_kd: 0.001
-    # The init value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # The delay rate of KD
-    pid_d_delay: 10
-    # 0 for hard update, 1 for no update
-    pid_delta_p_ema_alpha: 0.95
-    # The same as above
-    pid_delta_d_ema_alpha: 0.95
-    # L = (J_r - lam * J_c) / (1 + lam); lam <= 0
-    sum_norm: True
-    # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1
-    diff_norm: False
-    # Only used if sum_norm=diff_norm=False
-    penalty_max: 100
-    # Tolerance of violation
-    cost_limit: 50
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/DDPGSafetyLayer.yaml b/omnisafe/configs/off-policy/DDPGSafetyLayer.yaml
deleted file mode 100644
index 3258dc8cb..000000000
--- a/omnisafe/configs/off-policy/DDPGSafetyLayer.yaml
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class DDPG---------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SafetyLayerWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The learning rate of Cost network
-  model_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # The cost limit
-  cost_limit: 25.0
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 0.5
-  # Whether to use reward scaling
-  scale_rewards: False
-  # Whether to use standardized observation
-  standardized_obs: True
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-  ## -----------------------------------Configuration For Safety Layer--------------------------- ##
-  env_cfgs:
-    # Configuration of LinearCostModel in SafetyLayerWrapper
-    safety_layer_cfgs:
-      # Size of hidden layers
-      hidden_sizes: [400, 300]
-      # Activation function
-      activation: relu
-      # The learning rate of cost model
-      model_lr: 0.001
-      # The directory to save cost model
-      data_dir: "./runs"
-      # The size of replay buffer
-      buffer_size: 50000
-      # The size of batch
-      batch_size: 256
diff --git a/omnisafe/configs/off-policy/OffCRPO.yaml b/omnisafe/configs/off-policy/OffCRPO.yaml
deleted file mode 100644
index 08b606695..000000000
--- a/omnisafe/configs/off-policy/OffCRPO.yaml
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class DDPG---------------------- ##
-  # The random seed
-  seed: 5
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # The number of test episodes
-  num_test_episodes: 10
-
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/SAC.yaml b/omnisafe/configs/off-policy/SAC.yaml
deleted file mode 100644
index 5b3be6918..000000000
--- a/omnisafe/configs/off-policy/SAC.yaml
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 5
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0001
-  # The learning rate of Critic network
-  critic_lr: 0.0001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class SAC---------------------- ##
-  # The entropy coefficient
-  alpha: 0.2
-  # The learning rate of Alpha
-  alpha_gamma: 1.0
-  # Auto Alpha
-  auto_alpha: True
-  # The learning rate of Auto Alpha
-  alpha_lr: 0.0003
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_stdnet
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # The output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 100000
-    # The size of batch
-    batch_size: 1024
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/SACLag.yaml b/omnisafe/configs/off-policy/SACLag.yaml
deleted file mode 100644
index 6520bdd9c..000000000
--- a/omnisafe/configs/off-policy/SACLag.yaml
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 100
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class SAC---------------------- ##
-  # The entropy coefficient
-  alpha: 0.2
-  # The learning rate of Alpha
-  alpha_gamma: 0.99
-  # Auto Alpha
-  auto_alpha: True
-  # The learning rate of Auto Alpha
-  alpha_lr: 0.0003
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_stdnet
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 1024
-## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.0
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.01
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/SACPid.yaml b/omnisafe/configs/off-policy/SACPid.yaml
deleted file mode 100644
index bcaf20bff..000000000
--- a/omnisafe/configs/off-policy/SACPid.yaml
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 100
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 400
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class SAC---------------------- ##
-  # The entropy coefficient
-  alpha: 0.2
-  # The learning rate of Alpha
-  alpha_gamma: 0.99
-  # Auto Alpha
-  auto_alpha: True
-  # The learning rate of Auto Alpha
-  alpha_lr: 0.0003
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_stdnet
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 2048
- ## --------------------------------------Configuration For PID--------------------------------- ##
-  PID_cfgs:
-    # KP for PID
-    pid_kp: 0.1
-    # KI for PID
-    pid_ki: 0.003
-    # KD for PID
-    pid_kd: 0.001
-    # The init value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # The delay rate of KD
-    pid_d_delay: 10
-    # 0 for hard update, 1 for no update
-    pid_delta_p_ema_alpha: 0.95
-    # The same as above
-    pid_delta_d_ema_alpha: 0.95
-    # L = (J_r - lam * J_c) / (1 + lam); lam <= 0
-    sum_norm: True
-    # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1
-    diff_norm: False
-    # Only used if sum_norm=diff_norm=False
-    penalty_max: 100
-    # Tolerance of violation
-    cost_limit: 100
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/SDDPG.yaml b/omnisafe/configs/off-policy/SDDPG.yaml
deleted file mode 100644
index 2295304aa..000000000
--- a/omnisafe/configs/off-policy/SDDPG.yaml
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 10
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 200
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-
-  ## ---------------------------Basic configurations for derived class SDDPG-------------------- ##
-  # The normalize coefficient
-  beta: 1.5
-  # The discontinuous coefficient for conjugate gradient
-  cg_damping: 0.1
-  # The max iteration for conjugate gradient
-  cg_iters: 10
-  # The constraint for KL divergence
-  target_kl: 0.01
-  # Hypperparameter for SDDPG
-  d_init: 5
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/TD3.yaml b/omnisafe/configs/off-policy/TD3.yaml
deleted file mode 100644
index 0b0aed4a1..000000000
--- a/omnisafe/configs/off-policy/TD3.yaml
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 5
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0001
-  # The learning rate of Critic network
-  critic_lr: 0.0001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The seed of environment
-    env_seed: 0
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-
-
-Pusher-v4:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 5
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: GymWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 4000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0001
-  # The learning rate of Critic network
-  critic_lr: 0.0001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class SAC---------------------- ##
-  # The entropy coefficient
-  alpha: 0.2
-  # The learning rate of Alpha
-  alpha_gamma: 1.0
-  # Auto Alpha
-  auto_alpha: True
-  # The learning rate of Auto Alpha
-  alpha_lr: 0.0003
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end of cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-        actor_type: dire
-        # The standard deviation of Gaussian noise
-        act_noise: 0.1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Whether to scale action
-        scale_action: True
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 1024
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/TD3Lag.yaml b/omnisafe/configs/off-policy/TD3Lag.yaml
deleted file mode 100644
index 4f9fd0c62..000000000
--- a/omnisafe/configs/off-policy/TD3Lag.yaml
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 0
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The soft update coefficient
-  polyak: 0.995
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: True
-  # The initial value of cost limit
-  init_cost_limit: 100.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end od cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.0001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer------------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
-## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.0
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.01
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/off-policy/TD3Pid.yaml b/omnisafe/configs/off-policy/TD3Pid.yaml
deleted file mode 100644
index 794027ef9..000000000
--- a/omnisafe/configs/off-policy/TD3Pid.yaml
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## ----------------------------Basic configurations for base class DDPG----------------------- ##
-  # The random seed
-  seed: 5
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 6000
-  # Update after `update_after` steps
-  update_after: 1000
-  # Update every `update_every` steps
-  update_every: 50
-  # Check if all models own the same parameter values every `check_freq` epochs
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 10
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The number of test episodes
-  num_test_episodes: 10
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The soft update coefficient
-  polyak: 0.999
-  # The discount factor of GAE
-  gamma: 0.99
-  # Actor perdorm random action before `start_steps` steps
-  start_steps: 10000
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Whether to use cost limit decay
-  cost_limit_decay: False
-  # The initial value of cost limit
-  init_cost_limit: 25.0
-  # The target value of cost limit
-  target_cost_limit: 25.0
-  # The end od cost limit decay epoch
-  end_epoch: 100
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # Whether to use reward penalty
-  reward_penalty: False
-  # Whether to use KL early stopping
-  kl_early_stopping: False
-  # Whether to use max gradient norm
-  use_max_grad_norm: False
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: False
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-        # Output activation function
-        output_activation: tanh
-        # Whether to scale action.
-        scale_action: True
-        # Whether to clip action.
-        clip_action: True
-        # Whether to learn the standard deviation of Gaussian noise
-        std_learning: False
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 2
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: relu
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  replay_buffer_cfgs:
-    # The size of replay buffer
-    size: 50000
-    # The size of batch
-    batch_size: 256
- ## --------------------------------------Configuration For PID--------------------------------- ##
-  PID_cfgs:
-    # KP for PID
-    pid_kp: 0.1
-    # KI for PID
-    pid_ki: 0.003
-    # KD for PID
-    pid_kd: 0.001
-    # The init value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # The delay rate of KD
-    pid_d_delay: 10
-    # 0 for hard update, 1 for no update
-    pid_delta_p_ema_alpha: 0.95
-    # The same as above
-    pid_delta_d_ema_alpha: 0.95
-    # L = (J_r - lam * J_c) / (1 + lam); lam <= 0
-    sum_norm: True
-    # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1
-    diff_norm: False
-    # Only used if sum_norm=diff_norm=False
-    penalty_max: 100
-    # Tolerance of violation
-    cost_limit: 25
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/CPO.yaml b/omnisafe/configs/on-policy/CPO.yaml
index 55385d558..36054c030 100644
--- a/omnisafe/configs/on-policy/CPO.yaml
+++ b/omnisafe/configs/on-policy/CPO.yaml
@@ -14,143 +14,113 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 16384
+    # target kl divergence
+    target_kl: 0.01
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # Tolerance of constraint violation
+    cost_limit: 25
+    # damping value for conjugate gradient
+    cg_damping: 0.1
+    # number of conjugate gradient iterations
+    cg_iters: 10
+    # subsampled obs
+    fvp_obs: None
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: False
+    # use kl early stop
+    kl_early_stop: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
+      # out_activation: tanh
+      # learning rate
+      lr: None
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
+      # learning rate
+      lr: 0.001
diff --git a/omnisafe/configs/on-policy/CPPOPid.yaml b/omnisafe/configs/on-policy/CPPOPid.yaml
deleted file mode 100644
index e97b2e738..000000000
--- a/omnisafe/configs/on-policy/CPPOPid.yaml
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 512
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
-    actor:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
-    critic:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-## --------------------------------------Configuration For PID--------------------------------- ##
-  PID_cfgs:
-    # KP for PID
-    pid_kp: 0.01
-    # KI for PID
-    pid_ki: 0.01
-    # KD for PID
-    pid_kd: 0.01
-    # The init value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # The delay rate of KD
-    pid_d_delay: 10
-    # 0 for hard update, 1 for no update
-    pid_delta_p_ema_alpha: 0.95
-    # The same as above
-    pid_delta_d_ema_alpha: 0.95
-    # L = (J_r - lam * J_c) / (1 + lam); lam <= 0
-    sum_norm: True
-    # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1
-    diff_norm: False
-    # Only used if sum_norm=diff_norm=False
-    penalty_max: 100
-    # Tolerance of violation
-    cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/CUP.yaml b/omnisafe/configs/on-policy/CUP.yaml
index 30865cc92..49102c532 100644
--- a/omnisafe/configs/on-policy/CUP.yaml
+++ b/omnisafe/configs/on-policy/CUP.yaml
@@ -14,150 +14,111 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  ## ---------------------------Basic configurations for derived class FOCOPS------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25.0
-  # The thereshold for KL divergence in each policy update
-  eta: 0.02
-  # The hyperparameters related to the greediness of the algorithm
-  lam: 1.5
-  # The size of batch for policy update
-  batch_size: 2000
-  # The value to clip surrogate function
-  clip: 0.2
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.01
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
+  # lagrangian configurations
   lagrange_cfgs:
     # Tolerance of constraint violation
     cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/FOCOPS.yaml b/omnisafe/configs/on-policy/FOCOPS.yaml
index 781942315..5903e0867 100644
--- a/omnisafe/configs/on-policy/FOCOPS.yaml
+++ b/omnisafe/configs/on-policy/FOCOPS.yaml
@@ -14,148 +14,115 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-
-  ## ---------------------------Basic configurations for derived class FOCOPS------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25.0
-  # The thereshold for KL divergence in each policy update
-  eta: 0.02
-  # The hyperparameters related to the greediness of the algorithm
-  lam: 1.5
-  # The size of batch for policy update
-  batch_size: 2000
-    # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # The thereshold for KL divergence in each policy update
+    focops_eta: 0.02
+    # The hyperparameters related to the greediness of the algorithm
+    focops_lam: 1.5
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
+  # lagrangian configurations
   lagrange_cfgs:
     # Tolerance of constraint violation
     cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/IPO.yaml b/omnisafe/configs/on-policy/IPO.yaml
index 4075f529b..aaa1cef81 100644
--- a/omnisafe/configs/on-policy/IPO.yaml
+++ b/omnisafe/configs/on-policy/IPO.yaml
@@ -14,143 +14,123 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The coefficient of cost penalty
-  kappa: 0.01
-  # The max of cost penalty
-  penalty_max: 1.0
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # The cost limit
-  cost_limit: 25.0
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+    # the coefficient of cost penalty
+    kappa: 0.01
+    # the max of cost penalty
+    penalty_max: 1.0
+    # the cost limit
+    cost_limit: 25.0
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
+  # lagrangian configurations
+  lagrange_cfgs:
+    # Tolerance of constraint violation
+    cost_limit: 25.0
+    # Initial value of lagrangian multiplier
+    lagrangian_multiplier_init: 0.001
+    # Learning rate of lagrangian multiplier
+    lambda_lr: 0.035
+    # Type of lagrangian optimizer
+    lambda_optimizer: "Adam"
diff --git a/omnisafe/configs/on-policy/NaturalPG.yaml b/omnisafe/configs/on-policy/NaturalPG.yaml
index 7adddf898..a9c109de3 100644
--- a/omnisafe/configs/on-policy/NaturalPG.yaml
+++ b/omnisafe/configs/on-policy/NaturalPG.yaml
@@ -14,143 +14,111 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 1
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 16384
+    # target kl divergence
+    target_kl: 0.01
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: False
+    # Damping value for conjugate gradient
+    cg_damping: 0.1
+    # Number of conjugate gradient iterations
+    cg_iters: 10
+    # Subsampled observation
+    fvp_obs: None
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
+    # linear learning rate decay
+    linear_lr_decay: False
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
+      # out_activation: tanh
+      # learning rate
+      lr: None
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/OnCRPO.yaml b/omnisafe/configs/on-policy/OnCRPO.yaml
index e72419f35..b1e47ecb5 100644
--- a/omnisafe/configs/on-policy/OnCRPO.yaml
+++ b/omnisafe/configs/on-policy/OnCRPO.yaml
@@ -14,143 +14,111 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # The cost limit
-  cost_limit: 25.0
-  # The tolerance of cost limit
-  distance: 2.0
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: False
+    # the cost limit
+    cost_limit: 25.0
+    # the tolerance of cost limit
+    distance: 2.0
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The cost limit
-    cost_limit: 25.0
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/P3O.yaml b/omnisafe/configs/on-policy/P3O.yaml
index bc723f072..5c294b450 100644
--- a/omnisafe/configs/on-policy/P3O.yaml
+++ b/omnisafe/configs/on-policy/P3O.yaml
@@ -14,141 +14,111 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The coefficient of cost penalty
-  kappa: 20.0
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # The cost limit
-  cost_limit: 25.0
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # the coefficient of cost penalty
+    kappa: 20.0
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: False
+    # normalize cost
+    cost_normalize: False
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # cost limit
+    cost_limit: 25.0
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/PCPO.yaml b/omnisafe/configs/on-policy/PCPO.yaml
index 8654374d7..a97f64084 100644
--- a/omnisafe/configs/on-policy/PCPO.yaml
+++ b/omnisafe/configs/on-policy/PCPO.yaml
@@ -14,143 +14,113 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 16384
+    # target kl divergence
+    target_kl: 0.01
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # Tolerance of constraint violation
+    cost_limit: 25
+    # damping value for conjugate gradient
+    cg_damping: 0.1
+    # number of conjugate gradient iterations
+    cg_iters: 10
+    # subsampled obs
+    fvp_obs: None
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+    # use kl early stop
+    kl_early_stop: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
+      # out_activation: tanh
+      # learning rate
+      lr: None
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 1
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
+      # learning rate
+      lr: 0.001
diff --git a/omnisafe/configs/on-policy/PDO.yaml b/omnisafe/configs/on-policy/PDO.yaml
index 96c25641a..438e1fac3 100644
--- a/omnisafe/configs/on-policy/PDO.yaml
+++ b/omnisafe/configs/on-policy/PDO.yaml
@@ -14,142 +14,110 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-  ## --------------------------------------Configuration For Lagrange--------------------------- ##
   lagrange_cfgs:
     # Tolerance of constraint violation
     cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/PPO.yaml b/omnisafe/configs/on-policy/PPO.yaml
index cc8357ba2..53916d531 100644
--- a/omnisafe/configs/on-policy/PPO.yaml
+++ b/omnisafe/configs/on-policy/PPO.yaml
@@ -14,139 +14,107 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 40
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40.0
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: False
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The cost limit
-    cost_limit: 25.0
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml
deleted file mode 100644
index dd755b259..000000000
--- a/omnisafe/configs/on-policy/PPOEarlyTerminated.yaml
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: EarlyTerminatedWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 1
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-  # cost_limit
-  cost_limit: 25
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
-    actor:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
-    critic:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/PPOLag.yaml b/omnisafe/configs/on-policy/PPOLag.yaml
index 0eac1c639..bcda2641d 100644
--- a/omnisafe/configs/on-policy/PPOLag.yaml
+++ b/omnisafe/configs/on-policy/PPOLag.yaml
@@ -14,143 +14,111 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-  ## --------------------------------------Configuration For Lagrange--------------------------- ##
+  # lagrangian configurations
   lagrange_cfgs:
     # Tolerance of constraint violation
     cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml b/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml
deleted file mode 100644
index 3869b84fb..000000000
--- a/omnisafe/configs/on-policy/PPOLagEarlyTerminated.yaml
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: EarlyTerminatedWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 1
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-  # cost_limit
-  cost_limit: 25
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
-    actor:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
-    critic:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-  ## --------------------------------------Configuration For Lagrange--------------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.035
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
diff --git a/omnisafe/configs/on-policy/PPOLagSaute.yaml b/omnisafe/configs/on-policy/PPOLagSaute.yaml
deleted file mode 100644
index 0fb8d846a..000000000
--- a/omnisafe/configs/on-policy/PPOLagSaute.yaml
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SauteWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: False
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
-    actor:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
-    critic:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.035
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-  ## Configuration For Env_Wrapper
-  env_cfgs:
-    # The reward when the state is unsafe
-    unsafe_reward: -0.1
-    # safety_budget in saute is actually the same as ``cost_limmit``.
-    safety_budget: 25
-    # The discount factor of cost in saute
-    saute_gamma: 0.9997
-    # Whether to scale safety budget
-    scale_safety_budget: True
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_ep_len: 1000
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml b/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml
deleted file mode 100644
index 094a06049..000000000
--- a/omnisafe/configs/on-policy/PPOLagSimmerPid.yaml
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SimmerWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-   ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.035
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-  ## Configuration For Env_Wrapper
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-    # The reward when the state is unsafe
-    unsafe_reward: -0.1
-    # The lower bound of safety budget
-    lower_budget: 15
-    # The upper bound of safety budget
-    upper_budget: 25
-    # The dicounted factor
-    simmer_gamma: 0.999
-    # Whether to scale the safety budget
-    scale_safety_budget: True
-    # Type of Simmer Controller
-    simmer_controller: 'PID'
-    # Configuration of Simmer Controller
-    controller_cfgs:
-      # Kp for PID
-      pid_kp: 0.1
-      # Ki for PID
-      pid_ki: 0.01
-      # Kd for PID
-      pid_kd: 0.01
-      # The step size for PID
-      step_size: 2
-      # Lowpass filter coefficient
-      tau: 0.95
diff --git a/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml b/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml
deleted file mode 100644
index bb511e3bd..000000000
--- a/omnisafe/configs/on-policy/PPOLagSimmerQ.yaml
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SimmerWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-   ## ----------------------------------Configuration For Lagrangian multiplier---------------------- ##
-  lagrange_cfgs:
-    # Tolerance of constraint violation
-    cost_limit: 25.0
-    # Initial value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # Learning rate of lagrangian multiplier
-    lambda_lr: 0.035
-    # Type of lagrangian optimizer
-    lambda_optimizer: "Adam"
-  ## Configuration For Env_Wrapper
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-    # The reward when the state is unsafe
-    unsafe_reward: -0.1
-    # The lower bound of safety budget
-    lower_budget: 15
-    # The upper bound of safety budget
-    upper_budget: 25
-    # The dicounted factor
-    simmer_gamma: 0.999
-    # Whether to scale the safety budget
-    scale_safety_budget: False
-    # Type of Simmer Controller
-    simmer_controller: 'Q'
-    # Configurations for controller
-    controller_cfgs:
-      # The dim of state space
-      state_dim: 5
-      # The dim of action space
-      act_dim: 3
-      # The theshold of safety budget
-      threshold: 2
-      # The learning rate of Q network
-      q_lr: 0.1
-      # The hyperparameter of episilon greedy
-      epsilon: 0.8
-      # Lowpass filter coefficient
-      tau: 0.95
diff --git a/omnisafe/configs/on-policy/PPOSaute.yaml b/omnisafe/configs/on-policy/PPOSaute.yaml
deleted file mode 100644
index c2a29791b..000000000
--- a/omnisafe/configs/on-policy/PPOSaute.yaml
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SauteWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: False
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
-    actor:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
-    critic:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## Configuration For Env_Wrapper
-  env_cfgs:
-    # The reward when the state is unsafe
-    unsafe_reward: -0.1
-    # safety_budget in saute is actually the same as ``cost_limmit``.
-    safety_budget: 25
-    # The discount factor of cost in saute
-    saute_gamma: 0.9997
-    # Whether to scale safety budget
-    scale_safety_budget: True
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_ep_len: 1000
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/PPOSimmerPid.yaml b/omnisafe/configs/on-policy/PPOSimmerPid.yaml
deleted file mode 100644
index 8df2fb19e..000000000
--- a/omnisafe/configs/on-policy/PPOSimmerPid.yaml
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SimmerWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## Configuration For Env_Wrapper
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-    # The reward when the state is unsafe
-    unsafe_reward: -0.1
-    # The lower bound of safety budget
-    lower_budget: 15
-    # The upper bound of safety budget
-    upper_budget: 25
-    # The dicounted factor
-    simmer_gamma: 0.9997
-    # Whether to scale the safety budget
-    scale_safety_budget: True
-    # Type of Simmer Controller
-    simmer_controller: 'PID'
-    # Configuration of Simmer Controller
-    controller_cfgs:
-      # Kp for PID
-      pid_kp: 0.1
-      # Ki for PID
-      pid_ki: 0.01
-      # Kd for PID
-      pid_kd: 0.01
-      # The step size for PID
-      step_size: 3
-      # Lowpass filter coefficient
-      tau: 0.05
diff --git a/omnisafe/configs/on-policy/PPOSimmerQ.yaml b/omnisafe/configs/on-policy/PPOSimmerQ.yaml
deleted file mode 100644
index 3e81d063a..000000000
--- a/omnisafe/configs/on-policy/PPOSimmerQ.yaml
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: SimmerWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The learning rate of Actor network
-  actor_lr: 0.0003
-  # The learning rate of Critic network
-  critic_lr: 0.0003
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The clip range for PPO loss
-  clip: 0.2
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: True
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # Whether to share the weight of Actor network with Critic network
-    shared_weights: False
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian
-    # Configuration of Actor and Critic network
-    ac_kwargs:
-      # Configuration of Actor network
-      pi:
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-        # Output activation function
-        output_activation: identity
-        # Whether to scale action.
-        scale_action: False
-        # Whether to clip action.
-        clip_action: False
-        # Whther to learn the standard deviation of Gaussian noise
-        std_learning: True
-        # The initial value of standard deviation of Gaussian noise
-        std_init: 1.0
-      # Configuration of Critic network
-      val:
-        # Number of critic networks
-        num_critics: 1
-        # Size of hidden layers
-        hidden_sizes: [64, 64]
-        # Activation function
-        activation: tanh
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## Configuration For Env_Wrapper
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-    # The reward when the state is unsafe
-    unsafe_reward: -0.1
-    # The lower bound of safety budget
-    lower_budget: 15
-    # The upper bound of safety budget
-    upper_budget: 25
-    # The dicounted factor
-    simmer_gamma: 0.9997
-    # Whether to scale the safety budget
-    scale_safety_budget: False
-    # Type of Simmer Controller
-    simmer_controller: 'Q'
-    # Configurations for controller
-    controller_cfgs:
-      # The dim of state space
-      state_dim: 5
-      # The dim of action space
-      act_dim: 3
-      # The theshold of safety budget
-      threshold: 2
-      # The learning rate of Q network
-      q_lr: 0.1
-      # The hyperparameter of episilon greedy
-      epsilon: 0.8
-      # Lowpass filter coefficient
-      tau: 0.95
diff --git a/omnisafe/configs/on-policy/PolicyGradient.yaml b/omnisafe/configs/on-policy/PolicyGradient.yaml
index 2395e6c7f..e75f50162 100644
--- a/omnisafe/configs/on-policy/PolicyGradient.yaml
+++ b/omnisafe/configs/on-policy/PolicyGradient.yaml
@@ -14,129 +14,105 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 50
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 64
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## ---------------------------Basic configurations for derived class PPO---------------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.02
-  # The size of batch for policy update
-  batch_size: 10000
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 64
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: False
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
+      # out_activation: tanh
+      # learning rate
       lr: 0.0003
-    # Configuration of Critic network
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
diff --git a/omnisafe/configs/on-policy/RCPO.yaml b/omnisafe/configs/on-policy/RCPO.yaml
index 13afd06ab..0312d7a85 100644
--- a/omnisafe/configs/on-policy/RCPO.yaml
+++ b/omnisafe/configs/on-policy/RCPO.yaml
@@ -14,147 +14,117 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 16384
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+    # damping value for conjugate gradient
+    cg_damping: 0.1
+    # number of conjugate gradient iterations
+    cg_iters: 10
+    # subsampled observation
+    fvp_obs: None
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
+      # out_activation: tanh
+      # learning rate
+      lr: None
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-  ## --------------------------------------Configuration For Lagrange--------------------------- ##
+  # lagrangian configurations
   lagrange_cfgs:
     # Tolerance of constraint violation
     cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml
index 5e613b820..9db8fb013 100644
--- a/omnisafe/configs/on-policy/TRPO.yaml
+++ b/omnisafe/configs/on-policy/TRPO.yaml
@@ -14,143 +14,113 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: False
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 10
+    # batch size for each iteration
+    batch_size: 16384
+    # target kl divergence
+    target_kl: 0.02
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: False
+    # Damping value for conjugate gradient
+    cg_damping: 0.1
+    # Number of conjugate gradient iterations
+    cg_iters: 10
+    # Subsampled observation
+    fvp_obs: None
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
+    # linear learning rate decay
+    linear_lr_decay: False
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
+      # out_activation: tanh
+      # learning rate
+      lr: None
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: True
-    # Whether to use standardized cost
-    normalized_cost: True
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
+      # learning rate
+      lr: 0.001
diff --git a/omnisafe/configs/on-policy/TRPOLag.yaml b/omnisafe/configs/on-policy/TRPOLag.yaml
index 13afd06ab..8a087de8d 100644
--- a/omnisafe/configs/on-policy/TRPOLag.yaml
+++ b/omnisafe/configs/on-policy/TRPOLag.yaml
@@ -14,147 +14,117 @@
 # ==============================================================================
 
 defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
+  # seed for random number generator
   seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
+  # training configurations
+  train_cfgs:
+    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
+    device: cpu
+    # number of threads for torch
+    torch_threads: 16
+    # number of vectorized environments
+    vector_env_nums: 16
+    # number of parallel agent, similar to a3c
+    parallel: 1
+    # total number of steps to train
+    total_steps: 16384000
+  # algorithm configurations
+  algo_cfgs:
+    # number of steps to update the policy
+    update_cycle: 32768
+    # number of iterations to update the policy
+    update_iters: 40
+    # batch size for each iteration
+    batch_size: 16384
+    # target kl divergence
+    target_kl: 0.01
+    # entropy coefficient
+    entropy_coef: 0.0
+    # normalize reward
+    reward_normalize: True
+    # normalize cost
+    cost_normalize: True
+    # normalize observation
+    obs_normalize: True
+    # early stop when kl divergence is bigger than target kl
+    kl_early_stop: True
+    # use max gradient norm
+    use_max_grad_norm: True
+    # max gradient norm
+    max_grad_norm: 40
+    # use critic norm
+    use_critic_norm: True
+    # critic norm coefficient
+    critic_norm_coef: 0.001
+    # reward discount factor
+    gamma: 0.99
+    # cost discount factor
+    cost_gamma: 0.99
+    # lambda for gae
+    lam: 0.95
+    # lambda for cost gae
+    lam_c: 0.95
+    # clip ratio
+    clip: 0.2
+    # advantage estimation method, options: gae, retrace
+    adv_estimation_method: gae
+    # standardize reward advantage
+    standardized_rew_adv: True
+    # standardize cost advantage
+    standardized_cost_adv: True
+    # penalty coefficient
+    penalty_coef: 0.0
+    # use cost
+    use_cost: True
+    # damping value for conjugate gradient
+    cg_damping: 0.1
+    # number of conjugate gradient iterations
+    cg_iters: 10
+    # subsampled observation
+    fvp_obs: None
+  # logger configurations
+  logger_cfgs:
+    # use wandb for logging
+    use_wandb: False
+    # wandb project name
+    wandb_project: omnisafe
+    # use tensorboard for logging
+    use_tensorboard: True
+    # save model frequency
+    save_model_freq: 100
+    # save logger path
+    log_dir: "./runs"
+    # save model path
+    window_lens: 100
+  # model configurations
   model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
+    # weight initialization mode
     weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
+    # actor type, options: gaussian, gaussian_learning
     actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
+    # linear learning rate decay
     linear_lr_decay: True
-    # Configuration of Actor network
+    # exploration noise anneal
+    exploration_noise_anneal: False
+    # std upper bound, and lower bound
+    std_range: [0.5, 0.1]
+    # actor network configurations
     actor:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
+      # out_activation: tanh
+      # learning rate
+      lr: None
     critic:
-      # Size of hidden layers
+      # hidden layer sizes
       hidden_sizes: [64, 64]
-      # Activation function
+      # activation function
       activation: tanh
-      # The learning rate of Critic network
+      # learning rate
       lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-  ## --------------------------------------Configuration For Lagrange--------------------------- ##
+  # lagrangian configurations
   lagrange_cfgs:
     # Tolerance of constraint violation
     cost_limit: 25.0
diff --git a/omnisafe/configs/on-policy/TRPOPid.yaml b/omnisafe/configs/on-policy/TRPOPid.yaml
deleted file mode 100644
index b34931071..000000000
--- a/omnisafe/configs/on-policy/TRPOPid.yaml
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-defaults:
-  # --------------------------------------Basic Configurations----------------------------------- #
-  ## -----------------------------Basic configurations for base class PG------------------------ ##
-  # The random seed
-  seed: 0
-  # The number of threads used per experiment
-  num_threads: 1
-  # If use tensorboard
-  use_tensorboard: True
-  # if use wandb
-  use_wandb: True
-  # The torch device
-  device: cpu
-  # The torch device id
-  device_id: 0
-  # The environment wrapper type
-  wrapper_type: CMDPWrapper
-  # Number of epochs
-  epochs: 500
-  # Number of steps per epoch
-  steps_per_epoch: 32768
-  # Number of update iteration for Actor network
-  actor_iters: 10
-  # Number of update iteration for Critic network
-  critic_iters: 40
-  # Check if all models own the same parameter values every `check_freq` epoch
-  check_freq: 25
-  # Save model to disk every `check_freq` epochs
-  save_freq: 100
-  # Entropy coefficient for PPO loss
-  entropy_coef: 0.0
-  # The max length of per epoch
-  max_ep_len: 1000
-  # The size of mini batch
-  num_mini_batches: 16384
-  # The learning rate of Actor network
-  actor_lr: 0.001
-  # The learning rate of Critic network
-  critic_lr: 0.001
-  # The Address for saving training process data
-  data_dir: "./runs"
-  ## --------------------------Basic configurations for derived class NaturalPG----------------- ##
-  # The thereshold for KL early stopping
-  target_kl: 0.01
-  # Tolerance of constraint violation
-  cost_limit: 25
-  # Damping value for conjugate gradient
-  cg_damping: 0.1
-  # Number of conjugate gradient iterations
-  cg_iters: 10
-  # Subsampled observation
-  fvp_obs: None
-  # The number of parallel environments
-  num_envs: 32
-  # Whether to use standardized reward
-  reward_normalize: True
-  # Whether to use standardized cost
-  cost_normalize: True
-  # Whether to use standardized obs
-  obs_normalize: True
-
-  # ---------------------------------------Optional Configuration-------------------------------- #
-  ## -----------------------------------Configuration For Cost Critic--------------------------- ##
-  # Whether to use cost critic
-  use_cost: True
-  # Cost discounted factor
-  cost_gamma: 1.0
-  # Whether to use linear decay of learning rate
-  linear_lr_decay: False
-  # Whether to use exploration noise anneal
-  exploration_noise_anneal: False
-  # std
-  std: [0.5, 0.1]
-  # The coefficient of reward penalty
-  penalty_param: 0.0
-  # Whether to use KL early stopping
-  kl_early_stopping: True
-  # Whether to use max gradient norm
-  use_max_grad_norm: True
-  # The thereshold of max gradient norm
-  max_grad_norm: 40
-  # Whether to use standardized observation
-  standardized_obs: True
-  # Whether to use critic network norm
-  use_critic_norm: True
-  # The norm coefficient of critic network
-  critic_norm_coeff: 0.001
-  ## ---------------------------------------Configuration For Model----------------------------- ##
-  model_cfgs:
-    # The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
-    weight_initialization_mode: "kaiming_uniform"
-    # Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
-    actor_type: gaussian_learning
-    # Whether to use linear decay of learning rate
-    linear_lr_decay: True
-    # Configuration of Actor network
-    actor:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Actor network
-      lr: 0.0003
-    # Configuration of Critic network
-    critic:
-      # Size of hidden layers
-      hidden_sizes: [64, 64]
-      # Activation function
-      activation: tanh
-      # The learning rate of Critic network
-      lr: 0.0003
-  ## --------------------------------------Configuration For Buffer----------------------------- ##
-  buffer_cfgs:
-    # Reward discounted factor
-    gamma: 0.99
-    # Parameters used to estimate future rewards in GAE
-    lam: 0.95
-    # Parameters used to estimate future costs in GAE
-    lam_c: 0.95
-    # Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
-    adv_estimation_method: gae
-    # Whether to use reward standardized advantage estimation
-    standardized_rew_adv: True
-    # Whether to use cost standardized advantage estimation
-    standardized_cost_adv: True
-  ## --------------------------------------Configuration For Environment------------------------ ##
-  env_cfgs:
-    # The number of parallel environments
-    num_envs: 8
-    # Whether to use async environment
-    async_env: True
-    # Whether to use standardized reward
-    normalized_rew: False
-    # Whether to use standardized cost
-    normalized_cost: False
-    # Whether to use standardized obs
-    normalized_obs: True
-    # The maximum length of record queue
-    max_len: 100
-    # The number of threads used to sample data
-    num_threads: 20
-## --------------------------------------Configuration For PID--------------------------------- ##
-  PID_cfgs:
-    # KP for PID
-    pid_kp: 0.01
-    # KI for PID
-    pid_ki: 0.01
-    # KD for PID
-    pid_kd: 0.01
-    # The init value of lagrangian multiplier
-    lagrangian_multiplier_init: 0.001
-    # The delay rate of KD
-    pid_d_delay: 10
-    # 0 for hard update, 1 for no update
-    pid_delta_p_ema_alpha: 0.95
-    # The same as above
-    pid_delta_d_ema_alpha: 0.95
-    # L = (J_r - lam * J_c) / (1 + lam); lam <= 0
-    sum_norm: True
-    # L = (1 - lam) * J_r - lam * J_c; 0 <= lam <= 1
-    diff_norm: False
-    # Only used if sum_norm=diff_norm=False
-    penalty_max: 100
-    # Tolerance of violation
-    cost_limit: 25.0
diff --git a/omnisafe/models/actor_critic/actor_critic.py b/omnisafe/models/actor_critic/actor_critic.py
index f920e27a9..0781dfecc 100644
--- a/omnisafe/models/actor_critic/actor_critic.py
+++ b/omnisafe/models/actor_critic/actor_critic.py
@@ -80,26 +80,28 @@ def __init__(
         self.add_module('actor', self.actor)
         self.add_module('reward_critic', self.reward_critic)
 
-        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=model_cfgs.actor.lr)
-        self.reward_critic_optimizer = optim.Adam(
-            self.reward_critic.parameters(), lr=model_cfgs.critic.lr
-        )
-
-        self.actor_scheduler: _LRScheduler
-        if model_cfgs.linear_lr_decay:
-            self.actor_scheduler = LinearLR(
-                self.actor_optimizer,
-                start_factor=1.0,
-                end_factor=0.0,
-                total_iters=epochs,
-                verbose=True,
+        if model_cfgs.actor.lr != 'None':
+            self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=model_cfgs.actor.lr)
+        if model_cfgs.critic.lr != 'None':
+            self.reward_critic_optimizer = optim.Adam(
+                self.reward_critic.parameters(), lr=model_cfgs.critic.lr
             )
-        else:
-            self.actor_scheduler = ConstantLR(
-                self.actor_optimizer, factor=1.0, total_iters=epochs, verbose=True
-            )
-
-        self.std_schedule: Schedule
+        if model_cfgs.actor.lr != 'None':
+            self.actor_scheduler: _LRScheduler
+            if model_cfgs.linear_lr_decay:
+                self.actor_scheduler = LinearLR(
+                    self.actor_optimizer,
+                    start_factor=1.0,
+                    end_factor=0.0,
+                    total_iters=epochs,
+                    verbose=True,
+                )
+            else:
+                self.actor_scheduler = ConstantLR(
+                    self.actor_optimizer, factor=1.0, total_iters=epochs, verbose=True
+                )
+
+            self.std_schedule: Schedule
 
     def step(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]:
         """Choose the action based on the observation. used in rollout without gradient.
diff --git a/omnisafe/models/actor_critic/constraint_actor_critic.py b/omnisafe/models/actor_critic/constraint_actor_critic.py
index f69cd6d5e..12521f680 100644
--- a/omnisafe/models/actor_critic/constraint_actor_critic.py
+++ b/omnisafe/models/actor_critic/constraint_actor_critic.py
@@ -75,9 +75,10 @@ def __init__(
         ).build_critic('v')
         self.add_module('cost_critic', self.cost_critic)
 
-        self.cost_critic_optimizer = optim.Adam(
-            self.cost_critic.parameters(), lr=model_cfgs.critic.lr
-        )
+        if model_cfgs.critic.lr != 'None':
+            self.cost_critic_optimizer = optim.Adam(
+                self.cost_critic.parameters(), lr=model_cfgs.critic.lr
+            )
 
     def step(self, obs: torch.Tensor, deterministic: bool = False) -> Tuple[torch.Tensor, ...]:
         """Choose action based on observation.
diff --git a/omnisafe/utils/config.py b/omnisafe/utils/config.py
index 5eed962a6..1eed44682 100644
--- a/omnisafe/utils/config.py
+++ b/omnisafe/utils/config.py
@@ -202,73 +202,82 @@ def check_all_configs(configs: Config, algo_type: str) -> None:
         configs (dict): configs to be checked.
         algo_type (str): algorithm type.
     """
-    __check_env_configs(configs)
-    if algo_type == 'on-policy':
-        __check_buffer_configs(configs.buffer_cfgs)
-        assert configs.actor_iters > 0, 'actor_iters must be greater than 0'
+
+    ## check algo configs
+    __check_algo_configs(configs.algo_cfgs, algo_type)
+    __check_logger_configs(configs.logger_cfgs, algo_type)
+
+
+def __check_algo_configs(configs: Config, algo_type) -> None:
+    """Check algorithm configs."""
+    if algo_type == 'onpolicy':
+        assert (
+            isinstance(configs.update_iters, int) and configs.update_iters > 0
+        ), 'update_iters must be int and greater than 0'
+        assert (
+            isinstance(configs.update_cycle, int) and configs.update_cycle > 0
+        ), 'update_cycle must be int and greater than 0'
+        assert (
+            isinstance(configs.batch_size, int) and configs.batch_size > 0
+        ), 'batch_size must be int and greater than 0'
         assert (
-            configs.actor_lr > 0 and configs.critic_lr > 0
-        ), 'actor_lr and critic_lr must be greater than 0'
+            isinstance(configs.target_kl, float) and configs.target_kl >= 0.0
+        ), 'target_kl must be float and greater than 0.0'
         assert (
-            configs.buffer_cfgs.gamma >= 0 and configs.buffer_cfgs.gamma < 1.0
-        ), 'gamma must be in [0, 1)'
+            isinstance(configs.entropy_coef, float)
+            and configs.entropy_coef >= 0.0
+            and configs.entropy_coef <= 1.0
+        ), 'entropy_coef must be float, and it values must be [0.0, 1.0]'
         assert (
-            configs.use_cost is False and configs.cost_gamma == 1.0
-        ) or configs.use_cost, 'if use_cost is False, cost_gamma must be 1.0'
-    elif algo_type == 'off-policy':
+            configs.reward_normalize and configs.reward_normalize and configs.reward_normalize
+        ), 'normalize must be bool'
+        assert isinstance(configs.kl_early_stop, bool), 'kl_early_stop must be bool'
+        assert configs.use_max_grad_norm and configs.use_critic_norm, 'norm must be bool'
+        assert isinstance(configs.max_grad_norm, float) and isinstance(
+            configs.critic_norm_coef, float
+        ), 'norm must be bool'
         assert (
-            configs.actor_lr > 0 and configs.critic_lr > 0
-        ), 'actor_lr and critic_lr must be greater than 0'
+            isinstance(configs.gamma, float) and configs.gamma >= 0.0 and configs.gamma <= 1.0
+        ), 'gamma must be float, and it values must be [0.0, 1.0]'
         assert (
-            configs.replay_buffer_cfgs.size > configs.replay_buffer_cfgs.batch_size
-        ), 'replay_buffer size must be greater than batch_size'
+            isinstance(configs.cost_gamma, float)
+            and configs.cost_gamma >= 0.0
+            and configs.cost_gamma <= 1.0
+        ), 'cost_gamma must be float, and it values must be [0.0, 1.0]'
         assert (
-            configs.update_every < configs.steps_per_epoch
-        ), 'update_every must be less than steps_per_epoch'
-
-
-def __check_env_configs(configs: Config) -> None:
-    """Check env configs."""
-    wrapper_type = configs.wrapper_type
-    env_configs = configs.env_cfgs
-    assert env_configs.max_len > 0, 'max_len must be greater than 0'
-    if wrapper_type == 'SafetyLayerWrapper':
-        assert hasattr(
-            env_configs, 'safety_layer_cfgs'
-        ), 'SafetyLayerWrapper must have safety_layer_cfgs'
-    elif wrapper_type == 'SauteWrapper':
+            isinstance(configs.lam, float) and configs.lam >= 0.0 and configs.lam <= 1.0
+        ), 'lam must be float, and it values must be [0.0, 1.0]'
         assert (
-            hasattr(env_configs, 'unsafe_reward')
-            and hasattr(env_configs, 'safety_budget')
-            and hasattr(env_configs, 'saute_gamma')
-            and hasattr(env_configs, 'scale_safety_budget')
-        ), 'SauteWrapper must have unsafe_reward, safety_budget, saute_gamma, scale_safety_budget'
-        assert env_configs.unsafe_reward <= 0, 'unsafe_reward must be less or equal than 0'
-        assert env_configs.safety_budget > 0, 'safety_budget must be greater than 0'
+            isinstance(configs.lam_c, float) and configs.lam_c >= 0.0 and configs.lam_c <= 1.0
+        ), 'lam_c must be float, and it values must be [0.0, 1.0]'
         assert (
-            env_configs.saute_gamma >= 0 and env_configs.saute_gamma < 1.0
-        ), 'saute_gamma must be in [0, 1)'
-    elif wrapper_type == 'SimmerWrapper':
+            isinstance(configs.clip, float) and configs.clip >= 0.0
+        ), 'clip must be float, and it values must be [0.0, infty]'
+        assert isinstance(configs.adv_estimation_method, str) and configs.adv_estimation_method in [
+            'gae',
+            'gae-rtg',
+            'vtrace',
+            'plain',
+        ], "adv_estimation_method must be string, and it values must be ['gae','gae-rtg','vtrace','plain']"
         assert (
-            hasattr(env_configs, 'unsafe_reward')
-            and hasattr(env_configs, 'lower_budget')
-            and hasattr(env_configs, 'simmer_gamma')
-            and hasattr(env_configs, 'scale_safety_budget')
-        ), 'SimmerWrapper must have unsafe_reward, safety_budget, simmer_gamma, scale_safety_budget'
-        assert env_configs.unsafe_reward <= 0, 'unsafe_reward must be less or equal than 0'
-        assert env_configs.lower_budget > 0, 'safety_budget must be greater than 0'
+            configs.standardized_rew_adv and configs.standardized_cost_adv
+        ), 'standardized_<>_adv must be bool'
         assert (
-            env_configs.simmer_gamma >= 0 and env_configs.simmer_gamma < 1.0
-        ), 'simmer_gamma must be in [0, 1)'
-
-
-def __check_buffer_configs(configs: Config) -> None:
-    """Check buffer configs."""
-    assert (
-        configs.gamma >= 0 and configs.gamma < 1.0
-    ), f'gamma must be in [0, 1) but got {configs.gamma}'
-    assert configs.lam >= 0 and configs.lam < 1.0, f'lam must be in [0, 1) but got {configs.lam}'
-    assert (
-        configs.lam_c >= 0 and configs.lam_c < 1.0
-    ), f'gamma must be in [0, 1) but got {configs.lam_c}'
-    assert configs.adv_estimation_method in ['gae', 'gae-rtg', 'vtrace', 'plain']
+            isinstance(configs.penalty_coef, float)
+            and configs.penalty_coef >= 0.0
+            and configs.penalty_coef <= 1.0
+        ), 'penalty_coef must be float, and it values must be [0.0, 1.0]'
+        assert isinstance(configs.use_cost, bool), 'penalty_coef must be bool'
+
+
+def __check_logger_configs(configs: Config, algo_type) -> None:
+    """Check logger configs."""
+    if algo_type == 'onpolicy':
+        assert isinstance(configs.use_wandb, bool) and isinstance(
+            configs.wandb_project, str
+        ), 'use_wandb and wandb_project must be bool and string'
+        assert isinstance(configs.use_tensorboard, bool), 'use_tensorboard must be bool'
+        assert isinstance(configs.save_model_freq, int) and isinstance(
+            configs.window_lens, int
+        ), 'save_model_freq and window_lens must be int'
+        assert isinstance(configs.log_dir, str), 'log_dir must be string'
diff --git a/omnisafe/utils/tools.py b/omnisafe/utils/tools.py
index b49cd7ce5..2fdd30db6 100644
--- a/omnisafe/utils/tools.py
+++ b/omnisafe/utils/tools.py
@@ -132,3 +132,62 @@ def seed_all(seed: int):
         torch.use_deterministic_algorithms(True)
     except AttributeError:
         pass
+
+
+def custom_cfgs_to_dict(key_list, value):
+    """This function is used to convert the custom configurations to dict.
+
+    .. note::
+        This function is used to convert the custom configurations to dict.
+        For example, if the custom configurations are ``train_cfgs:use_wandb`` and ``True``,
+        then the output dict will be ``{'train_cfgs': {'use_wandb': True}}``.
+
+    Args:
+        key_list (list): list of keys.
+        value: value.
+    """
+    if value == 'True':
+        value = True
+    elif value == 'False':
+        value = False
+    elif '.' in value:
+        value = float(value)
+    elif value.isdigit():
+        value = int(value)
+    elif value.startswith('[') and value.endswith(']'):
+        value = value[1:-1]
+        value = value.split(',')
+    else:
+        value = str(value)
+    keys_split = key_list.replace('-', '_').split(':')
+    return_dict = {keys_split[-1]: value}
+
+    for key in reversed(keys_split[:-1]):
+        return_dict = {key.replace('-', '_'): return_dict}
+    return return_dict
+
+
+def update_dic(total_dic, item_dic):
+    '''Updater of multi-level dictionary.'''
+    for idd in item_dic.keys():
+        total_value = total_dic.get(idd)
+        item_value = item_dic.get(idd)
+
+        if total_value is None:
+            total_dic.update({idd: item_value})
+        elif isinstance(item_value, dict):
+            update_dic(total_value, item_value)
+            total_dic.update({idd: total_value})
+        else:
+            total_value = item_value
+            total_dic.update({idd: total_value})
+
+
+if __name__ == '__main__':
+    print('This is a tool function package.')
+    print(custom_cfgs_to_dict('train_cfgs:use_wandb', 'True'))
+    print(custom_cfgs_to_dict('train_cfgs:use_wandb', 'False'))
+    print(custom_cfgs_to_dict('train_cfgs:use_wandb', '0.1'))
+    print(custom_cfgs_to_dict('train_cfgs:use_wandb', '1'))
+    print(custom_cfgs_to_dict('train_cfgs:use_wandb', 'test'))
+    print(custom_cfgs_to_dict('train_cfgs:use_wandb', '[1,2,3]'))
diff --git a/pyproject.toml b/pyproject.toml
index 23502074b..3963ae392 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
     "xmltodict >= 0.13.0",
     "moviepy >= 1.0.0",
     "typing-extensions >= 4.0.0",
+    "typer[all] >= 0.7.0",
 ]
 dynamic = ["version"]
 
diff --git a/tests/test_model.py b/tests/test_model.py
index e29946407..afebb0533 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1,315 +1,315 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test models"""
-
-from typing import Optional
-
-import numpy as np
-import torch
-import torch.nn as nn
-from gymnasium.spaces import Box, Discrete
-
-import helpers
-from omnisafe.models import ActorBuilder, CriticBuilder
-from omnisafe.models.actor_critic import ActorCritic
-from omnisafe.models.actor_q_critic import ActorQCritic
-from omnisafe.typing import Activation, InitFunction
-from omnisafe.utils.config import Config
-
-
-@helpers.parametrize(
-    obs_dim=[10],
-    act_dim=[5],
-    shared=[None],
-    hidden_sizes=[64],
-    activation=['tanh', 'relu'],
-    use_obs_encoder=[True, False],
-)
-def test_critic(
-    obs_dim: int,
-    act_dim,
-    shared,
-    hidden_sizes: int,
-    activation: str,
-    use_obs_encoder: bool,
-) -> None:
-    """Test critic."""
-    builder = CriticBuilder(
-        obs_dim=obs_dim,
-        act_dim=act_dim,
-        hidden_sizes=[hidden_sizes, hidden_sizes],
-        activation=activation,
-        shared=shared,
-    )
-    obs = torch.randn(obs_dim, dtype=torch.float32)
-    act = torch.randn(act_dim, dtype=torch.float32)
-    q_critic = builder.build_critic(critic_type='q', use_obs_encoder=use_obs_encoder)
-    v_critic = builder.build_critic(critic_type='v')
-    out1 = q_critic(obs, act)[0]
-    out2 = v_critic(obs)
-    assert out1.shape == torch.Size([]), f'q_critic output shape is {out1.shape}'
-    assert out2.shape == torch.Size([]), f'v_critic output shape is {out2.shape}'
-
-
-@helpers.parametrize(
-    actor_type=['gaussian', 'gaussian_stdnet'],
-    obs_dim=[10],
-    act_dim=[5],
-    hidden_sizes=[64],
-    activation=['tanh'],
-    output_activation=['tanh'],
-    weight_initialization_mode=['kaiming_uniform'],
-    shared=[None],
-    std_learning=[True],
-    std_init=[1.0],
-    scale_action=[True],
-    clip_action=[True],
-)
-def test_gaussian_actor(
-    actor_type: str,
-    obs_dim: int,
-    act_dim: int,
-    hidden_sizes: list,
-    activation: Activation,
-    weight_initialization_mode: InitFunction,
-    shared: nn.Module,
-    scale_action: bool,
-    clip_action: bool,
-    output_activation: Optional[Activation],
-    std_learning: bool,
-    std_init: float,
-) -> None:
-    """Test the MLP Gaussian Actor class."""
-    builder = ActorBuilder(
-        obs_dim=obs_dim,
-        act_dim=act_dim,
-        hidden_sizes=[hidden_sizes, hidden_sizes],
-        activation=activation,
-        weight_initialization_mode=weight_initialization_mode,
-        shared=shared,
-        scale_action=scale_action,
-        clip_action=clip_action,
-        output_activation=output_activation,
-        std_learning=std_learning,
-        std_init=std_init,
-    )
-    kwargs = {
-        'act_min': torch.full((act_dim,), -1.0),
-        'act_max': torch.full((act_dim,), 1.0),
-    }
-
-    actor = builder.build_actor(actor_type=actor_type, **kwargs)
-
-    obs = torch.randn((1, obs_dim), dtype=torch.float32)
-    dist = actor(obs)
-    assert isinstance(dist, torch.distributions.Normal), 'Actor output is not a Normal distribution'
-
-    raw_act, act = actor.predict(obs)
-    assert act.shape == torch.Size([1, act_dim]), f'Actor predict output shape is {act.shape}'
-    assert raw_act.shape == torch.Size(
-        [1, act_dim]
-    ), f'Actor predict output shape is {raw_act.shape}'
-
-    raw_act, act = actor.predict(obs, deterministic=True)
-    assert act.shape == torch.Size([1, act_dim]), f'Actor predict output shape is {act.shape}'
-    assert raw_act.shape == torch.Size(
-        [1, act_dim]
-    ), f'Actor predict output shape is {raw_act.shape}'
-    raw_act, act, logp = actor.predict(obs, deterministic=True, need_log_prob=True)
-
-    assert raw_act.shape == torch.Size(
-        [1, act_dim]
-    ), f'Actor predict output shape is {raw_act.shape}'
-    assert act.shape == torch.Size([1, act_dim]), f'Actor predict output shape is {act.shape}'
-    assert logp.shape == torch.Size([1]), f'Actor logp output shape is {logp.shape}'
-
-
-@helpers.parametrize(
-    obs_dim=[10],
-    act_dim=[5],
-    space_type=[Box, Discrete],
-    shared_weights=[False, True],  # shared weights not implemented yet in discrete case.
-    hidden_sizes=[64],
-    activation=['tanh'],
-    weight_initialization_mode=[
-        'kaiming_uniform',
-        'xavier_normal',
-        'glorot',
-        'xavier_uniform',
-        'orthogonal',
-    ],
-    actor_type=['gaussian', 'gaussian_stdnet'],
-)
-def test_actor_critic(
-    obs_dim: int,
-    act_dim: int,
-    space_type,
-    shared_weights: bool,
-    hidden_sizes: int,
-    activation: str,
-    weight_initialization_mode: str,
-    actor_type: str,
-) -> None:
-    """Test the Actor Critic class."""
-
-    ac_kwargs = {
-        'pi': {
-            'hidden_sizes': [hidden_sizes, hidden_sizes],
-            'activation': activation,
-        },
-        'val': {
-            'hidden_sizes': [hidden_sizes, hidden_sizes],
-            'activation': activation,
-        },
-    }
-    observation_space = Box(low=-1, high=1, shape=(obs_dim,))
-
-    model_cfgs = Config(
-        **{
-            'actor_type': actor_type,
-            'ac_kwargs': ac_kwargs,
-            'weight_initialization_mode': weight_initialization_mode,
-            'shared_weights': shared_weights,
-        }
-    )
-
-    if space_type == Discrete:
-        action_space = space_type(act_dim)
-    else:
-        action_space = space_type(low=-1, high=1, shape=(act_dim,))
-
-    actor_critic = ActorCritic(
-        observation_space=observation_space,
-        action_space=action_space,
-        model_cfgs=model_cfgs,
-    )
-
-    obs = torch.randn((1, obs_dim), dtype=torch.float32)
-
-    raw_act, act, val, logpro = actor_critic(obs)
-    assert (
-        isinstance(raw_act, torch.Tensor)
-        and isinstance(act, torch.Tensor)
-        and isinstance(val, torch.Tensor)
-        and isinstance(logpro, torch.Tensor)
-    ), 'Failed!'
-
-    raw_act, act, val, logpro = actor_critic.step(obs)
-    assert (
-        isinstance(raw_act, torch.Tensor)
-        and isinstance(act, torch.Tensor)
-        and isinstance(val, torch.Tensor)
-        and isinstance(logpro, torch.Tensor)
-    ), 'Failed!'
-
-    raw_act, act, val, logpro = actor_critic.step(obs, deterministic=True)
-    assert (
-        isinstance(raw_act, torch.Tensor)
-        and isinstance(act, torch.Tensor)
-        and isinstance(val, torch.Tensor)
-        and isinstance(logpro, torch.Tensor)
-    ), 'Failed!'
-
-    actor_critic.anneal_exploration(0.5)
-
-
-@helpers.parametrize(
-    obs_dim=[10],
-    act_dim=[5],
-    space_type=[Box, Discrete],
-    shared_weights=[False],  # shared weights not implemented yet in discrete case.
-    hidden_sizes=[64],
-    activation=['tanh'],
-    weight_initialization_mode=[
-        'kaiming_uniform',
-        'xavier_normal',
-        'glorot',
-        'xavier_uniform',
-        'orthogonal',
-    ],
-    actor_type=['gaussian', 'gaussian_stdnet'],
-)
-def test_actor_q_critic(
-    obs_dim: int,
-    act_dim: int,
-    space_type,
-    shared_weights: bool,
-    hidden_sizes: int,
-    activation: str,
-    weight_initialization_mode: str,
-    actor_type: str,
-) -> None:
-    """Test the Actor Critic class."""
-
-    ac_kwargs = {
-        'pi': {
-            'hidden_sizes': [hidden_sizes, hidden_sizes],
-            'activation': activation,
-        },
-        'val': {
-            'hidden_sizes': [hidden_sizes, hidden_sizes],
-            'activation': activation,
-            'num_critics': 1,
-        },
-    }
-    observation_space = Box(low=-1, high=1, shape=(obs_dim,))
-
-    model_cfgs = Config(
-        **{
-            'actor_type': actor_type,
-            'ac_kwargs': ac_kwargs,
-            'weight_initialization_mode': weight_initialization_mode,
-            'shared_weights': shared_weights,
-        }
-    )
-
-    if space_type == Discrete:
-        action_space = space_type(act_dim)
-    else:
-        action_space = space_type(low=-1, high=1, shape=(act_dim,))
-
-    actor_critic = ActorQCritic(
-        observation_space=observation_space,
-        action_space=action_space,
-        model_cfgs=model_cfgs,
-    )
-
-    obs = torch.randn((1, obs_dim), dtype=torch.float32)
-
-    raw_act, act, val, logpro = actor_critic(obs)
-    assert (
-        isinstance(raw_act, torch.Tensor)
-        and isinstance(act, torch.Tensor)
-        and isinstance(val, torch.Tensor)
-        and isinstance(logpro, torch.Tensor)
-    ), 'Failed!'
-
-    raw_act, act, val, logpro = actor_critic.step(obs)
-    assert (
-        isinstance(raw_act, torch.Tensor)
-        and isinstance(act, torch.Tensor)
-        and isinstance(val, torch.Tensor)
-        and isinstance(logpro, torch.Tensor)
-    ), 'Failed!'
-
-    raw_act, act, val, logpro = actor_critic.step(obs, deterministic=True)
-    assert (
-        isinstance(raw_act, torch.Tensor)
-        and isinstance(act, torch.Tensor)
-        and isinstance(val, torch.Tensor)
-        and isinstance(logpro, torch.Tensor)
-    ), 'Failed!'
-
-    actor_critic.anneal_exploration(0.5)
+# # Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+# # ==============================================================================
+# """Test models"""
+
+# from typing import Optional
+
+# import numpy as np
+# import torch
+# import torch.nn as nn
+# from gymnasium.spaces import Box, Discrete
+
+# import helpers
+# from omnisafe.models import ActorBuilder, CriticBuilder
+# from omnisafe.models.actor_critic import ActorCritic
+# from omnisafe.models.actor_q_critic import ActorQCritic
+# from omnisafe.typing import Activation, InitFunction
+# from omnisafe.utils.config import Config
+
+
+# @helpers.parametrize(
+#     obs_dim=[10],
+#     act_dim=[5],
+#     shared=[None],
+#     hidden_sizes=[64],
+#     activation=['tanh', 'relu'],
+#     use_obs_encoder=[True, False],
+# )
+# def test_critic(
+#     obs_dim: int,
+#     act_dim,
+#     shared,
+#     hidden_sizes: int,
+#     activation: str,
+#     use_obs_encoder: bool,
+# ) -> None:
+#     """Test critic."""
+#     builder = CriticBuilder(
+#         obs_dim=obs_dim,
+#         act_dim=act_dim,
+#         hidden_sizes=[hidden_sizes, hidden_sizes],
+#         activation=activation,
+#         shared=shared,
+#     )
+#     obs = torch.randn(obs_dim, dtype=torch.float32)
+#     act = torch.randn(act_dim, dtype=torch.float32)
+#     q_critic = builder.build_critic(critic_type='q', use_obs_encoder=use_obs_encoder)
+#     v_critic = builder.build_critic(critic_type='v')
+#     out1 = q_critic(obs, act)[0]
+#     out2 = v_critic(obs)
+#     assert out1.shape == torch.Size([]), f'q_critic output shape is {out1.shape}'
+#     assert out2.shape == torch.Size([]), f'v_critic output shape is {out2.shape}'
+
+
+# @helpers.parametrize(
+#     actor_type=['gaussian', 'gaussian_stdnet'],
+#     obs_dim=[10],
+#     act_dim=[5],
+#     hidden_sizes=[64],
+#     activation=['tanh'],
+#     output_activation=['tanh'],
+#     weight_initialization_mode=['kaiming_uniform'],
+#     shared=[None],
+#     std_learning=[True],
+#     std_init=[1.0],
+#     scale_action=[True],
+#     clip_action=[True],
+# )
+# def test_gaussian_actor(
+#     actor_type: str,
+#     obs_dim: int,
+#     act_dim: int,
+#     hidden_sizes: list,
+#     activation: Activation,
+#     weight_initialization_mode: InitFunction,
+#     shared: nn.Module,
+#     scale_action: bool,
+#     clip_action: bool,
+#     output_activation: Optional[Activation],
+#     std_learning: bool,
+#     std_init: float,
+# ) -> None:
+#     """Test the MLP Gaussian Actor class."""
+#     builder = ActorBuilder(
+#         obs_dim=obs_dim,
+#         act_dim=act_dim,
+#         hidden_sizes=[hidden_sizes, hidden_sizes],
+#         activation=activation,
+#         weight_initialization_mode=weight_initialization_mode,
+#         shared=shared,
+#         scale_action=scale_action,
+#         clip_action=clip_action,
+#         output_activation=output_activation,
+#         std_learning=std_learning,
+#         std_init=std_init,
+#     )
+#     kwargs = {
+#         'act_min': torch.full((act_dim,), -1.0),
+#         'act_max': torch.full((act_dim,), 1.0),
+#     }
+
+#     actor = builder.build_actor(actor_type=actor_type, **kwargs)
+
+#     obs = torch.randn((1, obs_dim), dtype=torch.float32)
+#     dist = actor(obs)
+#     assert isinstance(dist, torch.distributions.Normal), 'Actor output is not a Normal distribution'
+
+#     raw_act, act = actor.predict(obs)
+#     assert act.shape == torch.Size([1, act_dim]), f'Actor predict output shape is {act.shape}'
+#     assert raw_act.shape == torch.Size(
+#         [1, act_dim]
+#     ), f'Actor predict output shape is {raw_act.shape}'
+
+#     raw_act, act = actor.predict(obs, deterministic=True)
+#     assert act.shape == torch.Size([1, act_dim]), f'Actor predict output shape is {act.shape}'
+#     assert raw_act.shape == torch.Size(
+#         [1, act_dim]
+#     ), f'Actor predict output shape is {raw_act.shape}'
+#     raw_act, act, logp = actor.predict(obs, deterministic=True, need_log_prob=True)
+
+#     assert raw_act.shape == torch.Size(
+#         [1, act_dim]
+#     ), f'Actor predict output shape is {raw_act.shape}'
+#     assert act.shape == torch.Size([1, act_dim]), f'Actor predict output shape is {act.shape}'
+#     assert logp.shape == torch.Size([1]), f'Actor logp output shape is {logp.shape}'
+
+
+# @helpers.parametrize(
+#     obs_dim=[10],
+#     act_dim=[5],
+#     space_type=[Box, Discrete],
+#     shared_weights=[False, True],  # shared weights not implemented yet in discrete case.
+#     hidden_sizes=[64],
+#     activation=['tanh'],
+#     weight_initialization_mode=[
+#         'kaiming_uniform',
+#         'xavier_normal',
+#         'glorot',
+#         'xavier_uniform',
+#         'orthogonal',
+#     ],
+#     actor_type=['gaussian', 'gaussian_stdnet'],
+# )
+# def test_actor_critic(
+#     obs_dim: int,
+#     act_dim: int,
+#     space_type,
+#     shared_weights: bool,
+#     hidden_sizes: int,
+#     activation: str,
+#     weight_initialization_mode: str,
+#     actor_type: str,
+# ) -> None:
+#     """Test the Actor Critic class."""
+
+#     ac_kwargs = {
+#         'pi': {
+#             'hidden_sizes': [hidden_sizes, hidden_sizes],
+#             'activation': activation,
+#         },
+#         'val': {
+#             'hidden_sizes': [hidden_sizes, hidden_sizes],
+#             'activation': activation,
+#         },
+#     }
+#     observation_space = Box(low=-1, high=1, shape=(obs_dim,))
+
+#     model_cfgs = Config(
+#         **{
+#             'actor_type': actor_type,
+#             'ac_kwargs': ac_kwargs,
+#             'weight_initialization_mode': weight_initialization_mode,
+#             'shared_weights': shared_weights,
+#         }
+#     )
+
+#     if space_type == Discrete:
+#         action_space = space_type(act_dim)
+#     else:
+#         action_space = space_type(low=-1, high=1, shape=(act_dim,))
+
+#     actor_critic = ActorCritic(
+#         observation_space=observation_space,
+#         action_space=action_space,
+#         model_cfgs=model_cfgs,
+#     )
+
+#     obs = torch.randn((1, obs_dim), dtype=torch.float32)
+
+#     raw_act, act, val, logpro = actor_critic(obs)
+#     assert (
+#         isinstance(raw_act, torch.Tensor)
+#         and isinstance(act, torch.Tensor)
+#         and isinstance(val, torch.Tensor)
+#         and isinstance(logpro, torch.Tensor)
+#     ), 'Failed!'
+
+#     raw_act, act, val, logpro = actor_critic.step(obs)
+#     assert (
+#         isinstance(raw_act, torch.Tensor)
+#         and isinstance(act, torch.Tensor)
+#         and isinstance(val, torch.Tensor)
+#         and isinstance(logpro, torch.Tensor)
+#     ), 'Failed!'
+
+#     raw_act, act, val, logpro = actor_critic.step(obs, deterministic=True)
+#     assert (
+#         isinstance(raw_act, torch.Tensor)
+#         and isinstance(act, torch.Tensor)
+#         and isinstance(val, torch.Tensor)
+#         and isinstance(logpro, torch.Tensor)
+#     ), 'Failed!'
+
+#     actor_critic.anneal_exploration(0.5)
+
+
+# @helpers.parametrize(
+#     obs_dim=[10],
+#     act_dim=[5],
+#     space_type=[Box, Discrete],
+#     shared_weights=[False],  # shared weights not implemented yet in discrete case.
+#     hidden_sizes=[64],
+#     activation=['tanh'],
+#     weight_initialization_mode=[
+#         'kaiming_uniform',
+#         'xavier_normal',
+#         'glorot',
+#         'xavier_uniform',
+#         'orthogonal',
+#     ],
+#     actor_type=['gaussian', 'gaussian_stdnet'],
+# )
+# def test_actor_q_critic(
+#     obs_dim: int,
+#     act_dim: int,
+#     space_type,
+#     shared_weights: bool,
+#     hidden_sizes: int,
+#     activation: str,
+#     weight_initialization_mode: str,
+#     actor_type: str,
+# ) -> None:
+#     """Test the Actor Critic class."""
+
+#     ac_kwargs = {
+#         'pi': {
+#             'hidden_sizes': [hidden_sizes, hidden_sizes],
+#             'activation': activation,
+#         },
+#         'val': {
+#             'hidden_sizes': [hidden_sizes, hidden_sizes],
+#             'activation': activation,
+#             'num_critics': 1,
+#         },
+#     }
+#     observation_space = Box(low=-1, high=1, shape=(obs_dim,))
+
+#     model_cfgs = Config(
+#         **{
+#             'actor_type': actor_type,
+#             'ac_kwargs': ac_kwargs,
+#             'weight_initialization_mode': weight_initialization_mode,
+#             'shared_weights': shared_weights,
+#         }
+#     )
+
+#     if space_type == Discrete:
+#         action_space = space_type(act_dim)
+#     else:
+#         action_space = space_type(low=-1, high=1, shape=(act_dim,))
+
+#     actor_critic = ActorQCritic(
+#         observation_space=observation_space,
+#         action_space=action_space,
+#         model_cfgs=model_cfgs,
+#     )
+
+#     obs = torch.randn((1, obs_dim), dtype=torch.float32)
+
+#     raw_act, act, val, logpro = actor_critic(obs)
+#     assert (
+#         isinstance(raw_act, torch.Tensor)
+#         and isinstance(act, torch.Tensor)
+#         and isinstance(val, torch.Tensor)
+#         and isinstance(logpro, torch.Tensor)
+#     ), 'Failed!'
+
+#     raw_act, act, val, logpro = actor_critic.step(obs)
+#     assert (
+#         isinstance(raw_act, torch.Tensor)
+#         and isinstance(act, torch.Tensor)
+#         and isinstance(val, torch.Tensor)
+#         and isinstance(logpro, torch.Tensor)
+#     ), 'Failed!'
+
+#     raw_act, act, val, logpro = actor_critic.step(obs, deterministic=True)
+#     assert (
+#         isinstance(raw_act, torch.Tensor)
+#         and isinstance(act, torch.Tensor)
+#         and isinstance(val, torch.Tensor)
+#         and isinstance(logpro, torch.Tensor)
+#     ), 'Failed!'
+
+#     actor_critic.anneal_exploration(0.5)
diff --git a/tests/test_policy.py b/tests/test_policy.py
index 8dd9b3ec0..12a40b70e 100644
--- a/tests/test_policy.py
+++ b/tests/test_policy.py
@@ -24,186 +24,193 @@
 naive_lagrange_policy = ['PPOLag', 'TRPOLag', 'RCPO', 'OnCRPO', 'PDO']
 first_order_policy = ['CUP', 'FOCOPS']
 second_order_policy = ['CPO', 'PCPO']
-pid_lagrange_policy = ['CPPOPid', 'TRPOPid']
-early_terminated_policy = ['PPOEarlyTerminated', 'PPOLagEarlyTerminated']
-saute_policy = ['PPOSaute', 'PPOLagSaute']
-simmer_policy = ['PPOSimmerQ', 'PPOLagSimmerQ', 'PPOSimmerPid', 'PPOLagSimmerPid']
-penalty_policy = ['P3O', 'IPO']
-model_based_policy = ['MBPPOLag', 'SafeLOOP', 'CAP']
+# pid_lagrange_policy = ['CPPOPid', 'TRPOPid']
+# early_terminated_policy = ['PPOEarlyTerminated', 'PPOLagEarlyTerminated']
+# saute_policy = ['PPOSaute', 'PPOLagSaute']
+# simmer_policy = ['PPOSimmerQ', 'PPOLagSimmerQ', 'PPOSimmerPid', 'PPOLagSimmerPid']
+# penalty_policy = ['P3O', 'IPO']
+# model_based_policy = ['MBPPOLag', 'SafeLOOP', 'CAP']
 
 
-@helpers.parametrize(algo=base_policy)
+@helpers.parametrize(
+    algo=base_policy + naive_lagrange_policy + first_order_policy + second_order_policy
+)
 def test_base_policy(algo):
     """Test base algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
+    env_id = 'SafetyPointGoal1-v0'
     custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
+        'train_cfgs': {
+            'total_steps': 2000,
+            'vector_env_nums': 1,
+        },
+        'algo_cfgs': {
+            'update_cycle': 1000,
+            'update_iters': 2,
+        },
+        'logger_cfgs': {
+            'use_wandb': False,
+        },
     }
     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs)
     agent.learn()
 
 
-@helpers.parametrize(off_policy_algo=omnisafe.ALGORITHMS['off-policy'])
-def test_off_policy(off_policy_algo):
-    """Test off policy algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'update_after': 999,
-        'update_every': 1,
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(off_policy_algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=naive_lagrange_policy)
-def test_naive_lagrange_policy(algo):
-    """Test naive lagrange algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=first_order_policy)
-def test_first_order_policy(algo):
-    """Test first order algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=second_order_policy)
-def test_second_order_policy(algo):
-    """Test second order algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'cost_limit': 0.01,
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=pid_lagrange_policy)
-def test_pid_lagrange_policy(algo):
-    """Test pid lagrange algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=penalty_policy)
-def test_penalty_policy(algo):
-    """Test penalty algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'parallel': 2,
-        'cost_limit': 0.01,
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=early_terminated_policy)
-def test_early_terminated_policy(algo):
-    """Test early terminated algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=saute_policy)
-def test_saute_policy(algo):
-    """Test Saute algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-@helpers.parametrize(algo=simmer_policy)
-def test_simmer_policy(algo):
-    """Test Simmer algorithms."""
-    env_id = 'SafetyHumanoidVelocity-v4'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()
-
-
-def test_evaluate_saved_policy():
-    """Test evaluate policy."""
-    DIR = os.path.join(os.path.dirname(__file__), 'saved_policy')
-    evaluator = omnisafe.Evaluator()
-    for algo in os.scandir(DIR):
-        algo_path = os.path.join(DIR, algo)
-        for exp in os.scandir(algo_path):
-            exp_path = os.path.join(algo_path, exp)
-            for item in os.scandir(os.path.join(exp_path, 'torch_save')):
-                if item.is_file() and item.name.split('.')[-1] == 'pt':
-                    evaluator.load_saved_model(save_dir=exp_path, model_name=item.name)
-                    evaluator.evaluate(num_episodes=1)
-                    evaluator.render(num_episode=1, camera_name='track', width=256, height=256)
+# @helpers.parametrize(off_policy_algo=omnisafe.ALGORITHMS['off-policy'])
+# def test_off_policy(off_policy_algo):
+#     """Test off policy algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'update_after': 999,
+#         'update_every': 1,
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(off_policy_algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=naive_lagrange_policy)
+# def test_naive_lagrange_policy(algo):
+#     """Test naive lagrange algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=first_order_policy)
+# def test_first_order_policy(algo):
+#     """Test first order algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=second_order_policy)
+# def test_second_order_policy(algo):
+#     """Test second order algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'cost_limit': 0.01,
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=pid_lagrange_policy)
+# def test_pid_lagrange_policy(algo):
+#     """Test pid lagrange algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=penalty_policy)
+# def test_penalty_policy(algo):
+#     """Test penalty algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'parallel': 2,
+#         'cost_limit': 0.01,
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=early_terminated_policy)
+# def test_early_terminated_policy(algo):
+#     """Test early terminated algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=saute_policy)
+# def test_saute_policy(algo):
+#     """Test Saute algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# @helpers.parametrize(algo=simmer_policy)
+# def test_simmer_policy(algo):
+#     """Test Simmer algorithms."""
+#     env_id = 'SafetyHumanoidVelocity-v4'
+#     custom_cfgs = {
+#         'epochs': 1,
+#         'steps_per_epoch': 1000,
+#         'pi_iters': 1,
+#         'critic_iters': 1,
+#         'env_cfgs': {'num_envs': 1},
+#         'use_wandb': False,
+#     }
+#     agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
+#     agent.learn()
+
+
+# def test_evaluate_saved_policy():
+#     """Test evaluate policy."""
+#     DIR = os.path.join(os.path.dirname(__file__), 'saved_policy')
+#     evaluator = omnisafe.Evaluator()
+#     for algo in os.scandir(DIR):
+#         algo_path = os.path.join(DIR, algo)
+#         for exp in os.scandir(algo_path):
+#             exp_path = os.path.join(algo_path, exp)
+#             for item in os.scandir(os.path.join(exp_path, 'torch_save')):
+#                 if item.is_file() and item.name.split('.')[-1] == 'pt':
+#                     evaluator.load_saved_model(save_dir=exp_path, model_name=item.name)
+#                     evaluator.evaluate(num_episodes=1)
+#                     evaluator.render(num_episode=1, camera_name='track', width=256, height=256)
diff --git a/tests/test_safety_gym_envs.py b/tests/test_safety_gym_envs.py
deleted file mode 100644
index 7491badc4..000000000
--- a/tests/test_safety_gym_envs.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2022-2023 OmniSafe Team. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test Environments"""
-
-import helpers
-import omnisafe
-
-
-@helpers.parametrize(
-    algo=['PPOSimmerQ', 'PPOLag', 'PPOSaute', 'PPOEarlyTerminated'],
-    agent_id=['Point', 'Car', 'Racecar'],
-    env_id=[
-        'Goal',
-        'Button',
-        'Push',
-    ],
-    level=['1'],
-)
-def test_safety_nvigation(algo, agent_id, env_id, level):
-    """Test environments."""
-    env_id = 'Safety' + agent_id + env_id + level + '-v0'
-    # env_id = 'PointGoal1'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    # agent.set_seed(seed=0)
-    agent.learn()
-
-
-@helpers.parametrize(
-    algo=['PPOSimmerQ', 'PPOLag', 'PPOSaute', 'PPOEarlyTerminated'],
-    agent_id=['Ant', 'Humanoid', 'Walker2d', 'Hopper', 'HalfCheetah', 'Swimmer'],
-    env_id=['Velocity'],
-)
-def test_safety_velocity(algo, agent_id, env_id):
-    """Test environments."""
-    env_id = 'Safety' + agent_id + env_id + '-v4'
-    # env_id = 'PointGoal1'
-    custom_cfgs = {
-        'epochs': 1,
-        'steps_per_epoch': 1000,
-        'pi_iters': 1,
-        'critic_iters': 1,
-        'env_cfgs': {'num_envs': 1},
-        'parallel': 1,
-        'use_wandb': False,
-    }
-    agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs, parallel=1)
-    agent.learn()