PKU-Alignment · Gaiejj · Apr 17, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,13 +11,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Features
 
+- Feat(pid-lagrange, test): add algo and update test [@Jiayi Zhou](https://github.com/Gaiejj) in PR [#210](https://github.com/OmniSafeAI/omnisafe/pull/210).
+
+- Feat(saute, simmer): support saute rl and clean the code [@Jiayi Zhou](https://github.com/Gaiejj) in PR [#209](https://github.com/OmniSafeAI/omnisafe/pull/209).
+
+- Feat(off-policy): support off-policy lag by [@Jiayi Zhou](https://github.com/Gaiejj) in PR [#204](https://github.com/OmniSafeAI/omnisafe/pull/204).
+
+- Chore: upload tutorial by [@Borong Zhang](https://github.com/muchvo) in PR [#201](https://github.com/OmniSafeAI/omnisafe/pull/201).
+
+- Chore(pre-commit): [pre-commit.ci] autoupdate by [@pre-commit.ci](https://github.com/apps/pre-commit-ci) in PR [#200](https://github.com/OmniSafeAI/omnisafe/pull/200).
+
+- Feat: update CLI for gpu and statistics tools by [@Borong Zhang](https://github.com/muchvo) in PR [#192](https://github.com/OmniSafeAI/omnisafe/pull/192).
+
 - Feat: add `ruff` and `codespell` integration by [@XuehaiPan](https://github.com/XuehaiPan) in PR [#186](https://github.com/OmniSafeAI/omnisafe/pull/186).
 
 ### Fixes
 
+- Fix: enable smooth param in Costs when plotting [@Borong Zhang](https://github.com/muchvo) in PR [#208](https://github.com/OmniSafeAI/omnisafe/pull/208).
+
+- Fix(off-policy): fix log when not update by [@Jiayi Zhou](https://github.com/Gaiejj) in PR [#206](https://github.com/OmniSafeAI/omnisafe/pull/206).
+
+- Fix: check duplicated parameters and values which are specified in experiment grid by [@Borong Zhang](https://github.com/muchvo) in PR [#203](https://github.com/OmniSafeAI/omnisafe/pull/203).
+
+- Fix(experiment grid): fix file path problem when using gpu in experiment grid by [@Borong Zhang](https://github.com/muchvo) in PR [#194](https://github.com/OmniSafeAI/omnisafe/pull/194).
+
 ### Documentation
 
+- Docs: fix small typo in README.md by [@mickelliu](https://github.com/mickelliu) in PR [#211](https://github.com/OmniSafeAI/omnisafe/pull/211).
+
+- Docs: change link to OmniSafeAI by [@Jiaming Ji](https://github.com/zmsn-2077) in PR [#205](https://github.com/OmniSafeAI/omnisafe/pull/205).
+
+- Docs: update api documents by [@Jiayi Zhou](https://github.com/Gaiejj) in PR [#191](https://github.com/OmniSafeAI/omnisafe/pull/191).
+
+### Refactor
+
+- Refactor(algo_wrapper, configs): rename update cycle and refactor structure by [@Jiayi Zhou](https://github.com/Gaiejj) in PR [#213](https://github.com/OmniSafeAI/omnisafe/pull/213).
+
+- Refactor: update hyper-parameters for first-order algorithms by [@Borong Zhang](https://github.com/muchvo) in PR [#199](https://github.com/OmniSafeAI/omnisafe/pull/199).
 
+- Refactor: condense top-level benchmarks by [@Jiaming Ji](https://github.com/zmsn-2077) in PR [#198](https://github.com/OmniSafeAI/omnisafe/pull/198).
 
 ## v0.2.2
 

diff --git a/README.md b/README.md
@@ -243,7 +243,7 @@ omnisafe eval ./saved_source/PPO-{SafetyPointGoal1-v0} --num-episode 1
 
 # Quick training some algorithms to validate your thoughts
 # Note: use `key1:key2`, your can select key of hyperparameters which are recursively contained, and use `--custom-cfgs`, you can add custom cfgs via CLI
-omnisafe train --algo PPO --total-steps 2048 --vector-env-nums 1 --custom-cfgs algo_cfgs:update_cycle --custom-cfgs 1024
+omnisafe train --algo PPO --total-steps 2048 --vector-env-nums 1 --custom-cfgs algo_cfgs:steps_per_epoch --custom-cfgs 1024
 
 # Quick training some algorithms via a saved config file, the format is as same as default format
 omnisafe train-config ./saved_source/train_config.yaml

diff --git a/docs/source/baserl/ppo.rst b/docs/source/baserl/ppo.rst
@@ -340,7 +340,7 @@ Quick start
                         'parallel': 1,
                     },
                     'algo_cfgs': {
-                        'update_cycle': 2048,
+                        'steps_per_epoch': 2048,
                         'update_iters': 1,
                     },
                     'logger_cfgs': {
@@ -472,7 +472,7 @@ Configs
 
                 - clip (float): Clipping parameter for PPO.
 
-            - update_cycle (int): Number of steps to update the policy network.
+            - steps_per_epoch (int): Number of steps to update the policy network.
             - update_iters (int): Number of iterations to update the policy network.
             - batch_size (int): Batch size for each iteration.
             - target_kl (float): Target KL divergence.

diff --git a/docs/source/baserl/trpo.rst b/docs/source/baserl/trpo.rst
@@ -494,7 +494,7 @@ Quick start
                         'parallel': 1,
                     },
                     'algo_cfgs': {
-                        'update_cycle': 2048,
+                        'steps_per_epoch': 2048,
                         'update_iters': 1,
                     },
                     'logger_cfgs': {
@@ -757,7 +757,7 @@ Configs
                 - cg_iters (int): Number of iterations for conjugate gradient.
                 - fvp_sample_freq (int): Frequency of sampling for Fisher vector product.
 
-            - update_cycle (int): Number of steps to update the policy network.
+            - steps_per_epoch (int): Number of steps to update the policy network.
             - update_iters (int): Number of iterations to update the policy network.
             - batch_size (int): Batch size for each iteration.
             - target_kl (float): Target KL divergence.

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -264,7 +264,7 @@ We give an example below:
                             'parallel': 1,
                         },
                         'algo_cfgs': {
-                            'update_cycle': 2048,
+                            'steps_per_epoch': 2048,
                             'update_iters': 1,
                         },
                         'logger_cfgs': {

diff --git a/docs/source/saferl/cpo.rst b/docs/source/saferl/cpo.rst
@@ -460,7 +460,7 @@ Quick start
                         'parallel': 1,
                     },
                     'algo_cfgs': {
-                        'update_cycle': 2048,
+                        'steps_per_epoch': 2048,
                         'update_iters': 1,
                     },
                     'logger_cfgs': {
@@ -715,7 +715,7 @@ Configs
                 - cg_iters (int): Number of iterations for conjugate gradient.
                 - fvp_sample_freq (int): Frequency of sampling for Fisher vector product.
 
-            - update_cycle (int): Number of steps to update the policy network.
+            - steps_per_epoch (int): Number of steps to update the policy network.
             - update_iters (int): Number of iterations to update the policy network.
             - batch_size (int): Batch size for each iteration.
             - target_kl (float): Target KL divergence.

diff --git a/docs/source/saferl/focops.rst b/docs/source/saferl/focops.rst
@@ -448,7 +448,7 @@ Quick start
                         'parallel': 1,
                     },
                     'algo_cfgs': {
-                        'update_cycle': 2048,
+                        'steps_per_epoch': 2048,
                         'update_iters': 1,
                     },
                     'logger_cfgs': {
@@ -590,7 +590,7 @@ Configs
 
                 - clip (float): Clipping parameter for FOCOPS.
 
-            - update_cycle (int): Number of steps to update the policy network.
+            - steps_per_epoch (int): Number of steps to update the policy network.
             - update_iters (int): Number of iterations to update the policy network.
             - batch_size (int): Batch size for each iteration.
             - target_kl (float): Target KL divergence.

diff --git a/docs/source/saferl/lag.rst b/docs/source/saferl/lag.rst
@@ -311,7 +311,7 @@ Quick start
                         'parallel': 1,
                     },
                     'algo_cfgs': {
-                        'update_cycle': 2048,
+                        'steps_per_epoch': 2048,
                         'update_iters': 1,
                     },
                     'logger_cfgs': {
@@ -450,7 +450,7 @@ Configs
 
                 - clip (float): Clipping parameter for PPOLag.
 
-            - update_cycle (int): Number of steps to update the policy network.
+            - steps_per_epoch (int): Number of steps to update the policy network.
             - update_iters (int): Number of iterations to update the policy network.
             - batch_size (int): Batch size for each iteration.
             - target_kl (float): Target KL divergence.

diff --git a/docs/source/saferl/pcpo.rst b/docs/source/saferl/pcpo.rst
@@ -438,7 +438,7 @@ Quick start
                         'parallel': 1,
                     },
                     'algo_cfgs': {
-                        'update_cycle': 2048,
+                        'steps_per_epoch': 2048,
                         'update_iters': 1,
                     },
                     'logger_cfgs': {
@@ -674,7 +674,7 @@ Configs
                 - cg_iters (int): Number of iterations for conjugate gradient.
                 - fvp_sample_freq (int): Frequency of sampling for Fisher vector product.
 
-            - update_cycle (int): Number of steps to update the policy network.
+            - steps_per_epoch (int): Number of steps to update the policy network.
             - update_iters (int): Number of iterations to update the policy network.
             - batch_size (int): Batch size for each iteration.
             - target_kl (float): Target KL divergence.

diff --git a/docs/source/start/usage.rst b/docs/source/start/usage.rst
@@ -33,7 +33,7 @@ Train policy
         --algo PPO
         --total-steps 1024
         --vector-env-nums 1
-        --custom-cfgs algo_cfgs:update_cycle
+        --custom-cfgs algo_cfgs:steps_per_epoch
         --custom-cfgs 512
 
     Here we provide a video example:
@@ -44,7 +44,7 @@ Train policy
 
 
 .. hint::
-    The above command will train a policy with PPO algorithm, and the total training steps is 1024. The vector environment number is 1, which means that the training process will use 1 CPU core. The ``algo_cfgs:update_cycle`` is the update cycle of the PPO algorithm, which means that the policy will be updated every 512 steps.
+    The above command will train a policy with PPO algorithm, and the total training steps is 1024. The vector environment number is 1, which means that the training process will use 1 CPU core. The ``algo_cfgs:steps_per_epoch`` is the update cycle of the PPO algorithm, which means that the policy will be updated every 512 steps.
 
 Customize Configuration
 -----------------------

diff --git a/examples/benchmarks/example_cli_benchmark_config.yaml b/examples/benchmarks/example_cli_benchmark_config.yaml
@@ -25,7 +25,7 @@ train_cfgs:torch_threads:
   [1]
 train_cfgs:total_steps:
   1024
-algo_cfgs:update_cycle:
+algo_cfgs:steps_per_epoch:
   512
 seed:
   [0]
diff --git a/examples/benchmarks/run_experiment_grid.py b/examples/benchmarks/run_experiment_grid.py
@@ -101,8 +101,8 @@ def train(
     eg.add('logger_cfgs:use_wandb', [False])
     eg.add('train_cfgs:vector_env_nums', [4])
     eg.add('train_cfgs:torch_threads', [1])
-    eg.add('algo_cfgs:update_cycle', [2048])
-    eg.add('train_cfgs:total_steps', [1024000])
+    eg.add('algo_cfgs:steps_per_epoch', [20000])
+    eg.add('train_cfgs:total_steps', [10000000])
     eg.add('seed', [0])
     # total experiment num must can be divided by num_pool
     # meanwhile, users should decide this value according to their machine

diff --git a/examples/train_from_custom_dict.py b/examples/train_from_custom_dict.py
@@ -25,7 +25,7 @@
         'parallel': 1,
     },
     'algo_cfgs': {
-        'update_cycle': 2048,
+        'steps_per_epoch': 2048,
         'update_iters': 1,
     },
     'logger_cfgs': {

diff --git a/images/CLI_example.svg b/images/CLI_example.svg
diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py
@@ -54,6 +54,7 @@ def __init__(
         self._plotter: Plotter = None
         self.cfgs = self._init_config()
         self._init_checks()
+        self._init_algo()
 
     def _init_config(self):
         """Init config."""
@@ -94,7 +95,7 @@ def _init_config(self):
         exp_name = f'{self.algo}-{{{self.env_id}}}'
         cfgs.recurisve_update({'exp_name': exp_name, 'env_id': self.env_id, 'algo': self.algo})
         cfgs.train_cfgs.recurisve_update(
-            {'epochs': cfgs.train_cfgs.total_steps // cfgs.algo_cfgs.update_cycle},
+            {'epochs': cfgs.train_cfgs.total_steps // cfgs.algo_cfgs.steps_per_epoch},
         )
         return cfgs
 
@@ -107,8 +108,8 @@ def _init_checks(self):
             self.env_id in support_envs()
         ), f"{self.env_id} doesn't exist. Please choose from {support_envs()}."
 
-    def learn(self):
-        """Agent Learning."""
+    def _init_algo(self):
+        """Init algo."""
         # Use number of physical cores as default.
         # If also hardware threading CPUs should be used
         # enable this by the use_number_of_threads=True
@@ -129,6 +130,9 @@ def learn(self):
             env_id=self.env_id,
             cfgs=self.cfgs,
         )
+
+    def learn(self):
+        """Agent Learning."""
         ep_ret, ep_cost, ep_len = self.agent.learn()
 
         self._init_statistical_tools()

diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py
@@ -52,23 +52,25 @@ def _init_env(self) -> None:
             self._seed,
             self._cfgs,
         )
-        assert (self._cfgs.algo_cfgs.update_cycle) % (
+        assert (self._cfgs.algo_cfgs.steps_per_epoch) % (
             distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums
         ) == 0, 'The number of steps per epoch is not divisible by the number of environments.'
 
         assert (
-            int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.update_cycle == 0
+            int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0
         ), 'The total number of steps is not divisible by the number of steps per epoch.'
-        self._epochs = int(self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.update_cycle)
+        self._epochs = int(
+            self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch,
+        )
         self._epoch = 0
-        self._update_cycle = self._cfgs.algo_cfgs.update_cycle // (
+        self._steps_per_epoch = self._cfgs.algo_cfgs.steps_per_epoch // (
             distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums
         )
-        self._steps_per_sample = self._cfgs.algo_cfgs.steps_per_sample
+        self._update_cycle = self._cfgs.algo_cfgs.update_cycle
         assert (
-            self._update_cycle % self._steps_per_sample == 0
+            self._steps_per_epoch % self._update_cycle == 0
         ), 'The number of steps per epoch is not divisible by the number of steps per sample.'
-        self._samples_per_epoch = self._update_cycle // self._steps_per_sample
+        self._samples_per_epoch = self._steps_per_epoch // self._update_cycle
         self._update_count = 0
 
     def _init_model(self) -> None:
@@ -80,9 +82,6 @@ def _init_model(self) -> None:
             epochs=self._epochs,
         ).to(self._device)
 
-        if distributed.world_size() > 1:
-            distributed.sync_params(self._actor_critic)
-
     def _init(self) -> None:
         self._buf = VectorOffPolicyBuffer(
             obs_space=self._env.observation_space,
@@ -161,7 +160,7 @@ def learn(self) -> tuple[int | float, ...]:
                 epoch * self._samples_per_epoch,
                 (epoch + 1) * self._samples_per_epoch,
             ):
-                step = sample_step * self._steps_per_sample * self._cfgs.train_cfgs.vector_env_nums
+                step = sample_step * self._update_cycle * self._cfgs.train_cfgs.vector_env_nums
 
                 roll_out_start = time.time()
                 # set noise for exploration
@@ -170,7 +169,7 @@ def learn(self) -> tuple[int | float, ...]:
 
                 # collect data from environment
                 self._env.roll_out(
-                    roll_out_step=self._steps_per_sample,
+                    roll_out_step=self._update_cycle,
                     agent=self._actor_critic,
                     buffer=self._buf,
                     logger=self._logger,
@@ -204,8 +203,8 @@ def learn(self) -> tuple[int | float, ...]:
 
             self._logger.store(
                 **{
-                    'TotalEnvSteps': step,
-                    'Time/FPS': self._cfgs.algo_cfgs.update_cycle / (time.time() - epoch_time),
+                    'TotalEnvSteps': step + 1,
+                    'Time/FPS': self._cfgs.algo_cfgs.steps_per_epoch / (time.time() - epoch_time),
                     'Time/Total': (time.time() - start_time),
                     'Time/Epoch': (time.time() - epoch_time),
                     'Train/Epoch': epoch,

diff --git a/omnisafe/algorithms/off_policy/sac.py b/omnisafe/algorithms/off_policy/sac.py
@@ -52,9 +52,6 @@ def _init_model(self) -> None:
             epochs=self._epochs,
         ).to(self._device)
 
-        if distributed.world_size() > 1:
-            distributed.sync_params(self._actor_critic)
-
     def _init(self) -> None:
         super()._init()
         if self._cfgs.algo_cfgs.auto_alpha:

diff --git a/omnisafe/algorithms/off_policy/td3.py b/omnisafe/algorithms/off_policy/td3.py
@@ -44,9 +44,6 @@ def _init_model(self) -> None:
             epochs=self._epochs,
         ).to(self._device)
 
-        if distributed.world_size() > 1:
-            distributed.sync_params(self._actor_critic)
-
     def _update_reward_critic(
         self,
         obs: torch.Tensor,

diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py
@@ -62,11 +62,11 @@ def _init_env(self) -> None:
             self._seed,
             self._cfgs,
         )
-        assert (self._cfgs.algo_cfgs.update_cycle) % (
+        assert (self._cfgs.algo_cfgs.steps_per_epoch) % (
             distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums
         ) == 0, 'The number of steps per epoch is not divisible by the number of environments.'
         self._steps_per_epoch = (
-            self._cfgs.algo_cfgs.update_cycle
+            self._cfgs.algo_cfgs.steps_per_epoch
             // distributed.world_size()
             // self._cfgs.train_cfgs.vector_env_nums
         )
@@ -199,15 +199,15 @@ def _init_log(self) -> None:
         self._logger.setup_torch_saver(what_to_save)
         self._logger.torch_save()
 
-        self._logger.register_key('Metrics/EpRet', window_length=50, min_and_max=True)
+        self._logger.register_key('Metrics/EpRet', window_length=50)
         self._logger.register_key('Metrics/EpCost', window_length=50)
         self._logger.register_key('Metrics/EpLen', window_length=50)
 
         self._logger.register_key('Train/Epoch')
         self._logger.register_key('Train/Entropy')
         self._logger.register_key('Train/KL')
         self._logger.register_key('Train/StopIter')
-        self._logger.register_key('Train/PolicyRatio')
+        self._logger.register_key('Train/PolicyRatio', min_and_max=True)
         self._logger.register_key('Train/LR')
         if self._cfgs.model_cfgs.actor_type == 'gaussian_learning':
             self._logger.register_key('Train/PolicyStd')
@@ -270,8 +270,8 @@ def learn(self) -> tuple[int | float, ...]:
 
             self._logger.store(
                 **{
-                    'TotalEnvSteps': (epoch + 1) * self._cfgs.algo_cfgs.update_cycle,
-                    'Time/FPS': self._cfgs.algo_cfgs.update_cycle / (time.time() - epoch_time),
+                    'TotalEnvSteps': (epoch + 1) * self._cfgs.algo_cfgs.steps_per_epoch,
+                    'Time/FPS': self._cfgs.algo_cfgs.steps_per_epoch / (time.time() - epoch_time),
                     'Time/Total': (time.time() - start_time),
                     'Time/Epoch': (time.time() - epoch_time),
                     'Train/Epoch': epoch,