Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support isaac gym interface #325

Merged
merged 15 commits into from
May 2, 2024
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,13 @@ repos:
^examples/|
^tests/|
^setup.py$|
^docs/source/conf.py$|
^omnisafe/envs/classic_control/envs_from_crabs.py$|
^omnisafe/common/control_barrier_function/crabs/models.py$|
^omnisafe/common/control_barrier_function/crabs/optimizers.py$|
^omnisafe/common/control_barrier_function/crabs/utils.py$|
^omnisafe/algorithms/off_policy/crabs.py$
^omnisafe/algorithms/off_policy/crabs.py$|
^omnisafe/utils/isaac_gym_utils.py$|
^docs/source/conf.py$
)
- repo: https://github.com/pycqa/pydocstyle
rev: 6.3.0
Expand Down
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ ignore=CVS,.vscode,.history
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\' represents the directory delimiter on Windows systems, it
# can't be used as an escape character.
ignore-paths=^examples/$,^tests/$
ignore-paths=^examples/$,^tests/$,^omnisafe/utils/isaac_gym_utils.py$,

# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,21 @@ Here is a list of environments that [Safety-Gymnasium](https://www.safety-gymnas
<td>HalfCheetah, Hopper, Swimmer, Walker2d, Ant, Humanoid</td>
<td>SafetyHumanoidVelocity-v1</td>
</tr>
<tr>
<td rowspan="4">Safe Isaac Gym</td>
<td>OverSafeFinger</td>
<td rowspan="4">ShadowHand</td>
<td rowspan="4">ShadowHandOverSafeFinger</td>
</tr>
<tr>
<td>OverSafeJoint</td>
</tr>
<tr>
<td>CatchOver2UnderarmSafeFinger</td>
</tr>
<tr>
<td>CatchOver2UnderarmSafeJoint</td>
</tr>
</tbody>
</table>

Expand Down
6 changes: 6 additions & 0 deletions omnisafe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
# ==============================================================================
"""OmniSafe: A comprehensive and reliable benchmark for safe reinforcement learning."""

from contextlib import suppress


with suppress(ImportError):
from isaacgym import gymutil

from omnisafe import algorithms
from omnisafe.algorithms import ALGORITHMS
from omnisafe.algorithms.algo_wrapper import AlgoWrapper as Agent
Expand Down
4 changes: 2 additions & 2 deletions omnisafe/adapter/crabs_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ def eval_policy( # pylint: disable=too-many-locals
"""
for _ in range(episode):
ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
obs, _ = self._eval_env.reset()
obs, _ = self._eval_env.reset() # type: ignore
obs = obs.to(self._device)

done = False
while not done:
act = agent.step(obs, deterministic=False)
obs, reward, cost, terminated, truncated, info = self._eval_env.step(act)
obs, reward, cost, terminated, truncated, info = self._eval_env.step(act) # type: ignore
obs, reward, cost, terminated, truncated = (
torch.as_tensor(x, dtype=torch.float32, device=self._device)
for x in (obs, reward, cost, terminated, truncated)
Expand Down
1 change: 1 addition & 0 deletions omnisafe/adapter/offpolicy_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def eval_policy( # pylint: disable=too-many-locals
agent (ConstraintActorCritic): Agent.
logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``.
"""
assert self._eval_env, 'Environment for evaluation has not been set!'
for _ in range(episode):
ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
obs, _ = self._eval_env.reset()
Expand Down
46 changes: 35 additions & 11 deletions omnisafe/adapter/online_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,17 @@ def __init__( # pylint: disable=too-many-arguments
env_cfgs = self._cfgs.env_cfgs.todict()

self._env: CMDP = make(env_id, num_envs=num_envs, device=self._device, **env_cfgs)
self._eval_env: CMDP = make(env_id, num_envs=1, device=self._device, **env_cfgs)

self._wrapper(
obs_normalize=cfgs.algo_cfgs.obs_normalize,
reward_normalize=cfgs.algo_cfgs.reward_normalize,
cost_normalize=cfgs.algo_cfgs.cost_normalize,
)

self._eval_env: CMDP | None = None
if self._env.need_evaluation:
self._eval_env = make(env_id, num_envs=1, device=self._device, **env_cfgs)
self._wrapper_eval(obs_normalize=cfgs.algo_cfgs.obs_normalize)

self._env.set_seed(seed)

def _wrapper(
Expand Down Expand Up @@ -116,32 +119,53 @@ def _wrapper(
"""
if self._env.need_time_limit_wrapper:
assert (
self._env.max_episode_steps and self._eval_env.max_episode_steps
self._env.max_episode_steps
), 'You must define max_episode_steps as an integer\
or cancel the use of the time_limit wrapper.'
\nor cancel the use of the time_limit wrapper.'
self._env = TimeLimit(
self._env,
time_limit=self._env.max_episode_steps,
device=self._device,
)
self._eval_env = TimeLimit(
self._eval_env,
time_limit=self._eval_env.max_episode_steps,
device=self._device,
)
if self._env.need_auto_reset_wrapper:
self._env = AutoReset(self._env, device=self._device)
if obs_normalize:
self._env = ObsNormalize(self._env, device=self._device)
self._eval_env = ObsNormalize(self._eval_env, device=self._device)
if reward_normalize:
self._env = RewardNormalize(self._env, device=self._device)
if cost_normalize:
self._env = CostNormalize(self._env, device=self._device)
self._env = ActionScale(self._env, low=-1.0, high=1.0, device=self._device)
self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device)
if self._env.num_envs == 1:
self._env = Unsqueeze(self._env, device=self._device)

def _wrapper_eval(
self,
obs_normalize: bool = True,
) -> None:
"""Wrapper the environment for evaluation.

Args:
obs_normalize (bool, optional): Whether to normalize the observation. Defaults to True.
reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True.
cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True.
"""
assert self._eval_env, 'Your environment for evaluation does not exist!'
if self._env.need_time_limit_wrapper:
assert (
self._eval_env.max_episode_steps
), 'You must define max_episode_steps as an\
\ninteger or cancel the use of the time_limit wrapper.'
self._eval_env = TimeLimit(
self._eval_env,
time_limit=self._eval_env.max_episode_steps,
device=self._device,
)
if self._env.need_auto_reset_wrapper:
self._eval_env = AutoReset(self._eval_env, device=self._device)
if obs_normalize:
self._eval_env = ObsNormalize(self._eval_env, device=self._device)
self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device)
self._eval_env = Unsqueeze(self._eval_env, device=self._device)

@property
Expand Down
11 changes: 8 additions & 3 deletions omnisafe/adapter/onpolicy_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,20 @@ def rollout( # pylint: disable=too-many-locals

obs = next_obs
epoch_end = step >= steps_per_epoch - 1
if epoch_end:
num_dones = int(terminated.contiguous().sum())
if self._env.num_envs - num_dones:
logger.log(
f'\nWarning: trajectory cut off when rollout by epoch\
in {self._env.num_envs - num_dones} of {self._env.num_envs} environments.',
)

for idx, (done, time_out) in enumerate(zip(terminated, truncated)):
if epoch_end or done or time_out:
last_value_r = torch.zeros(1)
last_value_c = torch.zeros(1)
if not done:
if epoch_end:
logger.log(
muchvo marked this conversation as resolved.
Show resolved Hide resolved
f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.',
)
_, last_value_r, last_value_c, _ = agent.step(obs[idx])
if time_out:
_, last_value_r, last_value_c, _ = agent.step(
Expand Down
156 changes: 156 additions & 0 deletions omnisafe/configs/on-policy/PPO.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,159 @@ defaults:
activation: tanh
# learning rate
lr: 0.0003

ShadowHandCatchOver2UnderarmSafeFinger:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize reward
reward_normalize: False
# normalize cost
cost_normalize: False
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006

ShadowHandOverSafeFinger:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006

ShadowHandCatchOver2UnderarmSafeJoint:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize reward
reward_normalize: False
# normalize cost
cost_normalize: False
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006

ShadowHandOverSafeJoint:
# training configurations
train_cfgs:
# number of vectorized environments
vector_env_nums: 256
# total number of steps to train
total_steps: 100000000
# algorithm configurations
algo_cfgs:
# number of steps to update the policy
steps_per_epoch: 38400
# number of iterations to update the policy
update_iters: 8
# batch size for each iteration
batch_size: 8192
# target kl divergence
target_kl: 0.016
# max gradient norm
max_grad_norm: 1.0
# use critic norm
use_critic_norm: False
# reward discount factor
gamma: 0.96
# normalize observation
obs_normalize: False
# model configurations
model_cfgs:
# actor network configurations
actor:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
critic:
# hidden layer sizes
hidden_sizes: [1024, 1024, 512]
# learning rate
lr: 0.0006
Loading
Loading