Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: update yaml #93

Merged
merged 1 commit into from
Feb 1, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 0 additions & 133 deletions omnisafe/configs/on-policy/PPO.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -286,136 +286,3 @@ SafetyRacecarGoal1-v0:
max_len: 100
# The number of threads used to sample data
num_threads: 20


SafetyHumanoidVelocity-v4:
# --------------------------------------Basic Configurations----------------------------------- #
## -----------------------------Basic configurations for base class PG------------------------ ##
# The random seed
seed: 0
# The torch device
device: cpu
# The torch device id
device_id: 0
# The environment wrapper type
wrapper_type: CMDPWrapper
# Number of epochs
epochs: 500
# Number of steps per epoch
steps_per_epoch: 32768
# Number of update iteration for Actor network
actor_iters: 40
# Number of update iteration for Critic network
critic_iters: 40
# Check if all models own the same parameter values every `check_freq` epoch
check_freq: 25
# Save model to disk every `check_freq` epochs
save_freq: 50
# Entropy coefficient for PPO loss
entropy_coef: 0.0
# The max length of per epoch
max_ep_len: 1000
# The size of mini batch
num_mini_batches: 64
# The learning rate of Actor network
actor_lr: 0.0003
# The learning rate of Critic network
critic_lr: 0.0003
# The Address for saving training process data
data_dir: "./runs"
## ---------------------------Basic configurations for derived class PPO---------------------- ##
# The thereshold for KL early stopping
target_kl: 0.02
# The size of batch for policy update
batch_size: 10000
# The clip range for PPO loss
clip: 0.2

# ---------------------------------------Optional Configuration-------------------------------- #
## -----------------------------------Configuration For Cost Critic--------------------------- ##
# Whether to use cost critic
use_cost: False
# Cost discounted factor
cost_gamma: 1.0
# Whether to use linear decay of learning rate
linear_lr_decay: True
# Whether to use exploration noise anneal
exploration_noise_anneal: False
# The coefficient of reward penalty
penalty_param: 0.0
# Whether to use KL early stopping
kl_early_stopping: True
# Whether to use max gradient norm
use_max_grad_norm: True
# The thereshold of max gradient norm
max_grad_norm: 40
# Whether to use critic network norm
use_critic_norm: True
# The norm coefficient of critic network
critic_norm_coeff: 0.001
## ---------------------------------------Configuration For Model----------------------------- ##
model_cfgs:
# Whether to share the weight of Actor network with Critic network
shared_weights: False
# The mode to initiate the weight of network, choosing from "kaiming_uniform", "xavier_normal", "glorot" and "orthogonal".
weight_initialization_mode: "kaiming_uniform"
# Type of Actor, choosing from "gaussian_annealing", "gaussian_std_net_actor", "gaussian_learning_actor", "categorical_actor"
actor_type: gaussian
# Configuration of Actor and Critic network
ac_kwargs:
# Configuration of Actor network
pi:
# Size of hidden layers
hidden_sizes: [64, 64]
# Activation function
activation: tanh
# Output activation function
output_activation: identity
# Whether to scale action.
scale_action: False
# Whether to clip action.
clip_action: False
# Whther to learn the standard deviation of Gaussian noise
std_learning: True
# The initial value of standard deviation of Gaussian noise
std_init: 1.0
# Configuration of Critic network
val:
# Number of critic networks
num_critics: 1
# Size of hidden layers
hidden_sizes: [64, 64]
# Activation function
activation: tanh
## --------------------------------------Configuration For Buffer----------------------------- ##
buffer_cfgs:
# Reward discounted factor
gamma: 0.99
# Parameters used to estimate future rewards in GAE
lam: 0.95
# Parameters used to estimate future costs in GAE
lam_c: 0.95
# Method to estimate the advantage reward/cost, choosing from "gae", "plain", "vtrace"
adv_estimation_method: gae
# Whether to use reward standardized advantage estimation
standardized_rew_adv: True
# Whether to use cost standardized advantage estimation
standardized_cost_adv: True
## --------------------------------------Configuration For Environment------------------------ ##
env_cfgs:
# The seed of environment
env_seed: 0
# The number of parallel environments
num_envs: 8
# Whether to use async environment
async_env: True
# Whether to use standardized reward
normalized_rew: True
# Whether to use standardized cost
normalized_cost: False
# Whether to use standardized obs
normalized_obs: False
# The maximum length of record queue
max_len: 100
# The number of threads used to sample data
num_threads: 20