diff --git a/README.md b/README.md index 74930ed..84f57bb 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Please check our [**Online Documentation**](https://rlzoo.readthedocs.io) for de - [Contents](#contents) - [Algorithms](#algorithms) - [Environments](#environments) - - [Configurations](#configuration) + - [Configurations](#configurations) - [Properties](#properties) - [Troubleshooting](#troubleshooting) - [Credits](#credits) @@ -66,8 +66,14 @@ the coming months after initial release. We will keep improving the potential pr
Version History [click to expand]
+ +* 1.0.4 (Current version) -* 1.0.3 (Current version) + Changes: + + * Add distributed training for DPPO algorithm, using Kungfu + +* 1.0.3 Changes: @@ -279,6 +285,148 @@ python algorithms/ac/run_ac.py We also provide an interactive learning configuration with Jupyter Notebook and *ipywidgets*, where you can select the algorithm, environment, and general learning settings with simple clicking on dropdown lists and sliders! A video demonstrating the usage is as following. The interactive mode can be used with [`rlzoo/interactive/main.ipynb`](https://github.com/tensorlayer/RLzoo/blob/master/rlzoo/interactive/main.ipynb) by running `$ jupyter notebook` to open it. ![Interactive Video](https://github.com/tensorlayer/RLzoo/blob/master/gif/interactive.gif) + + +### Distributed Training +RLzoo supports distributed training frameworks across multiple computational nodes with multiple CPUs/GPUs, using the [Kungfu](https://github.com/lsds/KungFu) package. The installation of Kungfu requires to install *CMake* and *Golang* first, details see the [website of Kungfu](https://github.com/lsds/KungFu). +An example for distributed training is contained in folder `rlzoo/distributed`, by running the following command, you will launch the distributed training process: +```bash +rlzoo/distributed/run_dis_train.sh +``` +
Code in Bash script [click to expand] +
+ +```bash +#!/bin/sh +set -e + +cd $(dirname $0) + +kungfu_flags() { + echo -q + echo -logdir logs + + local ip1=127.0.0.1 + local np1=$np + + local ip2=127.0.0.10 + local np2=$np + local H=$ip1:$np1,$ip2:$np2 + local m=cpu,gpu + + echo -H $ip1:$np1 +} + +prun() { + local np=$1 + shift + kungfu-run $(kungfu_flags) -np $np $@ +} + +n_learner=2 +n_actor=2 +n_server=1 + +flags() { + echo -l $n_learner + echo -a $n_actor + echo -s $n_server +} + +rl_run() { + local n=$((n_learner + n_actor + n_server)) + prun $n python3 training_components.py $(flags) +} + +main() { + rl_run +} + +main +``` +The script specifies the ip addresses for different computational nodes, as well as the number of policy learners (updating the models), actors (sampling through interaction with environments) and inference servers (policy forward inference during sampling process) as `n_learner`, `n_actor` and `n_server` respectively. + +
+
+ +Other training details are specified in an individual Python script named `training_components.py` **within the same directory** as `run_dis_train.sh`, which can be seen as following. + +
Code in Python script [click to expand] +
+ +```python +from rlzoo.common.env_wrappers import build_env +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP +from functools import partial + +# Specify the training configurations +training_conf = { + 'total_step': int(1e7), # overall training timesteps + 'traj_len': 200, # length of the rollout trajectory + 'train_n_traj': 2, # update the models after every certain number of trajectories for each learner + 'save_interval': 10, # saving the models after every certain number of updates +} + +# Specify the environment and launch it +env_name, env_type = 'CartPole-v0', 'classic_control' +env_maker = partial(build_env, env_name, env_type) +temp_env = env_maker() +obs_shape, act_shape = temp_env.observation_space.shape, temp_env.action_space.shape + +env_conf = { + 'env_name': env_name, + 'env_type': env_type, + 'env_maker': env_maker, + 'obs_shape': obs_shape, + 'act_shape': act_shape, +} + + +def build_network(observation_space, action_space, name='DPPO_CLIP'): + """ build networks for the algorithm """ + hidden_dim = 256 + num_hidden_layer = 2 + critic = ValueNetwork(observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') + + actor = StochasticPolicyNetwork(observation_space, action_space, + [hidden_dim] * num_hidden_layer, + trainable=True, + name=name + '_policy') + return critic, actor + + +def build_opt(actor_lr=1e-4, critic_lr=2e-4): + """ choose the optimizer for learning """ + import tensorflow as tf + return [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + + +net_builder = partial(build_network, temp_env.observation_space, temp_env.action_space) +opt_builder = partial(build_opt, ) + +agent_conf = { + 'net_builder': net_builder, + 'opt_builder': opt_builder, + 'agent_generator': partial(DPPO_CLIP, net_builder, opt_builder), +} +del temp_env + +from rlzoo.distributed.start_dis_role import main + +print('Start Training.') +main(training_conf, env_conf, agent_conf) +print('Training Finished.') + +``` +Users can specify the environment, network architectures, optimizers and other training detains in this script. + +
+
+ +Note: if RLzoo is installed, you can create the two scripts `run_dis_train.sh` and `training_components.py` in whatever directory to launch distributed training, as long as the two scripts are in the same directory. + ## Contents @@ -399,8 +547,12 @@ Our core contributors include: [Tianyang Yu](https://github.com/Tokarev-TT-33), [Yanhua Huang](https://github.com/Officium), [Hongming Zhang](https://github.com/initial-h), +[Guo Li](https://github.com/lgarithm), +Quancheng Guo, +[Luo Mai](https://github.com/luomai), [Hao Dong](https://github.com/zsdonghao) + ## Citing ``` diff --git a/rlzoo/.gitignore b/rlzoo/.gitignore old mode 100644 new mode 100755 index 1120de8..e66da9b --- a/rlzoo/.gitignore +++ b/rlzoo/.gitignore @@ -1,4 +1,4 @@ -*.pyc -/img -/log -/model +*.pyc +/img +/log +/model diff --git a/rlzoo/__init__.py b/rlzoo/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/__init__.py b/rlzoo/algorithms/__init__.py old mode 100644 new mode 100755 index b3b8ef9..d8f38d3 --- a/rlzoo/algorithms/__init__.py +++ b/rlzoo/algorithms/__init__.py @@ -1,14 +1,14 @@ -from .ac.ac import AC -from .pg.pg import PG -from .dqn.dqn import DQN -from .a3c.a3c import A3C -from .ddpg.ddpg import DDPG -from .td3.td3 import TD3 -from .sac.sac import SAC -from .ppo.ppo import PPO -from .ppo_penalty.ppo_penalty import PPO_PENALTY -from .ppo_clip.ppo_clip import PPO_CLIP -from .dppo.dppo import DPPO -from .dppo_penalty.dppo_penalty import DPPO_PENALTY -from .dppo_clip.dppo_clip import DPPO_CLIP -from .trpo.trpo import TRPO +from .ac.ac import AC +from .pg.pg import PG +from .dqn.dqn import DQN +from .a3c.a3c import A3C +from .ddpg.ddpg import DDPG +from .td3.td3 import TD3 +from .sac.sac import SAC +from .ppo.ppo import PPO +from .ppo_penalty.ppo_penalty import PPO_PENALTY +from .ppo_clip.ppo_clip import PPO_CLIP +from .dppo.dppo import DPPO +from .dppo_penalty.dppo_penalty import DPPO_PENALTY +from .dppo_clip.dppo_clip import DPPO_CLIP +from .trpo.trpo import TRPO diff --git a/rlzoo/algorithms/a3c/__init__.py b/rlzoo/algorithms/a3c/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/a3c/a3c.py b/rlzoo/algorithms/a3c/a3c.py old mode 100644 new mode 100755 index 8ed6348..867fa21 --- a/rlzoo/algorithms/a3c/a3c.py +++ b/rlzoo/algorithms/a3c/a3c.py @@ -1,275 +1,275 @@ -""" -Asynchronous Advantage Actor Critic (A3C) with Continuous Action Space. - -Actor Critic History ----------------------- -A3C > DDPG (for continuous action space) > AC - -Advantage ----------- -Train faster and more stable than AC. - -Disadvantage -------------- -Have bias. - -Reference ----------- -Original Paper: https://arxiv.org/pdf/1602.01783.pdf -MorvanZhou's tutorial: https://morvanzhou.github.io/tutorials/ -MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ -Environment ------------ -BipedalWalker-v2 : https://gym.openai.com/envs/BipedalWalker-v2 - -Reward is given for moving forward, total 300+ points up to the far end. -If the robot falls, it gets -100. Applying motor torque costs a small amount of -points, more optimal agent will get better score. State consists of hull angle -speed, angular velocity, horizontal speed, vertical speed, position of joints -and joints angular speed, legs contact with ground, and 10 lidar rangefinder -measurements. There's no coordinates in the state vector. - -Prerequisites --------------- -tensorflow 2.0.0a0 -tensorflow-probability 0.6.0 -tensorlayer 2.0.0 -&& -pip install box2d box2d-kengz --user - -""" - -import multiprocessing -import threading -import time - -from rlzoo.common.utils import * -from rlzoo.common.buffer import * - - -# tl.logging.set_verbosity(tl.logging.DEBUG) -################### Asynchronous Advantage Actor Critic (A3C) #################################### -class ACNet(object): - - def __init__(self, net_list, scope, entropy_beta): - self.ENTROPY_BETA = entropy_beta - self.actor, self.critic = net_list - - # @tf.function # shouldn't use here! - def update_global( - self, buffer_s, buffer_a, buffer_v_target, globalAC - ): # refer to the global Actor-Crtic network for updating it with samples - """ update the global critic """ - with tf.GradientTape() as tape: - self.v = self.critic(buffer_s) - self.v_target = buffer_v_target - td = tf.subtract(self.v_target, self.v, name='TD_error') - self.c_loss = tf.reduce_mean(tf.square(td)) - self.c_grads = tape.gradient(self.c_loss, self.critic.trainable_weights) - OPT_C.apply_gradients(zip(self.c_grads, globalAC.critic.trainable_weights)) # local grads applies to global net - del tape # Drop the reference to the tape - """ update the global actor """ - with tf.GradientTape() as tape: - self.actor(buffer_s) - self.a_his = buffer_a # float32 - log_prob = self.actor.policy_dist.logp(self.a_his) - exp_v = log_prob * td # td is from the critic part, no gradients for it - entropy = self.actor.policy_dist.entropy() # encourage exploration - self.exp_v = self.ENTROPY_BETA * entropy + exp_v - self.a_loss = tf.reduce_mean(-self.exp_v) - self.a_grads = tape.gradient(self.a_loss, self.actor.trainable_weights) - OPT_A.apply_gradients(zip(self.a_grads, globalAC.actor.trainable_weights)) # local grads applies to global net - del tape # Drop the reference to the tape - - # @tf.function - def pull_global(self, globalAC): # run by a local, pull weights from the global nets - for l_p, g_p in zip(self.actor.trainable_weights, globalAC.actor.trainable_weights): - l_p.assign(g_p) - for l_p, g_p in zip(self.critic.trainable_weights, globalAC.critic.trainable_weights): - l_p.assign(g_p) - - def get_action(self, s): # run by a local - return self.actor(np.array([s])).numpy()[0] - - def get_action_greedy(self, s): - return self.actor(np.array([s]), greedy=True)[0].numpy() - - def save_ckpt(self, env_name): # save trained weights - save_model(self.actor, 'model_actor', 'A3C', env_name) - save_model(self.critic, 'model_critic', 'A3C', env_name) - - def load_ckpt(self, env_name): # load trained weights - load_model(self.actor, 'model_actor', 'A3C', env_name) - load_model(self.critic, 'model_critic', 'A3C', env_name) - - -class Worker(object): - def __init__(self, env, net_list, name, train_episodes, max_steps, gamma, update_itr, entropy_beta, - render, plot_func): - self.name = name - self.AC = ACNet(net_list, name, entropy_beta) - self.MAX_GLOBAL_EP = train_episodes - self.UPDATE_GLOBAL_ITER = update_itr - self.GAMMA = gamma - self.env = env - self.max_steps = max_steps - self.render = render - self.plot_func = plot_func - - def work(self, globalAC): - global COORD, GLOBAL_RUNNING_R, GLOBAL_EP, OPT_A, OPT_C, t0, SAVE_INTERVAL - total_step = 1 - save_cnt = 1 - buffer_s, buffer_a, buffer_r = [], [], [] - while not COORD.should_stop() and GLOBAL_EP < self.MAX_GLOBAL_EP: - s = self.env.reset() - ep_r = 0 - for epi_step in range(self.max_steps): - # visualize Worker_0 during training - if self.name == 'Worker_0' and total_step % 30 == 0 and self.render: - self.env.render() - s = s.astype('float32') # double to float - a = self.AC.get_action(s) - s_, r, done, _info = self.env.step(a) - - s_ = s_.astype('float32') # double to float - - ep_r += r - buffer_s.append(s) - buffer_a.append(a) - buffer_r.append(r) - - if total_step % self.UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net - - if done: - v_s_ = 0 # terminal - else: - v_s_ = self.AC.critic(s_[np.newaxis, :])[0, 0] # reduce dim from 2 to 0 - - buffer_v_target = [] - - for r in buffer_r[::-1]: # reverse buffer r - v_s_ = r + self.GAMMA * v_s_ - buffer_v_target.append(v_s_) - - buffer_v_target.reverse() - buffer_s = buffer_s if len(buffer_s[0].shape) > 1 else np.vstack( - buffer_s) # no vstack for raw-pixel input - buffer_a, buffer_v_target = ( - np.vstack(buffer_a), np.vstack(buffer_v_target) - ) - - # update gradients on global network - self.AC.update_global(buffer_s, buffer_a, buffer_v_target.astype('float32'), globalAC) - buffer_s, buffer_a, buffer_r = [], [], [] - - # update local network from global network - self.AC.pull_global(globalAC) - - s = s_ - total_step += 1 - if self.name == 'Worker_0' and GLOBAL_EP >= save_cnt * SAVE_INTERVAL: - plot_save_log(GLOBAL_RUNNING_R, algorithm_name=self.name, env_name=self.env.spec.id) - globalAC.save_ckpt(env_name=self.env.spec.id) - save_cnt += 1 - if done: - break - - GLOBAL_RUNNING_R.append(ep_r) - if self.name == 'Worker_0' and self.plot_func is not None: - self.plot_func(GLOBAL_RUNNING_R) - print('{}, Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ - .format(self.name, GLOBAL_EP, self.MAX_GLOBAL_EP, ep_r, time.time() - t0)) - GLOBAL_EP += 1 - - -class A3C(): - def __init__(self, net_list, optimizers_list, entropy_beta=0.005): - """ - :param entropy_beta: factor for entropy boosted exploration - """ - self.net_list = net_list - self.optimizers_list = optimizers_list - self.GLOBAL_AC = ACNet(self.net_list[0], 'global', entropy_beta) # we only need its params - self.entropy_beta = entropy_beta - self.name = 'A3C' - - def learn(self, env, train_episodes=1000, test_episodes=10, max_steps=150, render=False, n_workers=1, update_itr=10, - gamma=0.99, save_interval=500, mode='train', plot_func=None): - - """ - :param env: a list of same learning environments - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param render: render or not - :param n_workers: manually set number of workers - :param update_itr: update global policy after several episodes - :param gamma: reward discount factor - :param save_interval: timesteps for saving the weights and plotting the results - :param mode: train or test - :param plot_func: additional function for interactive module - """ - global COORD, GLOBAL_RUNNING_R, GLOBAL_EP, OPT_A, OPT_C, t0, SAVE_INTERVAL - SAVE_INTERVAL = save_interval - COORD = tf.train.Coordinator() - GLOBAL_RUNNING_R = [] - GLOBAL_EP = 0 # will increase during training, stop training when it >= MAX_GLOBAL_EP - N_WORKERS = n_workers if n_workers > 0 else multiprocessing.cpu_count() - - self.plot_func = plot_func - if mode == 'train': - # ============================= TRAINING =============================== - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env[0].spec.id)) - t0 = time.time() - with tf.device("/cpu:0"): - [OPT_A, OPT_C] = self.optimizers_list - - workers = [] - # Create worker - for i in range(N_WORKERS): - i_name = 'Worker_%i' % i # worker name - workers.append( - Worker(env[i], self.net_list[i + 1], i_name, train_episodes, max_steps, gamma, - update_itr, self.entropy_beta, render, plot_func)) - - # start TF threading - worker_threads = [] - for worker in workers: - # t = threading.Thread(target=worker.work) - job = lambda: worker.work(self.GLOBAL_AC) - t = threading.Thread(target=job) - t.start() - worker_threads.append(t) - - COORD.join(worker_threads) - - plot_save_log(GLOBAL_RUNNING_R, algorithm_name=self.name, env_name=env[0].spec.id) - self.GLOBAL_AC.save_ckpt(env_name=env[0].spec.id) - - elif mode == 'test': - # ============================= EVALUATION ============================= - env = env[0] # only need one env for test - self.GLOBAL_AC.load_ckpt(env_name=env.spec.id) - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - frame_idx = 0 - for eps in range(test_episodes): - s = env.reset() - rall = 0 - for step in range(max_steps): - env.render() - frame_idx += 1 - s = s.astype('float32') # double to float - a = self.GLOBAL_AC.get_action_greedy(s) - s, r, d, _ = env.step(a) - if render: - env.render() - rall += r - if d: - break - - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - eps, test_episodes, rall, time.time() - t0)) - - elif mode is not 'test': - print('unknow mode type') +""" +Asynchronous Advantage Actor Critic (A3C) with Continuous Action Space. + +Actor Critic History +---------------------- +A3C > DDPG (for continuous action space) > AC + +Advantage +---------- +Train faster and more stable than AC. + +Disadvantage +------------- +Have bias. + +Reference +---------- +Original Paper: https://arxiv.org/pdf/1602.01783.pdf +MorvanZhou's tutorial: https://morvanzhou.github.io/tutorials/ +MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ +Environment +----------- +BipedalWalker-v2 : https://gym.openai.com/envs/BipedalWalker-v2 + +Reward is given for moving forward, total 300+ points up to the far end. +If the robot falls, it gets -100. Applying motor torque costs a small amount of +points, more optimal agent will get better score. State consists of hull angle +speed, angular velocity, horizontal speed, vertical speed, position of joints +and joints angular speed, legs contact with ground, and 10 lidar rangefinder +measurements. There's no coordinates in the state vector. + +Prerequisites +-------------- +tensorflow 2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer 2.0.0 +&& +pip install box2d box2d-kengz --user + +""" + +import multiprocessing +import threading +import time + +from rlzoo.common.utils import * +from rlzoo.common.buffer import * + + +# tl.logging.set_verbosity(tl.logging.DEBUG) +################### Asynchronous Advantage Actor Critic (A3C) #################################### +class ACNet(object): + + def __init__(self, net_list, scope, entropy_beta): + self.ENTROPY_BETA = entropy_beta + self.actor, self.critic = net_list + + # @tf.function # shouldn't use here! + def update_global( + self, buffer_s, buffer_a, buffer_v_target, globalAC + ): # refer to the global Actor-Crtic network for updating it with samples + """ update the global critic """ + with tf.GradientTape() as tape: + self.v = self.critic(buffer_s) + self.v_target = buffer_v_target + td = tf.subtract(self.v_target, self.v, name='TD_error') + self.c_loss = tf.reduce_mean(tf.square(td)) + self.c_grads = tape.gradient(self.c_loss, self.critic.trainable_weights) + OPT_C.apply_gradients(zip(self.c_grads, globalAC.critic.trainable_weights)) # local grads applies to global net + del tape # Drop the reference to the tape + """ update the global actor """ + with tf.GradientTape() as tape: + self.actor(buffer_s) + self.a_his = buffer_a # float32 + log_prob = self.actor.policy_dist.logp(self.a_his) + exp_v = log_prob * td # td is from the critic part, no gradients for it + entropy = self.actor.policy_dist.entropy() # encourage exploration + self.exp_v = self.ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + self.a_grads = tape.gradient(self.a_loss, self.actor.trainable_weights) + OPT_A.apply_gradients(zip(self.a_grads, globalAC.actor.trainable_weights)) # local grads applies to global net + del tape # Drop the reference to the tape + + # @tf.function + def pull_global(self, globalAC): # run by a local, pull weights from the global nets + for l_p, g_p in zip(self.actor.trainable_weights, globalAC.actor.trainable_weights): + l_p.assign(g_p) + for l_p, g_p in zip(self.critic.trainable_weights, globalAC.critic.trainable_weights): + l_p.assign(g_p) + + def get_action(self, s): # run by a local + return self.actor(np.array([s])).numpy()[0] + + def get_action_greedy(self, s): + return self.actor(np.array([s]), greedy=True)[0].numpy() + + def save_ckpt(self, env_name): # save trained weights + save_model(self.actor, 'model_actor', 'A3C', env_name) + save_model(self.critic, 'model_critic', 'A3C', env_name) + + def load_ckpt(self, env_name): # load trained weights + load_model(self.actor, 'model_actor', 'A3C', env_name) + load_model(self.critic, 'model_critic', 'A3C', env_name) + + +class Worker(object): + def __init__(self, env, net_list, name, train_episodes, max_steps, gamma, update_itr, entropy_beta, + render, plot_func): + self.name = name + self.AC = ACNet(net_list, name, entropy_beta) + self.MAX_GLOBAL_EP = train_episodes + self.UPDATE_GLOBAL_ITER = update_itr + self.GAMMA = gamma + self.env = env + self.max_steps = max_steps + self.render = render + self.plot_func = plot_func + + def work(self, globalAC): + global COORD, GLOBAL_RUNNING_R, GLOBAL_EP, OPT_A, OPT_C, t0, SAVE_INTERVAL + total_step = 1 + save_cnt = 1 + buffer_s, buffer_a, buffer_r = [], [], [] + while not COORD.should_stop() and GLOBAL_EP < self.MAX_GLOBAL_EP: + s = self.env.reset() + ep_r = 0 + for epi_step in range(self.max_steps): + # visualize Worker_0 during training + if self.name == 'Worker_0' and total_step % 30 == 0 and self.render: + self.env.render() + s = s.astype('float32') # double to float + a = self.AC.get_action(s) + s_, r, done, _info = self.env.step(a) + + s_ = s_.astype('float32') # double to float + + ep_r += r + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append(r) + + if total_step % self.UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net + + if done: + v_s_ = 0 # terminal + else: + v_s_ = self.AC.critic(s_[np.newaxis, :])[0, 0] # reduce dim from 2 to 0 + + buffer_v_target = [] + + for r in buffer_r[::-1]: # reverse buffer r + v_s_ = r + self.GAMMA * v_s_ + buffer_v_target.append(v_s_) + + buffer_v_target.reverse() + buffer_s = buffer_s if len(buffer_s[0].shape) > 1 else np.vstack( + buffer_s) # no vstack for raw-pixel input + buffer_a, buffer_v_target = ( + np.vstack(buffer_a), np.vstack(buffer_v_target) + ) + + # update gradients on global network + self.AC.update_global(buffer_s, buffer_a, buffer_v_target.astype('float32'), globalAC) + buffer_s, buffer_a, buffer_r = [], [], [] + + # update local network from global network + self.AC.pull_global(globalAC) + + s = s_ + total_step += 1 + if self.name == 'Worker_0' and GLOBAL_EP >= save_cnt * SAVE_INTERVAL: + plot_save_log(GLOBAL_RUNNING_R, algorithm_name=self.name, env_name=self.env.spec.id) + globalAC.save_ckpt(env_name=self.env.spec.id) + save_cnt += 1 + if done: + break + + GLOBAL_RUNNING_R.append(ep_r) + if self.name == 'Worker_0' and self.plot_func is not None: + self.plot_func(GLOBAL_RUNNING_R) + print('{}, Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ + .format(self.name, GLOBAL_EP, self.MAX_GLOBAL_EP, ep_r, time.time() - t0)) + GLOBAL_EP += 1 + + +class A3C(): + def __init__(self, net_list, optimizers_list, entropy_beta=0.005): + """ + :param entropy_beta: factor for entropy boosted exploration + """ + self.net_list = net_list + self.optimizers_list = optimizers_list + self.GLOBAL_AC = ACNet(self.net_list[0], 'global', entropy_beta) # we only need its params + self.entropy_beta = entropy_beta + self.name = 'A3C' + + def learn(self, env, train_episodes=1000, test_episodes=10, max_steps=150, render=False, n_workers=1, update_itr=10, + gamma=0.99, save_interval=500, mode='train', plot_func=None): + + """ + :param env: a list of same learning environments + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param render: render or not + :param n_workers: manually set number of workers + :param update_itr: update global policy after several episodes + :param gamma: reward discount factor + :param save_interval: timesteps for saving the weights and plotting the results + :param mode: train or test + :param plot_func: additional function for interactive module + """ + global COORD, GLOBAL_RUNNING_R, GLOBAL_EP, OPT_A, OPT_C, t0, SAVE_INTERVAL + SAVE_INTERVAL = save_interval + COORD = tf.train.Coordinator() + GLOBAL_RUNNING_R = [] + GLOBAL_EP = 0 # will increase during training, stop training when it >= MAX_GLOBAL_EP + N_WORKERS = n_workers if n_workers > 0 else multiprocessing.cpu_count() + + self.plot_func = plot_func + if mode == 'train': + # ============================= TRAINING =============================== + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env[0].spec.id)) + t0 = time.time() + with tf.device("/cpu:0"): + [OPT_A, OPT_C] = self.optimizers_list + + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'Worker_%i' % i # worker name + workers.append( + Worker(env[i], self.net_list[i + 1], i_name, train_episodes, max_steps, gamma, + update_itr, self.entropy_beta, render, plot_func)) + + # start TF threading + worker_threads = [] + for worker in workers: + # t = threading.Thread(target=worker.work) + job = lambda: worker.work(self.GLOBAL_AC) + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + + COORD.join(worker_threads) + + plot_save_log(GLOBAL_RUNNING_R, algorithm_name=self.name, env_name=env[0].spec.id) + self.GLOBAL_AC.save_ckpt(env_name=env[0].spec.id) + + elif mode == 'test': + # ============================= EVALUATION ============================= + env = env[0] # only need one env for test + self.GLOBAL_AC.load_ckpt(env_name=env.spec.id) + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + frame_idx = 0 + for eps in range(test_episodes): + s = env.reset() + rall = 0 + for step in range(max_steps): + env.render() + frame_idx += 1 + s = s.astype('float32') # double to float + a = self.GLOBAL_AC.get_action_greedy(s) + s, r, d, _ = env.step(a) + if render: + env.render() + rall += r + if d: + break + + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + eps, test_episodes, rall, time.time() - t0)) + + elif mode is not 'test': + print('unknow mode type') diff --git a/rlzoo/algorithms/a3c/default.py b/rlzoo/algorithms/a3c/default.py old mode 100644 new mode 100755 index 3cb373c..ec987c9 --- a/rlzoo/algorithms/a3c/default.py +++ b/rlzoo/algorithms/a3c/default.py @@ -1,377 +1,377 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -entropy_beta: factor for entropy boosted exploration ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -n_workers: manually set number of workers -update_itr: update global policy after several episodes -gamma: reward discount factor -save_interval: timesteps for saving the weights and plotting the results -mode: train or test ------------------------------------------------- -""" - - -def atari(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=100, - gamma=0.9, - train_episodes=1000, - test_episodes=10, - save_interval=100, - update_itr=10, - n_workers=num_env - ) - - return alg_params, learn_params - - -def classic_control(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=100, - gamma=0.9, - train_episodes=1000, - test_episodes=10, - save_interval=100, - update_itr=10, - n_workers=num_env - ) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=20000, - gamma=0.9, - train_episodes=20000, - test_episodes=10, - save_interval=500, - update_itr=10, - n_workers=num_env - ) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=100, - gamma=0.9, - train_episodes=1000, - test_episodes=10, - save_interval=100, - update_itr=10, - n_workers=num_env - ) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=100, - gamma=0.9, - train_episodes=1000, - test_episodes=10, - save_interval=100, - update_itr=10, - n_workers=num_env - - ) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=100, - gamma=0.9, - train_episodes=1000, - test_episodes=10, - save_interval=100, - update_itr=10, - n_workers=num_env - - ) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict( - entropy_beta=0.005 - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 4 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - net_list2 = [] # networks list of networks list, each item for single thread/process - for _ in range(num_env + 1): # additional one for global - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - alg_params['net_list'] = net_list2 - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') - c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=100, - gamma=0.9, - train_episodes=1000, - test_episodes=10, - save_interval=100, - update_itr=10, - n_workers=num_env - - ) - +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +entropy_beta: factor for entropy boosted exploration +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +n_workers: manually set number of workers +update_itr: update global policy after several episodes +gamma: reward discount factor +save_interval: timesteps for saving the weights and plotting the results +mode: train or test +------------------------------------------------ +""" + + +def atari(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=100, + gamma=0.9, + train_episodes=1000, + test_episodes=10, + save_interval=100, + update_itr=10, + n_workers=num_env + ) + + return alg_params, learn_params + + +def classic_control(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=100, + gamma=0.9, + train_episodes=1000, + test_episodes=10, + save_interval=100, + update_itr=10, + n_workers=num_env + ) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=20000, + gamma=0.9, + train_episodes=20000, + test_episodes=10, + save_interval=500, + update_itr=10, + n_workers=num_env + ) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=100, + gamma=0.9, + train_episodes=1000, + test_episodes=10, + save_interval=100, + update_itr=10, + n_workers=num_env + ) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=100, + gamma=0.9, + train_episodes=1000, + test_episodes=10, + save_interval=100, + update_itr=10, + n_workers=num_env + + ) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=100, + gamma=0.9, + train_episodes=1000, + test_episodes=10, + save_interval=100, + update_itr=10, + n_workers=num_env + + ) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict( + entropy_beta=0.005 + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 4 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + net_list2 = [] # networks list of networks list, each item for single thread/process + for _ in range(num_env + 1): # additional one for global + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + alg_params['net_list'] = net_list2 + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') + c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=100, + gamma=0.9, + train_episodes=1000, + test_episodes=10, + save_interval=100, + update_itr=10, + n_workers=num_env + + ) + return alg_params, learn_params \ No newline at end of file diff --git a/rlzoo/algorithms/a3c/run_a3c.py b/rlzoo/algorithms/a3c/run_a3c.py old mode 100644 new mode 100755 index 4684a26..fcb75ba --- a/rlzoo/algorithms/a3c/run_a3c.py +++ b/rlzoo/algorithms/a3c/run_a3c.py @@ -1,67 +1,67 @@ -from rlzoo.algorithms.a3c.a3c import A3C -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - -""" load environment """ -env_id = 'BipedalWalker-v2' -env = gym.make(env_id).unwrapped -# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run -action_shape = env.action_space.shape -state_shape = env.observation_space.shape -# reproducible -seed = 2 -np.random.seed(seed) -tf.random.set_seed(seed) -env.seed(seed) - -""" build networks for the algorithm """ -num_hidden_layer = 4 # number of hidden layers for the networks -hidden_dim = 64 # dimension of hidden layers for the networks -num_workers = 2 -net_list2 = [] -for i in range(num_workers + 1): - with tf.name_scope('A3C'): - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [actor, critic] - net_list2.append(net_list) - -""" choose optimizers """ -actor_lr, critic_lr = 5e-5, 1e-4 # learning rate -a_optimizer = tf.optimizers.RMSprop(actor_lr) -c_optimizer = tf.optimizers.RMSprop(critic_lr) -optimizers_list = [a_optimizer, c_optimizer] - -model = A3C(net_list2, optimizers_list, entropy_beta=0.005) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -entropy_beta: factor for entropy boosted exploration -""" - -env_list = [] -for i in range(num_workers): - env_list.append(gym.make(env_id).unwrapped) -model.learn(env_list, train_episodes=20000, test_episodes=100, max_steps=20000, n_workers=num_workers, update_itr=10, - gamma=0.99, save_interval=500, mode='train') -""" -full list of parameters for training ---------------------------------------- -env_list: a list of same learning environments -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -n_workers: manually set number of workers -update_itr: update global policy after several episodes -gamma: reward discount factor -save_interval: timesteps for saving the weights and plotting the results -mode: train or test -""" -# test -model.learn(env_list, test_episodes=100, max_steps=20000, mode='test', render=True) +from rlzoo.algorithms.a3c.a3c import A3C +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + +""" load environment """ +env_id = 'BipedalWalker-v2' +env = gym.make(env_id).unwrapped +# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run +action_shape = env.action_space.shape +state_shape = env.observation_space.shape +# reproducible +seed = 2 +np.random.seed(seed) +tf.random.set_seed(seed) +env.seed(seed) + +""" build networks for the algorithm """ +num_hidden_layer = 4 # number of hidden layers for the networks +hidden_dim = 64 # dimension of hidden layers for the networks +num_workers = 2 +net_list2 = [] +for i in range(num_workers + 1): + with tf.name_scope('A3C'): + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [actor, critic] + net_list2.append(net_list) + +""" choose optimizers """ +actor_lr, critic_lr = 5e-5, 1e-4 # learning rate +a_optimizer = tf.optimizers.RMSprop(actor_lr) +c_optimizer = tf.optimizers.RMSprop(critic_lr) +optimizers_list = [a_optimizer, c_optimizer] + +model = A3C(net_list2, optimizers_list, entropy_beta=0.005) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +entropy_beta: factor for entropy boosted exploration +""" + +env_list = [] +for i in range(num_workers): + env_list.append(gym.make(env_id).unwrapped) +model.learn(env_list, train_episodes=20000, test_episodes=100, max_steps=20000, n_workers=num_workers, update_itr=10, + gamma=0.99, save_interval=500, mode='train') +""" +full list of parameters for training +--------------------------------------- +env_list: a list of same learning environments +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +n_workers: manually set number of workers +update_itr: update global policy after several episodes +gamma: reward discount factor +save_interval: timesteps for saving the weights and plotting the results +mode: train or test +""" +# test +model.learn(env_list, test_episodes=100, max_steps=20000, mode='test', render=True) diff --git a/rlzoo/algorithms/ac/__init__.py b/rlzoo/algorithms/ac/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ac/ac.py b/rlzoo/algorithms/ac/ac.py old mode 100644 new mode 100755 index 7fc1027..bf56db8 --- a/rlzoo/algorithms/ac/ac.py +++ b/rlzoo/algorithms/ac/ac.py @@ -1,187 +1,187 @@ -""" -Actor-Critic -------------- -It uses TD-error as the Advantage. - -Actor Critic History ----------------------- -A3C > DDPG > AC - -Advantage ----------- -AC converge faster than Policy Gradient. - -Disadvantage (IMPORTANT) ------------------------- -The Policy is oscillated (difficult to converge), DDPG can solve -this problem using advantage of DQN. - -Reference ----------- -paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf -View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ -MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ - -Environment ------------- -CartPole-v0: https://gym.openai.com/envs/CartPole-v0 - -A pole is attached by an un-actuated joint to a cart, which moves along a -frictionless track. The system is controlled by applying a force of +1 or -1 -to the cart. The pendulum starts upright, and the goal is to prevent it from -falling over. - -A reward of +1 is provided for every timestep that the pole remains upright. -The episode ends when the pole is more than 15 degrees from vertical, or the -cart moves more than 2.4 units from the center. - - -Prerequisites --------------- -tensorflow >=2.0.0a0 -tensorlayer >=2.0.0 - -""" -import time - -import tensorlayer as tl - -from rlzoo.common.utils import * -from rlzoo.common.value_networks import * -from rlzoo.common.policy_networks import * - -tl.logging.set_verbosity(tl.logging.DEBUG) - - -############################### Actor-Critic #################################### -class AC: - def __init__(self, net_list, optimizers_list, gamma=0.9): - assert len(net_list) == 2 - assert len(optimizers_list) == 2 - self.name = 'AC' - self.actor, self.critic = net_list - assert isinstance(self.critic, ValueNetwork) - assert isinstance(self.actor, StochasticPolicyNetwork) - self.a_optimizer, self.c_optimizer = optimizers_list - self.GAMMA = gamma - - def update(self, s, a, r, s_): - # critic update - v_ = self.critic(np.array([s_])) - with tf.GradientTape() as tape: - v = self.critic(np.array([s])) - td_error = r + self.GAMMA * v_ - v # TD_error = r + lambd * V(newS) - V(S) - loss = tf.square(td_error) - grad = tape.gradient(loss, self.critic.trainable_weights) - self.c_optimizer.apply_gradients(zip(grad, self.critic.trainable_weights)) - - # actor update - with tf.GradientTape() as tape: - # _logits = self.actor(np.array([s])) - ## cross-entropy loss weighted by td-error (advantage), - # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution, - # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa. - - _ = self.actor(np.array([s])) - neg_log_prob = self.actor.policy_dist.neglogp([a]) - _exp_v = tf.reduce_mean(neg_log_prob * td_error) - grad = tape.gradient(_exp_v, self.actor.trainable_weights) - self.a_optimizer.apply_gradients(zip(grad, self.actor.trainable_weights)) - return _exp_v - - def get_action(self, s): - return self.actor(np.array([s]))[0].numpy() - - def get_action_greedy(self, s): - return self.actor(np.array([s]), greedy=True)[0].numpy() - - def save_ckpt(self, env_name): # save trained weights - save_model(self.actor, 'model_actor', self.name, env_name) - save_model(self.critic, 'model_critic', self.name, env_name) - - def load_ckpt(self, env_name): # load trained weights - load_model(self.actor, 'model_actor', self.name, env_name) - load_model(self.critic, 'model_critic', self.name, env_name) - - def learn(self, env, train_episodes=1000, test_episodes=500, max_steps=200, - save_interval=100, mode='train', render=False, plot_func=None): - """ - :param env: learning environment - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param save_interval: time steps for saving the weights and plotting the results - :param mode: 'train' or 'test' - :param render: if true, visualize the environment - :param plot_func: additional function for interactive module - """ - - t0 = time.time() - if mode == 'train': - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - reward_buffer = [] - for i_episode in range(train_episodes): - s = env.reset() - ep_rs_sum = 0 # rewards of all steps - - for step in range(max_steps): - - if render: - env.render() - - a = self.get_action(s) - s_new, r, done, info = env.step(a) - ep_rs_sum += r - - try: - self.update(s, a, r, s_new) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] - except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) - - s = s_new - - if done: - break - - reward_buffer.append(ep_rs_sum) - if plot_func is not None: - plot_func(reward_buffer) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ - .format(i_episode, train_episodes, ep_rs_sum, time.time() - t0)) - - if i_episode % save_interval == 0: - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) - - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) - - elif mode == 'test': - self.load_ckpt(env_name=env.spec.id) - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - - reward_buffer = [] - for i_episode in range(test_episodes): - s = env.reset() - ep_rs_sum = 0 # rewards of all steps - for step in range(max_steps): - if render: env.render() - a = self.get_action_greedy(s) - s_new, r, done, info = env.step(a) - s_new = s_new - - ep_rs_sum += r - s = s_new - - if done: - break - - reward_buffer.append(ep_rs_sum) - if plot_func: - plot_func(reward_buffer) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - i_episode, test_episodes, ep_rs_sum, time.time() - t0)) - - elif mode is not 'test': - print('unknow mode type') +""" +Actor-Critic +------------- +It uses TD-error as the Advantage. + +Actor Critic History +---------------------- +A3C > DDPG > AC + +Advantage +---------- +AC converge faster than Policy Gradient. + +Disadvantage (IMPORTANT) +------------------------ +The Policy is oscillated (difficult to converge), DDPG can solve +this problem using advantage of DQN. + +Reference +---------- +paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf +View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ +MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ + +Environment +------------ +CartPole-v0: https://gym.openai.com/envs/CartPole-v0 + +A pole is attached by an un-actuated joint to a cart, which moves along a +frictionless track. The system is controlled by applying a force of +1 or -1 +to the cart. The pendulum starts upright, and the goal is to prevent it from +falling over. + +A reward of +1 is provided for every timestep that the pole remains upright. +The episode ends when the pole is more than 15 degrees from vertical, or the +cart moves more than 2.4 units from the center. + + +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorlayer >=2.0.0 + +""" +import time + +import tensorlayer as tl + +from rlzoo.common.utils import * +from rlzoo.common.value_networks import * +from rlzoo.common.policy_networks import * + +tl.logging.set_verbosity(tl.logging.DEBUG) + + +############################### Actor-Critic #################################### +class AC: + def __init__(self, net_list, optimizers_list, gamma=0.9): + assert len(net_list) == 2 + assert len(optimizers_list) == 2 + self.name = 'AC' + self.actor, self.critic = net_list + assert isinstance(self.critic, ValueNetwork) + assert isinstance(self.actor, StochasticPolicyNetwork) + self.a_optimizer, self.c_optimizer = optimizers_list + self.GAMMA = gamma + + def update(self, s, a, r, s_): + # critic update + v_ = self.critic(np.array([s_])) + with tf.GradientTape() as tape: + v = self.critic(np.array([s])) + td_error = r + self.GAMMA * v_ - v # TD_error = r + lambd * V(newS) - V(S) + loss = tf.square(td_error) + grad = tape.gradient(loss, self.critic.trainable_weights) + self.c_optimizer.apply_gradients(zip(grad, self.critic.trainable_weights)) + + # actor update + with tf.GradientTape() as tape: + # _logits = self.actor(np.array([s])) + ## cross-entropy loss weighted by td-error (advantage), + # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution, + # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa. + + _ = self.actor(np.array([s])) + neg_log_prob = self.actor.policy_dist.neglogp([a]) + _exp_v = tf.reduce_mean(neg_log_prob * td_error) + grad = tape.gradient(_exp_v, self.actor.trainable_weights) + self.a_optimizer.apply_gradients(zip(grad, self.actor.trainable_weights)) + return _exp_v + + def get_action(self, s): + return self.actor(np.array([s]))[0].numpy() + + def get_action_greedy(self, s): + return self.actor(np.array([s]), greedy=True)[0].numpy() + + def save_ckpt(self, env_name): # save trained weights + save_model(self.actor, 'model_actor', self.name, env_name) + save_model(self.critic, 'model_critic', self.name, env_name) + + def load_ckpt(self, env_name): # load trained weights + load_model(self.actor, 'model_actor', self.name, env_name) + load_model(self.critic, 'model_critic', self.name, env_name) + + def learn(self, env, train_episodes=1000, test_episodes=500, max_steps=200, + save_interval=100, mode='train', render=False, plot_func=None): + """ + :param env: learning environment + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param save_interval: time steps for saving the weights and plotting the results + :param mode: 'train' or 'test' + :param render: if true, visualize the environment + :param plot_func: additional function for interactive module + """ + + t0 = time.time() + if mode == 'train': + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + reward_buffer = [] + for i_episode in range(train_episodes): + s = env.reset() + ep_rs_sum = 0 # rewards of all steps + + for step in range(max_steps): + + if render: + env.render() + + a = self.get_action(s) + s_new, r, done, info = env.step(a) + ep_rs_sum += r + + try: + self.update(s, a, r, s_new) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] + except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) + + s = s_new + + if done: + break + + reward_buffer.append(ep_rs_sum) + if plot_func is not None: + plot_func(reward_buffer) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ + .format(i_episode, train_episodes, ep_rs_sum, time.time() - t0)) + + if i_episode % save_interval == 0: + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) + + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) + + elif mode == 'test': + self.load_ckpt(env_name=env.spec.id) + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + + reward_buffer = [] + for i_episode in range(test_episodes): + s = env.reset() + ep_rs_sum = 0 # rewards of all steps + for step in range(max_steps): + if render: env.render() + a = self.get_action_greedy(s) + s_new, r, done, info = env.step(a) + s_new = s_new + + ep_rs_sum += r + s = s_new + + if done: + break + + reward_buffer.append(ep_rs_sum) + if plot_func: + plot_func(reward_buffer) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + i_episode, test_episodes, ep_rs_sum, time.time() - t0)) + + elif mode is not 'test': + print('unknow mode type') diff --git a/rlzoo/algorithms/ac/default.py b/rlzoo/algorithms/ac/default.py old mode 100644 new mode 100755 index 7b56efb..3ea0ce2 --- a/rlzoo/algorithms/ac/default.py +++ b/rlzoo/algorithms/ac/default.py @@ -1,288 +1,288 @@ -import tensorflow as tf -import tensorlayer as tl - -from rlzoo.common import math_utils -from rlzoo.common.value_networks import * -from rlzoo.common.policy_networks import * -from gym import spaces -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -gamma: discounted factor of reward -action_range: scale of action values ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving the weights and plotting the results -mode: 'train' or 'test' -render: if true, visualize the environment ------------------------------------------------- -""" - - -def atari(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params - - -def classic_control(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - seed = 1 - set_seed(seed, env) # reproducible - - alg_params = dict( - gamma=0.9, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) - net_list = [actor, critic] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic - a_optimizer = tf.optimizers.Adam(a_lr) - c_optimizer = tf.optimizers.Adam(c_lr) - optimizers_list = [a_optimizer, c_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=200, - train_episodes=500, - test_episodes=100, - save_interval=50, - ) - - return alg_params, learn_params +import tensorflow as tf +import tensorlayer as tl + +from rlzoo.common import math_utils +from rlzoo.common.value_networks import * +from rlzoo.common.policy_networks import * +from gym import spaces +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +gamma: discounted factor of reward +action_range: scale of action values +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving the weights and plotting the results +mode: 'train' or 'test' +render: if true, visualize the environment +------------------------------------------------ +""" + + +def atari(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params + + +def classic_control(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + seed = 1 + set_seed(seed, env) # reproducible + + alg_params = dict( + gamma=0.9, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) + net_list = [actor, critic] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic + a_optimizer = tf.optimizers.Adam(a_lr) + c_optimizer = tf.optimizers.Adam(c_lr) + optimizers_list = [a_optimizer, c_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=200, + train_episodes=500, + test_episodes=100, + save_interval=50, + ) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/ac/run_ac.py b/rlzoo/algorithms/ac/run_ac.py old mode 100644 new mode 100755 index 2162fc5..bace465 --- a/rlzoo/algorithms/ac/run_ac.py +++ b/rlzoo/algorithms/ac/run_ac.py @@ -1,59 +1,59 @@ -from rlzoo.common.utils import set_seed -from rlzoo.algorithms.ac.ac import AC -from rlzoo.common.value_networks import * -from rlzoo.common.policy_networks import * -import gym - -""" load environment """ -# env = gym.make('CartPole-v0').unwrapped -env = gym.make('Pendulum-v0').unwrapped -obs_space = env.observation_space -act_space = env.action_space -# reproducible -seed = 1 -set_seed(seed, env) - -# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run - - -""" build networks for the algorithm """ -num_hidden_layer = 2 # number of hidden layers for the networks -hidden_dim = 64 # dimension of hidden layers for the networks -with tf.name_scope('AC'): - with tf.name_scope('Critic'): - critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Actor'): - actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=tf.nn.tanh) -net_list = [actor, critic] - -""" choose optimizers """ -a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic -a_optimizer = tf.optimizers.Adam(a_lr) -c_optimizer = tf.optimizers.Adam(c_lr) -optimizers_list = [a_optimizer, c_optimizer] - -model = AC(net_list, optimizers_list) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -gamma: discounted factor of reward -action_range: scale of action values -""" - -model.learn(env, train_episodes=500, max_steps=200, - save_interval=50, mode='train', render=False) -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving the weights and plotting the results -mode: 'train' or 'test' -render: if true, visualize the environment -""" -model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) +from rlzoo.common.utils import set_seed +from rlzoo.algorithms.ac.ac import AC +from rlzoo.common.value_networks import * +from rlzoo.common.policy_networks import * +import gym + +""" load environment """ +# env = gym.make('CartPole-v0').unwrapped +env = gym.make('Pendulum-v0').unwrapped +obs_space = env.observation_space +act_space = env.action_space +# reproducible +seed = 1 +set_seed(seed, env) + +# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run + + +""" build networks for the algorithm """ +num_hidden_layer = 2 # number of hidden layers for the networks +hidden_dim = 64 # dimension of hidden layers for the networks +with tf.name_scope('AC'): + with tf.name_scope('Critic'): + critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Actor'): + actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=tf.nn.tanh) +net_list = [actor, critic] + +""" choose optimizers """ +a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic +a_optimizer = tf.optimizers.Adam(a_lr) +c_optimizer = tf.optimizers.Adam(c_lr) +optimizers_list = [a_optimizer, c_optimizer] + +model = AC(net_list, optimizers_list) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +gamma: discounted factor of reward +action_range: scale of action values +""" + +model.learn(env, train_episodes=500, max_steps=200, + save_interval=50, mode='train', render=False) +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving the weights and plotting the results +mode: 'train' or 'test' +render: if true, visualize the environment +""" +model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) diff --git a/rlzoo/algorithms/ddpg/__init__.py b/rlzoo/algorithms/ddpg/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ddpg/ddpg.py b/rlzoo/algorithms/ddpg/ddpg.py old mode 100644 new mode 100755 index 569e98a..d5feb1c --- a/rlzoo/algorithms/ddpg/ddpg.py +++ b/rlzoo/algorithms/ddpg/ddpg.py @@ -1,275 +1,275 @@ -""" -Deep Deterministic Policy Gradient (DDPG) ------------------------------------------ -An algorithm concurrently learns a Q-function and a policy. -It uses off-policy data and the Bellman equation to learn the Q-function, -and uses the Q-function to learn the policy. -Reference ---------- -Deterministic Policy Gradient Algorithms, Silver et al. 2014 -Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016 -MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ -MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ - -Prerequisites -------------- -tensorflow >=2.0.0a0 -tensorflow-probability 0.6.0 -tensorlayer >=2.0.0 -""" - -import time - -from rlzoo.common.utils import * -from rlzoo.common.buffer import * -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * - - -############################### DDPG #################################### - - -class DDPG(object): - """ - DDPG class - """ - - def __init__(self, net_list, optimizers_list, replay_buffer_size, action_range=1., tau=0.01): - """ - :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization - :param optimizers_list: a list of optimizers for all networks and differentiable variables - :param replay_buffer_size: the size of buffer for storing explored samples - :param tau: soft update factor - """ - assert len(net_list) == 4 - assert len(optimizers_list) == 2 - self.name = 'DDPG' - - self.critic, self.critic_target, self.actor, self.actor_target = net_list - - assert isinstance(self.critic, QNetwork) - assert isinstance(self.critic_target, QNetwork) - assert isinstance(self.actor, DeterministicPolicyNetwork) - assert isinstance(self.actor_target, DeterministicPolicyNetwork) - assert isinstance(self.actor.action_space, gym.spaces.Box) - - def copy_para(from_model, to_model): - for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): - j.assign(i) - - copy_para(self.actor, self.actor_target) - copy_para(self.critic, self.critic_target) - - self.replay_buffer_size = replay_buffer_size - self.buffer = ReplayBuffer(replay_buffer_size) - - self.ema = tf.train.ExponentialMovingAverage(decay=1 - tau) # soft replacement - self.action_range = action_range - - self.critic_opt, self.actor_opt = optimizers_list - - def ema_update(self): - """ - Soft updating by exponential smoothing - - :return: None - """ - paras = self.actor.trainable_weights + self.critic.trainable_weights - self.ema.apply(paras) - for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras): - i.assign(self.ema.average(j)) - - def sample_action(self): - """ generate random actions for exploration """ - a = tf.random.uniform(self.actor.action_space.shape, self.actor.action_space.low, self.actor.action_space.high) - return a - - def get_action(self, s, noise_scale): - """ - Choose action with exploration - - :param s: state - - :return: action - """ - a = self.actor([s])[0].numpy()*self.action_range - - # add randomness to action selection for exploration - noise = np.random.normal(0, 1, a.shape) * noise_scale - a += noise - a = np.clip(a, self.actor.action_space.low, self.actor.action_space.high) - - return a - - def get_action_greedy(self, s): - """ - Choose action - - :param s: state - - :return: action - """ - return self.actor([s])[0].numpy()*self.action_range - - def update(self, batch_size, gamma): - """ - Update parameters - - :param batch_size: update batch size - :param gamma: reward decay factor - - :return: - """ - bs, ba, br, bs_, bd = self.buffer.sample(batch_size) - - ba_ = self.actor_target(bs_)*self.action_range - - q_ = self.critic_target([bs_, ba_]) - y = br + (1 - bd) * gamma * q_ - with tf.GradientTape() as tape: - q = self.critic([bs, ba]) - td_error = tf.losses.mean_squared_error(y, q) - c_grads = tape.gradient(td_error, self.critic.trainable_weights) - self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights)) - - with tf.GradientTape() as tape: - a = self.actor(bs)*self.action_range - q = self.critic([bs, a]) - a_loss = - tf.reduce_mean(q) # maximize the q - a_grads = tape.gradient(a_loss, self.actor.trainable_weights) - self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights)) - self.ema_update() - - def store_transition(self, s, a, r, s_, d): - """ - Store data in data buffer - - :param s: state - :param a: act - :param r: reward - :param s_: next state - - :return: None - """ - d = 1 if d else 0 - - self.buffer.push(s, a, [r], s_, d) - - def save_ckpt(self, env_name): - """ - save trained weights - - :return: None - """ - save_model(self.actor, 'model_policy_net', self.name, env_name) - save_model(self.actor_target, 'model_target_policy_net', self.name, env_name) - save_model(self.critic, 'model_q_net', self.name, env_name) - save_model(self.critic_target, 'model_target_q_net', self.name, env_name) - - def load_ckpt(self, env_name): - """ - load trained weights - - :return: None - """ - load_model(self.actor, 'model_policy_net', self.name, env_name) - load_model(self.actor_target, 'model_target_policy_net', self.name, env_name) - load_model(self.critic, 'model_q_net', self.name, env_name) - load_model(self.critic_target, 'model_target_q_net', self.name, env_name) - - def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=10, explore_steps=500, - mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995, - plot_func=None): - """ - learn function - - :param env: learning environment - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param save_interval: time steps for saving - :param explore_steps: for random action sampling in the beginning of training - :param mode: train or test mode - :param render: render each step - :param batch_size: update batch size - :param gamma: reward decay factor - :param noise_scale: range of action noise for exploration - :param noise_scale_decay: noise scale decay factor - :param plot_func: additional function for interactive module - :return: None - """ - - t0 = time.time() - - if mode == 'train': # train - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - reward_buffer = [] - frame_idx = 0 - for i in range(1, train_episodes + 1): - s = env.reset() - ep_reward = 0 - - for j in range(max_steps): - if render: - env.render() - # Add exploration noise - if frame_idx > explore_steps: - a = self.get_action(s, noise_scale) - else: - a = self.sample_action() - frame_idx += 1 - - s_, r, done, info = env.step(a) - - self.store_transition(s, a, r, s_, done) - if len(self.buffer) >= self.replay_buffer_size: - self.update(batch_size, gamma) - noise_scale *= noise_scale_decay - s = s_ - ep_reward += r - - if done: - break - - print( - 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - i, train_episodes, ep_reward, - time.time() - t0 - ) - ) - - reward_buffer.append(ep_reward) - if plot_func is not None: - plot_func(reward_buffer) - if i and not i % save_interval: - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) - - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) - - # test - elif mode == 'test': - self.load_ckpt(env_name=env.spec.id) - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - reward_buffer = [] - for eps in range(1, test_episodes+1): - ep_rs_sum = 0 - s = env.reset() - for step in range(max_steps): - if render: - env.render() - action = self.get_action_greedy(s) - s, reward, done, info = env.step(action) - ep_rs_sum += reward - if done: - break - - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - eps, test_episodes, ep_rs_sum, time.time() - t0) - ) - reward_buffer.append(ep_rs_sum) - if plot_func: - plot_func(reward_buffer) - else: +""" +Deep Deterministic Policy Gradient (DDPG) +----------------------------------------- +An algorithm concurrently learns a Q-function and a policy. +It uses off-policy data and the Bellman equation to learn the Q-function, +and uses the Q-function to learn the policy. +Reference +--------- +Deterministic Policy Gradient Algorithms, Silver et al. 2014 +Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016 +MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ +MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ + +Prerequisites +------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 +""" + +import time + +from rlzoo.common.utils import * +from rlzoo.common.buffer import * +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * + + +############################### DDPG #################################### + + +class DDPG(object): + """ + DDPG class + """ + + def __init__(self, net_list, optimizers_list, replay_buffer_size, action_range=1., tau=0.01): + """ + :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization + :param optimizers_list: a list of optimizers for all networks and differentiable variables + :param replay_buffer_size: the size of buffer for storing explored samples + :param tau: soft update factor + """ + assert len(net_list) == 4 + assert len(optimizers_list) == 2 + self.name = 'DDPG' + + self.critic, self.critic_target, self.actor, self.actor_target = net_list + + assert isinstance(self.critic, QNetwork) + assert isinstance(self.critic_target, QNetwork) + assert isinstance(self.actor, DeterministicPolicyNetwork) + assert isinstance(self.actor_target, DeterministicPolicyNetwork) + assert isinstance(self.actor.action_space, gym.spaces.Box) + + def copy_para(from_model, to_model): + for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): + j.assign(i) + + copy_para(self.actor, self.actor_target) + copy_para(self.critic, self.critic_target) + + self.replay_buffer_size = replay_buffer_size + self.buffer = ReplayBuffer(replay_buffer_size) + + self.ema = tf.train.ExponentialMovingAverage(decay=1 - tau) # soft replacement + self.action_range = action_range + + self.critic_opt, self.actor_opt = optimizers_list + + def ema_update(self): + """ + Soft updating by exponential smoothing + + :return: None + """ + paras = self.actor.trainable_weights + self.critic.trainable_weights + self.ema.apply(paras) + for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras): + i.assign(self.ema.average(j)) + + def sample_action(self): + """ generate random actions for exploration """ + a = tf.random.uniform(self.actor.action_space.shape, self.actor.action_space.low, self.actor.action_space.high) + return a + + def get_action(self, s, noise_scale): + """ + Choose action with exploration + + :param s: state + + :return: action + """ + a = self.actor([s])[0].numpy()*self.action_range + + # add randomness to action selection for exploration + noise = np.random.normal(0, 1, a.shape) * noise_scale + a += noise + a = np.clip(a, self.actor.action_space.low, self.actor.action_space.high) + + return a + + def get_action_greedy(self, s): + """ + Choose action + + :param s: state + + :return: action + """ + return self.actor([s])[0].numpy()*self.action_range + + def update(self, batch_size, gamma): + """ + Update parameters + + :param batch_size: update batch size + :param gamma: reward decay factor + + :return: + """ + bs, ba, br, bs_, bd = self.buffer.sample(batch_size) + + ba_ = self.actor_target(bs_)*self.action_range + + q_ = self.critic_target([bs_, ba_]) + y = br + (1 - bd) * gamma * q_ + with tf.GradientTape() as tape: + q = self.critic([bs, ba]) + td_error = tf.losses.mean_squared_error(y, q) + c_grads = tape.gradient(td_error, self.critic.trainable_weights) + self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights)) + + with tf.GradientTape() as tape: + a = self.actor(bs)*self.action_range + q = self.critic([bs, a]) + a_loss = - tf.reduce_mean(q) # maximize the q + a_grads = tape.gradient(a_loss, self.actor.trainable_weights) + self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights)) + self.ema_update() + + def store_transition(self, s, a, r, s_, d): + """ + Store data in data buffer + + :param s: state + :param a: act + :param r: reward + :param s_: next state + + :return: None + """ + d = 1 if d else 0 + + self.buffer.push(s, a, [r], s_, d) + + def save_ckpt(self, env_name): + """ + save trained weights + + :return: None + """ + save_model(self.actor, 'model_policy_net', self.name, env_name) + save_model(self.actor_target, 'model_target_policy_net', self.name, env_name) + save_model(self.critic, 'model_q_net', self.name, env_name) + save_model(self.critic_target, 'model_target_q_net', self.name, env_name) + + def load_ckpt(self, env_name): + """ + load trained weights + + :return: None + """ + load_model(self.actor, 'model_policy_net', self.name, env_name) + load_model(self.actor_target, 'model_target_policy_net', self.name, env_name) + load_model(self.critic, 'model_q_net', self.name, env_name) + load_model(self.critic_target, 'model_target_q_net', self.name, env_name) + + def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=10, explore_steps=500, + mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995, + plot_func=None): + """ + learn function + + :param env: learning environment + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param save_interval: time steps for saving + :param explore_steps: for random action sampling in the beginning of training + :param mode: train or test mode + :param render: render each step + :param batch_size: update batch size + :param gamma: reward decay factor + :param noise_scale: range of action noise for exploration + :param noise_scale_decay: noise scale decay factor + :param plot_func: additional function for interactive module + :return: None + """ + + t0 = time.time() + + if mode == 'train': # train + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + reward_buffer = [] + frame_idx = 0 + for i in range(1, train_episodes + 1): + s = env.reset() + ep_reward = 0 + + for j in range(max_steps): + if render: + env.render() + # Add exploration noise + if frame_idx > explore_steps: + a = self.get_action(s, noise_scale) + else: + a = self.sample_action() + frame_idx += 1 + + s_, r, done, info = env.step(a) + + self.store_transition(s, a, r, s_, done) + if len(self.buffer) >= self.replay_buffer_size: + self.update(batch_size, gamma) + noise_scale *= noise_scale_decay + s = s_ + ep_reward += r + + if done: + break + + print( + 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + i, train_episodes, ep_reward, + time.time() - t0 + ) + ) + + reward_buffer.append(ep_reward) + if plot_func is not None: + plot_func(reward_buffer) + if i and not i % save_interval: + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) + + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) + + # test + elif mode == 'test': + self.load_ckpt(env_name=env.spec.id) + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + reward_buffer = [] + for eps in range(1, test_episodes+1): + ep_rs_sum = 0 + s = env.reset() + for step in range(max_steps): + if render: + env.render() + action = self.get_action_greedy(s) + s, reward, done, info = env.step(action) + ep_rs_sum += reward + if done: + break + + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + eps, test_episodes, ep_rs_sum, time.time() - t0) + ) + reward_buffer.append(ep_rs_sum) + if plot_func: + plot_func(reward_buffer) + else: print('unknown mode type') \ No newline at end of file diff --git a/rlzoo/algorithms/ddpg/default.py b/rlzoo/algorithms/ddpg/default.py old mode 100644 new mode 100755 index 7e79b1c..0a5cc0a --- a/rlzoo/algorithms/ddpg/default.py +++ b/rlzoo/algorithms/ddpg/default.py @@ -1,327 +1,327 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -replay_buffer_size: the size of buffer for storing explored samples -tau: soft update factor ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -explore_steps: for random action sampling in the beginning of training -mode: train or test mode -render: render each step -batch_size: update batch size -gamma: reward decay factor -noise_scale: range of action noise for exploration -noise_scale_decay: noise scale decay factor ------------------------------------------------ -""" - -def classic_control(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - replay_buffer_size=10000, - tau=0.01, - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DDPG'): - with tf.name_scope('Q_Net'): - q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net'): - target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - - net_list = [q_net, target_q_net, policy_net, target_policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-3 - critic_lr = 2e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=100, - test_episodes=10, - max_steps=200, - save_interval=10, - explore_steps=500, - batch_size=32, - gamma=0.9, - noise_scale=1., - noise_scale_decay=0.995 - ) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - replay_buffer_size=10000, - tau=0.01, - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DDPG'): - with tf.name_scope('Q_Net'): - q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net'): - target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - - net_list = [q_net, target_q_net, policy_net, target_policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-3 - critic_lr = 2e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=100, - test_episodes=10, - max_steps=200, - save_interval=10, - explore_steps=500, - batch_size=32, - gamma=0.9, - noise_scale=1., - noise_scale_decay=0.995 - ) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - replay_buffer_size=10000, - tau=0.01, - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DDPG'): - with tf.name_scope('Q_Net'): - q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net'): - target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - - net_list = [q_net, target_q_net, policy_net, target_policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-3 - critic_lr = 2e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=100, - test_episodes=10, - max_steps=200, - save_interval=10, - explore_steps=500, - batch_size=32, - gamma=0.9, - noise_scale=1., - noise_scale_decay=0.995 - ) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - replay_buffer_size=10000, - tau=0.01, - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DDPG'): - with tf.name_scope('Q_Net'): - q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net'): - target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - - net_list = [q_net, target_q_net, policy_net, target_policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-3 - critic_lr = 2e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=100, - test_episodes=10, - max_steps=200, - save_interval=10, - explore_steps=500, - batch_size=32, - gamma=0.9, - noise_scale=1., - noise_scale_decay=0.995 - ) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - replay_buffer_size=10000, - tau=0.01, - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DDPG'): - with tf.name_scope('Q_Net'): - q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net'): - target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - - net_list = [q_net, target_q_net, policy_net, target_policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-3 - critic_lr = 2e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=100, - test_episodes=10, - max_steps=200, - save_interval=10, - explore_steps=500, - batch_size=32, - gamma=0.9, - noise_scale=1., - noise_scale_decay=0.995 - ) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - replay_buffer_size=1000, - tau=0.01, - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DDPG'): - with tf.name_scope('Q_Net'): - q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net'): - target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - - net_list = [q_net, target_q_net, policy_net, target_policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-3 - critic_lr = 2e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=100, - test_episodes=10, - max_steps=200, - save_interval=10, - explore_steps=500, - batch_size=32, - gamma=0.9, - noise_scale=1., - noise_scale_decay=0.995 - ) - +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +replay_buffer_size: the size of buffer for storing explored samples +tau: soft update factor +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +explore_steps: for random action sampling in the beginning of training +mode: train or test mode +render: render each step +batch_size: update batch size +gamma: reward decay factor +noise_scale: range of action noise for exploration +noise_scale_decay: noise scale decay factor +----------------------------------------------- +""" + +def classic_control(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + replay_buffer_size=10000, + tau=0.01, + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DDPG'): + with tf.name_scope('Q_Net'): + q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net'): + target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + + net_list = [q_net, target_q_net, policy_net, target_policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-3 + critic_lr = 2e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=100, + test_episodes=10, + max_steps=200, + save_interval=10, + explore_steps=500, + batch_size=32, + gamma=0.9, + noise_scale=1., + noise_scale_decay=0.995 + ) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + replay_buffer_size=10000, + tau=0.01, + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DDPG'): + with tf.name_scope('Q_Net'): + q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net'): + target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + + net_list = [q_net, target_q_net, policy_net, target_policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-3 + critic_lr = 2e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=100, + test_episodes=10, + max_steps=200, + save_interval=10, + explore_steps=500, + batch_size=32, + gamma=0.9, + noise_scale=1., + noise_scale_decay=0.995 + ) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + replay_buffer_size=10000, + tau=0.01, + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DDPG'): + with tf.name_scope('Q_Net'): + q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net'): + target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + + net_list = [q_net, target_q_net, policy_net, target_policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-3 + critic_lr = 2e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=100, + test_episodes=10, + max_steps=200, + save_interval=10, + explore_steps=500, + batch_size=32, + gamma=0.9, + noise_scale=1., + noise_scale_decay=0.995 + ) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + replay_buffer_size=10000, + tau=0.01, + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DDPG'): + with tf.name_scope('Q_Net'): + q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net'): + target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + + net_list = [q_net, target_q_net, policy_net, target_policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-3 + critic_lr = 2e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=100, + test_episodes=10, + max_steps=200, + save_interval=10, + explore_steps=500, + batch_size=32, + gamma=0.9, + noise_scale=1., + noise_scale_decay=0.995 + ) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + replay_buffer_size=10000, + tau=0.01, + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DDPG'): + with tf.name_scope('Q_Net'): + q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net'): + target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + + net_list = [q_net, target_q_net, policy_net, target_policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-3 + critic_lr = 2e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=100, + test_episodes=10, + max_steps=200, + save_interval=10, + explore_steps=500, + batch_size=32, + gamma=0.9, + noise_scale=1., + noise_scale_decay=0.995 + ) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + replay_buffer_size=1000, + tau=0.01, + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DDPG'): + with tf.name_scope('Q_Net'): + q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net'): + target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + + net_list = [q_net, target_q_net, policy_net, target_policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-3 + critic_lr = 2e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=100, + test_episodes=10, + max_steps=200, + save_interval=10, + explore_steps=500, + batch_size=32, + gamma=0.9, + noise_scale=1., + noise_scale_decay=0.995 + ) + return alg_params, learn_params \ No newline at end of file diff --git a/rlzoo/algorithms/ddpg/run_ddpg.py b/rlzoo/algorithms/ddpg/run_ddpg.py old mode 100644 new mode 100755 index bc292dc..8723466 --- a/rlzoo/algorithms/ddpg/run_ddpg.py +++ b/rlzoo/algorithms/ddpg/run_ddpg.py @@ -1,66 +1,66 @@ -from rlzoo.common.utils import make_env, set_seed -from rlzoo.algorithms.ddpg.ddpg import DDPG -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - -""" load environment """ -env = gym.make('Pendulum-v0').unwrapped - -obs_space = env.observation_space -act_space = env.action_space - -# reproducible -seed = 2 -set_seed(seed, env) - -""" build networks for the algorithm """ -name = 'DDPG' -num_hidden_layer = 2 # number of hidden layers for the networks -hidden_dim = 64 # dimension of hidden layers for the networks - -actor = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer) -critic = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer) - -actor_target = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False) - -critic_target = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False) - -net_list = [critic, critic_target, actor, actor_target] - -""" create model """ -actor_lr = 1e-3 -critic_lr = 2e-3 -optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] -replay_buffer_size = 10000 -model = DDPG(net_list, optimizers_list, replay_buffer_size) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -replay_buffer_size: the size of buffer for storing explored samples -tau: soft update factor -""" - -model.learn(env, train_episodes=100, max_steps=200, save_interval=10, - mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995) -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -explore_steps: for random action sampling in the beginning of training -mode: train or test mode -render: render each step -batch_size: update batch size -gamma: reward decay factor -noise_scale: range of action noise for exploration -noise_scale_decay: noise scale decay factor -""" - -model.learn(env, test_episodes=10, max_steps=200, mode='test', render=True) - +from rlzoo.common.utils import make_env, set_seed +from rlzoo.algorithms.ddpg.ddpg import DDPG +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + +""" load environment """ +env = gym.make('Pendulum-v0').unwrapped + +obs_space = env.observation_space +act_space = env.action_space + +# reproducible +seed = 2 +set_seed(seed, env) + +""" build networks for the algorithm """ +name = 'DDPG' +num_hidden_layer = 2 # number of hidden layers for the networks +hidden_dim = 64 # dimension of hidden layers for the networks + +actor = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer) +critic = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer) + +actor_target = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False) + +critic_target = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False) + +net_list = [critic, critic_target, actor, actor_target] + +""" create model """ +actor_lr = 1e-3 +critic_lr = 2e-3 +optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] +replay_buffer_size = 10000 +model = DDPG(net_list, optimizers_list, replay_buffer_size) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +replay_buffer_size: the size of buffer for storing explored samples +tau: soft update factor +""" + +model.learn(env, train_episodes=100, max_steps=200, save_interval=10, + mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995) +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +explore_steps: for random action sampling in the beginning of training +mode: train or test mode +render: render each step +batch_size: update batch size +gamma: reward decay factor +noise_scale: range of action noise for exploration +noise_scale_decay: noise scale decay factor +""" + +model.learn(env, test_episodes=10, max_steps=200, mode='test', render=True) + diff --git a/rlzoo/algorithms/dppo/__init__.py b/rlzoo/algorithms/dppo/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo/default.py b/rlzoo/algorithms/dppo/default.py old mode 100644 new mode 100755 index 01a79d8..7db1693 --- a/rlzoo/algorithms/dppo/default.py +++ b/rlzoo/algorithms/dppo/default.py @@ -1,334 +1,334 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -epsilon: clip parameter (for method 'clip') -kl_target: controls bounds of policy update and adaptive lambda (for method 'penalty') -lam: KL-regularization coefficient (for method 'penalty') ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -gamma: reward discount factor -mode: train or test -batch_size: update batch size -a_update_steps: actor update iteration steps -c_update_steps: critic update iteration steps -n_worker: number of workers ------------------------------------------------ -""" - - -def atari(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5 # for method 'penalty' - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DPPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer) - - net_list = v_net, policy_net - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - a_update_steps=10, - c_update_steps=10, - n_workers=num_env, - batch_size=32) - - return alg_params, learn_params - - -def classic_control(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5 # for method 'penalty' - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DPPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer) - - net_list = v_net, policy_net - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - a_update_steps=10, - c_update_steps=10, - n_workers=num_env, - batch_size=32) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5 # for method 'penalty' - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DPPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer) - - net_list = v_net, policy_net - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - a_update_steps=10, - c_update_steps=10, - n_workers=num_env, - batch_size=32) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5 # for method 'penalty' - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DPPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer) - - net_list = v_net, policy_net - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - a_update_steps=10, - c_update_steps=10, - n_workers=num_env, - batch_size=32) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5 # for method 'penalty' - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DPPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer) - - net_list = v_net, policy_net - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - a_update_steps=10, - c_update_steps=10, - n_workers=num_env, - batch_size=32) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - assert isinstance(env, list) - seed = np.arange(len(env)).tolist() # a list of seeds for each env - set_seed(seed, env) # reproducible - - # for multi-threading - if isinstance(env, list): # judge if multiple envs are passed in for parallel computing - num_env = len(env) # number of envs passed in - env = env[0] # take one of the env as they are all the same - else: - num_env = 1 - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5 # for method 'penalty' - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('DPPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer) - - net_list = v_net, policy_net - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - a_update_steps=10, - c_update_steps=10, - n_workers=num_env, - batch_size=32) - - return alg_params, learn_params +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +epsilon: clip parameter (for method 'clip') +kl_target: controls bounds of policy update and adaptive lambda (for method 'penalty') +lam: KL-regularization coefficient (for method 'penalty') +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +gamma: reward discount factor +mode: train or test +batch_size: update batch size +a_update_steps: actor update iteration steps +c_update_steps: critic update iteration steps +n_worker: number of workers +----------------------------------------------- +""" + + +def atari(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5 # for method 'penalty' + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DPPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer) + + net_list = v_net, policy_net + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + a_update_steps=10, + c_update_steps=10, + n_workers=num_env, + batch_size=32) + + return alg_params, learn_params + + +def classic_control(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5 # for method 'penalty' + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DPPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer) + + net_list = v_net, policy_net + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + a_update_steps=10, + c_update_steps=10, + n_workers=num_env, + batch_size=32) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5 # for method 'penalty' + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DPPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer) + + net_list = v_net, policy_net + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + a_update_steps=10, + c_update_steps=10, + n_workers=num_env, + batch_size=32) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5 # for method 'penalty' + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DPPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer) + + net_list = v_net, policy_net + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + a_update_steps=10, + c_update_steps=10, + n_workers=num_env, + batch_size=32) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5 # for method 'penalty' + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DPPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer) + + net_list = v_net, policy_net + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + a_update_steps=10, + c_update_steps=10, + n_workers=num_env, + batch_size=32) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + assert isinstance(env, list) + seed = np.arange(len(env)).tolist() # a list of seeds for each env + set_seed(seed, env) # reproducible + + # for multi-threading + if isinstance(env, list): # judge if multiple envs are passed in for parallel computing + num_env = len(env) # number of envs passed in + env = env[0] # take one of the env as they are all the same + else: + num_env = 1 + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5 # for method 'penalty' + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('DPPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer) + + net_list = v_net, policy_net + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + a_update_steps=10, + c_update_steps=10, + n_workers=num_env, + batch_size=32) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/dppo/dppo.py b/rlzoo/algorithms/dppo/dppo.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo_clip/__init__.py b/rlzoo/algorithms/dppo_clip/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo_clip/dppo_clip.py b/rlzoo/algorithms/dppo_clip/dppo_clip.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo_clip/run_dppo_clip.py b/rlzoo/algorithms/dppo_clip/run_dppo_clip.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo_clip_distributed/__init__.py b/rlzoo/algorithms/dppo_clip_distributed/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/rlzoo/algorithms/dppo_clip_distributed/dppo_clip.py b/rlzoo/algorithms/dppo_clip_distributed/dppo_clip.py new file mode 100755 index 0000000..343073c --- /dev/null +++ b/rlzoo/algorithms/dppo_clip_distributed/dppo_clip.py @@ -0,0 +1,279 @@ +from rlzoo.common.policy_networks import StochasticPolicyNetwork +from rlzoo.common.value_networks import ValueNetwork +from rlzoo.common.utils import * +import tensorflow as tf +import numpy as np +import copy +import pickle + + +def write_log(text: str): + pass + # print('infer server: '+text) + # with open('infer_server_log.txt', 'a') as f: + # f.write(str(text) + '\n') + + +EPS = 1e-8 + + +class RLAlgorithm: + def __init__(self): + self.state_buffer = [] # shape: (None, [n_env], [state_shape]) + self.action_buffer = [] + self.reward_buffer = [] + self.done_buffer = [] + self.next_state_buffer = [] + self.logp_buffer = [] + self.all_buffer = self.state_buffer, self.action_buffer, self.reward_buffer, self.done_buffer, \ + self.next_state_buffer, self.logp_buffer + self.traj_list = [] + self.gamma = 0.9 + self.name = 'NotNamed' + + @property + def all_weights(self): + raise NotImplementedError + + def update_model(self, params): + raise NotImplementedError + + def _get_value(self, batch_state): + """ + return: value: tf.Tensor + """ + raise NotImplementedError + + def _get_action(self, batch_state): + """ + return: action: tf.Tensor, log_p: tf.Tensor + """ + raise NotImplementedError + + @property + def logp_shape(self): + raise NotImplementedError + + def save_ckpt(self, env_name): + """ + save trained weights + + :return: None + """ + raise NotImplementedError + + def plot_save_log(self, running_reward, env_name): + plot_save_log(running_reward, algorithm_name=self.name, env_name=env_name) + + def collect_data(self, s, a, r, d, s_, log_p, batch_data=False): + if not batch_data: + s, a, r, d, s_, log_p = [s], [a], [r], [d], [s_], [log_p] + for i, data in enumerate([s, a, r, d, s_, log_p]): + self.all_buffer[i].append(data) + + def get_value(self, state, batch_data=False): + if not batch_data: + state = [state] + value = self._get_value(np.array(state)) + value_shape = np.shape(value) + value = tf.reshape(value, value_shape[:-1]) + return value + + def get_action(self, state, batch_data=False): + if not batch_data: + state = [state] + + state = np.array(state) + action, log_p = self._get_action(state) + action, log_p = action.numpy(), log_p.numpy() + action_shape = np.shape(action) + # 最后一维度是1 是batch但是len=1就不转, 是batch本来要转 + # 不是batch时候len=1也要转 + if action_shape[-1] == 1 and batch_data ^ (len(action_shape) == 1): + # ((batch_data and not len(action_shape) == 1) or (not batch_data and len(action_shape) == 1)): + action = np.reshape(action, action_shape[:-1]) # 转换 + log_p = np.reshape(log_p, log_p.shape[:-1]) + return action, log_p + + # def _cal_discounted_r(self, state_list, reward_list, done_list, batch_data=False): + # discounted_r = [] + # for r in reward_list[::-1]: + # v_s_ = r + 0.9 * v_s_ + # discounted_r.append(v_s_) + + def _cal_discounted_r(self, next_state_list, reward_list, done_list, batch_data=False): + discounted_r = np.zeros_like(reward_list) # reward_buffer shape: [-1, n_env] + # done_list = np.array(done_list, dtype=np.int) + done_list = np.array(done_list) + v_s_ = self.get_value(next_state_list[-1], batch_data) * (1 - done_list[-1]) + for i in range(len(reward_list) - 1, -1, -1): + # discounted_r[i] = v_s_ = reward_list[i] + self.gamma * v_s_ + discounted_r[i] = v_s_ = reward_list[i] + (1 - done_list[i]) * self.gamma * v_s_ + return discounted_r + + def _cal_adv(self, state_list, reward_list, done_list, next_state_list, batch_data=False): + dc_r = self._cal_discounted_r(next_state_list, reward_list, done_list, batch_data) + # dc_r = np.array( + # [[6.5132155], [6.125795], [5.6953278], [5.217031], [4.68559], [4.0951], [3.439], [2.71], [1.9], [1.]]) + if batch_data: + s_shape = np.shape(self.state_buffer) # state_buffer shape: [-1, n_env, *obs_shape] + state_list = np.reshape(self.state_buffer, [-1, *s_shape[2:]]) + v = self.get_value(state_list, batch_data).numpy() + v = v.reshape(*s_shape[:2]) + else: + v = self.get_value(state_list, batch_data).numpy() + + dc_r = np.array(dc_r, dtype=np.float32) + advs = dc_r - v + # advs = (advs - np.mean(advs)) / (np.std(advs) + 1e-8) # norm all env data adv at the same time + return advs + + def _get_traj(self): + traj_list = [] + for element in [ + self.state_buffer, self.action_buffer, self.reward_buffer, self.done_buffer, self.next_state_buffer, + self._cal_adv(self.state_buffer, self.reward_buffer, self.done_buffer, self.next_state_buffer, True), + self.logp_buffer]: + axes = list(range(len(np.shape(element)))) + axes[0], axes[1] = 1, 0 + result = np.transpose(element, axes) + # print(result) + traj_list.append(result) + traj_list = list(zip(*traj_list)) # + return traj_list + + def update_traj_list(self): + self.traj_list.extend(self._get_traj()) + for buffer in self.all_buffer: + buffer.clear() + + +class DPPO_CLIP(RLAlgorithm): + def __init__(self, net_builder, opt_builder, n_step=100, gamma=0.9, epsilon=0.2): + super().__init__() + self.critic, self.actor = None, None + self.net_builder = net_builder + self.gamma = gamma + self.n_step = n_step + self._logp_shape = None + self.epsilon = epsilon + self.name = 'DPPO_CLIP' + self.acter_optimizer, self.critic_optimizer = opt_builder() + + def init_components(self): # todo init process should be placed + networks = self.net_builder() + assert len(networks) == 2 + self.critic, self.actor = networks + assert isinstance(self.critic, ValueNetwork) + assert isinstance(self.actor, StochasticPolicyNetwork) + + @property + def all_weights(self): + return self.critic.trainable_weights + self.actor.trainable_weights + + # api + def _get_action(self, state): + action = self.actor(state) + log_p = self.actor.policy_dist.logp(action) + return action, log_p + + def _get_value(self, state): + return self.critic(state) + + def save_ckpt(self, env_name): + """ + save trained weights + + :return: None + """ + save_model(self.actor, 'actor', self.name, env_name) + save_model(self.critic, 'critic', self.name, env_name) + + def load_ckpt(self, env_name): + """ + load trained weights + + :return: None + """ + load_model(self.actor, 'actor', self.name, env_name) + load_model(self.critic, 'critic', self.name, env_name) + + # api + def update_model(self, params): + for i, j in zip(self.all_weights, params): + i.assign(j) + for buffer in self.all_buffer: + buffer.clear() + + def a_train(self, s, a, adv, oldpi_logp): + oldpi_prob = tf.exp(oldpi_logp) + with tf.GradientTape() as tape: + _ = self.actor(s) + pi_prob = tf.exp(self.actor.policy_dist.logp(a)) + ratio = pi_prob / (oldpi_prob + EPS) + + surr = ratio * adv + aloss = -tf.reduce_mean( + tf.minimum(surr, tf.clip_by_value(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv)) + a_gard = tape.gradient(aloss, self.actor.trainable_weights) + return a_gard + + def c_train(self, dc_r, s): + dc_r = np.array(dc_r, dtype=np.float32) + with tf.GradientTape() as tape: + v = self.critic(s) + advantage = dc_r - v + closs = tf.reduce_mean(tf.square(advantage)) + c_grad = tape.gradient(closs, self.critic.trainable_weights) + return c_grad + + def train(self, traj_list, dis_agent=None): + for traj in traj_list: + state_list, action_list, reward_list, done_list, next_state_list, adv_list, logp_list = traj + for _ in range(10): + a_grad = self.a_train(state_list, action_list, adv_list, logp_list) + if dis_agent: + a_grad = [dis_agent.role_all_reduce(grad) for grad in a_grad] + self.acter_optimizer.apply_gradients(zip(a_grad, self.actor.trainable_weights)) + + dc_r = self._cal_discounted_r(next_state_list, reward_list, done_list) + for _ in range(10): + c_grad = self.c_train(dc_r, state_list) + if dis_agent: + c_grad = [dis_agent.role_all_reduce(grad) for grad in c_grad] + self.critic_optimizer.apply_gradients(zip(c_grad, self.critic.trainable_weights)) + + +if __name__ == '__main__': + from rlzoo.distributed.training_components import net_builder, env_maker, opt_builder + from rlzoo.common.utils import set_seed + + env = env_maker() + # set_seed(1, env) + + agent = DPPO_CLIP(net_builder, opt_builder) + agent.init_components() + + running_reward = [] + curr_step, max_step, traj_len = 0, 500 * 200, 200 + s = env.reset() + d = False + cnt = 0 + while curr_step < max_step: + for _ in range(traj_len): + curr_step += 1 + a, logp = agent.get_action(s) + s_, r, d, _ = env.step(a) + agent.collect_data(s, a, r, d, s_, logp) + if d: + s = env.reset() + else: + s = s_ + agent.update_traj_list() + agent.train(agent.traj_list) + avg_eps_reward = min(sum(agent.traj_list[0][2]) / (sum(agent.traj_list[0][3] + 1e-10)), traj_len) + agent.traj_list.clear() + running_reward.append(avg_eps_reward) + cnt += 1 + print(cnt, curr_step, avg_eps_reward) + agent.plot_save_log(running_reward, env.spec.id) diff --git a/rlzoo/algorithms/dppo_penalty/__init__.py b/rlzoo/algorithms/dppo_penalty/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo_penalty/dppo_penalty.py b/rlzoo/algorithms/dppo_penalty/dppo_penalty.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dppo_penalty/run_dppo_penalty.py b/rlzoo/algorithms/dppo_penalty/run_dppo_penalty.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dqn/__init__.py b/rlzoo/algorithms/dqn/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/dqn/default.py b/rlzoo/algorithms/dqn/default.py old mode 100644 new mode 100755 index 689afe3..aef6d77 --- a/rlzoo/algorithms/dqn/default.py +++ b/rlzoo/algorithms/dqn/default.py @@ -1,210 +1,210 @@ -from gym.spaces import Discrete - -from rlzoo.common.utils import set_seed -from rlzoo.common.value_networks import * - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -double_q (bool): if True double DQN will be used -dueling (bool): if True dueling value estimation will be used -exploration_rate (float): fraction of entire training period over - which the exploration rate is annealed -exploration_final_eps (float): final value of random action probability -batch_size (int): size of a batched sampled from replay buffer for training -train_freq (int): update the model every `train_freq` steps -learning_starts (int): how many steps of the model to collect transitions - for before learning starts -target_network_update_freq (int): update the target network every - `target_network_update_freq` steps -buffer_size (int): size of the replay buffer -prioritized_replay (bool): if True prioritized replay buffer will be used. -prioritized_alpha (float): alpha parameter for prioritized replay -prioritized_beta0 (float): beta parameter for prioritized replay -mode (str): train or test ------------------------------------------------ -""" - - -def atari(env, default_seed=False, **kwargs): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - assert isinstance(env.action_space, Discrete) - - alg_params = dict( - dueling=True, - double_q=True, - buffer_size=1000, - prioritized_replay=True, - prioritized_alpha=0.6, - prioritized_beta0=0.4, - ) - alg_params.update(kwargs) - if alg_params.get('net_list') is None: - alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], - state_only=True, dueling=alg_params['dueling'])] - - if alg_params.get('optimizers_list') is None: - alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10), - - learn_params = dict( - train_episodes=int(1e5), - test_episodes=10, - max_steps=200, - save_interval=1e4, - batch_size=32, - exploration_rate=0.1, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=10000, - target_network_update_freq=1000, - gamma=0.99, - ) - - return alg_params, learn_params - - -def classic_control(env, default_seed=False, **kwargs): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - assert isinstance(env.action_space, Discrete) - - alg_params = dict( - dueling=True, - double_q=True, - buffer_size=1000, - prioritized_replay=False, - prioritized_alpha=0.6, - prioritized_beta0=0.4, - ) - alg_params.update(kwargs) - if alg_params.get('net_list') is None: - alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, - state_only=True, dueling=alg_params['dueling'])] - - if alg_params.get('optimizers_list') is None: - alg_params['optimizers_list'] = tf.optimizers.Adam(5e-3, epsilon=1e-5), - - learn_params = dict( - train_episodes=int(1e3), - test_episodes=10, - max_steps=200, - save_interval=1e3, - batch_size=32, - exploration_rate=0.2, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=200, - target_network_update_freq=50, - gamma=0.99, - ) - - return alg_params, learn_params - - -# class CNNQNet(tl.models.Model): -# def __init__(self, in_dim, act_dim, dueling): -# super().__init__() -# self._state_shape = in_dim -# self._action_shape = act_dim, -# self.dueling = dueling -# with tf.name_scope('DQN'): -# with tf.name_scope('CNN'): -# self.cnn = basic_nets.CNNModel(in_dim) -# mlp_in_shape = self.cnn.outputs[0].shape[0] -# with tf.name_scope('QValue'): -# hidden_dim = 256 -# self.preq = tl.layers.Dense( -# hidden_dim, tf.nn.relu, -# tf.initializers.Orthogonal(1.0), -# in_channels=mlp_in_shape -# ) -# self.qout = tl.layers.Dense( -# act_dim, None, -# tf.initializers.Orthogonal(1.0), -# in_channels=hidden_dim -# ) -# if dueling: -# with tf.name_scope('Value'): -# hidden_dim = 256 -# self.prev = tl.layers.Dense( -# hidden_dim, tf.nn.relu, -# tf.initializers.Orthogonal(1.0), -# in_channels=mlp_in_shape -# ) -# self.vout = tl.layers.Dense( -# 1, None, -# tf.initializers.Orthogonal(1.0), -# in_channels=hidden_dim -# ) -# -# def forward(self, obv): -# obv = tf.cast(obv, tf.float32) / 255.0 -# mlp_in = tl.layers.flatten_reshape(self.cnn(obv)) -# q_out = self.qout(self.preq(mlp_in)) -# if self.dueling: -# v_out = self.vout(self.prev(mlp_in)) -# q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) -# return q_out -# -# @property -# def state_shape(self): -# return copy.deepcopy(self._state_shape) -# -# @property -# def action_shape(self): -# return copy.deepcopy(self._action_shape) -# -# -# class MLPQNet(tl.models.Model): -# def __init__(self, in_dim, act_dim, dueling): -# super().__init__() -# self._state_shape = in_dim, -# self._action_shape = act_dim, -# self.dueling = dueling -# hidden_dim = 64 -# with tf.name_scope('DQN'): -# with tf.name_scope('MLP'): -# self.mlp = tl.layers.Dense( -# hidden_dim, tf.nn.tanh, -# tf.initializers.Orthogonal(1.0), -# in_channels=in_dim -# ) -# with tf.name_scope('QValue'): -# self.qmlp = tl.layers.Dense( -# act_dim, None, -# tf.initializers.Orthogonal(1.0), -# in_channels=hidden_dim -# ) -# if dueling: -# with tf.name_scope('Value'): -# self.vmlp = tl.layers.Dense( -# 1, None, -# tf.initializers.Orthogonal(1.0), -# in_channels=hidden_dim -# ) -# -# def forward(self, obv): -# obv = tf.cast(obv, tf.float32) -# latent = self.mlp(obv) -# q_out = self.qmlp(latent) -# if self.dueling: -# v_out = self.vmlp(latent) -# q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) -# return q_out -# -# @property -# def state_shape(self): -# return copy.deepcopy(self._state_shape) -# -# @property -# def action_shape(self): -# return copy.deepcopy(self._action_shape) +from gym.spaces import Discrete + +from rlzoo.common.utils import set_seed +from rlzoo.common.value_networks import * + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +double_q (bool): if True double DQN will be used +dueling (bool): if True dueling value estimation will be used +exploration_rate (float): fraction of entire training period over + which the exploration rate is annealed +exploration_final_eps (float): final value of random action probability +batch_size (int): size of a batched sampled from replay buffer for training +train_freq (int): update the model every `train_freq` steps +learning_starts (int): how many steps of the model to collect transitions + for before learning starts +target_network_update_freq (int): update the target network every + `target_network_update_freq` steps +buffer_size (int): size of the replay buffer +prioritized_replay (bool): if True prioritized replay buffer will be used. +prioritized_alpha (float): alpha parameter for prioritized replay +prioritized_beta0 (float): beta parameter for prioritized replay +mode (str): train or test +----------------------------------------------- +""" + + +def atari(env, default_seed=False, **kwargs): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + assert isinstance(env.action_space, Discrete) + + alg_params = dict( + dueling=True, + double_q=True, + buffer_size=1000, + prioritized_replay=True, + prioritized_alpha=0.6, + prioritized_beta0=0.4, + ) + alg_params.update(kwargs) + if alg_params.get('net_list') is None: + alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], + state_only=True, dueling=alg_params['dueling'])] + + if alg_params.get('optimizers_list') is None: + alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10), + + learn_params = dict( + train_episodes=int(1e5), + test_episodes=10, + max_steps=200, + save_interval=1e4, + batch_size=32, + exploration_rate=0.1, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=10000, + target_network_update_freq=1000, + gamma=0.99, + ) + + return alg_params, learn_params + + +def classic_control(env, default_seed=False, **kwargs): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + assert isinstance(env.action_space, Discrete) + + alg_params = dict( + dueling=True, + double_q=True, + buffer_size=1000, + prioritized_replay=False, + prioritized_alpha=0.6, + prioritized_beta0=0.4, + ) + alg_params.update(kwargs) + if alg_params.get('net_list') is None: + alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, + state_only=True, dueling=alg_params['dueling'])] + + if alg_params.get('optimizers_list') is None: + alg_params['optimizers_list'] = tf.optimizers.Adam(5e-3, epsilon=1e-5), + + learn_params = dict( + train_episodes=int(1e3), + test_episodes=10, + max_steps=200, + save_interval=1e3, + batch_size=32, + exploration_rate=0.2, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=200, + target_network_update_freq=50, + gamma=0.99, + ) + + return alg_params, learn_params + + +# class CNNQNet(tl.models.Model): +# def __init__(self, in_dim, act_dim, dueling): +# super().__init__() +# self._state_shape = in_dim +# self._action_shape = act_dim, +# self.dueling = dueling +# with tf.name_scope('DQN'): +# with tf.name_scope('CNN'): +# self.cnn = basic_nets.CNNModel(in_dim) +# mlp_in_shape = self.cnn.outputs[0].shape[0] +# with tf.name_scope('QValue'): +# hidden_dim = 256 +# self.preq = tl.layers.Dense( +# hidden_dim, tf.nn.relu, +# tf.initializers.Orthogonal(1.0), +# in_channels=mlp_in_shape +# ) +# self.qout = tl.layers.Dense( +# act_dim, None, +# tf.initializers.Orthogonal(1.0), +# in_channels=hidden_dim +# ) +# if dueling: +# with tf.name_scope('Value'): +# hidden_dim = 256 +# self.prev = tl.layers.Dense( +# hidden_dim, tf.nn.relu, +# tf.initializers.Orthogonal(1.0), +# in_channels=mlp_in_shape +# ) +# self.vout = tl.layers.Dense( +# 1, None, +# tf.initializers.Orthogonal(1.0), +# in_channels=hidden_dim +# ) +# +# def forward(self, obv): +# obv = tf.cast(obv, tf.float32) / 255.0 +# mlp_in = tl.layers.flatten_reshape(self.cnn(obv)) +# q_out = self.qout(self.preq(mlp_in)) +# if self.dueling: +# v_out = self.vout(self.prev(mlp_in)) +# q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) +# return q_out +# +# @property +# def state_shape(self): +# return copy.deepcopy(self._state_shape) +# +# @property +# def action_shape(self): +# return copy.deepcopy(self._action_shape) +# +# +# class MLPQNet(tl.models.Model): +# def __init__(self, in_dim, act_dim, dueling): +# super().__init__() +# self._state_shape = in_dim, +# self._action_shape = act_dim, +# self.dueling = dueling +# hidden_dim = 64 +# with tf.name_scope('DQN'): +# with tf.name_scope('MLP'): +# self.mlp = tl.layers.Dense( +# hidden_dim, tf.nn.tanh, +# tf.initializers.Orthogonal(1.0), +# in_channels=in_dim +# ) +# with tf.name_scope('QValue'): +# self.qmlp = tl.layers.Dense( +# act_dim, None, +# tf.initializers.Orthogonal(1.0), +# in_channels=hidden_dim +# ) +# if dueling: +# with tf.name_scope('Value'): +# self.vmlp = tl.layers.Dense( +# 1, None, +# tf.initializers.Orthogonal(1.0), +# in_channels=hidden_dim +# ) +# +# def forward(self, obv): +# obv = tf.cast(obv, tf.float32) +# latent = self.mlp(obv) +# q_out = self.qmlp(latent) +# if self.dueling: +# v_out = self.vmlp(latent) +# q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) +# return q_out +# +# @property +# def state_shape(self): +# return copy.deepcopy(self._state_shape) +# +# @property +# def action_shape(self): +# return copy.deepcopy(self._action_shape) diff --git a/rlzoo/algorithms/dqn/dqn.py b/rlzoo/algorithms/dqn/dqn.py old mode 100644 new mode 100755 index 43507e6..efbb500 --- a/rlzoo/algorithms/dqn/dqn.py +++ b/rlzoo/algorithms/dqn/dqn.py @@ -1,248 +1,248 @@ -""" -Deep Q Network -""" -import random -from copy import deepcopy - -from rlzoo.common.utils import * -from rlzoo.common.buffer import ReplayBuffer, PrioritizedReplayBuffer -from rlzoo.common.value_networks import * - - -class DQN(object): - """ - Papers: - - Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep - reinforcement learning[J]. Nature, 2015, 518(7540): 529. - - Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements - in Deep Reinforcement Learning[J]. 2017. - """ - - def __init__(self, net_list, optimizers_list, double_q, dueling, buffer_size, - prioritized_replay, prioritized_alpha, prioritized_beta0, ): - """ - Parameters: - ---------- - :param net_list (list): a list of networks (value and policy) used in the algorithm, from common functions or customization - :param optimizers_list (list): a list of optimizers for all networks and differentiable variables - :param double_q (bool): if True double DQN will be used - :param dueling (bool): if True dueling value estimation will be used - :param buffer_size (int): size of the replay buffer - :param prioritized_replay (bool): if True prioritized replay buffer will be used. - :param prioritized_alpha (float): alpha parameter for prioritized replay - :param prioritized_beta0 (float): beta parameter for prioritized replay - """ - assert isinstance(net_list[0], QNetwork) - self.name = 'DQN' - if prioritized_replay: - self.buffer = PrioritizedReplayBuffer( - buffer_size, prioritized_alpha, prioritized_beta0) - else: - self.buffer = ReplayBuffer(buffer_size) - - self.network = net_list[0] - self.target_network = deepcopy(net_list[0]) - self.network.train() - self.target_network.infer() - self.optimizer = optimizers_list[0] - self.double_q = double_q - self.prioritized_replay = prioritized_replay - self.dueling = dueling - - def get_action(self, obv, eps=0.2): - out_dim = self.network.action_shape[0] - if random.random() < eps: - return int(random.random() * out_dim) - else: - obv = np.expand_dims(obv, 0).astype('float32') - return self.network(obv).numpy().argmax(1)[0] - - def get_action_greedy(self, obv): - obv = np.expand_dims(obv, 0).astype('float32') - return self.network(obv).numpy().argmax(1)[0] - - def sync(self): - """Copy q network to target q network""" - - for var, var_tar in zip(self.network.trainable_weights, - self.target_network.trainable_weights): - var_tar.assign(var) - - def save_ckpt(self, env_name): - """ - save trained weights - :return: None - """ - save_model(self.network, 'qnet', 'DQN', env_name) - - def load_ckpt(self, env_name): - """ - load trained weights - :return: None - """ - load_model(self.network, 'qnet', 'DQN', env_name) - - # @tf.function - def _td_error(self, transitions, reward_gamma): - b_o, b_a, b_r, b_o_, b_d = transitions - b_d = tf.cast(b_d, tf.float32) - b_a = tf.cast(b_a, tf.int64) - b_r = tf.cast(b_r, tf.float32) - if self.double_q: - b_a_ = tf.one_hot(tf.argmax(self.network(b_o_), 1), self.network.action_shape[0]) - b_q_ = (1 - b_d) * tf.reduce_sum(self.target_network(b_o_) * b_a_, 1) - else: - b_q_ = (1 - b_d) * tf.reduce_max(self.target_network(b_o_), 1) - - b_q = tf.reduce_sum(self.network(b_o) * tf.one_hot(b_a, self.network.action_shape[0]), 1) - return b_q - (b_r + reward_gamma * b_q_) - - def store_transition(self, s, a, r, s_, d): - self.buffer.push(s, a, r, s_, d) - - def update(self, batch_size, gamma): - if self.prioritized_replay: - # sample from prioritized replay buffer - *transitions, b_w, idxs = self.buffer.sample(batch_size) - # calculate weighted huber loss - with tf.GradientTape() as tape: - priorities = self._td_error(transitions, gamma) - huber_loss = tf.where(tf.abs(priorities) < 1, - tf.square(priorities) * 0.5, - tf.abs(priorities) - 0.5) - loss = tf.reduce_mean(huber_loss * b_w) - # backpropagate - grad = tape.gradient(loss, self.network.trainable_weights) - self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights)) - # update priorities - priorities = np.clip(np.abs(priorities), 1e-6, None) - self.buffer.update_priorities(idxs, priorities) - else: - # sample from prioritized replay buffer - transitions = self.buffer.sample(batch_size) - # calculate huber loss - with tf.GradientTape() as tape: - td_errors = self._td_error(transitions, gamma) - huber_loss = tf.where(tf.abs(td_errors) < 1, - tf.square(td_errors) * 0.5, - tf.abs(td_errors) - 0.5) - loss = tf.reduce_mean(huber_loss) - # backpropagate - grad = tape.gradient(loss, self.network.trainable_weights) - self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights)) - - def learn( - self, env, mode='train', render=False, - train_episodes=1000, test_episodes=10, max_steps=200, - save_interval=1000, gamma=0.99, - exploration_rate=0.2, exploration_final_eps=0.01, - target_network_update_freq=50, - batch_size=32, train_freq=4, learning_starts=200, - plot_func=None - ): - - """ - :param env: learning environment - :param mode: train or test - :param render: render each step - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param save_interval: time steps for saving - :param gamma: reward decay factor - :param exploration_rate (float): fraction of entire training period over - which the exploration rate is annealed - :param exploration_final_eps (float): final value of random action probability - :param target_network_update_freq (int): update the target network every - `target_network_update_freq` steps - :param batch_size (int): size of a batched sampled from replay buffer for training - :param train_freq (int): update the model every `train_freq` steps - :param learning_starts (int): how many steps of the model to collect transitions - for before learning starts - :param plot_func: additional function for interactive module - - """ - if mode == 'train': - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - reward_buffer = [] - i = 0 - for episode in range(1, train_episodes + 1): - o = env.reset() - ep_reward = 0 - for step in range(1, max_steps + 1): - i += 1 - if render: - env.render() - eps = 1 - (1 - exploration_final_eps) * \ - min(1, i / exploration_rate * (train_episodes * max_steps)) - a = self.get_action(o, eps) - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - self.store_transition(o, a, r, o_, done) - ep_reward += r - - # update networks - if i >= learning_starts and i % train_freq == 0: - self.update(batch_size, gamma) - - if i % target_network_update_freq == 0: - self.sync() - - # reset current observation - if done: - break - else: - o = o_ - - # saving model - if i % save_interval == 0: - self.save_ckpt(env.spec.id) - print( - 'Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, episode, ep_reward, step) - ) - reward_buffer.append(ep_reward) - if plot_func is not None: - plot_func(reward_buffer) - - elif mode == 'test': - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - - self.load_ckpt(env.spec.id) - self.network.infer() - - reward_buffer = [] - for episode in range(1, test_episodes + 1): - o = env.reset() - ep_reward = 0 - for step in range(1, max_steps + 1): - if render: - env.render() - a = self.get_action_greedy(o) - - # execute action - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - ep_reward += r - - if done: - break - else: - o = o_ - - print( - 'episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(episode, ep_reward, step) - ) - reward_buffer.append(ep_reward) - if plot_func is not None: - plot_func(reward_buffer) - - else: - print('unknown mode type') +""" +Deep Q Network +""" +import random +from copy import deepcopy + +from rlzoo.common.utils import * +from rlzoo.common.buffer import ReplayBuffer, PrioritizedReplayBuffer +from rlzoo.common.value_networks import * + + +class DQN(object): + """ + Papers: + + Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep + reinforcement learning[J]. Nature, 2015, 518(7540): 529. + + Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements + in Deep Reinforcement Learning[J]. 2017. + """ + + def __init__(self, net_list, optimizers_list, double_q, dueling, buffer_size, + prioritized_replay, prioritized_alpha, prioritized_beta0, ): + """ + Parameters: + ---------- + :param net_list (list): a list of networks (value and policy) used in the algorithm, from common functions or customization + :param optimizers_list (list): a list of optimizers for all networks and differentiable variables + :param double_q (bool): if True double DQN will be used + :param dueling (bool): if True dueling value estimation will be used + :param buffer_size (int): size of the replay buffer + :param prioritized_replay (bool): if True prioritized replay buffer will be used. + :param prioritized_alpha (float): alpha parameter for prioritized replay + :param prioritized_beta0 (float): beta parameter for prioritized replay + """ + assert isinstance(net_list[0], QNetwork) + self.name = 'DQN' + if prioritized_replay: + self.buffer = PrioritizedReplayBuffer( + buffer_size, prioritized_alpha, prioritized_beta0) + else: + self.buffer = ReplayBuffer(buffer_size) + + self.network = net_list[0] + self.target_network = deepcopy(net_list[0]) + self.network.train() + self.target_network.infer() + self.optimizer = optimizers_list[0] + self.double_q = double_q + self.prioritized_replay = prioritized_replay + self.dueling = dueling + + def get_action(self, obv, eps=0.2): + out_dim = self.network.action_shape[0] + if random.random() < eps: + return int(random.random() * out_dim) + else: + obv = np.expand_dims(obv, 0).astype('float32') + return self.network(obv).numpy().argmax(1)[0] + + def get_action_greedy(self, obv): + obv = np.expand_dims(obv, 0).astype('float32') + return self.network(obv).numpy().argmax(1)[0] + + def sync(self): + """Copy q network to target q network""" + + for var, var_tar in zip(self.network.trainable_weights, + self.target_network.trainable_weights): + var_tar.assign(var) + + def save_ckpt(self, env_name): + """ + save trained weights + :return: None + """ + save_model(self.network, 'qnet', 'DQN', env_name) + + def load_ckpt(self, env_name): + """ + load trained weights + :return: None + """ + load_model(self.network, 'qnet', 'DQN', env_name) + + # @tf.function + def _td_error(self, transitions, reward_gamma): + b_o, b_a, b_r, b_o_, b_d = transitions + b_d = tf.cast(b_d, tf.float32) + b_a = tf.cast(b_a, tf.int64) + b_r = tf.cast(b_r, tf.float32) + if self.double_q: + b_a_ = tf.one_hot(tf.argmax(self.network(b_o_), 1), self.network.action_shape[0]) + b_q_ = (1 - b_d) * tf.reduce_sum(self.target_network(b_o_) * b_a_, 1) + else: + b_q_ = (1 - b_d) * tf.reduce_max(self.target_network(b_o_), 1) + + b_q = tf.reduce_sum(self.network(b_o) * tf.one_hot(b_a, self.network.action_shape[0]), 1) + return b_q - (b_r + reward_gamma * b_q_) + + def store_transition(self, s, a, r, s_, d): + self.buffer.push(s, a, r, s_, d) + + def update(self, batch_size, gamma): + if self.prioritized_replay: + # sample from prioritized replay buffer + *transitions, b_w, idxs = self.buffer.sample(batch_size) + # calculate weighted huber loss + with tf.GradientTape() as tape: + priorities = self._td_error(transitions, gamma) + huber_loss = tf.where(tf.abs(priorities) < 1, + tf.square(priorities) * 0.5, + tf.abs(priorities) - 0.5) + loss = tf.reduce_mean(huber_loss * b_w) + # backpropagate + grad = tape.gradient(loss, self.network.trainable_weights) + self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights)) + # update priorities + priorities = np.clip(np.abs(priorities), 1e-6, None) + self.buffer.update_priorities(idxs, priorities) + else: + # sample from prioritized replay buffer + transitions = self.buffer.sample(batch_size) + # calculate huber loss + with tf.GradientTape() as tape: + td_errors = self._td_error(transitions, gamma) + huber_loss = tf.where(tf.abs(td_errors) < 1, + tf.square(td_errors) * 0.5, + tf.abs(td_errors) - 0.5) + loss = tf.reduce_mean(huber_loss) + # backpropagate + grad = tape.gradient(loss, self.network.trainable_weights) + self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights)) + + def learn( + self, env, mode='train', render=False, + train_episodes=1000, test_episodes=10, max_steps=200, + save_interval=1000, gamma=0.99, + exploration_rate=0.2, exploration_final_eps=0.01, + target_network_update_freq=50, + batch_size=32, train_freq=4, learning_starts=200, + plot_func=None + ): + + """ + :param env: learning environment + :param mode: train or test + :param render: render each step + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param save_interval: time steps for saving + :param gamma: reward decay factor + :param exploration_rate (float): fraction of entire training period over + which the exploration rate is annealed + :param exploration_final_eps (float): final value of random action probability + :param target_network_update_freq (int): update the target network every + `target_network_update_freq` steps + :param batch_size (int): size of a batched sampled from replay buffer for training + :param train_freq (int): update the model every `train_freq` steps + :param learning_starts (int): how many steps of the model to collect transitions + for before learning starts + :param plot_func: additional function for interactive module + + """ + if mode == 'train': + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + reward_buffer = [] + i = 0 + for episode in range(1, train_episodes + 1): + o = env.reset() + ep_reward = 0 + for step in range(1, max_steps + 1): + i += 1 + if render: + env.render() + eps = 1 - (1 - exploration_final_eps) * \ + min(1, i / exploration_rate * (train_episodes * max_steps)) + a = self.get_action(o, eps) + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + self.store_transition(o, a, r, o_, done) + ep_reward += r + + # update networks + if i >= learning_starts and i % train_freq == 0: + self.update(batch_size, gamma) + + if i % target_network_update_freq == 0: + self.sync() + + # reset current observation + if done: + break + else: + o = o_ + + # saving model + if i % save_interval == 0: + self.save_ckpt(env.spec.id) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, episode, ep_reward, step) + ) + reward_buffer.append(ep_reward) + if plot_func is not None: + plot_func(reward_buffer) + + elif mode == 'test': + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + + self.load_ckpt(env.spec.id) + self.network.infer() + + reward_buffer = [] + for episode in range(1, test_episodes + 1): + o = env.reset() + ep_reward = 0 + for step in range(1, max_steps + 1): + if render: + env.render() + a = self.get_action_greedy(o) + + # execute action + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + ep_reward += r + + if done: + break + else: + o = o_ + + print( + 'episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(episode, ep_reward, step) + ) + reward_buffer.append(ep_reward) + if plot_func is not None: + plot_func(reward_buffer) + + else: + print('unknown mode type') diff --git a/rlzoo/algorithms/dqn/run_dqn.py b/rlzoo/algorithms/dqn/run_dqn.py old mode 100644 new mode 100755 index 75f624c..6e8a031 --- a/rlzoo/algorithms/dqn/run_dqn.py +++ b/rlzoo/algorithms/dqn/run_dqn.py @@ -1,82 +1,82 @@ -import gym - -from rlzoo.algorithms.dqn.dqn import DQN -from rlzoo.algorithms.dqn.default import * -from rlzoo.common.value_networks import * -import gym - -""" load environment """ -env = gym.make('CartPole-v0').unwrapped - -obs_space = env.observation_space -act_space = env.action_space - -# reproducible -seed = 2 -set_seed(seed, env) - -in_dim = env.observation_space.shape[0] -act_dim = env.action_space.n -""" build networks for the algorithm """ -name = 'DQN' -Q_net = QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, - state_only=True, dueling=True) -net_list = [Q_net] - -""" create model """ -optimizer = tf.optimizers.Adam(5e-3, epsilon=1e-5) -optimizers_list = [optimizer] -model = DQN(net_list, optimizers_list, - double_q=True, - dueling=True, - buffer_size=10000, - prioritized_replay=False, - prioritized_alpha=0.6, - prioritized_beta0=0.4) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -replay_buffer_size: the size of buffer for storing explored samples -tau: soft update factor -""" - -model.learn(env, mode='train', render=False, - train_episodes=1000, - test_episodes=10, - max_steps=200, - save_interval=1e3, - batch_size=32, - exploration_rate=0.2, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=200, - target_network_update_freq=50, - gamma=0.99, ) -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -explore_steps: for random action sampling in the beginning of training -mode: train or test mode -render: render each step -batch_size: update batch size -gamma: reward decay factor -noise_scale: range of action noise for exploration -noise_scale_decay: noise scale decay factor -""" - -model.learn(env, mode='test', render=True, - test_episodes=10, - batch_size=32, - exploration_rate=0.2, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=200, - target_network_update_freq=50, - gamma=0.99, ) +import gym + +from rlzoo.algorithms.dqn.dqn import DQN +from rlzoo.algorithms.dqn.default import * +from rlzoo.common.value_networks import * +import gym + +""" load environment """ +env = gym.make('CartPole-v0').unwrapped + +obs_space = env.observation_space +act_space = env.action_space + +# reproducible +seed = 2 +set_seed(seed, env) + +in_dim = env.observation_space.shape[0] +act_dim = env.action_space.n +""" build networks for the algorithm """ +name = 'DQN' +Q_net = QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, + state_only=True, dueling=True) +net_list = [Q_net] + +""" create model """ +optimizer = tf.optimizers.Adam(5e-3, epsilon=1e-5) +optimizers_list = [optimizer] +model = DQN(net_list, optimizers_list, + double_q=True, + dueling=True, + buffer_size=10000, + prioritized_replay=False, + prioritized_alpha=0.6, + prioritized_beta0=0.4) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +replay_buffer_size: the size of buffer for storing explored samples +tau: soft update factor +""" + +model.learn(env, mode='train', render=False, + train_episodes=1000, + test_episodes=10, + max_steps=200, + save_interval=1e3, + batch_size=32, + exploration_rate=0.2, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=200, + target_network_update_freq=50, + gamma=0.99, ) +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +explore_steps: for random action sampling in the beginning of training +mode: train or test mode +render: render each step +batch_size: update batch size +gamma: reward decay factor +noise_scale: range of action noise for exploration +noise_scale_decay: noise scale decay factor +""" + +model.learn(env, mode='test', render=True, + test_episodes=10, + batch_size=32, + exploration_rate=0.2, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=200, + target_network_update_freq=50, + gamma=0.99, ) diff --git a/rlzoo/algorithms/pg/__init__.py b/rlzoo/algorithms/pg/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/pg/default.py b/rlzoo/algorithms/pg/default.py old mode 100644 new mode 100755 index 09361b6..40836c5 --- a/rlzoo/algorithms/pg/default.py +++ b/rlzoo/algorithms/pg/default.py @@ -1,259 +1,259 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -mode: train or test -render: render each step -gamma: reward decay ------------------------------------------------ -""" - - -def atari(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params - - -def classic_control(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict() - - if alg_params.get('net_list') is None: - num_hidden_layer = 1 # number of hidden layers for the networks - hidden_dim = 32 # dimension of hidden layers for the networks - with tf.name_scope('PG'): - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - num_hidden_layer * [hidden_dim]) - net_list = [policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - learning_rate = 0.02 - policy_optimizer = tf.optimizers.Adam(learning_rate) - optimizers_list = [policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - train_episodes=200, - test_episodes=100, - max_steps=200, - save_interval=20, - gamma=0.95 - ) - - return alg_params, learn_params +from rlzoo.common.policy_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +mode: train or test +render: render each step +gamma: reward decay +----------------------------------------------- +""" + + +def atari(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params + + +def classic_control(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict() + + if alg_params.get('net_list') is None: + num_hidden_layer = 1 # number of hidden layers for the networks + hidden_dim = 32 # dimension of hidden layers for the networks + with tf.name_scope('PG'): + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + num_hidden_layer * [hidden_dim]) + net_list = [policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + learning_rate = 0.02 + policy_optimizer = tf.optimizers.Adam(learning_rate) + optimizers_list = [policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + train_episodes=200, + test_episodes=100, + max_steps=200, + save_interval=20, + gamma=0.95 + ) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/pg/pg.py b/rlzoo/algorithms/pg/pg.py old mode 100644 new mode 100755 index cfbe671..5d7252a --- a/rlzoo/algorithms/pg/pg.py +++ b/rlzoo/algorithms/pg/pg.py @@ -1,217 +1,217 @@ -""" -Vanilla Policy Gradient(VPG or REINFORCE) ------------------------------------------ -The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. -It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. -Here is an example on discrete action space game CartPole-v0. -To apply it on continuous action space, you need to change the last softmax layer and the get_action function. - -Reference ---------- -Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998. -MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ -MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ - -Prerequisites --------------- -tensorflow >=2.0.0a0 -tensorflow-probability 0.6.0 -tensorlayer >=2.0.0 - -""" -import time - -from rlzoo.common.utils import * -from rlzoo.common.policy_networks import * - - -############################### PG #################################### - - -class PG: - """ - PG class - """ - - def __init__(self, net_list, optimizers_list): - """ - :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization - :param optimizers_list: a list of optimizers for all networks and differentiable variables - - """ - assert len(net_list) == 1 - assert len(optimizers_list) == 1 - self.name = 'PG' - self.model = net_list[0] - assert isinstance(self.model, StochasticPolicyNetwork) - self.buffer = [] - print('Policy Network', self.model) - self.optimizer = optimizers_list[0] - - def get_action(self, s): - """ - choose action with probabilities. - - :param s: state - - :return: act - """ - return self.model([s])[0].numpy() - - def get_action_greedy(self, s): - """ - choose action with greedy policy - - :param s: state - - :return: act - """ - return self.model([s], greedy=True).numpy()[0] - - def store_transition(self, s, a, r): - """ - store data in memory buffer - - :param s: state - :param a: act - :param r: reward - - :return: - """ - self.buffer.append([s, np.array(a, np.float32), np.array(r, np.float32)]) - - def update(self, gamma): - """ - update policy parameters via stochastic gradient ascent - - :return: None - """ - # discount and normalize episode reward - s, a, r = zip(*self.buffer) - s, a, r = np.array(s), np.array(a), np.array(r).flatten() - discounted_ep_rs_norm = self._discount_and_norm_rewards(r, gamma) - - with tf.GradientTape() as tape: - self.model(s) - neg_log_prob = self.model.policy_dist.neglogp(a) - loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm) # reward guided loss - - grad = tape.gradient(loss, self.model.trainable_weights) - self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) - - self.buffer = [] - return discounted_ep_rs_norm - - def _discount_and_norm_rewards(self, reward_list, gamma): - """ - compute discount_and_norm_rewards - - :return: discount_and_norm_rewards - """ - # discount episode rewards - discounted_ep_rs = np.zeros_like(reward_list) - running_add = 0 - for t in reversed(range(0, len(reward_list))): - running_add = running_add * gamma + reward_list[t] - discounted_ep_rs[t] = running_add - - # normalize episode rewards - discounted_ep_rs -= np.mean(discounted_ep_rs) - std = np.std(discounted_ep_rs) - if std != 0: - discounted_ep_rs /= np.std(discounted_ep_rs) - discounted_ep_rs = discounted_ep_rs[:, np.newaxis] - return discounted_ep_rs - - def save_ckpt(self, env_name): - """ - save trained weights - - :return: None - """ - save_model(self.model, 'model_policy', self.name, env_name) - - def load_ckpt(self, env_name): - """ - load trained weights - - :return: None - """ - load_model(self.model, 'model_policy', self.name, env_name) - - def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=100, - mode='train', render=False, gamma=0.95, plot_func=None): - """ - :param env: learning environment - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param save_interval: time steps for saving - :param mode: train or test - :param render: render each step - :param gamma: reward decay - :param plot_func: additional function for interactive module - :return: None - """ - - if mode == 'train': - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - reward_buffer = [] - t0 = time.time() - - for i_episode in range(1, train_episodes + 1): - - observation = env.reset() - - ep_rs_sum = 0 - for step in range(max_steps): - if render: - env.render() - action = self.get_action(observation) - observation_, reward, done, info = env.step(action) - self.store_transition(observation, action, reward) - - ep_rs_sum += reward - observation = observation_ - - if done: - break - - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - i_episode, train_episodes, ep_rs_sum, time.time() - t0) - ) - reward_buffer.append(ep_rs_sum) - if plot_func is not None: - plot_func(reward_buffer) - - self.update(gamma) - - if i_episode and i_episode % save_interval == 0: - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id) - - self.save_ckpt(env_name=env.spec.id) - plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id) - - elif mode == 'test': - # test - self.load_ckpt(env_name=env.spec.id) - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - t0 = time.time() - for eps in range(test_episodes): - observation = env.reset() - ep_rs_sum = 0 - for step in range(max_steps): - if render: - env.render() - action = self.get_action_greedy(observation) - observation, reward, done, info = env.step(action) - ep_rs_sum += reward - if done: - break - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - eps, test_episodes, ep_rs_sum, time.time() - t0) - ) - - else: - print('unknown mode type') +""" +Vanilla Policy Gradient(VPG or REINFORCE) +----------------------------------------- +The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. +It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. +Here is an example on discrete action space game CartPole-v0. +To apply it on continuous action space, you need to change the last softmax layer and the get_action function. + +Reference +--------- +Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998. +MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ +MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ + +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 + +""" +import time + +from rlzoo.common.utils import * +from rlzoo.common.policy_networks import * + + +############################### PG #################################### + + +class PG: + """ + PG class + """ + + def __init__(self, net_list, optimizers_list): + """ + :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization + :param optimizers_list: a list of optimizers for all networks and differentiable variables + + """ + assert len(net_list) == 1 + assert len(optimizers_list) == 1 + self.name = 'PG' + self.model = net_list[0] + assert isinstance(self.model, StochasticPolicyNetwork) + self.buffer = [] + print('Policy Network', self.model) + self.optimizer = optimizers_list[0] + + def get_action(self, s): + """ + choose action with probabilities. + + :param s: state + + :return: act + """ + return self.model([s])[0].numpy() + + def get_action_greedy(self, s): + """ + choose action with greedy policy + + :param s: state + + :return: act + """ + return self.model([s], greedy=True).numpy()[0] + + def store_transition(self, s, a, r): + """ + store data in memory buffer + + :param s: state + :param a: act + :param r: reward + + :return: + """ + self.buffer.append([s, np.array(a, np.float32), np.array(r, np.float32)]) + + def update(self, gamma): + """ + update policy parameters via stochastic gradient ascent + + :return: None + """ + # discount and normalize episode reward + s, a, r = zip(*self.buffer) + s, a, r = np.array(s), np.array(a), np.array(r).flatten() + discounted_ep_rs_norm = self._discount_and_norm_rewards(r, gamma) + + with tf.GradientTape() as tape: + self.model(s) + neg_log_prob = self.model.policy_dist.neglogp(a) + loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm) # reward guided loss + + grad = tape.gradient(loss, self.model.trainable_weights) + self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) + + self.buffer = [] + return discounted_ep_rs_norm + + def _discount_and_norm_rewards(self, reward_list, gamma): + """ + compute discount_and_norm_rewards + + :return: discount_and_norm_rewards + """ + # discount episode rewards + discounted_ep_rs = np.zeros_like(reward_list) + running_add = 0 + for t in reversed(range(0, len(reward_list))): + running_add = running_add * gamma + reward_list[t] + discounted_ep_rs[t] = running_add + + # normalize episode rewards + discounted_ep_rs -= np.mean(discounted_ep_rs) + std = np.std(discounted_ep_rs) + if std != 0: + discounted_ep_rs /= np.std(discounted_ep_rs) + discounted_ep_rs = discounted_ep_rs[:, np.newaxis] + return discounted_ep_rs + + def save_ckpt(self, env_name): + """ + save trained weights + + :return: None + """ + save_model(self.model, 'model_policy', self.name, env_name) + + def load_ckpt(self, env_name): + """ + load trained weights + + :return: None + """ + load_model(self.model, 'model_policy', self.name, env_name) + + def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=100, + mode='train', render=False, gamma=0.95, plot_func=None): + """ + :param env: learning environment + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param save_interval: time steps for saving + :param mode: train or test + :param render: render each step + :param gamma: reward decay + :param plot_func: additional function for interactive module + :return: None + """ + + if mode == 'train': + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + reward_buffer = [] + t0 = time.time() + + for i_episode in range(1, train_episodes + 1): + + observation = env.reset() + + ep_rs_sum = 0 + for step in range(max_steps): + if render: + env.render() + action = self.get_action(observation) + observation_, reward, done, info = env.step(action) + self.store_transition(observation, action, reward) + + ep_rs_sum += reward + observation = observation_ + + if done: + break + + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + i_episode, train_episodes, ep_rs_sum, time.time() - t0) + ) + reward_buffer.append(ep_rs_sum) + if plot_func is not None: + plot_func(reward_buffer) + + self.update(gamma) + + if i_episode and i_episode % save_interval == 0: + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id) + + self.save_ckpt(env_name=env.spec.id) + plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id) + + elif mode == 'test': + # test + self.load_ckpt(env_name=env.spec.id) + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + t0 = time.time() + for eps in range(test_episodes): + observation = env.reset() + ep_rs_sum = 0 + for step in range(max_steps): + if render: + env.render() + action = self.get_action_greedy(observation) + observation, reward, done, info = env.step(action) + ep_rs_sum += reward + if done: + break + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + eps, test_episodes, ep_rs_sum, time.time() - t0) + ) + + else: + print('unknown mode type') diff --git a/rlzoo/algorithms/pg/run_pg.py b/rlzoo/algorithms/pg/run_pg.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo/__init__.py b/rlzoo/algorithms/ppo/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo/default.py b/rlzoo/algorithms/ppo/default.py old mode 100644 new mode 100755 index a8305f8..3723871 --- a/rlzoo/algorithms/ppo/default.py +++ b/rlzoo/algorithms/ppo/default.py @@ -1,322 +1,322 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -epsilon: clip parameter (for method 'clip') -kl_target: controls bounds of policy update and adaptive lambda (for method 'penalty') -lam: KL-regularization coefficient (for method 'penalty') ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -gamma: reward discount factor -mode: train or test -render: render each step -batch_size: UPDATE batch size -a_update_steps: actor update iteration steps -c_update_steps: critic update iteration steps ------------------------------------------------ -""" - - -def atari(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params - - -def classic_control(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - # reproducible - seed = 1 - set_seed(seed, env) - - alg_params = dict(method='clip', # method can be clip or penalty - epsilon=0.2, # for method 'clip' - kl_target=0.01, # for method 'penalty' - lam=0.5,) # for method 'penalty' - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('PPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, trainable=True) - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - actor_lr = 1e-4 - critic_lr = 2e-4 - optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=1000, - test_episodes=100, - max_steps=200, - save_interval=50, - gamma=0.9, - batch_size=32, - a_update_steps=10, - c_update_steps=10) - - return alg_params, learn_params +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +epsilon: clip parameter (for method 'clip') +kl_target: controls bounds of policy update and adaptive lambda (for method 'penalty') +lam: KL-regularization coefficient (for method 'penalty') +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +gamma: reward discount factor +mode: train or test +render: render each step +batch_size: UPDATE batch size +a_update_steps: actor update iteration steps +c_update_steps: critic update iteration steps +----------------------------------------------- +""" + + +def atari(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params + + +def classic_control(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + # reproducible + seed = 1 + set_seed(seed, env) + + alg_params = dict(method='clip', # method can be clip or penalty + epsilon=0.2, # for method 'clip' + kl_target=0.01, # for method 'penalty' + lam=0.5,) # for method 'penalty' + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('PPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, trainable=True) + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + actor_lr = 1e-4 + critic_lr = 2e-4 + optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=1000, + test_episodes=100, + max_steps=200, + save_interval=50, + gamma=0.9, + batch_size=32, + a_update_steps=10, + c_update_steps=10) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/ppo/ppo.py b/rlzoo/algorithms/ppo/ppo.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo_clip/__init__.py b/rlzoo/algorithms/ppo_clip/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo_clip/ppo_clip.py b/rlzoo/algorithms/ppo_clip/ppo_clip.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo_clip/run_ppo_clip.py b/rlzoo/algorithms/ppo_clip/run_ppo_clip.py old mode 100644 new mode 100755 index fff0853..99645ed --- a/rlzoo/algorithms/ppo_clip/run_ppo_clip.py +++ b/rlzoo/algorithms/ppo_clip/run_ppo_clip.py @@ -1,59 +1,59 @@ -from rlzoo.common.utils import make_env, set_seed -from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - - -""" load environment """ -env = gym.make('Pendulum-v0').unwrapped - -# reproducible -seed = 1 -set_seed(seed, env) - -""" build networks for the algorithm """ -name = 'PPO_CLIP' -hidden_dim = 64 -num_hidden_layer = 2 -critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') - -actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, name=name + '_policy') -net_list = critic, actor - -""" create model """ -actor_lr = 1e-4 -critic_lr = 2e-4 -optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - -model = PPO_CLIP(net_list, optimizers_list,) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -epsilon: clip parameter -""" - -model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9, - mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10) - -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -gamma: reward discount factor -mode: train or test -render: render each step -batch_size: UPDATE batch size -a_update_steps: actor update iteration steps -c_update_steps: critic update iteration steps -:return: None -""" -model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) - +from rlzoo.common.utils import make_env, set_seed +from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + + +""" load environment """ +env = gym.make('Pendulum-v0').unwrapped + +# reproducible +seed = 1 +set_seed(seed, env) + +""" build networks for the algorithm """ +name = 'PPO_CLIP' +hidden_dim = 64 +num_hidden_layer = 2 +critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') + +actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, name=name + '_policy') +net_list = critic, actor + +""" create model """ +actor_lr = 1e-4 +critic_lr = 2e-4 +optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + +model = PPO_CLIP(net_list, optimizers_list,) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +epsilon: clip parameter +""" + +model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9, + mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10) + +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +gamma: reward discount factor +mode: train or test +render: render each step +batch_size: UPDATE batch size +a_update_steps: actor update iteration steps +c_update_steps: critic update iteration steps +:return: None +""" +model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) + diff --git a/rlzoo/algorithms/ppo_penalty/__init__.py b/rlzoo/algorithms/ppo_penalty/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo_penalty/ppo_penalty.py b/rlzoo/algorithms/ppo_penalty/ppo_penalty.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/ppo_penalty/run_ppo_penalty.py b/rlzoo/algorithms/ppo_penalty/run_ppo_penalty.py old mode 100644 new mode 100755 index 4e847d0..de6672b --- a/rlzoo/algorithms/ppo_penalty/run_ppo_penalty.py +++ b/rlzoo/algorithms/ppo_penalty/run_ppo_penalty.py @@ -1,60 +1,60 @@ -from rlzoo.common.utils import make_env, set_seed -from rlzoo.algorithms.ppo_penalty.ppo_penalty import PPO_PENALTY -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - - -""" load environment """ -env = gym.make('Pendulum-v0').unwrapped - -# reproducible -seed = 1 -set_seed(seed, env) - -""" build networks for the algorithm """ -name = 'PPO_PENALTY' -hidden_dim = 64 -num_hidden_layer = 2 -critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') - -actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, name=name + '_policy') -net_list = critic, actor - -""" create model """ -actor_lr = 1e-4 -critic_lr = 2e-4 -optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] - -model = PPO_PENALTY(net_list, optimizers_list,) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -kl_target: controls bounds of policy update and adaptive lambda -lam: KL-regularization coefficient -""" - -model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9, - mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10) - -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: times teps for saving -gamma: reward discount factor -mode: train or test -render: render each step -batch_size: update batch size -a_update_steps: actor update iteration steps -c_update_steps: critic update iteration steps -:return: None -""" - -model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) +from rlzoo.common.utils import make_env, set_seed +from rlzoo.algorithms.ppo_penalty.ppo_penalty import PPO_PENALTY +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + + +""" load environment """ +env = gym.make('Pendulum-v0').unwrapped + +# reproducible +seed = 1 +set_seed(seed, env) + +""" build networks for the algorithm """ +name = 'PPO_PENALTY' +hidden_dim = 64 +num_hidden_layer = 2 +critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') + +actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, name=name + '_policy') +net_list = critic, actor + +""" create model """ +actor_lr = 1e-4 +critic_lr = 2e-4 +optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + +model = PPO_PENALTY(net_list, optimizers_list,) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +kl_target: controls bounds of policy update and adaptive lambda +lam: KL-regularization coefficient +""" + +model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9, + mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10) + +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: times teps for saving +gamma: reward discount factor +mode: train or test +render: render each step +batch_size: update batch size +a_update_steps: actor update iteration steps +c_update_steps: critic update iteration steps +:return: None +""" + +model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) diff --git a/rlzoo/algorithms/sac/__init__.py b/rlzoo/algorithms/sac/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/sac/default.py b/rlzoo/algorithms/sac/default.py old mode 100644 new mode 100755 index 913db21..8fb86ea --- a/rlzoo/algorithms/sac/default.py +++ b/rlzoo/algorithms/sac/default.py @@ -1,364 +1,364 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -replay_buffer_capacity: the size of buffer for storing explored samples ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -batch_size: udpate batchsize -explore_steps: for random action sampling in the beginning of training -update_itr: repeated updates for single step -policy_target_update_interval: delayed update for the policy network and target networks -reward_scale: value range of reward -save_interval: timesteps for saving the weights and plotting the results -mode: 'train' or 'test' -AUTO_ENTROPY: automatically udpating variable alpha for entropy -render: if true, visualize the environment ------------------------------------------------ -""" - - -def classic_control(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here - with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) - net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha - soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) - soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - alpha_optimizer = tf.optimizers.Adam(alpha_lr) - optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=200, - update_itr=3, - policy_target_update_interval=3, - reward_scale=1., - AUTO_ENTROPY=True, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here - with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) - net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha - soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) - soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - alpha_optimizer = tf.optimizers.Adam(alpha_lr) - optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=200, - update_itr=3, - policy_target_update_interval=3, - reward_scale=1., - AUTO_ENTROPY=True, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here - with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) - net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha - soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) - soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - alpha_optimizer = tf.optimizers.Adam(alpha_lr) - optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=200, - update_itr=3, - policy_target_update_interval=3, - reward_scale=1., - AUTO_ENTROPY=True, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here - with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) - net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha - soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) - soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - alpha_optimizer = tf.optimizers.Adam(alpha_lr) - optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=200, - update_itr=3, - policy_target_update_interval=3, - reward_scale=1., - AUTO_ENTROPY=True, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here - with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) - net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha - soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) - soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - alpha_optimizer = tf.optimizers.Adam(alpha_lr) - optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=200, - update_itr=3, - policy_target_update_interval=3, - reward_scale=1., - AUTO_ENTROPY=True, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here - with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) - net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha - soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) - soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - alpha_optimizer = tf.optimizers.Adam(alpha_lr) - optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=200, - update_itr=3, - policy_target_update_interval=3, - reward_scale=1., - AUTO_ENTROPY=True, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +replay_buffer_capacity: the size of buffer for storing explored samples +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +batch_size: udpate batchsize +explore_steps: for random action sampling in the beginning of training +update_itr: repeated updates for single step +policy_target_update_interval: delayed update for the policy network and target networks +reward_scale: value range of reward +save_interval: timesteps for saving the weights and plotting the results +mode: 'train' or 'test' +AUTO_ENTROPY: automatically udpating variable alpha for entropy +render: if true, visualize the environment +----------------------------------------------- +""" + + +def classic_control(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here + with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) + net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha + soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + alpha_optimizer = tf.optimizers.Adam(alpha_lr) + optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=200, + update_itr=3, + policy_target_update_interval=3, + reward_scale=1., + AUTO_ENTROPY=True, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here + with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) + net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha + soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + alpha_optimizer = tf.optimizers.Adam(alpha_lr) + optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=200, + update_itr=3, + policy_target_update_interval=3, + reward_scale=1., + AUTO_ENTROPY=True, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here + with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) + net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha + soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + alpha_optimizer = tf.optimizers.Adam(alpha_lr) + optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=200, + update_itr=3, + policy_target_update_interval=3, + reward_scale=1., + AUTO_ENTROPY=True, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here + with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) + net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha + soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + alpha_optimizer = tf.optimizers.Adam(alpha_lr) + optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=200, + update_itr=3, + policy_target_update_interval=3, + reward_scale=1., + AUTO_ENTROPY=True, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here + with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) + net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha + soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + alpha_optimizer = tf.optimizers.Adam(alpha_lr) + optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=200, + update_itr=3, + policy_target_update_interval=3, + reward_scale=1., + AUTO_ENTROPY=True, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here + with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) + net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha + soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + alpha_optimizer = tf.optimizers.Adam(alpha_lr) + optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=200, + update_itr=3, + policy_target_update_interval=3, + reward_scale=1., + AUTO_ENTROPY=True, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/sac/run_sac.py b/rlzoo/algorithms/sac/run_sac.py old mode 100644 new mode 100755 index 89efb0a..5a1edcb --- a/rlzoo/algorithms/sac/run_sac.py +++ b/rlzoo/algorithms/sac/run_sac.py @@ -1,82 +1,82 @@ -from rlzoo.algorithms.sac.sac import SAC -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - -""" load environment """ -env = gym.make('Pendulum-v0').unwrapped -# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run -action_shape = env.action_space.shape -state_shape = env.observation_space.shape -# reproducible -seed = 2 -np.random.seed(seed) -tf.random.set_seed(seed) -env.seed(seed) - -""" build networks for the algorithm """ -num_hidden_layer = 2 # number of hidden layers for the networks -hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here -with tf.name_scope('SAC'): - with tf.name_scope('Q_Net1'): - soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim], - output_activation=None, - state_conditioned=True) -net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] - -""" choose optimizers """ -soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha -soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) -soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) -policy_optimizer = tf.optimizers.Adam(policy_lr) -alpha_optimizer = tf.optimizers.Adam(alpha_lr) -optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] - -model = SAC(net_list, optimizers_list) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -state_dim: dimension of state for the environment -action_dim: dimension of action for the environment -replay_buffer_capacity: the size of buffer for storing explored samples -action_range: value of each action in [-action_range, action_range] -""" - -model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, \ - update_itr=3, policy_target_update_interval=3, reward_scale=1., save_interval=10, \ - mode='train', AUTO_ENTROPY=True, render=False) -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -batch_size: udpate batchsize -explore_steps: for random action sampling in the beginning of training -update_itr: repeated updates for single step -policy_target_update_interval: delayed update for the policy network and target networks -reward_scale: value range of reward -save_interval: timesteps for saving the weights and plotting the results -mode: 'train' or 'test' -AUTO_ENTROPY: automatically udpating variable alpha for entropy -DETERMINISTIC: stochastic action policy if False, otherwise deterministic -render: if true, visualize the environment -""" -# test -model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True) +from rlzoo.algorithms.sac.sac import SAC +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + +""" load environment """ +env = gym.make('Pendulum-v0').unwrapped +# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run +action_shape = env.action_space.shape +state_shape = env.observation_space.shape +# reproducible +seed = 2 +np.random.seed(seed) +tf.random.set_seed(seed) +env.seed(seed) + +""" build networks for the algorithm """ +num_hidden_layer = 2 # number of hidden layers for the networks +hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here +with tf.name_scope('SAC'): + with tf.name_scope('Q_Net1'): + soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim], + output_activation=None, + state_conditioned=True) +net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] + +""" choose optimizers """ +soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha +soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) +soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) +policy_optimizer = tf.optimizers.Adam(policy_lr) +alpha_optimizer = tf.optimizers.Adam(alpha_lr) +optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] + +model = SAC(net_list, optimizers_list) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +state_dim: dimension of state for the environment +action_dim: dimension of action for the environment +replay_buffer_capacity: the size of buffer for storing explored samples +action_range: value of each action in [-action_range, action_range] +""" + +model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, \ + update_itr=3, policy_target_update_interval=3, reward_scale=1., save_interval=10, \ + mode='train', AUTO_ENTROPY=True, render=False) +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +batch_size: udpate batchsize +explore_steps: for random action sampling in the beginning of training +update_itr: repeated updates for single step +policy_target_update_interval: delayed update for the policy network and target networks +reward_scale: value range of reward +save_interval: timesteps for saving the weights and plotting the results +mode: 'train' or 'test' +AUTO_ENTROPY: automatically udpating variable alpha for entropy +DETERMINISTIC: stochastic action policy if False, otherwise deterministic +render: if true, visualize the environment +""" +# test +model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True) diff --git a/rlzoo/algorithms/sac/sac.py b/rlzoo/algorithms/sac/sac.py old mode 100644 new mode 100755 index 8f7041b..84291a5 --- a/rlzoo/algorithms/sac/sac.py +++ b/rlzoo/algorithms/sac/sac.py @@ -1,286 +1,286 @@ -""" -Soft Actor-Critic -using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net -adding alpha loss -paper: https://arxiv.org/pdf/1812.05905.pdf -Actor policy is stochastic. -Env: Openai Gym Pendulum-v0, continuous action space -tensorflow 2.0.0a0 -tensorflow-probability 0.6.0 -tensorlayer 2.0.0 -&& -pip install box2d box2d-kengz --user -""" - -import time - -import tensorflow_probability as tfp -import tensorlayer as tl -from rlzoo.common.utils import * -from rlzoo.common.buffer import * -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * - -tfd = tfp.distributions -Normal = tfd.Normal - -tl.logging.set_verbosity(tl.logging.DEBUG) - - -class SAC(): - """ Soft Actor-Critic """ - - def __init__(self, net_list, optimizers_list, replay_buffer_capacity=5e5): - self.replay_buffer = ReplayBuffer(replay_buffer_capacity) - self.name = 'SAC' - - # get all networks - [self.soft_q_net1, self.soft_q_net2, self.target_soft_q_net1, self.target_soft_q_net2, - self.policy_net] = net_list - - assert isinstance(self.soft_q_net1, QNetwork) - assert isinstance(self.soft_q_net2, QNetwork) - assert isinstance(self.target_soft_q_net1, QNetwork) - assert isinstance(self.target_soft_q_net2, QNetwork) - assert isinstance(self.policy_net, StochasticPolicyNetwork) - assert isinstance(self.policy_net.action_space, gym.spaces.Box) - - self.action_dim = self.policy_net.action_shape[0] - - self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha') - self.alpha = tf.math.exp(self.log_alpha) - print('Soft Q Network (1,2): ', self.soft_q_net1) - print('Policy Network: ', self.policy_net) - - # initialize weights of target networks - self.target_soft_q_net1 = self.target_ini(self.soft_q_net1, self.target_soft_q_net1) - self.target_soft_q_net2 = self.target_ini(self.soft_q_net2, self.target_soft_q_net2) - - [self.soft_q_optimizer1, self.soft_q_optimizer2, self.policy_optimizer, self.alpha_optimizer] = optimizers_list - - def evaluate(self, state, epsilon=1e-6): - """ generate action with state for calculating gradients """ - _ = self.policy_net(state) - mean, log_std = self.policy_net.policy_dist.get_param() # as SAC uses TanhNorm instead of normal distribution, need original mean_std - std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow - - normal = Normal(0, 1) - z = normal.sample(mean.shape) - action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick - # according to original paper, with an extra last term for normalizing different action range - log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0 ** 2 + epsilon) - # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); - # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, - # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. - log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced - - action = action_0 * self.policy_net.policy_dist.action_scale + self.policy_net.policy_dist.action_mean - - return action, log_prob, z, mean, log_std - - def get_action(self, state): - """ generate action with state for interaction with envronment """ - action, _, _, _, _ = self.evaluate(np.array([state])) - return action.numpy()[0] - - def get_action_greedy(self, state): - """ generate action with state for interaction with envronment """ - mean = self.policy_net(np.array([state]), greedy=True).numpy()[0] - action = tf.math.tanh(mean) * self.policy_net.policy_dist.action_scale + self.policy_net.policy_dist.action_mean - return action - - def sample_action(self, ): - """ generate random actions for exploration """ - return self.policy_net.random_sample() - - def target_ini(self, net, target_net): - """ hard-copy update for initializing target networks """ - for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): - target_param.assign(param) - return target_net - - def target_soft_update(self, net, target_net, soft_tau): - """ soft update the target net with Polyak averaging """ - for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): - target_param.assign( # copy weight value into target parameters - target_param * (1.0 - soft_tau) + param * soft_tau - ) - return target_net - - def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2): - """ update all networks in SAC """ - state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) - - reward = reward[:, np.newaxis] # expand dim - done = done[:, np.newaxis] - - reward = reward_scale * (reward - - np.mean(reward, axis=0)) / ( - np.std(reward, axis=0) + 1e-6) # normalize with batch mean and std - - # Training Q Function - new_next_action, next_log_prob, _, _, _ = self.evaluate(next_state) - target_q_min = tf.minimum( - self.target_soft_q_net1([next_state, new_next_action]), - self.target_soft_q_net2([next_state, new_next_action]) - ) - self.alpha * next_log_prob - target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward - - with tf.GradientTape() as q1_tape: - predicted_q_value1 = self.soft_q_net1([state, action]) - q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) - q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) - self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) - - with tf.GradientTape() as q2_tape: - predicted_q_value2 = self.soft_q_net2([state, action]) - q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) - q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) - self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) - - # Training Policy Function - with tf.GradientTape() as p_tape: - new_action, log_prob, z, mean, log_std = self.evaluate(state) - """ implementation 1 """ - predicted_new_q_value = tf.minimum(self.soft_q_net1([state, new_action]), - self.soft_q_net2([state, new_action])) - """ implementation 2 """ - # predicted_new_q_value = self.soft_q_net1([state, new_action]) - policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) - p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) - self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) - - # Updating alpha w.r.t entropy - # alpha: trade-off between exploration (max entropy) and exploitation (max Q) - if auto_entropy is True: - with tf.GradientTape() as alpha_tape: - alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) - alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha]) - self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha])) - self.alpha = tf.math.exp(self.log_alpha) - else: # fixed alpha - self.alpha = 1. - alpha_loss = 0 - - # Soft update the target value nets - self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) - self.target_soft_q_net2 = self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) - - def save_ckpt(self, env_name): - """ save trained weights """ - save_model(self.soft_q_net1, 'model_q_net1', self.name, env_name) - save_model(self.soft_q_net2, 'model_q_net2', self.name, env_name) - save_model(self.target_soft_q_net1, 'model_target_q_net1', self.name, env_name) - save_model(self.target_soft_q_net2, 'model_target_q_net2', self.name, env_name) - save_model(self.policy_net, 'model_policy_net', self.name, env_name) - - def load_ckpt(self, env_name): - """ load trained weights """ - load_model(self.soft_q_net1, 'model_q_net1', self.name, env_name) - load_model(self.soft_q_net2, 'model_q_net2', self.name, env_name) - load_model(self.target_soft_q_net1, 'model_target_q_net1', self.name, env_name) - load_model(self.target_soft_q_net2, 'model_target_q_net2', self.name, env_name) - load_model(self.policy_net, 'model_policy_net', self.name, env_name) - - def learn(self, env, train_episodes=1000, test_episodes=1000, max_steps=150, batch_size=64, explore_steps=500, - update_itr=3, policy_target_update_interval=3, reward_scale=1., save_interval=20, - mode='train', AUTO_ENTROPY=True, render=False, plot_func=None): - """ - :param env: learning environment - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param batch_size: udpate batchsize - :param explore_steps: for random action sampling in the beginning of training - :param update_itr: repeated updates for single step - :param policy_target_update_interval: delayed update for the policy network and target networks - :param reward_scale: value range of reward - :param save_interval: timesteps for saving the weights and plotting the results - :param mode: 'train' or 'test' - :param AUTO_ENTROPY: automatically updating variable alpha for entropy - :param render: if true, visualize the environment - :param plot_func: additional function for interactive module - """ - - # training loop - if mode == 'train': - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - frame_idx = 0 - rewards = [] - t0 = time.time() - for eps in range(train_episodes): - state = env.reset() - episode_reward = 0 - - for step in range(max_steps): - if frame_idx > explore_steps: - action = self.get_action(state) - else: - action = self.sample_action() - - next_state, reward, done, _ = env.step(action) - if render: env.render() - done = 1 if done == True else 0 - - self.replay_buffer.push(state, action, reward, next_state, done) - - state = next_state - episode_reward += reward - frame_idx += 1 - - if len(self.replay_buffer) > batch_size: - for i in range(update_itr): - self.update( - batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, - target_entropy=-1. * self.action_dim - ) - - if done: - break - if eps % int(save_interval) == 0: - plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) - self.save_ckpt(env_name=env.spec.id) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ - .format(eps, train_episodes, episode_reward, time.time() - t0)) - rewards.append(episode_reward) - if plot_func is not None: - plot_func(rewards) - plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) - self.save_ckpt(env_name=env.spec.id) - - elif mode == 'test': - frame_idx = 0 - rewards = [] - t0 = time.time() - self.load_ckpt(env_name=env.spec.id) - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - # set test mode - self.soft_q_net1.eval() - self.soft_q_net2.eval() - self.target_soft_q_net1.eval() - self.target_soft_q_net2.eval() - self.policy_net.eval() - - for eps in range(test_episodes): - state = env.reset() - episode_reward = 0 - - for step in range(max_steps): - action = self.get_action_greedy(state) - next_state, reward, done, _ = env.step(action) - if render: env.render() - done = 1 if done == True else 0 - - state = next_state - episode_reward += reward - frame_idx += 1 - if done: - break - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ - .format(eps, test_episodes, episode_reward, time.time() - t0)) - rewards.append(episode_reward) - if plot_func: - plot_func(rewards) - - else: - print('unknow mode type') +""" +Soft Actor-Critic +using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net +adding alpha loss +paper: https://arxiv.org/pdf/1812.05905.pdf +Actor policy is stochastic. +Env: Openai Gym Pendulum-v0, continuous action space +tensorflow 2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer 2.0.0 +&& +pip install box2d box2d-kengz --user +""" + +import time + +import tensorflow_probability as tfp +import tensorlayer as tl +from rlzoo.common.utils import * +from rlzoo.common.buffer import * +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * + +tfd = tfp.distributions +Normal = tfd.Normal + +tl.logging.set_verbosity(tl.logging.DEBUG) + + +class SAC(): + """ Soft Actor-Critic """ + + def __init__(self, net_list, optimizers_list, replay_buffer_capacity=5e5): + self.replay_buffer = ReplayBuffer(replay_buffer_capacity) + self.name = 'SAC' + + # get all networks + [self.soft_q_net1, self.soft_q_net2, self.target_soft_q_net1, self.target_soft_q_net2, + self.policy_net] = net_list + + assert isinstance(self.soft_q_net1, QNetwork) + assert isinstance(self.soft_q_net2, QNetwork) + assert isinstance(self.target_soft_q_net1, QNetwork) + assert isinstance(self.target_soft_q_net2, QNetwork) + assert isinstance(self.policy_net, StochasticPolicyNetwork) + assert isinstance(self.policy_net.action_space, gym.spaces.Box) + + self.action_dim = self.policy_net.action_shape[0] + + self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha') + self.alpha = tf.math.exp(self.log_alpha) + print('Soft Q Network (1,2): ', self.soft_q_net1) + print('Policy Network: ', self.policy_net) + + # initialize weights of target networks + self.target_soft_q_net1 = self.target_ini(self.soft_q_net1, self.target_soft_q_net1) + self.target_soft_q_net2 = self.target_ini(self.soft_q_net2, self.target_soft_q_net2) + + [self.soft_q_optimizer1, self.soft_q_optimizer2, self.policy_optimizer, self.alpha_optimizer] = optimizers_list + + def evaluate(self, state, epsilon=1e-6): + """ generate action with state for calculating gradients """ + _ = self.policy_net(state) + mean, log_std = self.policy_net.policy_dist.get_param() # as SAC uses TanhNorm instead of normal distribution, need original mean_std + std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow + + normal = Normal(0, 1) + z = normal.sample(mean.shape) + action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick + # according to original paper, with an extra last term for normalizing different action range + log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0 ** 2 + epsilon) + # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); + # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, + # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. + log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced + + action = action_0 * self.policy_net.policy_dist.action_scale + self.policy_net.policy_dist.action_mean + + return action, log_prob, z, mean, log_std + + def get_action(self, state): + """ generate action with state for interaction with envronment """ + action, _, _, _, _ = self.evaluate(np.array([state])) + return action.numpy()[0] + + def get_action_greedy(self, state): + """ generate action with state for interaction with envronment """ + mean = self.policy_net(np.array([state]), greedy=True).numpy()[0] + action = tf.math.tanh(mean) * self.policy_net.policy_dist.action_scale + self.policy_net.policy_dist.action_mean + return action + + def sample_action(self, ): + """ generate random actions for exploration """ + return self.policy_net.random_sample() + + def target_ini(self, net, target_net): + """ hard-copy update for initializing target networks """ + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign(param) + return target_net + + def target_soft_update(self, net, target_net, soft_tau): + """ soft update the target net with Polyak averaging """ + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign( # copy weight value into target parameters + target_param * (1.0 - soft_tau) + param * soft_tau + ) + return target_net + + def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2): + """ update all networks in SAC """ + state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) + + reward = reward[:, np.newaxis] # expand dim + done = done[:, np.newaxis] + + reward = reward_scale * (reward - + np.mean(reward, axis=0)) / ( + np.std(reward, axis=0) + 1e-6) # normalize with batch mean and std + + # Training Q Function + new_next_action, next_log_prob, _, _, _ = self.evaluate(next_state) + target_q_min = tf.minimum( + self.target_soft_q_net1([next_state, new_next_action]), + self.target_soft_q_net2([next_state, new_next_action]) + ) - self.alpha * next_log_prob + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + + with tf.GradientTape() as q1_tape: + predicted_q_value1 = self.soft_q_net1([state, action]) + q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) + q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) + self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) + + with tf.GradientTape() as q2_tape: + predicted_q_value2 = self.soft_q_net2([state, action]) + q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) + q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) + self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) + + # Training Policy Function + with tf.GradientTape() as p_tape: + new_action, log_prob, z, mean, log_std = self.evaluate(state) + """ implementation 1 """ + predicted_new_q_value = tf.minimum(self.soft_q_net1([state, new_action]), + self.soft_q_net2([state, new_action])) + """ implementation 2 """ + # predicted_new_q_value = self.soft_q_net1([state, new_action]) + policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) + p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) + self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) + + # Updating alpha w.r.t entropy + # alpha: trade-off between exploration (max entropy) and exploitation (max Q) + if auto_entropy is True: + with tf.GradientTape() as alpha_tape: + alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) + alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha]) + self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha])) + self.alpha = tf.math.exp(self.log_alpha) + else: # fixed alpha + self.alpha = 1. + alpha_loss = 0 + + # Soft update the target value nets + self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) + self.target_soft_q_net2 = self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) + + def save_ckpt(self, env_name): + """ save trained weights """ + save_model(self.soft_q_net1, 'model_q_net1', self.name, env_name) + save_model(self.soft_q_net2, 'model_q_net2', self.name, env_name) + save_model(self.target_soft_q_net1, 'model_target_q_net1', self.name, env_name) + save_model(self.target_soft_q_net2, 'model_target_q_net2', self.name, env_name) + save_model(self.policy_net, 'model_policy_net', self.name, env_name) + + def load_ckpt(self, env_name): + """ load trained weights """ + load_model(self.soft_q_net1, 'model_q_net1', self.name, env_name) + load_model(self.soft_q_net2, 'model_q_net2', self.name, env_name) + load_model(self.target_soft_q_net1, 'model_target_q_net1', self.name, env_name) + load_model(self.target_soft_q_net2, 'model_target_q_net2', self.name, env_name) + load_model(self.policy_net, 'model_policy_net', self.name, env_name) + + def learn(self, env, train_episodes=1000, test_episodes=1000, max_steps=150, batch_size=64, explore_steps=500, + update_itr=3, policy_target_update_interval=3, reward_scale=1., save_interval=20, + mode='train', AUTO_ENTROPY=True, render=False, plot_func=None): + """ + :param env: learning environment + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param batch_size: udpate batchsize + :param explore_steps: for random action sampling in the beginning of training + :param update_itr: repeated updates for single step + :param policy_target_update_interval: delayed update for the policy network and target networks + :param reward_scale: value range of reward + :param save_interval: timesteps for saving the weights and plotting the results + :param mode: 'train' or 'test' + :param AUTO_ENTROPY: automatically updating variable alpha for entropy + :param render: if true, visualize the environment + :param plot_func: additional function for interactive module + """ + + # training loop + if mode == 'train': + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + frame_idx = 0 + rewards = [] + t0 = time.time() + for eps in range(train_episodes): + state = env.reset() + episode_reward = 0 + + for step in range(max_steps): + if frame_idx > explore_steps: + action = self.get_action(state) + else: + action = self.sample_action() + + next_state, reward, done, _ = env.step(action) + if render: env.render() + done = 1 if done == True else 0 + + self.replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(self.replay_buffer) > batch_size: + for i in range(update_itr): + self.update( + batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, + target_entropy=-1. * self.action_dim + ) + + if done: + break + if eps % int(save_interval) == 0: + plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) + self.save_ckpt(env_name=env.spec.id) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ + .format(eps, train_episodes, episode_reward, time.time() - t0)) + rewards.append(episode_reward) + if plot_func is not None: + plot_func(rewards) + plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) + self.save_ckpt(env_name=env.spec.id) + + elif mode == 'test': + frame_idx = 0 + rewards = [] + t0 = time.time() + self.load_ckpt(env_name=env.spec.id) + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + # set test mode + self.soft_q_net1.eval() + self.soft_q_net2.eval() + self.target_soft_q_net1.eval() + self.target_soft_q_net2.eval() + self.policy_net.eval() + + for eps in range(test_episodes): + state = env.reset() + episode_reward = 0 + + for step in range(max_steps): + action = self.get_action_greedy(state) + next_state, reward, done, _ = env.step(action) + if render: env.render() + done = 1 if done == True else 0 + + state = next_state + episode_reward += reward + frame_idx += 1 + if done: + break + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ + .format(eps, test_episodes, episode_reward, time.time() - t0)) + rewards.append(episode_reward) + if plot_func: + plot_func(rewards) + + else: + print('unknow mode type') diff --git a/rlzoo/algorithms/td3/__init__.py b/rlzoo/algorithms/td3/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/td3/default.py b/rlzoo/algorithms/td3/default.py old mode 100644 new mode 100755 index 6240103..8994450 --- a/rlzoo/algorithms/td3/default.py +++ b/rlzoo/algorithms/td3/default.py @@ -1,371 +1,371 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -replay_buffer_capacity: the size of buffer for storing explored samples -policy_target_update_interval: delayed interval for updating the target policy ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -batch_size: udpate batchsize -explore_steps: for random action sampling in the beginning of training -update_itr: repeated updates for single step -reward_scale: value range of reward -save_interval: timesteps for saving the weights and plotting the results -explore_noise_scale: range of action noise for exploration -eval_noise_scale: range of action noise for evaluation of action value -mode: 'train' or 'test' -render: if true, visualize the environment ------------------------------------------------ -""" - - -def classic_control(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - policy_target_update_interval=5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network - q_optimizer1 = tf.optimizers.Adam(q_lr) - q_optimizer2 = tf.optimizers.Adam(q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=500, - update_itr=3, - reward_scale=1., - explore_noise_scale=1.0, - eval_noise_scale=0.5, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - policy_target_update_interval=5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network - q_optimizer1 = tf.optimizers.Adam(q_lr) - q_optimizer2 = tf.optimizers.Adam(q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=500, - update_itr=3, - reward_scale=1., - explore_noise_scale=1.0, - eval_noise_scale=0.5, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - policy_target_update_interval=5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network - q_optimizer1 = tf.optimizers.Adam(q_lr) - q_optimizer2 = tf.optimizers.Adam(q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=500, - update_itr=3, - reward_scale=1., - explore_noise_scale=1.0, - eval_noise_scale=0.5, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - policy_target_update_interval=5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network - q_optimizer1 = tf.optimizers.Adam(q_lr) - q_optimizer2 = tf.optimizers.Adam(q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=500, - update_itr=3, - reward_scale=1., - explore_noise_scale=1.0, - eval_noise_scale=0.5, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - policy_target_update_interval=5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network - q_optimizer1 = tf.optimizers.Adam(q_lr) - q_optimizer2 = tf.optimizers.Adam(q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=500, - update_itr=3, - reward_scale=1., - explore_noise_scale=1.0, - eval_noise_scale=0.5, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - seed = 2 - set_seed(seed, env) # reproducible - - alg_params = dict( - replay_buffer_capacity=5e5, - policy_target_update_interval=5, - ) - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - alg_params['net_list'] = net_list - if alg_params.get('optimizers_list') is None: - q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network - q_optimizer1 = tf.optimizers.Adam(q_lr) - q_optimizer2 = tf.optimizers.Adam(q_lr) - policy_optimizer = tf.optimizers.Adam(policy_lr) - optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict( - max_steps=150, - batch_size=64, - explore_steps=500, - update_itr=3, - reward_scale=1., - explore_noise_scale=1.0, - eval_noise_scale=0.5, - train_episodes=100, - test_episodes=10, - save_interval=10, - ) - - return alg_params, learn_params +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +replay_buffer_capacity: the size of buffer for storing explored samples +policy_target_update_interval: delayed interval for updating the target policy +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +batch_size: udpate batchsize +explore_steps: for random action sampling in the beginning of training +update_itr: repeated updates for single step +reward_scale: value range of reward +save_interval: timesteps for saving the weights and plotting the results +explore_noise_scale: range of action noise for exploration +eval_noise_scale: range of action noise for evaluation of action value +mode: 'train' or 'test' +render: if true, visualize the environment +----------------------------------------------- +""" + + +def classic_control(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + policy_target_update_interval=5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network + q_optimizer1 = tf.optimizers.Adam(q_lr) + q_optimizer2 = tf.optimizers.Adam(q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=500, + update_itr=3, + reward_scale=1., + explore_noise_scale=1.0, + eval_noise_scale=0.5, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + policy_target_update_interval=5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network + q_optimizer1 = tf.optimizers.Adam(q_lr) + q_optimizer2 = tf.optimizers.Adam(q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=500, + update_itr=3, + reward_scale=1., + explore_noise_scale=1.0, + eval_noise_scale=0.5, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + policy_target_update_interval=5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network + q_optimizer1 = tf.optimizers.Adam(q_lr) + q_optimizer2 = tf.optimizers.Adam(q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=500, + update_itr=3, + reward_scale=1., + explore_noise_scale=1.0, + eval_noise_scale=0.5, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + policy_target_update_interval=5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network + q_optimizer1 = tf.optimizers.Adam(q_lr) + q_optimizer2 = tf.optimizers.Adam(q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=500, + update_itr=3, + reward_scale=1., + explore_noise_scale=1.0, + eval_noise_scale=0.5, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + policy_target_update_interval=5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network + q_optimizer1 = tf.optimizers.Adam(q_lr) + q_optimizer2 = tf.optimizers.Adam(q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=500, + update_itr=3, + reward_scale=1., + explore_noise_scale=1.0, + eval_noise_scale=0.5, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + seed = 2 + set_seed(seed, env) # reproducible + + alg_params = dict( + replay_buffer_capacity=5e5, + policy_target_update_interval=5, + ) + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + alg_params['net_list'] = net_list + if alg_params.get('optimizers_list') is None: + q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network + q_optimizer1 = tf.optimizers.Adam(q_lr) + q_optimizer2 = tf.optimizers.Adam(q_lr) + policy_optimizer = tf.optimizers.Adam(policy_lr) + optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict( + max_steps=150, + batch_size=64, + explore_steps=500, + update_itr=3, + reward_scale=1., + explore_noise_scale=1.0, + eval_noise_scale=0.5, + train_episodes=100, + test_episodes=10, + save_interval=10, + ) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/td3/run_td3.py b/rlzoo/algorithms/td3/run_td3.py old mode 100644 new mode 100755 index 3dbd84c..c38bf8f --- a/rlzoo/algorithms/td3/run_td3.py +++ b/rlzoo/algorithms/td3/run_td3.py @@ -1,83 +1,83 @@ -from rlzoo.algorithms.td3.td3 import TD3 -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - -""" load environment """ -env = gym.make('Pendulum-v0').unwrapped -# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run -action_shape = env.action_space.shape -state_shape = env.observation_space.shape -# reproducible -seed = 2 -np.random.seed(seed) -tf.random.set_seed(seed) -env.seed(seed) - -""" build networks for the algorithm """ -num_hidden_layer = 2 # number of hidden layers for the networks -hidden_dim = 64 # dimension of hidden layers for the networks -with tf.name_scope('TD3'): - with tf.name_scope('Q_Net1'): - q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Q_Net2'): - q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net1'): - target_q_net1 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Q_Net2'): - target_q_net2 = QNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Policy'): - policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) - with tf.name_scope('Target_Policy'): - target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, - hidden_dim_list=num_hidden_layer * [hidden_dim]) -net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] - -""" choose optimizers """ -q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network -q_optimizer1 = tf.optimizers.Adam(q_lr) -q_optimizer2 = tf.optimizers.Adam(q_lr) -policy_optimizer = tf.optimizers.Adam(policy_lr) -optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] - -model = TD3(net_list, optimizers_list) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -state_dim: dimension of state for the environment -action_dim: dimension of action for the environment -replay_buffer_capacity: the size of buffer for storing explored samples -policy_target_update_interval: delayed interval for updating the target policy -action_range: value of each action in [-action_range, action_range] -""" - -model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, update_itr=3, - reward_scale=1., save_interval=10, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train', - render=False) -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -batch_size: udpate batchsize -explore_steps: for random action sampling in the beginning of training -update_itr: repeated updates for single step -reward_scale: value range of reward -save_interval: timesteps for saving the weights and plotting the results -explore_noise_scale: range of action noise for exploration -eval_noise_scale: range of action noise for evaluation of action value -mode: 'train' or 'test' -render: if true, visualize the environment - -""" -# test -model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True) +from rlzoo.algorithms.td3.td3 import TD3 +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + +""" load environment """ +env = gym.make('Pendulum-v0').unwrapped +# env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run +action_shape = env.action_space.shape +state_shape = env.observation_space.shape +# reproducible +seed = 2 +np.random.seed(seed) +tf.random.set_seed(seed) +env.seed(seed) + +""" build networks for the algorithm """ +num_hidden_layer = 2 # number of hidden layers for the networks +hidden_dim = 64 # dimension of hidden layers for the networks +with tf.name_scope('TD3'): + with tf.name_scope('Q_Net1'): + q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Q_Net2'): + q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net1'): + target_q_net1 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Q_Net2'): + target_q_net2 = QNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Policy'): + policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) + with tf.name_scope('Target_Policy'): + target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, + hidden_dim_list=num_hidden_layer * [hidden_dim]) +net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] + +""" choose optimizers """ +q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network +q_optimizer1 = tf.optimizers.Adam(q_lr) +q_optimizer2 = tf.optimizers.Adam(q_lr) +policy_optimizer = tf.optimizers.Adam(policy_lr) +optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] + +model = TD3(net_list, optimizers_list) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +state_dim: dimension of state for the environment +action_dim: dimension of action for the environment +replay_buffer_capacity: the size of buffer for storing explored samples +policy_target_update_interval: delayed interval for updating the target policy +action_range: value of each action in [-action_range, action_range] +""" + +model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, update_itr=3, + reward_scale=1., save_interval=10, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train', + render=False) +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +batch_size: udpate batchsize +explore_steps: for random action sampling in the beginning of training +update_itr: repeated updates for single step +reward_scale: value range of reward +save_interval: timesteps for saving the weights and plotting the results +explore_noise_scale: range of action noise for exploration +eval_noise_scale: range of action noise for evaluation of action value +mode: 'train' or 'test' +render: if true, visualize the environment + +""" +# test +model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True) diff --git a/rlzoo/algorithms/td3/td3.py b/rlzoo/algorithms/td3/td3.py old mode 100644 new mode 100755 index 7c3deb9..637a6ac --- a/rlzoo/algorithms/td3/td3.py +++ b/rlzoo/algorithms/td3/td3.py @@ -1,314 +1,314 @@ -""" -Twin Delayed DDPG (TD3) ------------------------- -DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters. -Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks: -* Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), -and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions. - -* Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently -than the Q-function. - -* Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for -the policy to exploit Q-function errors by smoothing out Q along changes in action. - -The implementation of TD3 includes 6 networks: 2 Q-net, 2 target Q-net, 1 policy net, 1 target policy net -Actor policy in TD3 is deterministic, with Gaussian exploration noise. - -Reference ---------- -original paper: https://arxiv.org/pdf/1802.09477.pdf - - -Environment ---- -Openai Gym Pendulum-v0, continuous action space -https://gym.openai.com/envs/Pendulum-v0/ - -Prerequisites ---- -tensorflow >=2.0.0a0 -tensorflow-probability 0.6.0 -tensorlayer >=2.0.0 - -&& -pip install box2d box2d-kengz --user - - -""" -import time - -import tensorflow_probability as tfp -import tensorlayer as tl -from rlzoo.common.utils import * -from rlzoo.common.buffer import * -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * - -tfd = tfp.distributions -Normal = tfd.Normal - -tl.logging.set_verbosity(tl.logging.DEBUG) - - -############################### TD3 #################################### - - -class TD3(): - """ twin-delayed ddpg """ - - def __init__(self, net_list, optimizers_list, replay_buffer_capacity=5e5, policy_target_update_interval=5): - self.name = 'TD3' - self.replay_buffer = ReplayBuffer(replay_buffer_capacity) - - # get all networks - [self.q_net1, self.q_net2, self.target_q_net1, self.target_q_net2, self.policy_net, - self.target_policy_net] = net_list - - assert isinstance(self.q_net1, QNetwork) - assert isinstance(self.q_net2, QNetwork) - assert isinstance(self.target_q_net1, QNetwork) - assert isinstance(self.target_q_net2, QNetwork) - assert isinstance(self.policy_net, DeterministicPolicyNetwork) - assert isinstance(self.target_policy_net, DeterministicPolicyNetwork) - assert isinstance(self.policy_net.action_space, gym.spaces.Box) - - print('Q Network (1,2): ', self.q_net1) - print('Policy Network: ', self.policy_net) - - # initialize weights of target networks - self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1) - self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2) - self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net) - - self.update_cnt = 0 - self.policy_target_update_interval = policy_target_update_interval - - [self.q_optimizer1, self.q_optimizer2, self.policy_optimizer] = optimizers_list - - def evaluate(self, state, eval_noise_scale, target=False): - """ - generate action with state for calculating gradients; - - :param eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions. - """ - if target: - action = self.target_policy_net(state) - else: - action = self.policy_net(state) - # add noise - normal = Normal(0, 1) - eval_noise_clip = 2 * eval_noise_scale - noise = normal.sample(action.shape) * eval_noise_scale - noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip) - action = action + noise - - return action - - def get_action(self, state, explore_noise_scale): - """ generate action with state for interaction with envronment """ - action = self.policy_net(np.array([state])) - action = action.numpy()[0] - - # add noise - normal = Normal(0, 1) - noise = normal.sample(action.shape) * explore_noise_scale - action = action + noise - - return action.numpy() - - def get_action_greedy(self, state): - """ generate action with state for interaction with envronment """ - return self.policy_net(np.array([state])).numpy()[0] - - def sample_action(self): - """ generate random actions for exploration """ - return self.policy_net.random_sample() - - def target_ini(self, net, target_net): - """ hard-copy update for initializing target networks """ - for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): - target_param.assign(param) - return target_net - - def target_soft_update(self, net, target_net, soft_tau): - """ soft update the target net with Polyak averaging """ - for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): - target_param.assign( # copy weight value into target parameters - target_param * (1.0 - soft_tau) + param * soft_tau - ) - return target_net - - def update(self, batch_size, eval_noise_scale, reward_scale=1., gamma=0.9, soft_tau=1e-2): - """ update all networks in TD3 """ - self.update_cnt += 1 - state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) - - reward = reward[:, np.newaxis] # expand dim - done = done[:, np.newaxis] - - new_next_action = self.evaluate( - next_state, eval_noise_scale=eval_noise_scale, target=True - ) # clipped normal noise - reward = reward_scale * (reward - - np.mean(reward, axis=0)) / (np.std(reward, - axis=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem - - # Training Q Function - target_q_min = tf.minimum(self.target_q_net1([next_state, new_next_action]), - self.target_q_net2([next_state, new_next_action])) - - target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward - - with tf.GradientTape() as q1_tape: - predicted_q_value1 = self.q_net1([state, action]) - q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value)) - q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights) - self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights)) - - with tf.GradientTape() as q2_tape: - predicted_q_value2 = self.q_net2([state, action]) - q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value)) - q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights) - self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights)) - - # Training Policy Function - if self.update_cnt % self.policy_target_update_interval == 0: - with tf.GradientTape() as p_tape: - new_action = self.evaluate( - state, eval_noise_scale=0.0, target=False - ) # no noise, deterministic policy gradients - # """ implementation 1 """ - # predicted_new_q_value = tf.minimum(self.q_net1([state, new_action]),self.q_net2([state, new_action])) - """ implementation 2 """ - predicted_new_q_value = self.q_net1([state, new_action]) - policy_loss = -tf.reduce_mean(predicted_new_q_value) - p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) - self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) - - # Soft update the target nets - self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau) - self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau) - self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau) - - def save_ckpt(self, env_name): # save trained weights - save_model(self.q_net1, 'model_q_net1', self.name, env_name) - save_model(self.q_net2, 'model_q_net2', self.name, env_name) - save_model(self.target_q_net1, 'model_target_q_net1', self.name, env_name) - save_model(self.target_q_net2, 'model_target_q_net2', self.name, env_name) - save_model(self.policy_net, 'model_policy_net', self.name, env_name) - save_model(self.target_policy_net, 'model_target_policy_net', self.name, env_name) - - def load_ckpt(self, env_name): # load trained weights - load_model(self.q_net1, 'model_q_net1', self.name, env_name) - load_model(self.q_net2, 'model_q_net2', self.name, env_name) - load_model(self.target_q_net1, 'model_target_q_net1', self.name, env_name) - load_model(self.target_q_net2, 'model_target_q_net2', self.name, env_name) - load_model(self.policy_net, 'model_policy_net', self.name, env_name) - load_model(self.target_policy_net, 'model_target_policy_net', self.name, env_name) - - def learn(self, env, train_episodes=1000, test_episodes=1000, max_steps=150, batch_size=64, explore_steps=500, - update_itr=3, - reward_scale=1., save_interval=10, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train', - render=False, plot_func=None): - """ - :param env: learning environment - :param train_episodes: total number of episodes for training - :param test_episodes: total number of episodes for testing - :param max_steps: maximum number of steps for one episode - :param batch_size: udpate batchsize - :param explore_steps: for random action sampling in the beginning of training - :param update_itr: repeated updates for single step - :param reward_scale: value range of reward - :param save_interval: timesteps for saving the weights and plotting the results - :param explore_noise_scale: range of action noise for exploration - :param eval_noise_scale: range of action noise for evaluation of action value - :param mode: 'train' or 'test' - :param render: if true, visualize the environment - :param plot_func: additional function for interactive module - """ - - # training loop - if mode == 'train': - print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - frame_idx = 0 - rewards = [] - t0 = time.time() - for eps in range(train_episodes): - state = env.reset() - episode_reward = 0 - - for step in range(max_steps): - if frame_idx > explore_steps: - action = self.get_action(state, explore_noise_scale=explore_noise_scale) - else: - action = self.sample_action() - - next_state, reward, done, _ = env.step(action) - if render: env.render() - done = 1 if done == True else 0 - - self.replay_buffer.push(state, action, reward, next_state, done) - - state = next_state - episode_reward += reward - frame_idx += 1 - - if len(self.replay_buffer) > batch_size: - for i in range(update_itr): - self.update(batch_size, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale) - - if done: - break - - if eps % int(save_interval) == 0: - plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) - self.save_ckpt(env_name=env.spec.id) - - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ - .format(eps, train_episodes, episode_reward, time.time() - t0)) - rewards.append(episode_reward) - if plot_func is not None: - plot_func(rewards) - plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) - self.save_ckpt(env_name=env.spec.id) - - elif mode == 'test': - frame_idx = 0 - rewards = [] - t0 = time.time() - - self.load_ckpt(env_name=env.spec.id) - print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) - # set test mode - self.q_net1.eval() - self.q_net2.eval() - self.target_q_net1.eval() - self.target_q_net2.eval() - self.policy_net.eval() - self.target_policy_net.eval() - - for eps in range(test_episodes): - state = env.reset() - episode_reward = 0 - - for step in range(max_steps): - action = self.get_action_greedy(state) - next_state, reward, done, _ = env.step(action) - if render: env.render() - done = 1 if done == True else 0 - - state = next_state - episode_reward += reward - frame_idx += 1 - - if done: - break - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ - .format(eps, test_episodes, episode_reward, time.time() - t0)) - rewards.append(episode_reward) - if plot_func is not None: - plot_func(rewards) - - else: - print('unknow mode type, activate test mode as default') +""" +Twin Delayed DDPG (TD3) +------------------------ +DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters. +Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks: +* Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), +and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions. + +* Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently +than the Q-function. + +* Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for +the policy to exploit Q-function errors by smoothing out Q along changes in action. + +The implementation of TD3 includes 6 networks: 2 Q-net, 2 target Q-net, 1 policy net, 1 target policy net +Actor policy in TD3 is deterministic, with Gaussian exploration noise. + +Reference +--------- +original paper: https://arxiv.org/pdf/1802.09477.pdf + + +Environment +--- +Openai Gym Pendulum-v0, continuous action space +https://gym.openai.com/envs/Pendulum-v0/ + +Prerequisites +--- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 + +&& +pip install box2d box2d-kengz --user + + +""" +import time + +import tensorflow_probability as tfp +import tensorlayer as tl +from rlzoo.common.utils import * +from rlzoo.common.buffer import * +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * + +tfd = tfp.distributions +Normal = tfd.Normal + +tl.logging.set_verbosity(tl.logging.DEBUG) + + +############################### TD3 #################################### + + +class TD3(): + """ twin-delayed ddpg """ + + def __init__(self, net_list, optimizers_list, replay_buffer_capacity=5e5, policy_target_update_interval=5): + self.name = 'TD3' + self.replay_buffer = ReplayBuffer(replay_buffer_capacity) + + # get all networks + [self.q_net1, self.q_net2, self.target_q_net1, self.target_q_net2, self.policy_net, + self.target_policy_net] = net_list + + assert isinstance(self.q_net1, QNetwork) + assert isinstance(self.q_net2, QNetwork) + assert isinstance(self.target_q_net1, QNetwork) + assert isinstance(self.target_q_net2, QNetwork) + assert isinstance(self.policy_net, DeterministicPolicyNetwork) + assert isinstance(self.target_policy_net, DeterministicPolicyNetwork) + assert isinstance(self.policy_net.action_space, gym.spaces.Box) + + print('Q Network (1,2): ', self.q_net1) + print('Policy Network: ', self.policy_net) + + # initialize weights of target networks + self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1) + self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2) + self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net) + + self.update_cnt = 0 + self.policy_target_update_interval = policy_target_update_interval + + [self.q_optimizer1, self.q_optimizer2, self.policy_optimizer] = optimizers_list + + def evaluate(self, state, eval_noise_scale, target=False): + """ + generate action with state for calculating gradients; + + :param eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions. + """ + if target: + action = self.target_policy_net(state) + else: + action = self.policy_net(state) + # add noise + normal = Normal(0, 1) + eval_noise_clip = 2 * eval_noise_scale + noise = normal.sample(action.shape) * eval_noise_scale + noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip) + action = action + noise + + return action + + def get_action(self, state, explore_noise_scale): + """ generate action with state for interaction with envronment """ + action = self.policy_net(np.array([state])) + action = action.numpy()[0] + + # add noise + normal = Normal(0, 1) + noise = normal.sample(action.shape) * explore_noise_scale + action = action + noise + + return action.numpy() + + def get_action_greedy(self, state): + """ generate action with state for interaction with envronment """ + return self.policy_net(np.array([state])).numpy()[0] + + def sample_action(self): + """ generate random actions for exploration """ + return self.policy_net.random_sample() + + def target_ini(self, net, target_net): + """ hard-copy update for initializing target networks """ + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign(param) + return target_net + + def target_soft_update(self, net, target_net, soft_tau): + """ soft update the target net with Polyak averaging """ + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign( # copy weight value into target parameters + target_param * (1.0 - soft_tau) + param * soft_tau + ) + return target_net + + def update(self, batch_size, eval_noise_scale, reward_scale=1., gamma=0.9, soft_tau=1e-2): + """ update all networks in TD3 """ + self.update_cnt += 1 + state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) + + reward = reward[:, np.newaxis] # expand dim + done = done[:, np.newaxis] + + new_next_action = self.evaluate( + next_state, eval_noise_scale=eval_noise_scale, target=True + ) # clipped normal noise + reward = reward_scale * (reward - + np.mean(reward, axis=0)) / (np.std(reward, + axis=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem + + # Training Q Function + target_q_min = tf.minimum(self.target_q_net1([next_state, new_next_action]), + self.target_q_net2([next_state, new_next_action])) + + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + + with tf.GradientTape() as q1_tape: + predicted_q_value1 = self.q_net1([state, action]) + q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value)) + q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights) + self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights)) + + with tf.GradientTape() as q2_tape: + predicted_q_value2 = self.q_net2([state, action]) + q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value)) + q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights) + self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights)) + + # Training Policy Function + if self.update_cnt % self.policy_target_update_interval == 0: + with tf.GradientTape() as p_tape: + new_action = self.evaluate( + state, eval_noise_scale=0.0, target=False + ) # no noise, deterministic policy gradients + # """ implementation 1 """ + # predicted_new_q_value = tf.minimum(self.q_net1([state, new_action]),self.q_net2([state, new_action])) + """ implementation 2 """ + predicted_new_q_value = self.q_net1([state, new_action]) + policy_loss = -tf.reduce_mean(predicted_new_q_value) + p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) + self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) + + # Soft update the target nets + self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau) + self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau) + self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau) + + def save_ckpt(self, env_name): # save trained weights + save_model(self.q_net1, 'model_q_net1', self.name, env_name) + save_model(self.q_net2, 'model_q_net2', self.name, env_name) + save_model(self.target_q_net1, 'model_target_q_net1', self.name, env_name) + save_model(self.target_q_net2, 'model_target_q_net2', self.name, env_name) + save_model(self.policy_net, 'model_policy_net', self.name, env_name) + save_model(self.target_policy_net, 'model_target_policy_net', self.name, env_name) + + def load_ckpt(self, env_name): # load trained weights + load_model(self.q_net1, 'model_q_net1', self.name, env_name) + load_model(self.q_net2, 'model_q_net2', self.name, env_name) + load_model(self.target_q_net1, 'model_target_q_net1', self.name, env_name) + load_model(self.target_q_net2, 'model_target_q_net2', self.name, env_name) + load_model(self.policy_net, 'model_policy_net', self.name, env_name) + load_model(self.target_policy_net, 'model_target_policy_net', self.name, env_name) + + def learn(self, env, train_episodes=1000, test_episodes=1000, max_steps=150, batch_size=64, explore_steps=500, + update_itr=3, + reward_scale=1., save_interval=10, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train', + render=False, plot_func=None): + """ + :param env: learning environment + :param train_episodes: total number of episodes for training + :param test_episodes: total number of episodes for testing + :param max_steps: maximum number of steps for one episode + :param batch_size: udpate batchsize + :param explore_steps: for random action sampling in the beginning of training + :param update_itr: repeated updates for single step + :param reward_scale: value range of reward + :param save_interval: timesteps for saving the weights and plotting the results + :param explore_noise_scale: range of action noise for exploration + :param eval_noise_scale: range of action noise for evaluation of action value + :param mode: 'train' or 'test' + :param render: if true, visualize the environment + :param plot_func: additional function for interactive module + """ + + # training loop + if mode == 'train': + print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + frame_idx = 0 + rewards = [] + t0 = time.time() + for eps in range(train_episodes): + state = env.reset() + episode_reward = 0 + + for step in range(max_steps): + if frame_idx > explore_steps: + action = self.get_action(state, explore_noise_scale=explore_noise_scale) + else: + action = self.sample_action() + + next_state, reward, done, _ = env.step(action) + if render: env.render() + done = 1 if done == True else 0 + + self.replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(self.replay_buffer) > batch_size: + for i in range(update_itr): + self.update(batch_size, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale) + + if done: + break + + if eps % int(save_interval) == 0: + plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) + self.save_ckpt(env_name=env.spec.id) + + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ + .format(eps, train_episodes, episode_reward, time.time() - t0)) + rewards.append(episode_reward) + if plot_func is not None: + plot_func(rewards) + plot_save_log(rewards, algorithm_name=self.name, env_name=env.spec.id) + self.save_ckpt(env_name=env.spec.id) + + elif mode == 'test': + frame_idx = 0 + rewards = [] + t0 = time.time() + + self.load_ckpt(env_name=env.spec.id) + print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) + # set test mode + self.q_net1.eval() + self.q_net2.eval() + self.target_q_net1.eval() + self.target_q_net2.eval() + self.policy_net.eval() + self.target_policy_net.eval() + + for eps in range(test_episodes): + state = env.reset() + episode_reward = 0 + + for step in range(max_steps): + action = self.get_action_greedy(state) + next_state, reward, done, _ = env.step(action) + if render: env.render() + done = 1 if done == True else 0 + + state = next_state + episode_reward += reward + frame_idx += 1 + + if done: + break + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ + .format(eps, test_episodes, episode_reward, time.time() - t0)) + rewards.append(episode_reward) + if plot_func is not None: + plot_func(rewards) + + else: + print('unknow mode type, activate test mode as default') diff --git a/rlzoo/algorithms/trpo/__init__.py b/rlzoo/algorithms/trpo/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/algorithms/trpo/default.py b/rlzoo/algorithms/trpo/default.py old mode 100644 new mode 100755 index 6f71615..a1365bb --- a/rlzoo/algorithms/trpo/default.py +++ b/rlzoo/algorithms/trpo/default.py @@ -1,330 +1,330 @@ -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -from rlzoo.common.utils import set_seed - -""" -full list of algorithm parameters (alg_params) ------------------------------------------------ -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -damping_coeff: Artifact for numerical stability -cg_iters: Number of iterations of conjugate gradient to perform -delta: KL-divergence limit for TRPO update. ------------------------------------------------ - -full list of learning parameters (learn_params) ------------------------------------------------ -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -gamma: reward discount factor -mode: train or test -render: render each step -batch_size: update batch size -backtrack_iters: Maximum number of steps allowed in the backtracking line search -backtrack_coeff: How far back to step during backtracking line search -train_critic_iters: critic update iteration steps ------------------------------------------------ -""" - - -def atari(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params - - -def classic_control(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params - - -def box2d(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params - - -def mujoco(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params - - -def robotics(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params - - -def dm_control(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params - - -def rlbench(env, default_seed=True): - if default_seed: - # reproducible - seed = 2 - set_seed(seed, env) - - alg_params = dict( - damping_coeff=0.1, - cg_iters=10, - delta=0.01 - ) - - if alg_params.get('net_list') is None: - num_hidden_layer = 2 # number of hidden layers for the networks - hidden_dim = 64 # dimension of hidden layers for the networks - with tf.name_scope('TRPO'): - with tf.name_scope('V_Net'): - v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) - with tf.name_scope('Policy'): - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, - [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) - - net_list = [v_net, policy_net] - alg_params['net_list'] = net_list - - if alg_params.get('optimizers_list') is None: - critic_lr = 1e-3 - optimizers_list = [tf.optimizers.Adam(critic_lr)] - alg_params['optimizers_list'] = optimizers_list - - learn_params = dict(train_episodes=2000, - test_episodes=100, - max_steps=200, - save_interval=100, - gamma=0.9, - batch_size=256, - backtrack_iters=10, - backtrack_coeff=0.8, - train_critic_iters=80) - - return alg_params, learn_params +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.common.utils import set_seed + +""" +full list of algorithm parameters (alg_params) +----------------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +damping_coeff: Artifact for numerical stability +cg_iters: Number of iterations of conjugate gradient to perform +delta: KL-divergence limit for TRPO update. +----------------------------------------------- + +full list of learning parameters (learn_params) +----------------------------------------------- +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +gamma: reward discount factor +mode: train or test +render: render each step +batch_size: update batch size +backtrack_iters: Maximum number of steps allowed in the backtracking line search +backtrack_coeff: How far back to step during backtracking line search +train_critic_iters: critic update iteration steps +----------------------------------------------- +""" + + +def atari(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params + + +def classic_control(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params + + +def box2d(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params + + +def mujoco(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params + + +def robotics(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params + + +def dm_control(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params + + +def rlbench(env, default_seed=True): + if default_seed: + # reproducible + seed = 2 + set_seed(seed, env) + + alg_params = dict( + damping_coeff=0.1, + cg_iters=10, + delta=0.01 + ) + + if alg_params.get('net_list') is None: + num_hidden_layer = 2 # number of hidden layers for the networks + hidden_dim = 64 # dimension of hidden layers for the networks + with tf.name_scope('TRPO'): + with tf.name_scope('V_Net'): + v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) + with tf.name_scope('Policy'): + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, + [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) + + net_list = [v_net, policy_net] + alg_params['net_list'] = net_list + + if alg_params.get('optimizers_list') is None: + critic_lr = 1e-3 + optimizers_list = [tf.optimizers.Adam(critic_lr)] + alg_params['optimizers_list'] = optimizers_list + + learn_params = dict(train_episodes=2000, + test_episodes=100, + max_steps=200, + save_interval=100, + gamma=0.9, + batch_size=256, + backtrack_iters=10, + backtrack_coeff=0.8, + train_critic_iters=80) + + return alg_params, learn_params diff --git a/rlzoo/algorithms/trpo/run_trpo.py b/rlzoo/algorithms/trpo/run_trpo.py old mode 100644 new mode 100755 index 9bfd2cf..37de726 --- a/rlzoo/algorithms/trpo/run_trpo.py +++ b/rlzoo/algorithms/trpo/run_trpo.py @@ -1,58 +1,58 @@ -from rlzoo.common.utils import set_seed -from rlzoo.algorithms.trpo.trpo import TRPO -from rlzoo.common.policy_networks import * -from rlzoo.common.value_networks import * -import gym - -""" load environment """ -env = gym.make('Pendulum-v0').unwrapped - -# reproducible -seed = 2 -set_seed(seed, env) - -""" build networks for the algorithm """ -name = 'TRPO' -hidden_dim = 64 -num_hidden_layer = 2 -critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') - -actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, - output_activation=tf.nn.tanh, name=name + '_policy') -net_list = critic, actor - -critic_lr = 1e-3 -optimizers_list = [tf.optimizers.Adam(critic_lr)] - -""" create model """ -model = TRPO(net_list, optimizers_list, damping_coeff=0.1, cg_iters=10, delta=0.01) -""" -full list of arguments for the algorithm ----------------------------------------- -net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization -optimizers_list: a list of optimizers for all networks and differentiable variables -damping_coeff: Artifact for numerical stability -cg_iters: Number of iterations of conjugate gradient to perform -delta: KL-divergence limit for TRPO update. -""" - -model.learn(env, mode='train', render=False, train_episodes=2000, max_steps=200, save_interval=100, - gamma=0.9, batch_size=256, backtrack_iters=10, backtrack_coeff=0.8, train_critic_iters=80) -""" -full list of parameters for training ---------------------------------------- -env: learning environment -train_episodes: total number of episodes for training -test_episodes: total number of episodes for testing -max_steps: maximum number of steps for one episode -save_interval: time steps for saving -gamma: reward discount factor -mode: train or test -render: render each step -batch_size: update batch size -backtrack_iters: Maximum number of steps allowed in the backtracking line search -backtrack_coeff: How far back to step during backtracking line search -train_critic_iters: critic update iteration steps -""" - -model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) +from rlzoo.common.utils import set_seed +from rlzoo.algorithms.trpo.trpo import TRPO +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +import gym + +""" load environment """ +env = gym.make('Pendulum-v0').unwrapped + +# reproducible +seed = 2 +set_seed(seed, env) + +""" build networks for the algorithm """ +name = 'TRPO' +hidden_dim = 64 +num_hidden_layer = 2 +critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') + +actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, + output_activation=tf.nn.tanh, name=name + '_policy') +net_list = critic, actor + +critic_lr = 1e-3 +optimizers_list = [tf.optimizers.Adam(critic_lr)] + +""" create model """ +model = TRPO(net_list, optimizers_list, damping_coeff=0.1, cg_iters=10, delta=0.01) +""" +full list of arguments for the algorithm +---------------------------------------- +net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization +optimizers_list: a list of optimizers for all networks and differentiable variables +damping_coeff: Artifact for numerical stability +cg_iters: Number of iterations of conjugate gradient to perform +delta: KL-divergence limit for TRPO update. +""" + +model.learn(env, mode='train', render=False, train_episodes=2000, max_steps=200, save_interval=100, + gamma=0.9, batch_size=256, backtrack_iters=10, backtrack_coeff=0.8, train_critic_iters=80) +""" +full list of parameters for training +--------------------------------------- +env: learning environment +train_episodes: total number of episodes for training +test_episodes: total number of episodes for testing +max_steps: maximum number of steps for one episode +save_interval: time steps for saving +gamma: reward discount factor +mode: train or test +render: render each step +batch_size: update batch size +backtrack_iters: Maximum number of steps allowed in the backtracking line search +backtrack_coeff: How far back to step during backtracking line search +train_critic_iters: critic update iteration steps +""" + +model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) diff --git a/rlzoo/algorithms/trpo/trpo.py b/rlzoo/algorithms/trpo/trpo.py old mode 100644 new mode 100755 diff --git a/rlzoo/common/__init__.py b/rlzoo/common/__init__.py old mode 100644 new mode 100755 diff --git a/rlzoo/common/basic_nets.py b/rlzoo/common/basic_nets.py old mode 100644 new mode 100755 index 4a9a272..d6b9510 --- a/rlzoo/common/basic_nets.py +++ b/rlzoo/common/basic_nets.py @@ -1,149 +1,149 @@ -"""Basic neural networks""" -import tensorflow as tf -import tensorlayer as tl -from tensorlayer.layers import Dense, Input -from gym import spaces -from collections import OrderedDict - - -def MLP(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2), - activation=tf.nn.relu, *args, **kwargs): - """Multiple fully-connected layers for approximation - - :param input_dim: (int) size of input tensor - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) initialization method for weights - :param activation: (callable) activation function of hidden layers - - Return: - input tensor, output tensor - """ - - l = inputs = Input([None, input_dim]) - for i in range(len(hidden_dim_list)): - l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init)(l) - outputs = l - - return inputs, outputs - - -def MLPModel(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2), - activation=tf.nn.relu, *args, **kwargs): - """Multiple fully-connected layers for approximation - - :param input_dim: (int) size of input tensor - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) initialization method for weights - :param activation: (callable) activation function of hidden layers - - Return: - input tensor, output tensor - """ - l = inputs = Input([None, input_dim], name='Input_Layer') - for i in range(len(hidden_dim_list)): - l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init, name='Hidden_Layer%d' % (i + 1))(l) - outputs = l - - return tl.models.Model(inputs=inputs, outputs=outputs) - - -def CNN(input_shape, conv_kwargs=None): - """Multiple convolutional layers for approximation - Default setting is equal to architecture used in DQN - - :param input_shape: (tuple[int]) (H, W, C) - :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d - - Return: - input tensor, output tensor - """ - if not conv_kwargs: - in_channels = input_shape[-1] - conv_kwargs = [ - { - 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu, - 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID', - 'W_init': tf.initializers.GlorotUniform() - }, - { - 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu, - 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID', - 'W_init': tf.initializers.GlorotUniform() - }, - { - 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu, - 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID', - 'W_init': tf.initializers.GlorotUniform() - } - ] - l = inputs = tl.layers.Input((1,) + input_shape) - - for i, kwargs in enumerate(conv_kwargs): - # kwargs['name'] = kwargs.get('name', 'cnn_layer{}'.format(i + 1)) - l = tl.layers.Conv2d(**kwargs)(l) - outputs = tl.layers.Flatten()(l) - - return inputs, outputs - - -def CNNModel(input_shape, conv_kwargs=None): - """Multiple convolutional layers for approximation - Default setting is equal to architecture used in DQN - - :param input_shape: (tuple[int]) (H, W, C) - :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d - - Return: - tl.model.Model - """ - if not conv_kwargs: - in_channels = input_shape[-1] - conv_kwargs = [ - { - 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu, - 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID', - 'W_init': tf.initializers.GlorotUniform() - }, - { - 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu, - 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID', - 'W_init': tf.initializers.GlorotUniform() - }, - { - 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu, - 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID', - 'W_init': tf.initializers.GlorotUniform() - } - ] - - ni = tl.layers.Input((1,) + input_shape, name='CNN_Input') - hi = ni - - for i, kwargs in enumerate(conv_kwargs): - kwargs['name'] = kwargs.get('name', 'CNN_Layer{}'.format(i + 1)) - hi = tl.layers.Conv2d(**kwargs)(hi) - no = tl.layers.Flatten(name='Flatten_Layer')(hi) - - return tl.models.Model(inputs=ni, outputs=no) - - -def CreateInputLayer(state_space, conv_kwargs=None): - def CreateSingleInput(single_state_space): - single_state_shape = single_state_space.shape - # build structure - if len(single_state_shape) == 1: - l = inputs = Input((None,) + single_state_shape, name='input_layer') - else: - with tf.name_scope('CNN'): - inputs, l = CNN(single_state_shape, conv_kwargs=conv_kwargs) - return inputs, l, single_state_shape - - if isinstance(state_space, spaces.Dict): - input_dict, layer_dict, shape_dict = OrderedDict(), OrderedDict(), OrderedDict() - for k, v in state_space.spaces.items(): - input_dict[k], layer_dict[k], shape_dict[k] = CreateSingleInput(v) - return input_dict, layer_dict, shape_dict - if isinstance(state_space, spaces.Space): - return CreateSingleInput(state_space) - else: - raise ValueError('state space error') +"""Basic neural networks""" +import tensorflow as tf +import tensorlayer as tl +from tensorlayer.layers import Dense, Input +from gym import spaces +from collections import OrderedDict + + +def MLP(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2), + activation=tf.nn.relu, *args, **kwargs): + """Multiple fully-connected layers for approximation + + :param input_dim: (int) size of input tensor + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) initialization method for weights + :param activation: (callable) activation function of hidden layers + + Return: + input tensor, output tensor + """ + + l = inputs = Input([None, input_dim]) + for i in range(len(hidden_dim_list)): + l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init)(l) + outputs = l + + return inputs, outputs + + +def MLPModel(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2), + activation=tf.nn.relu, *args, **kwargs): + """Multiple fully-connected layers for approximation + + :param input_dim: (int) size of input tensor + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) initialization method for weights + :param activation: (callable) activation function of hidden layers + + Return: + input tensor, output tensor + """ + l = inputs = Input([None, input_dim], name='Input_Layer') + for i in range(len(hidden_dim_list)): + l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init, name='Hidden_Layer%d' % (i + 1))(l) + outputs = l + + return tl.models.Model(inputs=inputs, outputs=outputs) + + +def CNN(input_shape, conv_kwargs=None): + """Multiple convolutional layers for approximation + Default setting is equal to architecture used in DQN + + :param input_shape: (tuple[int]) (H, W, C) + :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d + + Return: + input tensor, output tensor + """ + if not conv_kwargs: + in_channels = input_shape[-1] + conv_kwargs = [ + { + 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu, + 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID', + 'W_init': tf.initializers.GlorotUniform() + }, + { + 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu, + 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID', + 'W_init': tf.initializers.GlorotUniform() + }, + { + 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu, + 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID', + 'W_init': tf.initializers.GlorotUniform() + } + ] + l = inputs = tl.layers.Input((1,) + input_shape) + + for i, kwargs in enumerate(conv_kwargs): + # kwargs['name'] = kwargs.get('name', 'cnn_layer{}'.format(i + 1)) + l = tl.layers.Conv2d(**kwargs)(l) + outputs = tl.layers.Flatten()(l) + + return inputs, outputs + + +def CNNModel(input_shape, conv_kwargs=None): + """Multiple convolutional layers for approximation + Default setting is equal to architecture used in DQN + + :param input_shape: (tuple[int]) (H, W, C) + :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d + + Return: + tl.model.Model + """ + if not conv_kwargs: + in_channels = input_shape[-1] + conv_kwargs = [ + { + 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu, + 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID', + 'W_init': tf.initializers.GlorotUniform() + }, + { + 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu, + 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID', + 'W_init': tf.initializers.GlorotUniform() + }, + { + 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu, + 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID', + 'W_init': tf.initializers.GlorotUniform() + } + ] + + ni = tl.layers.Input((1,) + input_shape, name='CNN_Input') + hi = ni + + for i, kwargs in enumerate(conv_kwargs): + kwargs['name'] = kwargs.get('name', 'CNN_Layer{}'.format(i + 1)) + hi = tl.layers.Conv2d(**kwargs)(hi) + no = tl.layers.Flatten(name='Flatten_Layer')(hi) + + return tl.models.Model(inputs=ni, outputs=no) + + +def CreateInputLayer(state_space, conv_kwargs=None): + def CreateSingleInput(single_state_space): + single_state_shape = single_state_space.shape + # build structure + if len(single_state_shape) == 1: + l = inputs = Input((None,) + single_state_shape, name='input_layer') + else: + with tf.name_scope('CNN'): + inputs, l = CNN(single_state_shape, conv_kwargs=conv_kwargs) + return inputs, l, single_state_shape + + if isinstance(state_space, spaces.Dict): + input_dict, layer_dict, shape_dict = OrderedDict(), OrderedDict(), OrderedDict() + for k, v in state_space.spaces.items(): + input_dict[k], layer_dict[k], shape_dict[k] = CreateSingleInput(v) + return input_dict, layer_dict, shape_dict + if isinstance(state_space, spaces.Space): + return CreateSingleInput(state_space) + else: + raise ValueError('state space error') diff --git a/rlzoo/common/buffer.py b/rlzoo/common/buffer.py old mode 100644 new mode 100755 index 6455a5d..88f7a7c --- a/rlzoo/common/buffer.py +++ b/rlzoo/common/buffer.py @@ -1,306 +1,306 @@ -""" -Functions for utilization. - -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 - -""" -import inspect -import operator -import random - -import numpy as np - - -class ReplayBuffer(object): - """A standard ring buffer for storing transitions and sampling for training""" - def __init__(self, capacity): - self.capacity = capacity # mamimum number of samples - self.buffer = [] - self.position = 0 # pointer - - def push(self, state, action, reward, next_state, done): - if len(self.buffer) < self.capacity: - self.buffer.append(None) - self.buffer[self.position] = (state, action, reward, next_state, done) - self.position = int((self.position + 1) % self.capacity) # as a ring buffer - - def sample(self, batch_size): - indexes = range(len(self)) - # sample with replacement - idxes = [random.choice(indexes) for _ in range(batch_size)] - return self._encode_sample(idxes) - - def _encode_sample(self, idxes): - states, actions, rewards, next_states, dones = [], [], [], [], [] - for i in idxes: - state, action, reward, next_state, done = self.buffer[i] - states.append(state) - actions.append(action) - rewards.append(reward) - next_states.append(next_state) - dones.append(done) - return ( - np.stack(states), - np.stack(actions), - np.stack(rewards), - np.stack(next_states), - np.stack(dones), - ) - - def __len__(self): - return len(self.buffer) - - -class SegmentTree(object): - def __init__(self, capacity, operation, neutral_element): - """Build a Segment Tree data structure. - - https://en.wikipedia.org/wiki/Segment_tree - - Can be used as regular array, but with two - important differences: - - a) setting item's value is slightly slower. - It is O(lg capacity) instead of O(1). - b) user has access to an efficient ( O(log segment size) ) - `reduce` operation which reduces `operation` over - a contiguous subsequence of items in the array. - - :param apacity: (int) - Total size of the array - must be a power of two. - :param operation: (lambda obj, obj -> obj) - and operation for combining elements (eg. sum, max) - must form a mathematical group together with the set of - possible values for array elements (i.e. be associative) - :param neutral_element: (obj) - neutral element for the operation above. eg. float('-inf') - for max and 0 for sum. - """ - assert capacity > 0 and capacity & (capacity - 1) == 0, \ - "capacity must be positive and a power of 2." - self._capacity = capacity - self._value = [neutral_element for _ in range(2 * capacity)] - self._operation = operation - - def _reduce_helper(self, start, end, node, node_start, node_end): - if start == node_start and end == node_end: - return self._value[node] - mid = (node_start + node_end) // 2 - if end <= mid: - return self._reduce_helper(start, end, 2 * node, node_start, mid) - else: - if mid + 1 <= start: - return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) - else: - return self._operation( - self._reduce_helper(start, mid, 2 * node, node_start, mid), - self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) - ) - - def reduce(self, start=0, end=None): - """Returns result of applying `self.operation` - to a contiguous subsequence of the array. - - :param start: (int) beginning of the subsequence - :param end: (int) end of the subsequences - - Returns: - reduced: (obj) result of reducing self.operation over the specified range of array. - """ - if end is None: - end = self._capacity - if end < 0: - end += self._capacity - end -= 1 - return self._reduce_helper(start, end, 1, 0, self._capacity - 1) - - def __setitem__(self, idx, val): - # index of the leaf - idx += self._capacity - self._value[idx] = val - idx //= 2 - while idx >= 1: - self._value[idx] = self._operation(self._value[2 * idx], self._value[2 * idx + 1]) - idx //= 2 - - def __getitem__(self, idx): - assert 0 <= idx < self._capacity - return self._value[self._capacity + idx] - - -class SumSegmentTree(SegmentTree): - - def __init__(self, capacity): - super(SumSegmentTree, self).__init__(capacity=capacity, operation=operator.add, neutral_element=0.0) - - def sum(self, start=0, end=None): - """Returns arr[start] + ... + arr[end]""" - return super(SumSegmentTree, self).reduce(start, end) - - def find_prefixsum_idx(self, prefixsum): - """Find the highest index `i` in the array such that - sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum - - if array values are probabilities, this function - allows to sample indexes according to the discrete - probability efficiently. - - :param perfixsum: (float) - upperbound on the sum of array prefix - - Returns: - idx: (int) - highest index satisfying the prefixsum constraint - """ - assert 0 <= prefixsum <= self.sum() + 1e-5 - idx = 1 - while idx < self._capacity: # while non-leaf - if self._value[2 * idx] > prefixsum: - idx = 2 * idx - else: - prefixsum -= self._value[2 * idx] - idx = 2 * idx + 1 - return idx - self._capacity - - -class MinSegmentTree(SegmentTree): - - def __init__(self, capacity): - super(MinSegmentTree, self).__init__(capacity=capacity, operation=min, neutral_element=float('inf')) - - def min(self, start=0, end=None): - """Returns min(arr[start], ..., arr[end])""" - - return super(MinSegmentTree, self).reduce(start, end) - - -class PrioritizedReplayBuffer(ReplayBuffer): # is it succeed from the ReplayBuffer above? - def __init__(self, capacity, alpha, beta): - """Create Prioritized Replay buffer. - - :param capacity: (int) - Max number of transitions to store in the buffer. When the buffer - overflows the old memories are dropped. - :param alpha: (float) - how much prioritization is used - (0 - no prioritization, 1 - full prioritization) - - See Also: - ReplayBuffer.__init__ - """ - super(PrioritizedReplayBuffer, self).__init__(capacity) - assert alpha >= 0 - self._alpha = alpha - - it_capacity = 1 - while it_capacity < capacity: - it_capacity *= 2 - - self._it_sum = SumSegmentTree(it_capacity) - self._it_min = MinSegmentTree(it_capacity) - self._max_priority = 1.0 - self.beta = beta - - def push(self, *args): - """See ReplayBuffer.store_effect""" - idx = self.position - super().push(*args) - self._it_sum[idx] = self._max_priority ** self._alpha - self._it_min[idx] = self._max_priority ** self._alpha - - def _sample_proportional(self, batch_size): - res = [] - p_total = self._it_sum.sum(0, len(self.buffer) - 1) - every_range_len = p_total / batch_size - for i in range(batch_size): - mass = random.random() * every_range_len + i * every_range_len - idx = self._it_sum.find_prefixsum_idx(mass) - res.append(idx) - return res - - def sample(self, batch_size): - """Sample a batch of experiences""" - idxes = self._sample_proportional(batch_size) - - it_sum = self._it_sum.sum() - p_min = self._it_min.min() / it_sum - max_weight = (p_min * len(self.buffer))**(-self.beta) - - p_samples = np.asarray([self._it_sum[idx] for idx in idxes]) / it_sum - weights = (p_samples * len(self.buffer)) ** (-self.beta) / max_weight - encoded_sample = self._encode_sample(idxes) - return encoded_sample + (weights, idxes) - - def update_priorities(self, idxes, priorities): - """Update priorities of sampled transitions""" - assert len(idxes) == len(priorities) - for idx, priority in zip(idxes, priorities): - assert priority > 0 - assert 0 <= idx < len(self.buffer) - self._it_sum[idx] = priority ** self._alpha - self._it_min[idx] = priority ** self._alpha - - self._max_priority = max(self._max_priority, priority) - - -class HindsightReplayBuffer(ReplayBuffer): - """Hindsight Experience Replay - In this buffer, state is a tuple consists of (observation, goal) - """ - GOAL_FUTURE = 'future' - GOAL_EPISODE = 'episode' - GOAL_RANDOM = 'random' - - def __init__(self, capacity, hindsight_freq, goal_type, reward_func, done_func): - """ - :param hindsight_freq (int): How many hindsight transitions will be generated for each real transition - :param goal_type (str): The generatation method of hindsight goals. Should be HER_GOAL_* - :param reward_func (callable): goal (np.array) X next_state (np.array) -> reward (float) - :param done_func (callable): goal (np.array) X next_state (np.array) -> done_flag (bool) - """ - super().__init__(capacity) - self.hindsight_freq = hindsight_freq - self.goal_type = goal_type - self.reward_func = reward_func - self.done_func = done_func - - def _sample_goals(self, episode, t): - goals = [] - episode_len = len(episode) - for _ in range(self.hindsight_freq): - if self.goal_type == HindsightReplayBuffer.GOAL_FUTURE: - index = random.choice(range(t + 1, episode_len)) - source = episode - elif self.goal_type == HindsightReplayBuffer.GOAL_EPISODE: - index = random.choice(range(episode_len)) - source = episode - elif self.goal_type == HindsightReplayBuffer.GOAL_RANDOM: - index = random.choice(range(len(self))) - source = self.buffer - else: - raise ValueError("Invalid goal type %s" % self.goal_type) - goals.append(source[index][0][0]) # return the observation - return goals - - def push(self, *args, **kwargs): - if inspect.stack()[1][3] != 'push_episode': - raise ValueError("Please use `push_episode` methods in HER") - else: - super().push(*args, **kwargs) - - def push_episode(self, states, actions, rewards, next_states, dones): - episode = list(zip(states, actions, rewards, next_states, dones)) - episode_len = len(states) - for t, (state, action, reward, next_state, done) in enumerate(episode): - self.push(state, action, reward, next_state, done) - if self.goal_type == HindsightReplayBuffer.GOAL_FUTURE and t == episode_len - 1: - break - for goal in self._sample_goals(episode, t): - s = (state[0], goal) - a = action - r = self.reward_func(goal, next_state[0]) - s_ = (next_state[0], goal) - d = self.done_func(goal, next_state[0]) - self.push(s, a, r, s_, d) +""" +Functions for utilization. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import inspect +import operator +import random + +import numpy as np + + +class ReplayBuffer(object): + """A standard ring buffer for storing transitions and sampling for training""" + def __init__(self, capacity): + self.capacity = capacity # mamimum number of samples + self.buffer = [] + self.position = 0 # pointer + + def push(self, state, action, reward, next_state, done): + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = int((self.position + 1) % self.capacity) # as a ring buffer + + def sample(self, batch_size): + indexes = range(len(self)) + # sample with replacement + idxes = [random.choice(indexes) for _ in range(batch_size)] + return self._encode_sample(idxes) + + def _encode_sample(self, idxes): + states, actions, rewards, next_states, dones = [], [], [], [], [] + for i in idxes: + state, action, reward, next_state, done = self.buffer[i] + states.append(state) + actions.append(action) + rewards.append(reward) + next_states.append(next_state) + dones.append(done) + return ( + np.stack(states), + np.stack(actions), + np.stack(rewards), + np.stack(next_states), + np.stack(dones), + ) + + def __len__(self): + return len(self.buffer) + + +class SegmentTree(object): + def __init__(self, capacity, operation, neutral_element): + """Build a Segment Tree data structure. + + https://en.wikipedia.org/wiki/Segment_tree + + Can be used as regular array, but with two + important differences: + + a) setting item's value is slightly slower. + It is O(lg capacity) instead of O(1). + b) user has access to an efficient ( O(log segment size) ) + `reduce` operation which reduces `operation` over + a contiguous subsequence of items in the array. + + :param apacity: (int) + Total size of the array - must be a power of two. + :param operation: (lambda obj, obj -> obj) + and operation for combining elements (eg. sum, max) + must form a mathematical group together with the set of + possible values for array elements (i.e. be associative) + :param neutral_element: (obj) + neutral element for the operation above. eg. float('-inf') + for max and 0 for sum. + """ + assert capacity > 0 and capacity & (capacity - 1) == 0, \ + "capacity must be positive and a power of 2." + self._capacity = capacity + self._value = [neutral_element for _ in range(2 * capacity)] + self._operation = operation + + def _reduce_helper(self, start, end, node, node_start, node_end): + if start == node_start and end == node_end: + return self._value[node] + mid = (node_start + node_end) // 2 + if end <= mid: + return self._reduce_helper(start, end, 2 * node, node_start, mid) + else: + if mid + 1 <= start: + return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) + else: + return self._operation( + self._reduce_helper(start, mid, 2 * node, node_start, mid), + self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) + ) + + def reduce(self, start=0, end=None): + """Returns result of applying `self.operation` + to a contiguous subsequence of the array. + + :param start: (int) beginning of the subsequence + :param end: (int) end of the subsequences + + Returns: + reduced: (obj) result of reducing self.operation over the specified range of array. + """ + if end is None: + end = self._capacity + if end < 0: + end += self._capacity + end -= 1 + return self._reduce_helper(start, end, 1, 0, self._capacity - 1) + + def __setitem__(self, idx, val): + # index of the leaf + idx += self._capacity + self._value[idx] = val + idx //= 2 + while idx >= 1: + self._value[idx] = self._operation(self._value[2 * idx], self._value[2 * idx + 1]) + idx //= 2 + + def __getitem__(self, idx): + assert 0 <= idx < self._capacity + return self._value[self._capacity + idx] + + +class SumSegmentTree(SegmentTree): + + def __init__(self, capacity): + super(SumSegmentTree, self).__init__(capacity=capacity, operation=operator.add, neutral_element=0.0) + + def sum(self, start=0, end=None): + """Returns arr[start] + ... + arr[end]""" + return super(SumSegmentTree, self).reduce(start, end) + + def find_prefixsum_idx(self, prefixsum): + """Find the highest index `i` in the array such that + sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum + + if array values are probabilities, this function + allows to sample indexes according to the discrete + probability efficiently. + + :param perfixsum: (float) + upperbound on the sum of array prefix + + Returns: + idx: (int) + highest index satisfying the prefixsum constraint + """ + assert 0 <= prefixsum <= self.sum() + 1e-5 + idx = 1 + while idx < self._capacity: # while non-leaf + if self._value[2 * idx] > prefixsum: + idx = 2 * idx + else: + prefixsum -= self._value[2 * idx] + idx = 2 * idx + 1 + return idx - self._capacity + + +class MinSegmentTree(SegmentTree): + + def __init__(self, capacity): + super(MinSegmentTree, self).__init__(capacity=capacity, operation=min, neutral_element=float('inf')) + + def min(self, start=0, end=None): + """Returns min(arr[start], ..., arr[end])""" + + return super(MinSegmentTree, self).reduce(start, end) + + +class PrioritizedReplayBuffer(ReplayBuffer): # is it succeed from the ReplayBuffer above? + def __init__(self, capacity, alpha, beta): + """Create Prioritized Replay buffer. + + :param capacity: (int) + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + :param alpha: (float) + how much prioritization is used + (0 - no prioritization, 1 - full prioritization) + + See Also: + ReplayBuffer.__init__ + """ + super(PrioritizedReplayBuffer, self).__init__(capacity) + assert alpha >= 0 + self._alpha = alpha + + it_capacity = 1 + while it_capacity < capacity: + it_capacity *= 2 + + self._it_sum = SumSegmentTree(it_capacity) + self._it_min = MinSegmentTree(it_capacity) + self._max_priority = 1.0 + self.beta = beta + + def push(self, *args): + """See ReplayBuffer.store_effect""" + idx = self.position + super().push(*args) + self._it_sum[idx] = self._max_priority ** self._alpha + self._it_min[idx] = self._max_priority ** self._alpha + + def _sample_proportional(self, batch_size): + res = [] + p_total = self._it_sum.sum(0, len(self.buffer) - 1) + every_range_len = p_total / batch_size + for i in range(batch_size): + mass = random.random() * every_range_len + i * every_range_len + idx = self._it_sum.find_prefixsum_idx(mass) + res.append(idx) + return res + + def sample(self, batch_size): + """Sample a batch of experiences""" + idxes = self._sample_proportional(batch_size) + + it_sum = self._it_sum.sum() + p_min = self._it_min.min() / it_sum + max_weight = (p_min * len(self.buffer))**(-self.beta) + + p_samples = np.asarray([self._it_sum[idx] for idx in idxes]) / it_sum + weights = (p_samples * len(self.buffer)) ** (-self.beta) / max_weight + encoded_sample = self._encode_sample(idxes) + return encoded_sample + (weights, idxes) + + def update_priorities(self, idxes, priorities): + """Update priorities of sampled transitions""" + assert len(idxes) == len(priorities) + for idx, priority in zip(idxes, priorities): + assert priority > 0 + assert 0 <= idx < len(self.buffer) + self._it_sum[idx] = priority ** self._alpha + self._it_min[idx] = priority ** self._alpha + + self._max_priority = max(self._max_priority, priority) + + +class HindsightReplayBuffer(ReplayBuffer): + """Hindsight Experience Replay + In this buffer, state is a tuple consists of (observation, goal) + """ + GOAL_FUTURE = 'future' + GOAL_EPISODE = 'episode' + GOAL_RANDOM = 'random' + + def __init__(self, capacity, hindsight_freq, goal_type, reward_func, done_func): + """ + :param hindsight_freq (int): How many hindsight transitions will be generated for each real transition + :param goal_type (str): The generatation method of hindsight goals. Should be HER_GOAL_* + :param reward_func (callable): goal (np.array) X next_state (np.array) -> reward (float) + :param done_func (callable): goal (np.array) X next_state (np.array) -> done_flag (bool) + """ + super().__init__(capacity) + self.hindsight_freq = hindsight_freq + self.goal_type = goal_type + self.reward_func = reward_func + self.done_func = done_func + + def _sample_goals(self, episode, t): + goals = [] + episode_len = len(episode) + for _ in range(self.hindsight_freq): + if self.goal_type == HindsightReplayBuffer.GOAL_FUTURE: + index = random.choice(range(t + 1, episode_len)) + source = episode + elif self.goal_type == HindsightReplayBuffer.GOAL_EPISODE: + index = random.choice(range(episode_len)) + source = episode + elif self.goal_type == HindsightReplayBuffer.GOAL_RANDOM: + index = random.choice(range(len(self))) + source = self.buffer + else: + raise ValueError("Invalid goal type %s" % self.goal_type) + goals.append(source[index][0][0]) # return the observation + return goals + + def push(self, *args, **kwargs): + if inspect.stack()[1][3] != 'push_episode': + raise ValueError("Please use `push_episode` methods in HER") + else: + super().push(*args, **kwargs) + + def push_episode(self, states, actions, rewards, next_states, dones): + episode = list(zip(states, actions, rewards, next_states, dones)) + episode_len = len(states) + for t, (state, action, reward, next_state, done) in enumerate(episode): + self.push(state, action, reward, next_state, done) + if self.goal_type == HindsightReplayBuffer.GOAL_FUTURE and t == episode_len - 1: + break + for goal in self._sample_goals(episode, t): + s = (state[0], goal) + a = action + r = self.reward_func(goal, next_state[0]) + s_ = (next_state[0], goal) + d = self.done_func(goal, next_state[0]) + self.push(s, a, r, s_, d) diff --git a/rlzoo/common/build_rlbench_env.py b/rlzoo/common/build_rlbench_env.py old mode 100644 new mode 100755 index 19f6c84..ac1aafb --- a/rlzoo/common/build_rlbench_env.py +++ b/rlzoo/common/build_rlbench_env.py @@ -1,162 +1,162 @@ -import sys -from collections import OrderedDict - -import numpy as np -from gym import spaces - -from pyrep.const import RenderMode -from pyrep.objects.dummy import Dummy -from pyrep.objects.vision_sensor import VisionSensor -from rlbench.environment import Environment -from rlbench.action_modes import ArmActionMode, ActionMode -from rlbench.observation_config import ObservationConfig -from rlbench.tasks import * - - -# Don't forget to add: export PYTHONPATH=PATH_TO_YOUR_LOCAL_RLBENCH_REPO - -# list of state types -state_types = ['left_shoulder_rgb', - 'left_shoulder_depth', - 'left_shoulder_mask', - 'right_shoulder_rgb', - 'right_shoulder_depth', - 'right_shoulder_mask', - 'wrist_rgb', - 'wrist_depth', - 'wrist_mask', - 'joint_velocities', - 'joint_velocities_noise', - 'joint_positions', - 'joint_positions_noise', - 'joint_forces', - 'joint_forces_noise', - 'gripper_pose', - 'gripper_touch_forces', - 'task_low_dim_state'] - - -class RLBenchEnv(): - """ make RLBench env to have same interfaces as openai.gym """ - - def __init__(self, task_name: str, state_type: list = 'state', ): - # render_mode=None): - """ - create RL Bench environment - :param task_name: task names can be found in rlbench.tasks - :param state_type: state or vision or a sub list of state_types list like ['left_shoulder_rgb'] - """ - if state_type == 'state' or state_type == 'vision' or isinstance(state_type, list): - self._state_type = state_type - else: - raise ValueError('State type value error, your value is {}'.format(state_type)) - # self._render_mode = render_mode - self._render_mode = None - obs_config = ObservationConfig() - obs_config.set_all(True) - action_mode = ActionMode(ArmActionMode.ABS_JOINT_VELOCITY) - self.env = Environment( - action_mode, obs_config=obs_config, headless=True) - self.env.launch() - try: - self.task = self.env.get_task(getattr(sys.modules[__name__], task_name)) - except: - raise NotImplementedError - - _, obs = self.task.reset() - self.spec = Spec(task_name) - - if self._state_type == 'state': - self.observation_space = spaces.Box( - low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape) - elif self._state_type == 'vision': - space_dict = OrderedDict() - space_dict["state"] = spaces.Box( - low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape) - for i in ["left_shoulder_rgb", "right_shoulder_rgb", "wrist_rgb", "front_rgb"]: - space_dict[i] = spaces.Box( - low=0, high=1, shape=getattr(obs, i).shape) - self.observation_space = spaces.Dict(space_dict) - else: - space_dict = OrderedDict() - for name in self._state_type: - if name.split('_')[-1] in ('rgb', 'depth', 'mask'): - space_dict[name] = spaces.Box( - low=0, high=1, shape=getattr(obs, name).shape) - else: - space_dict[name] = spaces.Box( - low=-np.inf, high=np.inf, - shape=getattr(obs, name).shape) - self.observation_space = spaces.Dict(space_dict) - self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.env.action_size,), dtype=np.float32) - - # if render_mode is not None: - # # Add the camera to the scene - # cam_placeholder = Dummy('cam_cinematic_placeholder') - # self._gym_cam = VisionSensor.create([640, 360]) - # self._gym_cam.set_pose(cam_placeholder.get_pose()) - # if render_mode == 'human': - # self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED) - # else: - # self._gym_cam.set_render_mode(RenderMode.OPENGL3) - - def _extract_obs(self, obs): - if self._state_type == 'state': - return np.array(obs.get_low_dim_data(), np.float32) - elif self._state_type == 'vision': - return np.array([np.array(obs.get_low_dim_data(), np.float32), - np.array(obs.left_shoulder_rgb, np.float32), - np.array(obs.right_shoulder_rgb, np.float32), - np.array(obs.wrist_rgb, np.float32), - np.array(obs.front_rgb, np.float32), ]) - else: - result = ['tag'] - for name in self._state_type: - result.append(np.array(getattr(obs, name), np.float32)) - return np.delete(np.array(result,), 0, 0) - - def seed(self, seed_value): - # set seed as in openai.gym env - pass - - def render(self, mode='human'): - # todo render available at any time - if self._render_mode is None: - self._render_mode = mode - # Add the camera to the scene - cam_placeholder = Dummy('cam_cinematic_placeholder') - self._gym_cam = VisionSensor.create([640, 360]) - self._gym_cam.set_pose(cam_placeholder.get_pose()) - if mode == 'human': - self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED) - else: - self._gym_cam.set_render_mode(RenderMode.OPENGL3) - - if mode != self._render_mode: - raise ValueError( - 'The render mode must match the render mode selected in the ' - 'constructor. \nI.e. if you want "human" render mode, then ' - 'create the env by calling: ' - 'gym.make("reach_target-state-v0", render_mode="human").\n' - 'You passed in mode %s, but expected %s.' % ( - mode, self._render_mode)) - if mode == 'rgb_array': - return self._gym_cam.capture_rgb() - - def reset(self): - descriptions, obs = self.task.reset() - return self._extract_obs(obs) - - def step(self, action): - obs, reward, terminate = self.task.step(action) - return self._extract_obs(obs), reward, terminate, None - - def close(self): - self.env.shutdown() - - -class Spec(): - """ a fake spec """ - - def __init__(self, id_name): - self.id = id_name +import sys +from collections import OrderedDict + +import numpy as np +from gym import spaces + +from pyrep.const import RenderMode +from pyrep.objects.dummy import Dummy +from pyrep.objects.vision_sensor import VisionSensor +from rlbench.environment import Environment +from rlbench.action_modes import ArmActionMode, ActionMode +from rlbench.observation_config import ObservationConfig +from rlbench.tasks import * + + +# Don't forget to add: export PYTHONPATH=PATH_TO_YOUR_LOCAL_RLBENCH_REPO + +# list of state types +state_types = ['left_shoulder_rgb', + 'left_shoulder_depth', + 'left_shoulder_mask', + 'right_shoulder_rgb', + 'right_shoulder_depth', + 'right_shoulder_mask', + 'wrist_rgb', + 'wrist_depth', + 'wrist_mask', + 'joint_velocities', + 'joint_velocities_noise', + 'joint_positions', + 'joint_positions_noise', + 'joint_forces', + 'joint_forces_noise', + 'gripper_pose', + 'gripper_touch_forces', + 'task_low_dim_state'] + + +class RLBenchEnv(): + """ make RLBench env to have same interfaces as openai.gym """ + + def __init__(self, task_name: str, state_type: list = 'state', ): + # render_mode=None): + """ + create RL Bench environment + :param task_name: task names can be found in rlbench.tasks + :param state_type: state or vision or a sub list of state_types list like ['left_shoulder_rgb'] + """ + if state_type == 'state' or state_type == 'vision' or isinstance(state_type, list): + self._state_type = state_type + else: + raise ValueError('State type value error, your value is {}'.format(state_type)) + # self._render_mode = render_mode + self._render_mode = None + obs_config = ObservationConfig() + obs_config.set_all(True) + action_mode = ActionMode(ArmActionMode.ABS_JOINT_VELOCITY) + self.env = Environment( + action_mode, obs_config=obs_config, headless=True) + self.env.launch() + try: + self.task = self.env.get_task(getattr(sys.modules[__name__], task_name)) + except: + raise NotImplementedError + + _, obs = self.task.reset() + self.spec = Spec(task_name) + + if self._state_type == 'state': + self.observation_space = spaces.Box( + low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape) + elif self._state_type == 'vision': + space_dict = OrderedDict() + space_dict["state"] = spaces.Box( + low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape) + for i in ["left_shoulder_rgb", "right_shoulder_rgb", "wrist_rgb", "front_rgb"]: + space_dict[i] = spaces.Box( + low=0, high=1, shape=getattr(obs, i).shape) + self.observation_space = spaces.Dict(space_dict) + else: + space_dict = OrderedDict() + for name in self._state_type: + if name.split('_')[-1] in ('rgb', 'depth', 'mask'): + space_dict[name] = spaces.Box( + low=0, high=1, shape=getattr(obs, name).shape) + else: + space_dict[name] = spaces.Box( + low=-np.inf, high=np.inf, + shape=getattr(obs, name).shape) + self.observation_space = spaces.Dict(space_dict) + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.env.action_size,), dtype=np.float32) + + # if render_mode is not None: + # # Add the camera to the scene + # cam_placeholder = Dummy('cam_cinematic_placeholder') + # self._gym_cam = VisionSensor.create([640, 360]) + # self._gym_cam.set_pose(cam_placeholder.get_pose()) + # if render_mode == 'human': + # self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED) + # else: + # self._gym_cam.set_render_mode(RenderMode.OPENGL3) + + def _extract_obs(self, obs): + if self._state_type == 'state': + return np.array(obs.get_low_dim_data(), np.float32) + elif self._state_type == 'vision': + return np.array([np.array(obs.get_low_dim_data(), np.float32), + np.array(obs.left_shoulder_rgb, np.float32), + np.array(obs.right_shoulder_rgb, np.float32), + np.array(obs.wrist_rgb, np.float32), + np.array(obs.front_rgb, np.float32), ]) + else: + result = ['tag'] + for name in self._state_type: + result.append(np.array(getattr(obs, name), np.float32)) + return np.delete(np.array(result,), 0, 0) + + def seed(self, seed_value): + # set seed as in openai.gym env + pass + + def render(self, mode='human'): + # todo render available at any time + if self._render_mode is None: + self._render_mode = mode + # Add the camera to the scene + cam_placeholder = Dummy('cam_cinematic_placeholder') + self._gym_cam = VisionSensor.create([640, 360]) + self._gym_cam.set_pose(cam_placeholder.get_pose()) + if mode == 'human': + self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED) + else: + self._gym_cam.set_render_mode(RenderMode.OPENGL3) + + if mode != self._render_mode: + raise ValueError( + 'The render mode must match the render mode selected in the ' + 'constructor. \nI.e. if you want "human" render mode, then ' + 'create the env by calling: ' + 'gym.make("reach_target-state-v0", render_mode="human").\n' + 'You passed in mode %s, but expected %s.' % ( + mode, self._render_mode)) + if mode == 'rgb_array': + return self._gym_cam.capture_rgb() + + def reset(self): + descriptions, obs = self.task.reset() + return self._extract_obs(obs) + + def step(self, action): + obs, reward, terminate = self.task.step(action) + return self._extract_obs(obs), reward, terminate, None + + def close(self): + self.env.shutdown() + + +class Spec(): + """ a fake spec """ + + def __init__(self, id_name): + self.id = id_name diff --git a/rlzoo/common/distributions.py b/rlzoo/common/distributions.py old mode 100644 new mode 100755 index 8c95036..b191290 --- a/rlzoo/common/distributions.py +++ b/rlzoo/common/distributions.py @@ -1,207 +1,207 @@ -"""Definition of parametrized distributions. Adapted from openai/baselines""" -import copy -from functools import wraps - -import numpy as np -import tensorflow as tf -from gym import spaces - - -def expand_dims(func): - @wraps(func) - def wrapper(*args, **kwargs): - result = func(*args, **kwargs) - result = tf.expand_dims(result, axis=-1) - return result - - return wrapper - - -class Distribution(object): - """A particular probability distribution""" - - def set_param(self, *args, **kwargs): - raise NotImplementedError - - def sample(self, *args, **kwargs): - """Sampling from distribution. Allow explore parameters.""" - raise NotImplementedError - - def logp(self, x): - """Calculate log probability of a sample.""" - return -self.neglogp(x) - - def neglogp(self, x): - """Calculate negative log probability of a sample.""" - raise NotImplementedError - - def kl(self, *parameters): - """Calculate Kullback–Leibler divergence""" - raise NotImplementedError - - def entropy(self): - """Calculate the entropy of distribution.""" - raise NotImplementedError - - -class Categorical(Distribution): - """Creates a categorical distribution""" - - def __init__(self, ndim, logits=None): - """ - Args: - ndim (int): total number of actions - logits (tensor): logits variables - """ - self._ndim = ndim - self._logits = logits - self.param = self._logits - - @property - def ndim(self): - return copy.copy(self._ndim) - - def set_param(self, logits): - """ - Args: - logits (tensor): logits variables to set - """ - self._logits = logits - self.param = self._logits - - def get_param(self): - return copy.deepcopy(self._logits) - - def sample(self): - """ Sample actions from distribution, using the Gumbel-Softmax trick """ - u = np.array(np.random.uniform(0, 1, size=np.shape(self._logits)), dtype=np.float32) - res = tf.argmax(self._logits - tf.math.log(-tf.math.log(u)), axis=-1) - return res - - def greedy_sample(self): - """ Get actions greedily """ - _probs = tf.nn.softmax(self._logits) - return tf.argmax(_probs, axis=-1) - - def logp(self, x): - return -self.neglogp(x) - - @expand_dims - def neglogp(self, x): - x = np.array(x) - if np.any(x % 1): - raise ValueError('Input float actions in discrete action space') - x = tf.convert_to_tensor(x, tf.int32) - x = tf.one_hot(x, self._ndim, axis=-1) - return tf.nn.softmax_cross_entropy_with_logits(x, self._logits) - - @expand_dims - def kl(self, logits): - """ - Args: - logits (tensor): logits variables of another distribution - """ - a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True) - a1 = logits - tf.reduce_max(logits, axis=-1, keepdims=True) - ea0 = tf.exp(a0) - ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) - z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) - p0 = ea0 / z0 - return tf.reduce_sum( - p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=-1) - - @expand_dims - def entropy(self): - a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True) - ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1) - - -class DiagGaussian(Distribution): - """Creates a diagonal Gaussian distribution """ - - def __init__(self, ndim, mean_logstd=None): - """ - Args: - ndim (int): the dimenstion of actions - mean_logstd (tensor): mean and logstd stacked on the last axis - """ - self._ndim = ndim - self.mean = None - self.logstd = None - self.std = None - self.action_mean = None - self.action_scale = None - self.param = self.mean, self.logstd - if mean_logstd is not None: - self.set_param(mean_logstd) - - @property - def ndim(self): - return copy.copy(self._ndim) - - def set_param(self, mean_logstd): - """ - Args: - mean_logstd (tensor): mean and log std - """ - self.mean, self.logstd = mean_logstd - self.std = tf.math.exp(self.logstd) - self.param = self.mean, self.logstd - - def get_param(self): - """ Get parameters """ - return copy.deepcopy(self.mean), copy.deepcopy(self.logstd) - - def sample(self): - """ Get actions in deterministic or stochastic manner """ - return self.mean, self.std * np.random.normal(0, 1, np.shape(self.mean)) - - def greedy_sample(self): - """ Get actions greedily/deterministically """ - return self.mean - - def logp(self, x): - return -self.neglogp(x) - - @expand_dims - def neglogp(self, x): - # here we reverse the action normalization to make the computation of negative log probability correct - x = (x - self.action_mean)/self.action_scale - - return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ - + 0.5 * np.log(2.0 * np.pi) * float(self._ndim) + tf.reduce_sum(self.logstd, axis=-1) - - @expand_dims - def kl(self, mean_logstd): - """ - Args: - mean_logstd (tensor): mean and logstd of another distribution - """ - mean, logstd = mean_logstd - return tf.reduce_sum( - logstd - self.logstd + - (tf.square(self.std) + tf.square(self.mean - mean)) - / (2.0 * tf.square(tf.math.exp(logstd))) - 0.5, axis=-1) - - @expand_dims - def entropy(self): - return tf.reduce_sum( - self.logstd + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) - - -def make_dist(ac_space): - """Get distribution based on action space - - :param ac_space: gym.spaces.Space - """ - if isinstance(ac_space, spaces.Discrete): - return Categorical(ac_space.n) - elif isinstance(ac_space, spaces.Box): - assert len(ac_space.shape) == 1 - return DiagGaussian(ac_space.shape[0]) - else: - raise NotImplementedError +"""Definition of parametrized distributions. Adapted from openai/baselines""" +import copy +from functools import wraps + +import numpy as np +import tensorflow as tf +from gym import spaces + + +def expand_dims(func): + @wraps(func) + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + result = tf.expand_dims(result, axis=-1) + return result + + return wrapper + + +class Distribution(object): + """A particular probability distribution""" + + def set_param(self, *args, **kwargs): + raise NotImplementedError + + def sample(self, *args, **kwargs): + """Sampling from distribution. Allow explore parameters.""" + raise NotImplementedError + + def logp(self, x): + """Calculate log probability of a sample.""" + return -self.neglogp(x) + + def neglogp(self, x): + """Calculate negative log probability of a sample.""" + raise NotImplementedError + + def kl(self, *parameters): + """Calculate Kullback–Leibler divergence""" + raise NotImplementedError + + def entropy(self): + """Calculate the entropy of distribution.""" + raise NotImplementedError + + +class Categorical(Distribution): + """Creates a categorical distribution""" + + def __init__(self, ndim, logits=None): + """ + Args: + ndim (int): total number of actions + logits (tensor): logits variables + """ + self._ndim = ndim + self._logits = logits + self.param = self._logits + + @property + def ndim(self): + return copy.copy(self._ndim) + + def set_param(self, logits): + """ + Args: + logits (tensor): logits variables to set + """ + self._logits = logits + self.param = self._logits + + def get_param(self): + return copy.deepcopy(self._logits) + + def sample(self): + """ Sample actions from distribution, using the Gumbel-Softmax trick """ + u = np.array(np.random.uniform(0, 1, size=np.shape(self._logits)), dtype=np.float32) + res = tf.argmax(self._logits - tf.math.log(-tf.math.log(u)), axis=-1) + return res + + def greedy_sample(self): + """ Get actions greedily """ + _probs = tf.nn.softmax(self._logits) + return tf.argmax(_probs, axis=-1) + + def logp(self, x): + return -self.neglogp(x) + + @expand_dims + def neglogp(self, x): + x = np.array(x) + if np.any(x % 1): + raise ValueError('Input float actions in discrete action space') + x = tf.convert_to_tensor(x, tf.int32) + x = tf.one_hot(x, self._ndim, axis=-1) + return tf.nn.softmax_cross_entropy_with_logits(x, self._logits) + + @expand_dims + def kl(self, logits): + """ + Args: + logits (tensor): logits variables of another distribution + """ + a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True) + a1 = logits - tf.reduce_max(logits, axis=-1, keepdims=True) + ea0 = tf.exp(a0) + ea1 = tf.exp(a1) + z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) + z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) + p0 = ea0 / z0 + return tf.reduce_sum( + p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=-1) + + @expand_dims + def entropy(self): + a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True) + ea0 = tf.exp(a0) + z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) + p0 = ea0 / z0 + return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1) + + +class DiagGaussian(Distribution): + """Creates a diagonal Gaussian distribution """ + + def __init__(self, ndim, mean_logstd=None): + """ + Args: + ndim (int): the dimenstion of actions + mean_logstd (tensor): mean and logstd stacked on the last axis + """ + self._ndim = ndim + self.mean = None + self.logstd = None + self.std = None + self.action_mean = None + self.action_scale = None + self.param = self.mean, self.logstd + if mean_logstd is not None: + self.set_param(mean_logstd) + + @property + def ndim(self): + return copy.copy(self._ndim) + + def set_param(self, mean_logstd): + """ + Args: + mean_logstd (tensor): mean and log std + """ + self.mean, self.logstd = mean_logstd + self.std = tf.math.exp(self.logstd) + self.param = self.mean, self.logstd + + def get_param(self): + """ Get parameters """ + return copy.deepcopy(self.mean), copy.deepcopy(self.logstd) + + def sample(self): + """ Get actions in deterministic or stochastic manner """ + return self.mean, self.std * np.random.normal(0, 1, np.shape(self.mean)) + + def greedy_sample(self): + """ Get actions greedily/deterministically """ + return self.mean + + def logp(self, x): + return -self.neglogp(x) + + @expand_dims + def neglogp(self, x): + # here we reverse the action normalization to make the computation of negative log probability correct + x = (x - self.action_mean)/self.action_scale + + return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ + + 0.5 * np.log(2.0 * np.pi) * float(self._ndim) + tf.reduce_sum(self.logstd, axis=-1) + + @expand_dims + def kl(self, mean_logstd): + """ + Args: + mean_logstd (tensor): mean and logstd of another distribution + """ + mean, logstd = mean_logstd + return tf.reduce_sum( + logstd - self.logstd + + (tf.square(self.std) + tf.square(self.mean - mean)) + / (2.0 * tf.square(tf.math.exp(logstd))) - 0.5, axis=-1) + + @expand_dims + def entropy(self): + return tf.reduce_sum( + self.logstd + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) + + +def make_dist(ac_space): + """Get distribution based on action space + + :param ac_space: gym.spaces.Space + """ + if isinstance(ac_space, spaces.Discrete): + return Categorical(ac_space.n) + elif isinstance(ac_space, spaces.Box): + assert len(ac_space.shape) == 1 + return DiagGaussian(ac_space.shape[0]) + else: + raise NotImplementedError diff --git a/rlzoo/common/env_list.py b/rlzoo/common/env_list.py old mode 100644 new mode 100755 index 6be577b..540c343 --- a/rlzoo/common/env_list.py +++ b/rlzoo/common/env_list.py @@ -1,902 +1,902 @@ -def get_envlist(env_type): - """ get list of env names wrt the type of env """ - try: - l = all_env_list[env_type] - except: - print('Env Type {:s} Not Found!'.format(env_type)) - return l - - -all_env_list = { - ## Gym - # Atari - 'atari': ['AirRaid-v0', - 'AirRaid-v4', - 'AirRaidDeterministic-v0', - 'AirRaidDeterministic-v4', - 'AirRaidNoFrameskip-v0', - 'AirRaidNoFrameskip-v4', - 'AirRaid-ram-v0', - 'AirRaid-ram-v4', - 'AirRaid-ramDeterministic-v0', - 'AirRaid-ramDeterministic-v4', - 'AirRaid-ramNoFrameskip-v0', - 'AirRaid-ramNoFrameskip-v4', - 'Alien-v0', - 'Alien-v4', - 'AlienDeterministic-v0', - 'AlienDeterministic-v4', - 'AlienNoFrameskip-v0', - 'AlienNoFrameskip-v4', - 'Alien-ram-v0', - 'Alien-ram-v4', - 'Alien-ramDeterministic-v0', - 'Alien-ramDeterministic-v4', - 'Alien-ramNoFrameskip-v0', - 'Alien-ramNoFrameskip-v4', - 'Amidar-v0', - 'Amidar-v4', - 'AmidarDeterministic-v0', - 'AmidarDeterministic-v4', - 'AmidarNoFrameskip-v0', - 'AmidarNoFrameskip-v4', - 'Amidar-ram-v0', - 'Amidar-ram-v4', - 'Amidar-ramDeterministic-v0', - 'Amidar-ramDeterministic-v4', - 'Amidar-ramNoFrameskip-v0', - 'Amidar-ramNoFrameskip-v4', - 'Assault-v0', - 'Assault-v4', - 'AssaultDeterministic-v0', - 'AssaultDeterministic-v4', - 'AssaultNoFrameskip-v0', - 'AssaultNoFrameskip-v4', - 'Assault-ram-v0', - 'Assault-ram-v4', - 'Assault-ramDeterministic-v0', - 'Assault-ramDeterministic-v4', - 'Assault-ramNoFrameskip-v0', - 'Assault-ramNoFrameskip-v4', - 'Asterix-v0', - 'Asterix-v4', - 'AsterixDeterministic-v0', - 'AsterixDeterministic-v4', - 'AsterixNoFrameskip-v0', - 'AsterixNoFrameskip-v4', - 'Asterix-ram-v0', - 'Asterix-ram-v4', - 'Asterix-ramDeterministic-v0', - 'Asterix-ramDeterministic-v4', - 'Asterix-ramNoFrameskip-v0', - 'Asterix-ramNoFrameskip-v4', - 'Asteroids-v0', - 'Asteroids-v4', - 'AsteroidsDeterministic-v0', - 'AsteroidsDeterministic-v4', - 'AsteroidsNoFrameskip-v0', - 'AsteroidsNoFrameskip-v4', - 'Asteroids-ram-v0', - 'Asteroids-ram-v4', - 'Asteroids-ramDeterministic-v0', - 'Asteroids-ramDeterministic-v4', - 'Asteroids-ramNoFrameskip-v0', - 'Asteroids-ramNoFrameskip-v4', - 'Atlantis-v0', - 'Atlantis-v4', - 'AtlantisDeterministic-v0', - 'AtlantisDeterministic-v4', - 'AtlantisNoFrameskip-v0', - 'AtlantisNoFrameskip-v4', - 'Atlantis-ram-v0', - 'Atlantis-ram-v4', - 'Atlantis-ramDeterministic-v0', - 'Atlantis-ramDeterministic-v4', - 'Atlantis-ramNoFrameskip-v0', - 'Atlantis-ramNoFrameskip-v4', - 'BankHeist-v0', - 'BankHeist-v4', - 'BankHeistDeterministic-v0', - 'BankHeistDeterministic-v4', - 'BankHeistNoFrameskip-v0', - 'BankHeistNoFrameskip-v4', - 'BankHeist-ram-v0', - 'BankHeist-ram-v4', - 'BankHeist-ramDeterministic-v0', - 'BankHeist-ramDeterministic-v4', - 'BankHeist-ramNoFrameskip-v0', - 'BankHeist-ramNoFrameskip-v4', - 'BattleZone-v0', - 'BattleZone-v4', - 'BattleZoneDeterministic-v0', - 'BattleZoneDeterministic-v4', - 'BattleZoneNoFrameskip-v0', - 'BattleZoneNoFrameskip-v4', - 'BattleZone-ram-v0', - 'BattleZone-ram-v4', - 'BattleZone-ramDeterministic-v0', - 'BattleZone-ramDeterministic-v4', - 'BattleZone-ramNoFrameskip-v0', - 'BattleZone-ramNoFrameskip-v4', - 'BeamRider-v0', - 'BeamRider-v4', - 'BeamRiderDeterministic-v0', - 'BeamRiderDeterministic-v4', - 'BeamRiderNoFrameskip-v0', - 'BeamRiderNoFrameskip-v4', - 'BeamRider-ram-v0', - 'BeamRider-ram-v4', - 'BeamRider-ramDeterministic-v0', - 'BeamRider-ramDeterministic-v4', - 'BeamRider-ramNoFrameskip-v0', - 'BeamRider-ramNoFrameskip-v4', - 'Berzerk-v0', - 'Berzerk-v4', - 'BerzerkDeterministic-v0', - 'BerzerkDeterministic-v4', - 'BerzerkNoFrameskip-v0', - 'BerzerkNoFrameskip-v4', - 'Berzerk-ram-v0', - 'Berzerk-ram-v4', - 'Berzerk-ramDeterministic-v0', - 'Berzerk-ramDeterministic-v4', - 'Berzerk-ramNoFrameskip-v0', - 'Berzerk-ramNoFrameskip-v4', - 'Bowling-v0', - 'Bowling-v4', - 'BowlingDeterministic-v0', - 'BowlingDeterministic-v4', - 'BowlingNoFrameskip-v0', - 'BowlingNoFrameskip-v4', - 'Bowling-ram-v0', - 'Bowling-ram-v4', - 'Bowling-ramDeterministic-v0', - 'Bowling-ramDeterministic-v4', - 'Bowling-ramNoFrameskip-v0', - 'Bowling-ramNoFrameskip-v4', - 'Boxing-v0', - 'Boxing-v4', - 'BoxingDeterministic-v0', - 'BoxingDeterministic-v4', - 'BoxingNoFrameskip-v0', - 'BoxingNoFrameskip-v4', - 'Boxing-ram-v0', - 'Boxing-ram-v4', - 'Boxing-ramDeterministic-v0', - 'Boxing-ramDeterministic-v4', - 'Boxing-ramNoFrameskip-v0', - 'Boxing-ramNoFrameskip-v4', - 'Breakout-v0', - 'Breakout-v4', - 'BreakoutDeterministic-v0', - 'BreakoutDeterministic-v4', - 'BreakoutNoFrameskip-v0', - 'BreakoutNoFrameskip-v4', - 'Breakout-ram-v0', - 'Breakout-ram-v4', - 'Breakout-ramDeterministic-v0', - 'Breakout-ramDeterministic-v4', - 'Breakout-ramNoFrameskip-v0', - 'Breakout-ramNoFrameskip-v4', - 'Carnival-v0', - 'Carnival-v4', - 'CarnivalDeterministic-v0', - 'CarnivalDeterministic-v4', - 'CarnivalNoFrameskip-v0', - 'CarnivalNoFrameskip-v4', - 'Carnival-ram-v0', - 'Carnival-ram-v4', - 'Carnival-ramDeterministic-v0', - 'Carnival-ramDeterministic-v4', - 'Carnival-ramNoFrameskip-v0', - 'Carnival-ramNoFrameskip-v4', - 'Centipede-v0', - 'Centipede-v4', - 'CentipedeDeterministic-v0', - 'CentipedeDeterministic-v4', - 'CentipedeNoFrameskip-v0', - 'CentipedeNoFrameskip-v4', - 'Centipede-ram-v0', - 'Centipede-ram-v4', - 'Centipede-ramDeterministic-v0', - 'Centipede-ramDeterministic-v4', - 'Centipede-ramNoFrameskip-v0', - 'Centipede-ramNoFrameskip-v4', - 'ChopperCommand-v0', - 'ChopperCommand-v4', - 'ChopperCommandDeterministic-v0', - 'ChopperCommandDeterministic-v4', - 'ChopperCommandNoFrameskip-v0', - 'ChopperCommandNoFrameskip-v4', - 'ChopperCommand-ram-v0', - 'ChopperCommand-ram-v4', - 'ChopperCommand-ramDeterministic-v0', - 'ChopperCommand-ramDeterministic-v4', - 'ChopperCommand-ramNoFrameskip-v0', - 'ChopperCommand-ramNoFrameskip-v4', - 'CrazyClimber-v0', - 'CrazyClimber-v4', - 'CrazyClimberDeterministic-v0', - 'CrazyClimberDeterministic-v4', - 'CrazyClimberNoFrameskip-v0', - 'CrazyClimberNoFrameskip-v4', - 'CrazyClimber-ram-v0', - 'CrazyClimber-ram-v4', - 'CrazyClimber-ramDeterministic-v0', - 'CrazyClimber-ramDeterministic-v4', - 'CrazyClimber-ramNoFrameskip-v0', - 'CrazyClimber-ramNoFrameskip-v4', - 'DemonAttack-v0', - 'DemonAttack-v4', - 'DemonAttackDeterministic-v0', - 'DemonAttackDeterministic-v4', - 'DemonAttackNoFrameskip-v0', - 'DemonAttackNoFrameskip-v4', - 'DemonAttack-ram-v0', - 'DemonAttack-ram-v4', - 'DemonAttack-ramDeterministic-v0', - 'DemonAttack-ramDeterministic-v4', - 'DemonAttack-ramNoFrameskip-v0', - 'DemonAttack-ramNoFrameskip-v4', - 'DoubleDunk-v0', - 'DoubleDunk-v4', - 'DoubleDunkDeterministic-v0', - 'DoubleDunkDeterministic-v4', - 'DoubleDunkNoFrameskip-v0', - 'DoubleDunkNoFrameskip-v4', - 'DoubleDunk-ram-v0', - 'DoubleDunk-ram-v4', - 'DoubleDunk-ramDeterministic-v0', - 'DoubleDunk-ramDeterministic-v4', - 'DoubleDunk-ramNoFrameskip-v0', - 'DoubleDunk-ramNoFrameskip-v4', - 'ElevatorAction-v0', - 'ElevatorAction-v4', - 'ElevatorActionDeterministic-v0', - 'ElevatorActionDeterministic-v4', - 'ElevatorActionNoFrameskip-v0', - 'ElevatorActionNoFrameskip-v4', - 'ElevatorAction-ram-v0', - 'ElevatorAction-ram-v4', - 'ElevatorAction-ramDeterministic-v0', - 'ElevatorAction-ramDeterministic-v4', - 'ElevatorAction-ramNoFrameskip-v0', - 'ElevatorAction-ramNoFrameskip-v4', - 'Enduro-v0', - 'Enduro-v4', - 'EnduroDeterministic-v0', - 'EnduroDeterministic-v4', - 'EnduroNoFrameskip-v0', - 'EnduroNoFrameskip-v4', - 'Enduro-ram-v0', - 'Enduro-ram-v4', - 'Enduro-ramDeterministic-v0', - 'Enduro-ramDeterministic-v4', - 'Enduro-ramNoFrameskip-v0', - 'Enduro-ramNoFrameskip-v4', - 'FishingDerby-v0', - 'FishingDerby-v4', - 'FishingDerbyDeterministic-v0', - 'FishingDerbyDeterministic-v4', - 'FishingDerbyNoFrameskip-v0', - 'FishingDerbyNoFrameskip-v4', - 'FishingDerby-ram-v0', - 'FishingDerby-ram-v4', - 'FishingDerby-ramDeterministic-v0', - 'FishingDerby-ramDeterministic-v4', - 'FishingDerby-ramNoFrameskip-v0', - 'FishingDerby-ramNoFrameskip-v4', - 'Freeway-v0', - 'Freeway-v4', - 'FreewayDeterministic-v0', - 'FreewayDeterministic-v4', - 'FreewayNoFrameskip-v0', - 'FreewayNoFrameskip-v4', - 'Freeway-ram-v0', - 'Freeway-ram-v4', - 'Freeway-ramDeterministic-v0', - 'Freeway-ramDeterministic-v4', - 'Freeway-ramNoFrameskip-v0', - 'Freeway-ramNoFrameskip-v4', - 'Frostbite-v0', - 'Frostbite-v4', - 'FrostbiteDeterministic-v0', - 'FrostbiteDeterministic-v4', - 'FrostbiteNoFrameskip-v0', - 'FrostbiteNoFrameskip-v4', - 'Frostbite-ram-v0', - 'Frostbite-ram-v4', - 'Frostbite-ramDeterministic-v0', - 'Frostbite-ramDeterministic-v4', - 'Frostbite-ramNoFrameskip-v0', - 'Frostbite-ramNoFrameskip-v4', - 'Gopher-v0', - 'Gopher-v4', - 'GopherDeterministic-v0', - 'GopherDeterministic-v4', - 'GopherNoFrameskip-v0', - 'GopherNoFrameskip-v4', - 'Gopher-ram-v0', - 'Gopher-ram-v4', - 'Gopher-ramDeterministic-v0', - 'Gopher-ramDeterministic-v4', - 'Gopher-ramNoFrameskip-v0', - 'Gopher-ramNoFrameskip-v4', - 'Gravitar-v0', - 'Gravitar-v4', - 'GravitarDeterministic-v0', - 'GravitarDeterministic-v4', - 'GravitarNoFrameskip-v0', - 'GravitarNoFrameskip-v4', - 'Gravitar-ram-v0', - 'Gravitar-ram-v4', - 'Gravitar-ramDeterministic-v0', - 'Gravitar-ramDeterministic-v4', - 'Gravitar-ramNoFrameskip-v0', - 'Gravitar-ramNoFrameskip-v4', - 'Hero-v0', - 'Hero-v4', - 'HeroDeterministic-v0', - 'HeroDeterministic-v4', - 'HeroNoFrameskip-v0', - 'HeroNoFrameskip-v4', - 'Hero-ram-v0', - 'Hero-ram-v4', - 'Hero-ramDeterministic-v0', - 'Hero-ramDeterministic-v4', - 'Hero-ramNoFrameskip-v0', - 'Hero-ramNoFrameskip-v4', - 'IceHockey-v0', - 'IceHockey-v4', - 'IceHockeyDeterministic-v0', - 'IceHockeyDeterministic-v4', - 'IceHockeyNoFrameskip-v0', - 'IceHockeyNoFrameskip-v4', - 'IceHockey-ram-v0', - 'IceHockey-ram-v4', - 'IceHockey-ramDeterministic-v0', - 'IceHockey-ramDeterministic-v4', - 'IceHockey-ramNoFrameskip-v0', - 'IceHockey-ramNoFrameskip-v4', - 'Jamesbond-v0', - 'Jamesbond-v4', - 'JamesbondDeterministic-v0', - 'JamesbondDeterministic-v4', - 'JamesbondNoFrameskip-v0', - 'JamesbondNoFrameskip-v4', - 'Jamesbond-ram-v0', - 'Jamesbond-ram-v4', - 'Jamesbond-ramDeterministic-v0', - 'Jamesbond-ramDeterministic-v4', - 'Jamesbond-ramNoFrameskip-v0', - 'Jamesbond-ramNoFrameskip-v4', - 'JourneyEscape-v0', - 'JourneyEscape-v4', - 'JourneyEscapeDeterministic-v0', - 'JourneyEscapeDeterministic-v4', - 'JourneyEscapeNoFrameskip-v0', - 'JourneyEscapeNoFrameskip-v4', - 'JourneyEscape-ram-v0', - 'JourneyEscape-ram-v4', - 'JourneyEscape-ramDeterministic-v0', - 'JourneyEscape-ramDeterministic-v4', - 'JourneyEscape-ramNoFrameskip-v0', - 'JourneyEscape-ramNoFrameskip-v4', - 'Kangaroo-v0', - 'Kangaroo-v4', - 'KangarooDeterministic-v0', - 'KangarooDeterministic-v4', - 'KangarooNoFrameskip-v0', - 'KangarooNoFrameskip-v4', - 'Kangaroo-ram-v0', - 'Kangaroo-ram-v4', - 'Kangaroo-ramDeterministic-v0', - 'Kangaroo-ramDeterministic-v4', - 'Kangaroo-ramNoFrameskip-v0', - 'Kangaroo-ramNoFrameskip-v4', - 'Krull-v0', - 'Krull-v4', - 'KrullDeterministic-v0', - 'KrullDeterministic-v4', - 'KrullNoFrameskip-v0', - 'KrullNoFrameskip-v4', - 'Krull-ram-v0', - 'Krull-ram-v4', - 'Krull-ramDeterministic-v0', - 'Krull-ramDeterministic-v4', - 'Krull-ramNoFrameskip-v0', - 'Krull-ramNoFrameskip-v4', - 'KungFuMaster-v0', - 'KungFuMaster-v4', - 'KungFuMasterDeterministic-v0', - 'KungFuMasterDeterministic-v4', - 'KungFuMasterNoFrameskip-v0', - 'KungFuMasterNoFrameskip-v4', - 'KungFuMaster-ram-v0', - 'KungFuMaster-ram-v4', - 'KungFuMaster-ramDeterministic-v0', - 'KungFuMaster-ramDeterministic-v4', - 'KungFuMaster-ramNoFrameskip-v0', - 'KungFuMaster-ramNoFrameskip-v4', - 'MontezumaRevenge-v0', - 'MontezumaRevenge-v4', - 'MontezumaRevengeDeterministic-v0', - 'MontezumaRevengeDeterministic-v4', - 'MontezumaRevengeNoFrameskip-v0', - 'MontezumaRevengeNoFrameskip-v4', - 'MontezumaRevenge-ram-v0', - 'MontezumaRevenge-ram-v4', - 'MontezumaRevenge-ramDeterministic-v0', - 'MontezumaRevenge-ramDeterministic-v4', - 'MontezumaRevenge-ramNoFrameskip-v0', - 'MontezumaRevenge-ramNoFrameskip-v4', - 'MsPacman-v0', - 'MsPacman-v4', - 'MsPacmanDeterministic-v0', - 'MsPacmanDeterministic-v4', - 'MsPacmanNoFrameskip-v0', - 'MsPacmanNoFrameskip-v4', - 'MsPacman-ram-v0', - 'MsPacman-ram-v4', - 'MsPacman-ramDeterministic-v0', - 'MsPacman-ramDeterministic-v4', - 'MsPacman-ramNoFrameskip-v0', - 'MsPacman-ramNoFrameskip-v4', - 'NameThisGame-v0', - 'NameThisGame-v4', - 'NameThisGameDeterministic-v0', - 'NameThisGameDeterministic-v4', - 'NameThisGameNoFrameskip-v0', - 'NameThisGameNoFrameskip-v4', - 'NameThisGame-ram-v0', - 'NameThisGame-ram-v4', - 'NameThisGame-ramDeterministic-v0', - 'NameThisGame-ramDeterministic-v4', - 'NameThisGame-ramNoFrameskip-v0', - 'NameThisGame-ramNoFrameskip-v4', - 'Phoenix-v0', - 'Phoenix-v4', - 'PhoenixDeterministic-v0', - 'PhoenixDeterministic-v4', - 'PhoenixNoFrameskip-v0', - 'PhoenixNoFrameskip-v4', - 'Phoenix-ram-v0', - 'Phoenix-ram-v4', - 'Phoenix-ramDeterministic-v0', - 'Phoenix-ramDeterministic-v4', - 'Phoenix-ramNoFrameskip-v0', - 'Phoenix-ramNoFrameskip-v4', - 'Pitfall-v0', - 'Pitfall-v4', - 'PitfallDeterministic-v0', - 'PitfallDeterministic-v4', - 'PitfallNoFrameskip-v0', - 'PitfallNoFrameskip-v4', - 'Pitfall-ram-v0', - 'Pitfall-ram-v4', - 'Pitfall-ramDeterministic-v0', - 'Pitfall-ramDeterministic-v4', - 'Pitfall-ramNoFrameskip-v0', - 'Pitfall-ramNoFrameskip-v4', - 'Pong-v0', - 'Pong-v4', - 'PongDeterministic-v0', - 'PongDeterministic-v4', - 'PongNoFrameskip-v0', - 'PongNoFrameskip-v4', - 'Pong-ram-v0', - 'Pong-ram-v4', - 'Pong-ramDeterministic-v0', - 'Pong-ramDeterministic-v4', - 'Pong-ramNoFrameskip-v0', - 'Pong-ramNoFrameskip-v4', - 'Pooyan-v0', - 'Pooyan-v4', - 'PooyanDeterministic-v0', - 'PooyanDeterministic-v4', - 'PooyanNoFrameskip-v0', - 'PooyanNoFrameskip-v4', - 'Pooyan-ram-v0', - 'Pooyan-ram-v4', - 'Pooyan-ramDeterministic-v0', - 'Pooyan-ramDeterministic-v4', - 'Pooyan-ramNoFrameskip-v0', - 'Pooyan-ramNoFrameskip-v4', - 'PrivateEye-v0', - 'PrivateEye-v4', - 'PrivateEyeDeterministic-v0', - 'PrivateEyeDeterministic-v4', - 'PrivateEyeNoFrameskip-v0', - 'PrivateEyeNoFrameskip-v4', - 'PrivateEye-ram-v0', - 'PrivateEye-ram-v4', - 'PrivateEye-ramDeterministic-v0', - 'PrivateEye-ramDeterministic-v4', - 'PrivateEye-ramNoFrameskip-v0', - 'PrivateEye-ramNoFrameskip-v4', - 'Qbert-v0', - 'Qbert-v4', - 'QbertDeterministic-v0', - 'QbertDeterministic-v4', - 'QbertNoFrameskip-v0', - 'QbertNoFrameskip-v4', - 'Qbert-ram-v0', - 'Qbert-ram-v4', - 'Qbert-ramDeterministic-v0', - 'Qbert-ramDeterministic-v4', - 'Qbert-ramNoFrameskip-v0', - 'Qbert-ramNoFrameskip-v4', - 'Riverraid-v0', - 'Riverraid-v4', - 'RiverraidDeterministic-v0', - 'RiverraidDeterministic-v4', - 'RiverraidNoFrameskip-v0', - 'RiverraidNoFrameskip-v4', - 'Riverraid-ram-v0', - 'Riverraid-ram-v4', - 'Riverraid-ramDeterministic-v0', - 'Riverraid-ramDeterministic-v4', - 'Riverraid-ramNoFrameskip-v0', - 'Riverraid-ramNoFrameskip-v4', - 'RoadRunner-v0', - 'RoadRunner-v4', - 'RoadRunnerDeterministic-v0', - 'RoadRunnerDeterministic-v4', - 'RoadRunnerNoFrameskip-v0', - 'RoadRunnerNoFrameskip-v4', - 'RoadRunner-ram-v0', - 'RoadRunner-ram-v4', - 'RoadRunner-ramDeterministic-v0', - 'RoadRunner-ramDeterministic-v4', - 'RoadRunner-ramNoFrameskip-v0', - 'RoadRunner-ramNoFrameskip-v4', - 'Robotank-v0', - 'Robotank-v4', - 'RobotankDeterministic-v0', - 'RobotankDeterministic-v4', - 'RobotankNoFrameskip-v0', - 'RobotankNoFrameskip-v4', - 'Robotank-ram-v0', - 'Robotank-ram-v4', - 'Robotank-ramDeterministic-v0', - 'Robotank-ramDeterministic-v4', - 'Robotank-ramNoFrameskip-v0', - 'Robotank-ramNoFrameskip-v4', - 'Seaquest-v0', - 'Seaquest-v4', - 'SeaquestDeterministic-v0', - 'SeaquestDeterministic-v4', - 'SeaquestNoFrameskip-v0', - 'SeaquestNoFrameskip-v4', - 'Seaquest-ram-v0', - 'Seaquest-ram-v4', - 'Seaquest-ramDeterministic-v0', - 'Seaquest-ramDeterministic-v4', - 'Seaquest-ramNoFrameskip-v0', - 'Seaquest-ramNoFrameskip-v4', - 'Skiing-v0', - 'Skiing-v4', - 'SkiingDeterministic-v0', - 'SkiingDeterministic-v4', - 'SkiingNoFrameskip-v0', - 'SkiingNoFrameskip-v4', - 'Skiing-ram-v0', - 'Skiing-ram-v4', - 'Skiing-ramDeterministic-v0', - 'Skiing-ramDeterministic-v4', - 'Skiing-ramNoFrameskip-v0', - 'Skiing-ramNoFrameskip-v4', - 'Solaris-v0', - 'Solaris-v4', - 'SolarisDeterministic-v0', - 'SolarisDeterministic-v4', - 'SolarisNoFrameskip-v0', - 'SolarisNoFrameskip-v4', - 'Solaris-ram-v0', - 'Solaris-ram-v4', - 'Solaris-ramDeterministic-v0', - 'Solaris-ramDeterministic-v4', - 'Solaris-ramNoFrameskip-v0', - 'Solaris-ramNoFrameskip-v4', - 'SpaceInvaders-v0', - 'SpaceInvaders-v4', - 'SpaceInvadersDeterministic-v0', - 'SpaceInvadersDeterministic-v4', - 'SpaceInvadersNoFrameskip-v0', - 'SpaceInvadersNoFrameskip-v4', - 'SpaceInvaders-ram-v0', - 'SpaceInvaders-ram-v4', - 'SpaceInvaders-ramDeterministic-v0', - 'SpaceInvaders-ramDeterministic-v4', - 'SpaceInvaders-ramNoFrameskip-v0', - 'SpaceInvaders-ramNoFrameskip-v4', - 'StarGunner-v0', - 'StarGunner-v4', - 'StarGunnerDeterministic-v0', - 'StarGunnerDeterministic-v4', - 'StarGunnerNoFrameskip-v0', - 'StarGunnerNoFrameskip-v4', - 'StarGunner-ram-v0', - 'StarGunner-ram-v4', - 'StarGunner-ramDeterministic-v0', - 'StarGunner-ramDeterministic-v4', - 'StarGunner-ramNoFrameskip-v0', - 'StarGunner-ramNoFrameskip-v4', - 'Tennis-v0', - 'Tennis-v4', - 'TennisDeterministic-v0', - 'TennisDeterministic-v4', - 'TennisNoFrameskip-v0', - 'TennisNoFrameskip-v4', - 'Tennis-ram-v0', - 'Tennis-ram-v4', - 'Tennis-ramDeterministic-v0', - 'Tennis-ramDeterministic-v4', - 'Tennis-ramNoFrameskip-v0', - 'Tennis-ramNoFrameskip-v4', - 'TimePilot-v0', - 'TimePilot-v4', - 'TimePilotDeterministic-v0', - 'TimePilotDeterministic-v4', - 'TimePilotNoFrameskip-v0', - 'TimePilotNoFrameskip-v4', - 'TimePilot-ram-v0', - 'TimePilot-ram-v4', - 'TimePilot-ramDeterministic-v0', - 'TimePilot-ramDeterministic-v4', - 'TimePilot-ramNoFrameskip-v0', - 'TimePilot-ramNoFrameskip-v4', - 'Tutankham-v0', - 'Tutankham-v4', - 'TutankhamDeterministic-v0', - 'TutankhamDeterministic-v4', - 'TutankhamNoFrameskip-v0', - 'TutankhamNoFrameskip-v4', - 'Tutankham-ram-v0', - 'Tutankham-ram-v4', - 'Tutankham-ramDeterministic-v0', - 'Tutankham-ramDeterministic-v4', - 'Tutankham-ramNoFrameskip-v0', - 'Tutankham-ramNoFrameskip-v4', - 'UpNDown-v0', - 'UpNDown-v4', - 'UpNDownDeterministic-v0', - 'UpNDownDeterministic-v4', - 'UpNDownNoFrameskip-v0', - 'UpNDownNoFrameskip-v4', - 'UpNDown-ram-v0', - 'UpNDown-ram-v4', - 'UpNDown-ramDeterministic-v0', - 'UpNDown-ramDeterministic-v4', - 'UpNDown-ramNoFrameskip-v0', - 'UpNDown-ramNoFrameskip-v4', - 'Venture-v0', - 'Venture-v4', - 'VentureDeterministic-v0', - 'VentureDeterministic-v4', - 'VentureNoFrameskip-v0', - 'VentureNoFrameskip-v4', - 'Venture-ram-v0', - 'Venture-ram-v4', - 'Venture-ramDeterministic-v0', - 'Venture-ramDeterministic-v4', - 'Venture-ramNoFrameskip-v0', - 'Venture-ramNoFrameskip-v4', - 'VideoPinball-v0', - 'VideoPinball-v4', - 'VideoPinballDeterministic-v0', - 'VideoPinballDeterministic-v4', - 'VideoPinballNoFrameskip-v0', - 'VideoPinballNoFrameskip-v4', - 'VideoPinball-ram-v0', - 'VideoPinball-ram-v4', - 'VideoPinball-ramDeterministic-v0', - 'VideoPinball-ramDeterministic-v4', - 'VideoPinball-ramNoFrameskip-v0', - 'VideoPinball-ramNoFrameskip-v4', - 'WizardOfWor-v0', - 'WizardOfWor-v4', - 'WizardOfWorDeterministic-v0', - 'WizardOfWorDeterministic-v4', - 'WizardOfWorNoFrameskip-v0', - 'WizardOfWorNoFrameskip-v4', - 'WizardOfWor-ram-v0', - 'WizardOfWor-ram-v4', - 'WizardOfWor-ramDeterministic-v0', - 'WizardOfWor-ramDeterministic-v4', - 'WizardOfWor-ramNoFrameskip-v0', - 'WizardOfWor-ramNoFrameskip-v4', - 'YarsRevenge-v0', - 'YarsRevenge-v4', - 'YarsRevengeDeterministic-v0', - 'YarsRevengeDeterministic-v4', - 'YarsRevengeNoFrameskip-v0', - 'YarsRevengeNoFrameskip-v4', - 'YarsRevenge-ram-v0', - 'YarsRevenge-ram-v4', - 'YarsRevenge-ramDeterministic-v0', - 'YarsRevenge-ramDeterministic-v4', - 'YarsRevenge-ramNoFrameskip-v0', - 'YarsRevenge-ramNoFrameskip-v4', - 'Zaxxon-v0', - 'Zaxxon-v4', - 'ZaxxonDeterministic-v0', - 'ZaxxonDeterministic-v4', - 'ZaxxonNoFrameskip-v0', - 'ZaxxonNoFrameskip-v4', - 'Zaxxon-ram-v0', - 'Zaxxon-ram-v4', - 'Zaxxon-ramDeterministic-v0', - 'Zaxxon-ramDeterministic-v4', - 'Zaxxon-ramNoFrameskip-v0', - 'Zaxxon-ramNoFrameskip-v4'], - - # Classic control - 'classic_control': [ - 'Acrobot-v1', - 'CartPole-v1', - 'CartPole-v0', - 'MountainCar-v0', - 'MountainCarContinuous-v0', - 'Pendulum-v0' - ], - - # Box2D - 'box2d': [ - 'BipedalWalker-v2', - 'BipedalWalkerHardcore-v2', - 'CarRacing-v0', - 'LunarLander-v2', - 'LunarLanderContinuous-v2' - ], - - # MuJoCo - 'mujoco': [ - 'Ant-v2', - 'HalfCheetah-v2', - 'Hopper-v2', - 'Humanoid-v2', - 'HumanoidStandup-v2', - 'InvertedDoublePendulum-v2', - 'InvertedPendulum-v2', - 'Reacher-v2', - 'Swimmer-v2', - 'Walker2d-v2' - ], - - # Robotics - 'robotics': [ - 'FetchPickAndPlace-v1', - 'FetchPush-v1', - 'FetchReach-v1', - 'FetchSlide-v1', - 'HandManipulateBlock-v0', - 'HandManipulateEgg-v0', - 'HandManipulatePen-v0', - 'HandReach-v0' - ], - - ## Deepmind Control Suite (need check!) - 'dm_control': [ - 'AcrobotSparse-v0', - 'BallincupCatch-v0', - 'CartpoleSwingup-v0', - 'FingerTurn-v0', - 'FishSwim-v0', - 'CheetahRun-v0', - 'HopperHop-v0', - 'HumanoidStand-v0', - 'HumanoidWalk-v0', - 'HumanoidRun-v0', - 'ManipulatorBringball-v0', - 'PendulumSwingup-v0', - 'Pointmass-v0', - 'ReacherHard-v0', - 'Swimmer-v0', - 'WalkerRun-v0' - ], - - ## RLBench - 'rlbench': [ - 'BeatTheBuzz', - 'BlockPyramid', - 'ChangeChannel', - 'ChangeClock', - 'CloseBox', - 'CloseDoor', - 'CloseDrawer', - 'CloseFridge', - 'CloseGrill', - 'CloseJar', - 'CloseLaptopLid', - 'CloseMicrowave', - 'EmptyContainer', - 'EmptyDishwasher', - 'GetIceFromFridge', - 'HangFrameOnHanger', - 'HannoiSquare', - 'HitBallWithQueue', - 'Hockey', - 'InsertUsbInComputer', - 'LampOff', - 'LampOn', - 'LightBulbIn', - 'LightBulbOut', - 'MeatOffGrill', - 'MeatOnGrill', - 'MoveHanger', - 'OpenBox', - 'OpenDoor', - 'OpenDrawer', - 'OpenFridge', - 'OpenGrill', - 'OpenJar', - 'OpenMicrowave', - 'OpenOven', - 'OpenWindow', - 'OpenWineBottle', - 'PhoneOnBase', - 'PickAndLift', - 'PickUpCup', - 'PlaceCups', - 'PlaceHangerOnRack', - 'PlaceShapeInShapeSorter', - 'PlayJenga', - 'PlugChargerInPowerSupply', - 'PourFromCupToCup', - 'PressSwitch', - 'PushButton', - 'PushButtons', - 'PutBooksOnBookshelf', - 'PutBottleInFridge', - 'PutGroceriesInCupboard', - 'PutItemInDrawer', - 'PutKnifeInKnifeBlock', - 'PutKnifeOnChoppingBoard', - 'PutMoneyInSafe', - 'PutPlateInColoredDishRack', - 'PutRubbishInBin', - 'PutShoesInBox', - 'PutToiletRollOnStand', - 'PutTrayInOven', - 'PutUmbrellaInUmbrellaStand', - 'ReachAndDrag', - 'ReachTarget', - 'RemoveCups', - 'ScoopWithSpatula', - 'ScrewNail', - 'SetTheTable', - 'SetupCheckers', - 'SlideBlockToTarget', - 'SlideCabinetOpen', - 'SlideCabinetOpenAndPlaceCups', - 'SolvePuzzle', - 'StackBlocks', - 'StackCups', - 'StackWine', - 'StraightenRope', - 'SweepToDustpan', - 'TakeCupOutFromCabinet', - 'TakeFrameOffHanger', - 'TakeItemOutOfDrawer', - 'TakeLidOffSaucepan', - 'TakeMoneyOutSafe', - 'TakeOffWeighingScales', - 'TakePlateOffColoredDishRack', - 'TakeShoesOutOfBox', - 'TakeToiletRollOffStand', - 'TakeTrayOutOfOven', - 'TakeUmbrellaOutOfUmbrellaStand', - 'TakeUsbOutOfComputer', - 'ToiletSeatDown', - 'ToiletSeatUp', - 'TurnOvenOn', - 'TurnTap', - 'TvOff', - 'TvOn', - 'UnplugCharger', - 'WaterPlants', - 'WeighingScales', - 'WipeDesk' - ] -} +def get_envlist(env_type): + """ get list of env names wrt the type of env """ + try: + l = all_env_list[env_type] + except: + print('Env Type {:s} Not Found!'.format(env_type)) + return l + + +all_env_list = { + ## Gym + # Atari + 'atari': ['AirRaid-v0', + 'AirRaid-v4', + 'AirRaidDeterministic-v0', + 'AirRaidDeterministic-v4', + 'AirRaidNoFrameskip-v0', + 'AirRaidNoFrameskip-v4', + 'AirRaid-ram-v0', + 'AirRaid-ram-v4', + 'AirRaid-ramDeterministic-v0', + 'AirRaid-ramDeterministic-v4', + 'AirRaid-ramNoFrameskip-v0', + 'AirRaid-ramNoFrameskip-v4', + 'Alien-v0', + 'Alien-v4', + 'AlienDeterministic-v0', + 'AlienDeterministic-v4', + 'AlienNoFrameskip-v0', + 'AlienNoFrameskip-v4', + 'Alien-ram-v0', + 'Alien-ram-v4', + 'Alien-ramDeterministic-v0', + 'Alien-ramDeterministic-v4', + 'Alien-ramNoFrameskip-v0', + 'Alien-ramNoFrameskip-v4', + 'Amidar-v0', + 'Amidar-v4', + 'AmidarDeterministic-v0', + 'AmidarDeterministic-v4', + 'AmidarNoFrameskip-v0', + 'AmidarNoFrameskip-v4', + 'Amidar-ram-v0', + 'Amidar-ram-v4', + 'Amidar-ramDeterministic-v0', + 'Amidar-ramDeterministic-v4', + 'Amidar-ramNoFrameskip-v0', + 'Amidar-ramNoFrameskip-v4', + 'Assault-v0', + 'Assault-v4', + 'AssaultDeterministic-v0', + 'AssaultDeterministic-v4', + 'AssaultNoFrameskip-v0', + 'AssaultNoFrameskip-v4', + 'Assault-ram-v0', + 'Assault-ram-v4', + 'Assault-ramDeterministic-v0', + 'Assault-ramDeterministic-v4', + 'Assault-ramNoFrameskip-v0', + 'Assault-ramNoFrameskip-v4', + 'Asterix-v0', + 'Asterix-v4', + 'AsterixDeterministic-v0', + 'AsterixDeterministic-v4', + 'AsterixNoFrameskip-v0', + 'AsterixNoFrameskip-v4', + 'Asterix-ram-v0', + 'Asterix-ram-v4', + 'Asterix-ramDeterministic-v0', + 'Asterix-ramDeterministic-v4', + 'Asterix-ramNoFrameskip-v0', + 'Asterix-ramNoFrameskip-v4', + 'Asteroids-v0', + 'Asteroids-v4', + 'AsteroidsDeterministic-v0', + 'AsteroidsDeterministic-v4', + 'AsteroidsNoFrameskip-v0', + 'AsteroidsNoFrameskip-v4', + 'Asteroids-ram-v0', + 'Asteroids-ram-v4', + 'Asteroids-ramDeterministic-v0', + 'Asteroids-ramDeterministic-v4', + 'Asteroids-ramNoFrameskip-v0', + 'Asteroids-ramNoFrameskip-v4', + 'Atlantis-v0', + 'Atlantis-v4', + 'AtlantisDeterministic-v0', + 'AtlantisDeterministic-v4', + 'AtlantisNoFrameskip-v0', + 'AtlantisNoFrameskip-v4', + 'Atlantis-ram-v0', + 'Atlantis-ram-v4', + 'Atlantis-ramDeterministic-v0', + 'Atlantis-ramDeterministic-v4', + 'Atlantis-ramNoFrameskip-v0', + 'Atlantis-ramNoFrameskip-v4', + 'BankHeist-v0', + 'BankHeist-v4', + 'BankHeistDeterministic-v0', + 'BankHeistDeterministic-v4', + 'BankHeistNoFrameskip-v0', + 'BankHeistNoFrameskip-v4', + 'BankHeist-ram-v0', + 'BankHeist-ram-v4', + 'BankHeist-ramDeterministic-v0', + 'BankHeist-ramDeterministic-v4', + 'BankHeist-ramNoFrameskip-v0', + 'BankHeist-ramNoFrameskip-v4', + 'BattleZone-v0', + 'BattleZone-v4', + 'BattleZoneDeterministic-v0', + 'BattleZoneDeterministic-v4', + 'BattleZoneNoFrameskip-v0', + 'BattleZoneNoFrameskip-v4', + 'BattleZone-ram-v0', + 'BattleZone-ram-v4', + 'BattleZone-ramDeterministic-v0', + 'BattleZone-ramDeterministic-v4', + 'BattleZone-ramNoFrameskip-v0', + 'BattleZone-ramNoFrameskip-v4', + 'BeamRider-v0', + 'BeamRider-v4', + 'BeamRiderDeterministic-v0', + 'BeamRiderDeterministic-v4', + 'BeamRiderNoFrameskip-v0', + 'BeamRiderNoFrameskip-v4', + 'BeamRider-ram-v0', + 'BeamRider-ram-v4', + 'BeamRider-ramDeterministic-v0', + 'BeamRider-ramDeterministic-v4', + 'BeamRider-ramNoFrameskip-v0', + 'BeamRider-ramNoFrameskip-v4', + 'Berzerk-v0', + 'Berzerk-v4', + 'BerzerkDeterministic-v0', + 'BerzerkDeterministic-v4', + 'BerzerkNoFrameskip-v0', + 'BerzerkNoFrameskip-v4', + 'Berzerk-ram-v0', + 'Berzerk-ram-v4', + 'Berzerk-ramDeterministic-v0', + 'Berzerk-ramDeterministic-v4', + 'Berzerk-ramNoFrameskip-v0', + 'Berzerk-ramNoFrameskip-v4', + 'Bowling-v0', + 'Bowling-v4', + 'BowlingDeterministic-v0', + 'BowlingDeterministic-v4', + 'BowlingNoFrameskip-v0', + 'BowlingNoFrameskip-v4', + 'Bowling-ram-v0', + 'Bowling-ram-v4', + 'Bowling-ramDeterministic-v0', + 'Bowling-ramDeterministic-v4', + 'Bowling-ramNoFrameskip-v0', + 'Bowling-ramNoFrameskip-v4', + 'Boxing-v0', + 'Boxing-v4', + 'BoxingDeterministic-v0', + 'BoxingDeterministic-v4', + 'BoxingNoFrameskip-v0', + 'BoxingNoFrameskip-v4', + 'Boxing-ram-v0', + 'Boxing-ram-v4', + 'Boxing-ramDeterministic-v0', + 'Boxing-ramDeterministic-v4', + 'Boxing-ramNoFrameskip-v0', + 'Boxing-ramNoFrameskip-v4', + 'Breakout-v0', + 'Breakout-v4', + 'BreakoutDeterministic-v0', + 'BreakoutDeterministic-v4', + 'BreakoutNoFrameskip-v0', + 'BreakoutNoFrameskip-v4', + 'Breakout-ram-v0', + 'Breakout-ram-v4', + 'Breakout-ramDeterministic-v0', + 'Breakout-ramDeterministic-v4', + 'Breakout-ramNoFrameskip-v0', + 'Breakout-ramNoFrameskip-v4', + 'Carnival-v0', + 'Carnival-v4', + 'CarnivalDeterministic-v0', + 'CarnivalDeterministic-v4', + 'CarnivalNoFrameskip-v0', + 'CarnivalNoFrameskip-v4', + 'Carnival-ram-v0', + 'Carnival-ram-v4', + 'Carnival-ramDeterministic-v0', + 'Carnival-ramDeterministic-v4', + 'Carnival-ramNoFrameskip-v0', + 'Carnival-ramNoFrameskip-v4', + 'Centipede-v0', + 'Centipede-v4', + 'CentipedeDeterministic-v0', + 'CentipedeDeterministic-v4', + 'CentipedeNoFrameskip-v0', + 'CentipedeNoFrameskip-v4', + 'Centipede-ram-v0', + 'Centipede-ram-v4', + 'Centipede-ramDeterministic-v0', + 'Centipede-ramDeterministic-v4', + 'Centipede-ramNoFrameskip-v0', + 'Centipede-ramNoFrameskip-v4', + 'ChopperCommand-v0', + 'ChopperCommand-v4', + 'ChopperCommandDeterministic-v0', + 'ChopperCommandDeterministic-v4', + 'ChopperCommandNoFrameskip-v0', + 'ChopperCommandNoFrameskip-v4', + 'ChopperCommand-ram-v0', + 'ChopperCommand-ram-v4', + 'ChopperCommand-ramDeterministic-v0', + 'ChopperCommand-ramDeterministic-v4', + 'ChopperCommand-ramNoFrameskip-v0', + 'ChopperCommand-ramNoFrameskip-v4', + 'CrazyClimber-v0', + 'CrazyClimber-v4', + 'CrazyClimberDeterministic-v0', + 'CrazyClimberDeterministic-v4', + 'CrazyClimberNoFrameskip-v0', + 'CrazyClimberNoFrameskip-v4', + 'CrazyClimber-ram-v0', + 'CrazyClimber-ram-v4', + 'CrazyClimber-ramDeterministic-v0', + 'CrazyClimber-ramDeterministic-v4', + 'CrazyClimber-ramNoFrameskip-v0', + 'CrazyClimber-ramNoFrameskip-v4', + 'DemonAttack-v0', + 'DemonAttack-v4', + 'DemonAttackDeterministic-v0', + 'DemonAttackDeterministic-v4', + 'DemonAttackNoFrameskip-v0', + 'DemonAttackNoFrameskip-v4', + 'DemonAttack-ram-v0', + 'DemonAttack-ram-v4', + 'DemonAttack-ramDeterministic-v0', + 'DemonAttack-ramDeterministic-v4', + 'DemonAttack-ramNoFrameskip-v0', + 'DemonAttack-ramNoFrameskip-v4', + 'DoubleDunk-v0', + 'DoubleDunk-v4', + 'DoubleDunkDeterministic-v0', + 'DoubleDunkDeterministic-v4', + 'DoubleDunkNoFrameskip-v0', + 'DoubleDunkNoFrameskip-v4', + 'DoubleDunk-ram-v0', + 'DoubleDunk-ram-v4', + 'DoubleDunk-ramDeterministic-v0', + 'DoubleDunk-ramDeterministic-v4', + 'DoubleDunk-ramNoFrameskip-v0', + 'DoubleDunk-ramNoFrameskip-v4', + 'ElevatorAction-v0', + 'ElevatorAction-v4', + 'ElevatorActionDeterministic-v0', + 'ElevatorActionDeterministic-v4', + 'ElevatorActionNoFrameskip-v0', + 'ElevatorActionNoFrameskip-v4', + 'ElevatorAction-ram-v0', + 'ElevatorAction-ram-v4', + 'ElevatorAction-ramDeterministic-v0', + 'ElevatorAction-ramDeterministic-v4', + 'ElevatorAction-ramNoFrameskip-v0', + 'ElevatorAction-ramNoFrameskip-v4', + 'Enduro-v0', + 'Enduro-v4', + 'EnduroDeterministic-v0', + 'EnduroDeterministic-v4', + 'EnduroNoFrameskip-v0', + 'EnduroNoFrameskip-v4', + 'Enduro-ram-v0', + 'Enduro-ram-v4', + 'Enduro-ramDeterministic-v0', + 'Enduro-ramDeterministic-v4', + 'Enduro-ramNoFrameskip-v0', + 'Enduro-ramNoFrameskip-v4', + 'FishingDerby-v0', + 'FishingDerby-v4', + 'FishingDerbyDeterministic-v0', + 'FishingDerbyDeterministic-v4', + 'FishingDerbyNoFrameskip-v0', + 'FishingDerbyNoFrameskip-v4', + 'FishingDerby-ram-v0', + 'FishingDerby-ram-v4', + 'FishingDerby-ramDeterministic-v0', + 'FishingDerby-ramDeterministic-v4', + 'FishingDerby-ramNoFrameskip-v0', + 'FishingDerby-ramNoFrameskip-v4', + 'Freeway-v0', + 'Freeway-v4', + 'FreewayDeterministic-v0', + 'FreewayDeterministic-v4', + 'FreewayNoFrameskip-v0', + 'FreewayNoFrameskip-v4', + 'Freeway-ram-v0', + 'Freeway-ram-v4', + 'Freeway-ramDeterministic-v0', + 'Freeway-ramDeterministic-v4', + 'Freeway-ramNoFrameskip-v0', + 'Freeway-ramNoFrameskip-v4', + 'Frostbite-v0', + 'Frostbite-v4', + 'FrostbiteDeterministic-v0', + 'FrostbiteDeterministic-v4', + 'FrostbiteNoFrameskip-v0', + 'FrostbiteNoFrameskip-v4', + 'Frostbite-ram-v0', + 'Frostbite-ram-v4', + 'Frostbite-ramDeterministic-v0', + 'Frostbite-ramDeterministic-v4', + 'Frostbite-ramNoFrameskip-v0', + 'Frostbite-ramNoFrameskip-v4', + 'Gopher-v0', + 'Gopher-v4', + 'GopherDeterministic-v0', + 'GopherDeterministic-v4', + 'GopherNoFrameskip-v0', + 'GopherNoFrameskip-v4', + 'Gopher-ram-v0', + 'Gopher-ram-v4', + 'Gopher-ramDeterministic-v0', + 'Gopher-ramDeterministic-v4', + 'Gopher-ramNoFrameskip-v0', + 'Gopher-ramNoFrameskip-v4', + 'Gravitar-v0', + 'Gravitar-v4', + 'GravitarDeterministic-v0', + 'GravitarDeterministic-v4', + 'GravitarNoFrameskip-v0', + 'GravitarNoFrameskip-v4', + 'Gravitar-ram-v0', + 'Gravitar-ram-v4', + 'Gravitar-ramDeterministic-v0', + 'Gravitar-ramDeterministic-v4', + 'Gravitar-ramNoFrameskip-v0', + 'Gravitar-ramNoFrameskip-v4', + 'Hero-v0', + 'Hero-v4', + 'HeroDeterministic-v0', + 'HeroDeterministic-v4', + 'HeroNoFrameskip-v0', + 'HeroNoFrameskip-v4', + 'Hero-ram-v0', + 'Hero-ram-v4', + 'Hero-ramDeterministic-v0', + 'Hero-ramDeterministic-v4', + 'Hero-ramNoFrameskip-v0', + 'Hero-ramNoFrameskip-v4', + 'IceHockey-v0', + 'IceHockey-v4', + 'IceHockeyDeterministic-v0', + 'IceHockeyDeterministic-v4', + 'IceHockeyNoFrameskip-v0', + 'IceHockeyNoFrameskip-v4', + 'IceHockey-ram-v0', + 'IceHockey-ram-v4', + 'IceHockey-ramDeterministic-v0', + 'IceHockey-ramDeterministic-v4', + 'IceHockey-ramNoFrameskip-v0', + 'IceHockey-ramNoFrameskip-v4', + 'Jamesbond-v0', + 'Jamesbond-v4', + 'JamesbondDeterministic-v0', + 'JamesbondDeterministic-v4', + 'JamesbondNoFrameskip-v0', + 'JamesbondNoFrameskip-v4', + 'Jamesbond-ram-v0', + 'Jamesbond-ram-v4', + 'Jamesbond-ramDeterministic-v0', + 'Jamesbond-ramDeterministic-v4', + 'Jamesbond-ramNoFrameskip-v0', + 'Jamesbond-ramNoFrameskip-v4', + 'JourneyEscape-v0', + 'JourneyEscape-v4', + 'JourneyEscapeDeterministic-v0', + 'JourneyEscapeDeterministic-v4', + 'JourneyEscapeNoFrameskip-v0', + 'JourneyEscapeNoFrameskip-v4', + 'JourneyEscape-ram-v0', + 'JourneyEscape-ram-v4', + 'JourneyEscape-ramDeterministic-v0', + 'JourneyEscape-ramDeterministic-v4', + 'JourneyEscape-ramNoFrameskip-v0', + 'JourneyEscape-ramNoFrameskip-v4', + 'Kangaroo-v0', + 'Kangaroo-v4', + 'KangarooDeterministic-v0', + 'KangarooDeterministic-v4', + 'KangarooNoFrameskip-v0', + 'KangarooNoFrameskip-v4', + 'Kangaroo-ram-v0', + 'Kangaroo-ram-v4', + 'Kangaroo-ramDeterministic-v0', + 'Kangaroo-ramDeterministic-v4', + 'Kangaroo-ramNoFrameskip-v0', + 'Kangaroo-ramNoFrameskip-v4', + 'Krull-v0', + 'Krull-v4', + 'KrullDeterministic-v0', + 'KrullDeterministic-v4', + 'KrullNoFrameskip-v0', + 'KrullNoFrameskip-v4', + 'Krull-ram-v0', + 'Krull-ram-v4', + 'Krull-ramDeterministic-v0', + 'Krull-ramDeterministic-v4', + 'Krull-ramNoFrameskip-v0', + 'Krull-ramNoFrameskip-v4', + 'KungFuMaster-v0', + 'KungFuMaster-v4', + 'KungFuMasterDeterministic-v0', + 'KungFuMasterDeterministic-v4', + 'KungFuMasterNoFrameskip-v0', + 'KungFuMasterNoFrameskip-v4', + 'KungFuMaster-ram-v0', + 'KungFuMaster-ram-v4', + 'KungFuMaster-ramDeterministic-v0', + 'KungFuMaster-ramDeterministic-v4', + 'KungFuMaster-ramNoFrameskip-v0', + 'KungFuMaster-ramNoFrameskip-v4', + 'MontezumaRevenge-v0', + 'MontezumaRevenge-v4', + 'MontezumaRevengeDeterministic-v0', + 'MontezumaRevengeDeterministic-v4', + 'MontezumaRevengeNoFrameskip-v0', + 'MontezumaRevengeNoFrameskip-v4', + 'MontezumaRevenge-ram-v0', + 'MontezumaRevenge-ram-v4', + 'MontezumaRevenge-ramDeterministic-v0', + 'MontezumaRevenge-ramDeterministic-v4', + 'MontezumaRevenge-ramNoFrameskip-v0', + 'MontezumaRevenge-ramNoFrameskip-v4', + 'MsPacman-v0', + 'MsPacman-v4', + 'MsPacmanDeterministic-v0', + 'MsPacmanDeterministic-v4', + 'MsPacmanNoFrameskip-v0', + 'MsPacmanNoFrameskip-v4', + 'MsPacman-ram-v0', + 'MsPacman-ram-v4', + 'MsPacman-ramDeterministic-v0', + 'MsPacman-ramDeterministic-v4', + 'MsPacman-ramNoFrameskip-v0', + 'MsPacman-ramNoFrameskip-v4', + 'NameThisGame-v0', + 'NameThisGame-v4', + 'NameThisGameDeterministic-v0', + 'NameThisGameDeterministic-v4', + 'NameThisGameNoFrameskip-v0', + 'NameThisGameNoFrameskip-v4', + 'NameThisGame-ram-v0', + 'NameThisGame-ram-v4', + 'NameThisGame-ramDeterministic-v0', + 'NameThisGame-ramDeterministic-v4', + 'NameThisGame-ramNoFrameskip-v0', + 'NameThisGame-ramNoFrameskip-v4', + 'Phoenix-v0', + 'Phoenix-v4', + 'PhoenixDeterministic-v0', + 'PhoenixDeterministic-v4', + 'PhoenixNoFrameskip-v0', + 'PhoenixNoFrameskip-v4', + 'Phoenix-ram-v0', + 'Phoenix-ram-v4', + 'Phoenix-ramDeterministic-v0', + 'Phoenix-ramDeterministic-v4', + 'Phoenix-ramNoFrameskip-v0', + 'Phoenix-ramNoFrameskip-v4', + 'Pitfall-v0', + 'Pitfall-v4', + 'PitfallDeterministic-v0', + 'PitfallDeterministic-v4', + 'PitfallNoFrameskip-v0', + 'PitfallNoFrameskip-v4', + 'Pitfall-ram-v0', + 'Pitfall-ram-v4', + 'Pitfall-ramDeterministic-v0', + 'Pitfall-ramDeterministic-v4', + 'Pitfall-ramNoFrameskip-v0', + 'Pitfall-ramNoFrameskip-v4', + 'Pong-v0', + 'Pong-v4', + 'PongDeterministic-v0', + 'PongDeterministic-v4', + 'PongNoFrameskip-v0', + 'PongNoFrameskip-v4', + 'Pong-ram-v0', + 'Pong-ram-v4', + 'Pong-ramDeterministic-v0', + 'Pong-ramDeterministic-v4', + 'Pong-ramNoFrameskip-v0', + 'Pong-ramNoFrameskip-v4', + 'Pooyan-v0', + 'Pooyan-v4', + 'PooyanDeterministic-v0', + 'PooyanDeterministic-v4', + 'PooyanNoFrameskip-v0', + 'PooyanNoFrameskip-v4', + 'Pooyan-ram-v0', + 'Pooyan-ram-v4', + 'Pooyan-ramDeterministic-v0', + 'Pooyan-ramDeterministic-v4', + 'Pooyan-ramNoFrameskip-v0', + 'Pooyan-ramNoFrameskip-v4', + 'PrivateEye-v0', + 'PrivateEye-v4', + 'PrivateEyeDeterministic-v0', + 'PrivateEyeDeterministic-v4', + 'PrivateEyeNoFrameskip-v0', + 'PrivateEyeNoFrameskip-v4', + 'PrivateEye-ram-v0', + 'PrivateEye-ram-v4', + 'PrivateEye-ramDeterministic-v0', + 'PrivateEye-ramDeterministic-v4', + 'PrivateEye-ramNoFrameskip-v0', + 'PrivateEye-ramNoFrameskip-v4', + 'Qbert-v0', + 'Qbert-v4', + 'QbertDeterministic-v0', + 'QbertDeterministic-v4', + 'QbertNoFrameskip-v0', + 'QbertNoFrameskip-v4', + 'Qbert-ram-v0', + 'Qbert-ram-v4', + 'Qbert-ramDeterministic-v0', + 'Qbert-ramDeterministic-v4', + 'Qbert-ramNoFrameskip-v0', + 'Qbert-ramNoFrameskip-v4', + 'Riverraid-v0', + 'Riverraid-v4', + 'RiverraidDeterministic-v0', + 'RiverraidDeterministic-v4', + 'RiverraidNoFrameskip-v0', + 'RiverraidNoFrameskip-v4', + 'Riverraid-ram-v0', + 'Riverraid-ram-v4', + 'Riverraid-ramDeterministic-v0', + 'Riverraid-ramDeterministic-v4', + 'Riverraid-ramNoFrameskip-v0', + 'Riverraid-ramNoFrameskip-v4', + 'RoadRunner-v0', + 'RoadRunner-v4', + 'RoadRunnerDeterministic-v0', + 'RoadRunnerDeterministic-v4', + 'RoadRunnerNoFrameskip-v0', + 'RoadRunnerNoFrameskip-v4', + 'RoadRunner-ram-v0', + 'RoadRunner-ram-v4', + 'RoadRunner-ramDeterministic-v0', + 'RoadRunner-ramDeterministic-v4', + 'RoadRunner-ramNoFrameskip-v0', + 'RoadRunner-ramNoFrameskip-v4', + 'Robotank-v0', + 'Robotank-v4', + 'RobotankDeterministic-v0', + 'RobotankDeterministic-v4', + 'RobotankNoFrameskip-v0', + 'RobotankNoFrameskip-v4', + 'Robotank-ram-v0', + 'Robotank-ram-v4', + 'Robotank-ramDeterministic-v0', + 'Robotank-ramDeterministic-v4', + 'Robotank-ramNoFrameskip-v0', + 'Robotank-ramNoFrameskip-v4', + 'Seaquest-v0', + 'Seaquest-v4', + 'SeaquestDeterministic-v0', + 'SeaquestDeterministic-v4', + 'SeaquestNoFrameskip-v0', + 'SeaquestNoFrameskip-v4', + 'Seaquest-ram-v0', + 'Seaquest-ram-v4', + 'Seaquest-ramDeterministic-v0', + 'Seaquest-ramDeterministic-v4', + 'Seaquest-ramNoFrameskip-v0', + 'Seaquest-ramNoFrameskip-v4', + 'Skiing-v0', + 'Skiing-v4', + 'SkiingDeterministic-v0', + 'SkiingDeterministic-v4', + 'SkiingNoFrameskip-v0', + 'SkiingNoFrameskip-v4', + 'Skiing-ram-v0', + 'Skiing-ram-v4', + 'Skiing-ramDeterministic-v0', + 'Skiing-ramDeterministic-v4', + 'Skiing-ramNoFrameskip-v0', + 'Skiing-ramNoFrameskip-v4', + 'Solaris-v0', + 'Solaris-v4', + 'SolarisDeterministic-v0', + 'SolarisDeterministic-v4', + 'SolarisNoFrameskip-v0', + 'SolarisNoFrameskip-v4', + 'Solaris-ram-v0', + 'Solaris-ram-v4', + 'Solaris-ramDeterministic-v0', + 'Solaris-ramDeterministic-v4', + 'Solaris-ramNoFrameskip-v0', + 'Solaris-ramNoFrameskip-v4', + 'SpaceInvaders-v0', + 'SpaceInvaders-v4', + 'SpaceInvadersDeterministic-v0', + 'SpaceInvadersDeterministic-v4', + 'SpaceInvadersNoFrameskip-v0', + 'SpaceInvadersNoFrameskip-v4', + 'SpaceInvaders-ram-v0', + 'SpaceInvaders-ram-v4', + 'SpaceInvaders-ramDeterministic-v0', + 'SpaceInvaders-ramDeterministic-v4', + 'SpaceInvaders-ramNoFrameskip-v0', + 'SpaceInvaders-ramNoFrameskip-v4', + 'StarGunner-v0', + 'StarGunner-v4', + 'StarGunnerDeterministic-v0', + 'StarGunnerDeterministic-v4', + 'StarGunnerNoFrameskip-v0', + 'StarGunnerNoFrameskip-v4', + 'StarGunner-ram-v0', + 'StarGunner-ram-v4', + 'StarGunner-ramDeterministic-v0', + 'StarGunner-ramDeterministic-v4', + 'StarGunner-ramNoFrameskip-v0', + 'StarGunner-ramNoFrameskip-v4', + 'Tennis-v0', + 'Tennis-v4', + 'TennisDeterministic-v0', + 'TennisDeterministic-v4', + 'TennisNoFrameskip-v0', + 'TennisNoFrameskip-v4', + 'Tennis-ram-v0', + 'Tennis-ram-v4', + 'Tennis-ramDeterministic-v0', + 'Tennis-ramDeterministic-v4', + 'Tennis-ramNoFrameskip-v0', + 'Tennis-ramNoFrameskip-v4', + 'TimePilot-v0', + 'TimePilot-v4', + 'TimePilotDeterministic-v0', + 'TimePilotDeterministic-v4', + 'TimePilotNoFrameskip-v0', + 'TimePilotNoFrameskip-v4', + 'TimePilot-ram-v0', + 'TimePilot-ram-v4', + 'TimePilot-ramDeterministic-v0', + 'TimePilot-ramDeterministic-v4', + 'TimePilot-ramNoFrameskip-v0', + 'TimePilot-ramNoFrameskip-v4', + 'Tutankham-v0', + 'Tutankham-v4', + 'TutankhamDeterministic-v0', + 'TutankhamDeterministic-v4', + 'TutankhamNoFrameskip-v0', + 'TutankhamNoFrameskip-v4', + 'Tutankham-ram-v0', + 'Tutankham-ram-v4', + 'Tutankham-ramDeterministic-v0', + 'Tutankham-ramDeterministic-v4', + 'Tutankham-ramNoFrameskip-v0', + 'Tutankham-ramNoFrameskip-v4', + 'UpNDown-v0', + 'UpNDown-v4', + 'UpNDownDeterministic-v0', + 'UpNDownDeterministic-v4', + 'UpNDownNoFrameskip-v0', + 'UpNDownNoFrameskip-v4', + 'UpNDown-ram-v0', + 'UpNDown-ram-v4', + 'UpNDown-ramDeterministic-v0', + 'UpNDown-ramDeterministic-v4', + 'UpNDown-ramNoFrameskip-v0', + 'UpNDown-ramNoFrameskip-v4', + 'Venture-v0', + 'Venture-v4', + 'VentureDeterministic-v0', + 'VentureDeterministic-v4', + 'VentureNoFrameskip-v0', + 'VentureNoFrameskip-v4', + 'Venture-ram-v0', + 'Venture-ram-v4', + 'Venture-ramDeterministic-v0', + 'Venture-ramDeterministic-v4', + 'Venture-ramNoFrameskip-v0', + 'Venture-ramNoFrameskip-v4', + 'VideoPinball-v0', + 'VideoPinball-v4', + 'VideoPinballDeterministic-v0', + 'VideoPinballDeterministic-v4', + 'VideoPinballNoFrameskip-v0', + 'VideoPinballNoFrameskip-v4', + 'VideoPinball-ram-v0', + 'VideoPinball-ram-v4', + 'VideoPinball-ramDeterministic-v0', + 'VideoPinball-ramDeterministic-v4', + 'VideoPinball-ramNoFrameskip-v0', + 'VideoPinball-ramNoFrameskip-v4', + 'WizardOfWor-v0', + 'WizardOfWor-v4', + 'WizardOfWorDeterministic-v0', + 'WizardOfWorDeterministic-v4', + 'WizardOfWorNoFrameskip-v0', + 'WizardOfWorNoFrameskip-v4', + 'WizardOfWor-ram-v0', + 'WizardOfWor-ram-v4', + 'WizardOfWor-ramDeterministic-v0', + 'WizardOfWor-ramDeterministic-v4', + 'WizardOfWor-ramNoFrameskip-v0', + 'WizardOfWor-ramNoFrameskip-v4', + 'YarsRevenge-v0', + 'YarsRevenge-v4', + 'YarsRevengeDeterministic-v0', + 'YarsRevengeDeterministic-v4', + 'YarsRevengeNoFrameskip-v0', + 'YarsRevengeNoFrameskip-v4', + 'YarsRevenge-ram-v0', + 'YarsRevenge-ram-v4', + 'YarsRevenge-ramDeterministic-v0', + 'YarsRevenge-ramDeterministic-v4', + 'YarsRevenge-ramNoFrameskip-v0', + 'YarsRevenge-ramNoFrameskip-v4', + 'Zaxxon-v0', + 'Zaxxon-v4', + 'ZaxxonDeterministic-v0', + 'ZaxxonDeterministic-v4', + 'ZaxxonNoFrameskip-v0', + 'ZaxxonNoFrameskip-v4', + 'Zaxxon-ram-v0', + 'Zaxxon-ram-v4', + 'Zaxxon-ramDeterministic-v0', + 'Zaxxon-ramDeterministic-v4', + 'Zaxxon-ramNoFrameskip-v0', + 'Zaxxon-ramNoFrameskip-v4'], + + # Classic control + 'classic_control': [ + 'Acrobot-v1', + 'CartPole-v1', + 'CartPole-v0', + 'MountainCar-v0', + 'MountainCarContinuous-v0', + 'Pendulum-v0' + ], + + # Box2D + 'box2d': [ + 'BipedalWalker-v2', + 'BipedalWalkerHardcore-v2', + 'CarRacing-v0', + 'LunarLander-v2', + 'LunarLanderContinuous-v2' + ], + + # MuJoCo + 'mujoco': [ + 'Ant-v2', + 'HalfCheetah-v2', + 'Hopper-v2', + 'Humanoid-v2', + 'HumanoidStandup-v2', + 'InvertedDoublePendulum-v2', + 'InvertedPendulum-v2', + 'Reacher-v2', + 'Swimmer-v2', + 'Walker2d-v2' + ], + + # Robotics + 'robotics': [ + 'FetchPickAndPlace-v1', + 'FetchPush-v1', + 'FetchReach-v1', + 'FetchSlide-v1', + 'HandManipulateBlock-v0', + 'HandManipulateEgg-v0', + 'HandManipulatePen-v0', + 'HandReach-v0' + ], + + ## Deepmind Control Suite (need check!) + 'dm_control': [ + 'AcrobotSparse-v0', + 'BallincupCatch-v0', + 'CartpoleSwingup-v0', + 'FingerTurn-v0', + 'FishSwim-v0', + 'CheetahRun-v0', + 'HopperHop-v0', + 'HumanoidStand-v0', + 'HumanoidWalk-v0', + 'HumanoidRun-v0', + 'ManipulatorBringball-v0', + 'PendulumSwingup-v0', + 'Pointmass-v0', + 'ReacherHard-v0', + 'Swimmer-v0', + 'WalkerRun-v0' + ], + + ## RLBench + 'rlbench': [ + 'BeatTheBuzz', + 'BlockPyramid', + 'ChangeChannel', + 'ChangeClock', + 'CloseBox', + 'CloseDoor', + 'CloseDrawer', + 'CloseFridge', + 'CloseGrill', + 'CloseJar', + 'CloseLaptopLid', + 'CloseMicrowave', + 'EmptyContainer', + 'EmptyDishwasher', + 'GetIceFromFridge', + 'HangFrameOnHanger', + 'HannoiSquare', + 'HitBallWithQueue', + 'Hockey', + 'InsertUsbInComputer', + 'LampOff', + 'LampOn', + 'LightBulbIn', + 'LightBulbOut', + 'MeatOffGrill', + 'MeatOnGrill', + 'MoveHanger', + 'OpenBox', + 'OpenDoor', + 'OpenDrawer', + 'OpenFridge', + 'OpenGrill', + 'OpenJar', + 'OpenMicrowave', + 'OpenOven', + 'OpenWindow', + 'OpenWineBottle', + 'PhoneOnBase', + 'PickAndLift', + 'PickUpCup', + 'PlaceCups', + 'PlaceHangerOnRack', + 'PlaceShapeInShapeSorter', + 'PlayJenga', + 'PlugChargerInPowerSupply', + 'PourFromCupToCup', + 'PressSwitch', + 'PushButton', + 'PushButtons', + 'PutBooksOnBookshelf', + 'PutBottleInFridge', + 'PutGroceriesInCupboard', + 'PutItemInDrawer', + 'PutKnifeInKnifeBlock', + 'PutKnifeOnChoppingBoard', + 'PutMoneyInSafe', + 'PutPlateInColoredDishRack', + 'PutRubbishInBin', + 'PutShoesInBox', + 'PutToiletRollOnStand', + 'PutTrayInOven', + 'PutUmbrellaInUmbrellaStand', + 'ReachAndDrag', + 'ReachTarget', + 'RemoveCups', + 'ScoopWithSpatula', + 'ScrewNail', + 'SetTheTable', + 'SetupCheckers', + 'SlideBlockToTarget', + 'SlideCabinetOpen', + 'SlideCabinetOpenAndPlaceCups', + 'SolvePuzzle', + 'StackBlocks', + 'StackCups', + 'StackWine', + 'StraightenRope', + 'SweepToDustpan', + 'TakeCupOutFromCabinet', + 'TakeFrameOffHanger', + 'TakeItemOutOfDrawer', + 'TakeLidOffSaucepan', + 'TakeMoneyOutSafe', + 'TakeOffWeighingScales', + 'TakePlateOffColoredDishRack', + 'TakeShoesOutOfBox', + 'TakeToiletRollOffStand', + 'TakeTrayOutOfOven', + 'TakeUmbrellaOutOfUmbrellaStand', + 'TakeUsbOutOfComputer', + 'ToiletSeatDown', + 'ToiletSeatUp', + 'TurnOvenOn', + 'TurnTap', + 'TvOff', + 'TvOn', + 'UnplugCharger', + 'WaterPlants', + 'WeighingScales', + 'WipeDesk' + ] +} diff --git a/rlzoo/common/env_wrappers.py b/rlzoo/common/env_wrappers.py old mode 100644 new mode 100755 index c7f92cf..04c0345 --- a/rlzoo/common/env_wrappers.py +++ b/rlzoo/common/env_wrappers.py @@ -1,637 +1,637 @@ -"""Env wrappers -Most common wrappers can be checked from following links for usage: - -`https://pypi.org/project/gym-vec-env` - -`https://github.com/openai/baselines/blob/master/baselines/common/wrappers.py` -""" -from collections import deque -from functools import partial -from multiprocessing import Pipe, Process, cpu_count -from sys import platform - -import cv2 -import gym -import numpy as np -from gym import spaces -from gym.wrappers import FlattenDictWrapper - -from rlzoo.common.env_list import get_envlist - -__all__ = ( - 'build_env', # build env - 'TimeLimit', # Time limit wrapper - 'NoopResetEnv', # Run random number of no-ops on reset - 'FireResetEnv', # Reset wrapper for envs with fire action - 'EpisodicLifeEnv', # end-of-life == end-of-episode wrapper - 'MaxAndSkipEnv', # skip frame wrapper - 'ClipRewardEnv', # clip reward wrapper - 'WarpFrame', # warp observation wrapper - 'FrameStack', # stack frame wrapper - 'LazyFrames', # lazy store wrapper - 'RewardShaping', # reward shaping - 'SubprocVecEnv', # vectorized env wrapper - 'VecFrameStack', # stack frames in vectorized env - 'Monitor', # Episode reward and length monitor - 'NormalizedActions', # normalized action to actual space - 'DmObsTrans', # translate observations in dm_control environments -) -cv2.ocl.setUseOpenCL(False) - - -def build_env(env_id, env_type, vectorized=False, - seed=0, reward_shaping=None, nenv=1, **kwargs): - """ - Build env based on options - - :param env_id: (str) environment id - :param env_type: (str) atari, classic_control, box2d - :param vectorized: (bool) whether sampling parrallel - :param seed: (int) random seed for env - :param reward_shaping: (callable) callable function for reward shaping - :param nenv: (int) how many processes will be used in sampling - :param kwargs: (dict) - :param max_episode_steps: (int) the maximum episode steps - """ - nenv = nenv or cpu_count() // (1 + (platform == 'darwin')) - stack = env_type == 'atari' - if nenv > 1: - if vectorized: - env = _make_vec_env(env_id, env_type, nenv, seed, - reward_shaping, stack, **kwargs) - else: - env = [] - for _ in range(nenv): - single_env = _make_env(env_id, env_type, seed, - reward_shaping, stack, **kwargs) - env.append(single_env) # get env as a list of same single env - - else: - env = _make_env(env_id, env_type, seed, - reward_shaping, stack, **kwargs) - - return env - - -def check_name_in_list(env_id, env_type): - """ Check if env_id exists in the env_type list """ - env_list = get_envlist(env_type) - if env_id not in env_list: - print('Env ID {:s} Not Found In {:s}!'.format(env_id, env_type)) - else: - print('Env ID {:s} Exists!'.format(env_id)) - - -def _make_env(env_id, env_type, seed, reward_shaping, frame_stack, **kwargs): - """Make single env""" - check_name_in_list(env_id, env_type) # check existence of env_id in env_type - if env_type == 'atari': - env = gym.make(env_id) - env = NoopResetEnv(env, noop_max=30) - if 'NoFrameskip' in env.spec.id: - env = MaxAndSkipEnv(env, skip=4) - env = Monitor(env) - # deepmind wrap - env = EpisodicLifeEnv(env) - if 'FIRE' in env.unwrapped.get_action_meanings(): - env = FireResetEnv(env) - env = WarpFrame(env) - env = ClipRewardEnv(env) - if frame_stack: - env = FrameStack(env, 4) - elif env_type in ['classic_control', 'box2d', 'mujoco']: - env = gym.make(env_id).unwrapped - max_episode_steps = kwargs.get('max_episode_steps') - if max_episode_steps is not None: - env = TimeLimit(env.unwrapped, max_episode_steps) - env = Monitor(env) - elif env_type == 'robotics': - env = gym.make(env_id) - env = FlattenDictWrapper(env, ['observation', 'desired_goal']) - env = Monitor(env, info_keywords=('is_success',)) - elif env_type == 'dm_control': - env = gym.make('dm2gym:' + env_id, environment_kwargs={'flat_observation': True}) - env = DmObsTrans(env) - elif env_type == 'rlbench': - from rlzoo.common.build_rlbench_env import RLBenchEnv - state_type = kwargs.get('state_type') - env = RLBenchEnv(env_id) if state_type is None else RLBenchEnv(env_id, state_type) - else: - raise NotImplementedError - - if reward_shaping is not None: - if callable(reward_shaping): - env = RewardShaping(env, reward_shaping) - else: - raise ValueError('reward_shaping parameter must be callable') - env.seed(seed) - return env - - -def _make_vec_env(env_id, env_type, nenv, seed, - reward_shaping, frame_stack, **kwargs): - """Make vectorized env""" - env = SubprocVecEnv([partial( - _make_env, env_id, env_type, seed + i, reward_shaping, False, **kwargs - ) for i in range(nenv)]) - if frame_stack: - env = VecFrameStack(env, 4) - return env - - -class DmObsTrans(gym.Wrapper): - """ Observation process for DeepMind Control Suite environments """ - - def __init__(self, env): - self.env = env - super(DmObsTrans, self).__init__(env) - self.__need_trans = False - if isinstance(self.observation_space, gym.spaces.dict.Dict): - self.observation_space = self.observation_space['observations'] - self.__need_trans = True - - def step(self, ac): - observation, reward, done, info = self.env.step(ac) - if self.__need_trans: - observation = observation['observations'] - return observation, reward, done, info - - def reset(self, **kwargs): - observation = self.env.reset(**kwargs) - if self.__need_trans: - observation = observation['observations'] - return observation - - -class TimeLimit(gym.Wrapper): - - def __init__(self, env, max_episode_steps=None): - self.env = env - super(TimeLimit, self).__init__(env) - self._max_episode_steps = max_episode_steps - self._elapsed_steps = 0 - - def step(self, ac): - observation, reward, done, info = self.env.step(ac) - self._elapsed_steps += 1 - if self._elapsed_steps >= self._max_episode_steps: - done = True - info['TimeLimit.truncated'] = True - return observation, reward, done, info - - def reset(self, **kwargs): - self._elapsed_steps = 0 - return self.env.reset(**kwargs) - - -class NoopResetEnv(gym.Wrapper): - - def __init__(self, env, noop_max=30): - """Sample initial states by taking random number of no-ops on reset. - No-op is assumed to be action 0. - """ - super(NoopResetEnv, self).__init__(env) - self.noop_max = noop_max - self.override_num_noops = None - self.noop_action = 0 - assert env.unwrapped.get_action_meanings()[0] == 'NOOP' - - def reset(self, **kwargs): - """ Do no-op action for a number of steps in [1, noop_max].""" - self.env.reset(**kwargs) - if self.override_num_noops is not None: - noops = self.override_num_noops - else: - noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) - assert noops > 0 - obs = None - for _ in range(noops): - obs, _, done, _ = self.env.step(self.noop_action) - if done: - obs = self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) - - -class FireResetEnv(gym.Wrapper): - - def __init__(self, env): - """Take action on reset for environments that are fixed until firing.""" - super(FireResetEnv, self).__init__(env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' - assert len(env.unwrapped.get_action_meanings()) >= 3 - - def reset(self, **kwargs): - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) - - -class EpisodicLifeEnv(gym.Wrapper): - - def __init__(self, env): - """Make end-of-life == end-of-episode, but only reset on true game over. - Done by DeepMind for the DQN and co. since it helps value estimation. - """ - super(EpisodicLifeEnv, self).__init__(env) - self.lives = 0 - self.was_real_done = True - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - # check current lives, make loss of life terminal, - # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if 0 < lives < self.lives: - # for Qbert sometimes we stay in lives == 0 condition for a few - # frames so it's important to keep lives > 0, so that we only reset - # once the environment advertises done. - done = True - self.lives = lives - return obs, reward, done, info - - def reset(self, **kwargs): - """Reset only when lives are exhausted. - This way all states are still reachable even though lives are episodic, - and the learner need not know about any of this behind-the-scenes. - """ - if self.was_real_done: - obs = self.env.reset(**kwargs) - else: - # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) - self.lives = self.env.unwrapped.ale.lives() - return obs - - -class MaxAndSkipEnv(gym.Wrapper): - - def __init__(self, env, skip=4): - """Return only every `skip`-th frame""" - super(MaxAndSkipEnv, self).__init__(env) - # most recent raw observations (for max pooling across time steps) - shape = (2,) + env.observation_space.shape - self._obs_buffer = np.zeros(shape, dtype=np.uint8) - self._skip = skip - - def step(self, action): - """Repeat action, sum reward, and max over last observations.""" - total_reward = 0.0 - done = info = None - for i in range(self._skip): - obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: - self._obs_buffer[0] = obs - if i == self._skip - 1: - self._obs_buffer[1] = obs - total_reward += reward - if done: - break - # Note that the observation on the done=True frame doesn't matter - max_frame = self._obs_buffer.max(axis=0) - - return max_frame, total_reward, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) - - -class ClipRewardEnv(gym.RewardWrapper): - - def __init__(self, env): - super(ClipRewardEnv, self).__init__(env) - - def reward(self, reward): - """Bin reward to {+1, 0, -1} by its sign.""" - return np.sign(reward) - - -class WarpFrame(gym.ObservationWrapper): - - def __init__(self, env, width=84, height=84, grayscale=True): - """Warp frames to 84x84 as done in the Nature paper and later work.""" - super(WarpFrame, self).__init__(env) - self.width = width - self.height = height - self.grayscale = grayscale - shape = (self.height, self.width, 1 if self.grayscale else 3) - self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) - - def observation(self, frame): - if self.grayscale: - frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) - size = (self.width, self.height) - frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA) - if self.grayscale: - frame = np.expand_dims(frame, -1) - return frame - - -class FrameStack(gym.Wrapper): - - def __init__(self, env, k): - """Stack k last frames. - Returns lazy array, which is much more memory efficient. - See Also `LazyFrames` - """ - super(FrameStack, self).__init__(env) - self.k = k - self.frames = deque([], maxlen=k) - shp = env.observation_space.shape - shape = shp[:-1] + (shp[-1] * k,) - self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) - - def reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) - return np.asarray(self._get_ob()) - - def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return np.asarray(self._get_ob()), reward, done, info - - def _get_ob(self): - assert len(self.frames) == self.k - return LazyFrames(list(self.frames)) - - -class LazyFrames(object): - - def __init__(self, frames): - """This object ensures that common frames between the observations are - only stored once. It exists purely to optimize memory usage which can be - huge for DQN's 1M frames replay buffers. - - This object should only be converted to numpy array before being passed - to the model. You'd not believe how complex the previous solution was. - """ - self._frames = frames - self._out = None - - def _force(self): - if self._out is None: - self._out = np.concatenate(self._frames, axis=-1) - self._frames = None - return self._out - - def __array__(self, dtype=None): - out = self._force() - if dtype is not None: - out = out.astype(dtype) - return out - - def __len__(self): - return len(self._force()) - - def __getitem__(self, i): - return self._force()[i] - - -class RewardShaping(gym.RewardWrapper): - """Shaping the reward - For reward scale, func can be `lambda r: r * scale` - """ - - def __init__(self, env, func): - super(RewardShaping, self).__init__(env) - self.func = func - - def reward(self, reward): - return self.func(reward) - - -class VecFrameStack(object): - - def __init__(self, env, k): - self.env = env - self.k = k - self.action_space = env.action_space - self.frames = deque([], maxlen=k) - shp = env.observation_space.shape - shape = shp[:-1] + (shp[-1] * k,) - self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) - - def reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) - return np.asarray(self._get_ob()) - - def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return np.asarray(self._get_ob()), reward, done, info - - def _get_ob(self): - assert len(self.frames) == self.k - return LazyFrames(list(self.frames)) - - -def _worker(remote, parent_remote, env_fn_wrapper): - parent_remote.close() - env = env_fn_wrapper.x() - while True: - cmd, data = remote.recv() - if cmd == 'step': - ob, reward, done, info = env.step(data) - if done: - ob = env.reset() - remote.send((ob, reward, done, info)) - elif cmd == 'reset': - ob = env.reset() - remote.send(ob) - elif cmd == 'reset_task': - ob = env._reset_task() - remote.send(ob) - elif cmd == 'close': - remote.close() - break - elif cmd == 'get_spaces': - remote.send((env.observation_space, env.action_space)) - else: - raise NotImplementedError - - -class CloudpickleWrapper(object): - """ - Uses cloudpickle to serialize contents - """ - - def __init__(self, x): - self.x = x - - def __getstate__(self): - import cloudpickle - return cloudpickle.dumps(self.x) - - def __setstate__(self, ob): - import pickle - self.x = pickle.loads(ob) - - -class SubprocVecEnv(object): - - def __init__(self, env_fns): - """ - envs: list of gym environments to run in subprocesses - """ - self.num_envs = len(env_fns) - - self.waiting = False - self.closed = False - nenvs = len(env_fns) - self.nenvs = nenvs - self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) - zipped_args = zip(self.work_remotes, self.remotes, env_fns) - self.ps = [ - Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) - for (work_remote, remote, env_fn) in zipped_args - ] - - for p in self.ps: - # if the main process crashes, we should not cause things to hang - p.daemon = True - p.start() - for remote in self.work_remotes: - remote.close() - - self.remotes[0].send(('get_spaces', None)) - observation_space, action_space = self.remotes[0].recv() - self.observation_space = observation_space - self.action_space = action_space - - def _step_async(self, actions): - """ - Tell all the environments to start taking a step - with the given actions. - Call step_wait() to get the results of the step. - You should not call this if a step_async run is - already pending. - """ - for remote, action in zip(self.remotes, actions): - remote.send(('step', action)) - self.waiting = True - - def _step_wait(self): - """ - Wait for the step taken with step_async(). - Returns (obs, rews, dones, infos): - - obs: an array of observations, or a tuple of - arrays of observations. - - rews: an array of rewards - - dones: an array of "episode done" booleans - - infos: a sequence of info objects - """ - results = [remote.recv() for remote in self.remotes] - self.waiting = False - obs, rews, dones, infos = zip(*results) - return np.stack(obs), np.stack(rews), np.stack(dones), infos - - def reset(self): - """ - Reset all the environments and return an array of - observations, or a tuple of observation arrays. - If step_async is still doing work, that work will - be cancelled and step_wait() should not be called - until step_async() is invoked again. - """ - for remote in self.remotes: - remote.send(('reset', None)) - return np.stack([remote.recv() for remote in self.remotes]) - - def _reset_task(self): - for remote in self.remotes: - remote.send(('reset_task', None)) - return np.stack([remote.recv() for remote in self.remotes]) - - def close(self): - if self.closed: - return - if self.waiting: - for remote in self.remotes: - remote.recv() - for remote in self.remotes: - remote.send(('close', None)) - for p in self.ps: - p.join() - self.closed = True - - def __len__(self): - return self.nenvs - - def step(self, actions): - self._step_async(actions) - return self._step_wait() - - -class Monitor(gym.Wrapper): - - def __init__(self, env, info_keywords=None): - super(Monitor, self).__init__(env) - self._monitor_rewards = None - self._info_keywords = info_keywords or [] - - def reset(self, **kwargs): - self._monitor_rewards = [] - return self.env.reset(**kwargs) - - def step(self, action): - o_, r, done, info = self.env.step(action) - self._monitor_rewards.append(r) - if done: - info['episode'] = { - 'r': sum(self._monitor_rewards), - 'l': len(self._monitor_rewards) - } - for keyword in self._info_keywords: - info['episode'][keyword] = info[keyword] - return o_, r, done, info - - -class NormalizedActions(gym.ActionWrapper): - - def _action(self, action): - low = self.action_space.low - high = self.action_space.high - - action = low + (action + 1.0) * 0.5 * (high - low) - action = np.clip(action, low, high) - - return action - - def _reverse_action(self, action): - low = self.action_space.low - high = self.action_space.high - - action = 2 * (action - low) / (high - low) - 1 - action = np.clip(action, low, high) - - return action - - -def close_env(env): - """ - close environment or environment list - """ - try: - env.close() - except: - pass - try: - for e in env: - e.close() - except: - pass +"""Env wrappers +Most common wrappers can be checked from following links for usage: + +`https://pypi.org/project/gym-vec-env` + +`https://github.com/openai/baselines/blob/master/baselines/common/wrappers.py` +""" +from collections import deque +from functools import partial +from multiprocessing import Pipe, Process, cpu_count +from sys import platform + +import cv2 +import gym +import numpy as np +from gym import spaces +from gym.wrappers import FlattenDictWrapper + +from rlzoo.common.env_list import get_envlist + +__all__ = ( + 'build_env', # build env + 'TimeLimit', # Time limit wrapper + 'NoopResetEnv', # Run random number of no-ops on reset + 'FireResetEnv', # Reset wrapper for envs with fire action + 'EpisodicLifeEnv', # end-of-life == end-of-episode wrapper + 'MaxAndSkipEnv', # skip frame wrapper + 'ClipRewardEnv', # clip reward wrapper + 'WarpFrame', # warp observation wrapper + 'FrameStack', # stack frame wrapper + 'LazyFrames', # lazy store wrapper + 'RewardShaping', # reward shaping + 'SubprocVecEnv', # vectorized env wrapper + 'VecFrameStack', # stack frames in vectorized env + 'Monitor', # Episode reward and length monitor + 'NormalizedActions', # normalized action to actual space + 'DmObsTrans', # translate observations in dm_control environments +) +cv2.ocl.setUseOpenCL(False) + + +def build_env(env_id, env_type, vectorized=False, + seed=0, reward_shaping=None, nenv=1, **kwargs): + """ + Build env based on options + + :param env_id: (str) environment id + :param env_type: (str) atari, classic_control, box2d + :param vectorized: (bool) whether sampling parrallel + :param seed: (int) random seed for env + :param reward_shaping: (callable) callable function for reward shaping + :param nenv: (int) how many processes will be used in sampling + :param kwargs: (dict) + :param max_episode_steps: (int) the maximum episode steps + """ + nenv = nenv or cpu_count() // (1 + (platform == 'darwin')) + stack = env_type == 'atari' + if nenv > 1: + if vectorized: + env = _make_vec_env(env_id, env_type, nenv, seed, + reward_shaping, stack, **kwargs) + else: + env = [] + for _ in range(nenv): + single_env = _make_env(env_id, env_type, seed, + reward_shaping, stack, **kwargs) + env.append(single_env) # get env as a list of same single env + + else: + env = _make_env(env_id, env_type, seed, + reward_shaping, stack, **kwargs) + + return env + + +def check_name_in_list(env_id, env_type): + """ Check if env_id exists in the env_type list """ + env_list = get_envlist(env_type) + if env_id not in env_list: + print('Env ID {:s} Not Found In {:s}!'.format(env_id, env_type)) + else: + print('Env ID {:s} Exists!'.format(env_id)) + + +def _make_env(env_id, env_type, seed, reward_shaping, frame_stack, **kwargs): + """Make single env""" + check_name_in_list(env_id, env_type) # check existence of env_id in env_type + if env_type == 'atari': + env = gym.make(env_id) + env = NoopResetEnv(env, noop_max=30) + if 'NoFrameskip' in env.spec.id: + env = MaxAndSkipEnv(env, skip=4) + env = Monitor(env) + # deepmind wrap + env = EpisodicLifeEnv(env) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = WarpFrame(env) + env = ClipRewardEnv(env) + if frame_stack: + env = FrameStack(env, 4) + elif env_type in ['classic_control', 'box2d', 'mujoco']: + env = gym.make(env_id).unwrapped + max_episode_steps = kwargs.get('max_episode_steps') + if max_episode_steps is not None: + env = TimeLimit(env.unwrapped, max_episode_steps) + env = Monitor(env) + elif env_type == 'robotics': + env = gym.make(env_id) + env = FlattenDictWrapper(env, ['observation', 'desired_goal']) + env = Monitor(env, info_keywords=('is_success',)) + elif env_type == 'dm_control': + env = gym.make('dm2gym:' + env_id, environment_kwargs={'flat_observation': True}) + env = DmObsTrans(env) + elif env_type == 'rlbench': + from rlzoo.common.build_rlbench_env import RLBenchEnv + state_type = kwargs.get('state_type') + env = RLBenchEnv(env_id) if state_type is None else RLBenchEnv(env_id, state_type) + else: + raise NotImplementedError + + if reward_shaping is not None: + if callable(reward_shaping): + env = RewardShaping(env, reward_shaping) + else: + raise ValueError('reward_shaping parameter must be callable') + env.seed(seed) + return env + + +def _make_vec_env(env_id, env_type, nenv, seed, + reward_shaping, frame_stack, **kwargs): + """Make vectorized env""" + env = SubprocVecEnv([partial( + _make_env, env_id, env_type, seed + i, reward_shaping, False, **kwargs + ) for i in range(nenv)]) + if frame_stack: + env = VecFrameStack(env, 4) + return env + + +class DmObsTrans(gym.Wrapper): + """ Observation process for DeepMind Control Suite environments """ + + def __init__(self, env): + self.env = env + super(DmObsTrans, self).__init__(env) + self.__need_trans = False + if isinstance(self.observation_space, gym.spaces.dict.Dict): + self.observation_space = self.observation_space['observations'] + self.__need_trans = True + + def step(self, ac): + observation, reward, done, info = self.env.step(ac) + if self.__need_trans: + observation = observation['observations'] + return observation, reward, done, info + + def reset(self, **kwargs): + observation = self.env.reset(**kwargs) + if self.__need_trans: + observation = observation['observations'] + return observation + + +class TimeLimit(gym.Wrapper): + + def __init__(self, env, max_episode_steps=None): + self.env = env + super(TimeLimit, self).__init__(env) + self._max_episode_steps = max_episode_steps + self._elapsed_steps = 0 + + def step(self, ac): + observation, reward, done, info = self.env.step(ac) + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + done = True + info['TimeLimit.truncated'] = True + return observation, reward, done, info + + def reset(self, **kwargs): + self._elapsed_steps = 0 + return self.env.reset(**kwargs) + + +class NoopResetEnv(gym.Wrapper): + + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + super(NoopResetEnv, self).__init__(env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class FireResetEnv(gym.Wrapper): + + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + super(FireResetEnv, self).__init__(env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + super(EpisodicLifeEnv, self).__init__(env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if 0 < lives < self.lives: + # for Qbert sometimes we stay in lives == 0 condition for a few + # frames so it's important to keep lives > 0, so that we only reset + # once the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + super(MaxAndSkipEnv, self).__init__(env) + # most recent raw observations (for max pooling across time steps) + shape = (2,) + env.observation_space.shape + self._obs_buffer = np.zeros(shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = info = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class ClipRewardEnv(gym.RewardWrapper): + + def __init__(self, env): + super(ClipRewardEnv, self).__init__(env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +class WarpFrame(gym.ObservationWrapper): + + def __init__(self, env, width=84, height=84, grayscale=True): + """Warp frames to 84x84 as done in the Nature paper and later work.""" + super(WarpFrame, self).__init__(env) + self.width = width + self.height = height + self.grayscale = grayscale + shape = (self.height, self.width, 1 if self.grayscale else 3) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) + + def observation(self, frame): + if self.grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + size = (self.width, self.height) + frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA) + if self.grayscale: + frame = np.expand_dims(frame, -1) + return frame + + +class FrameStack(gym.Wrapper): + + def __init__(self, env, k): + """Stack k last frames. + Returns lazy array, which is much more memory efficient. + See Also `LazyFrames` + """ + super(FrameStack, self).__init__(env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + shape = shp[:-1] + (shp[-1] * k,) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return np.asarray(self._get_ob()) + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return np.asarray(self._get_ob()), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + + +class LazyFrames(object): + + def __init__(self, frames): + """This object ensures that common frames between the observations are + only stored once. It exists purely to optimize memory usage which can be + huge for DQN's 1M frames replay buffers. + + This object should only be converted to numpy array before being passed + to the model. You'd not believe how complex the previous solution was. + """ + self._frames = frames + self._out = None + + def _force(self): + if self._out is None: + self._out = np.concatenate(self._frames, axis=-1) + self._frames = None + return self._out + + def __array__(self, dtype=None): + out = self._force() + if dtype is not None: + out = out.astype(dtype) + return out + + def __len__(self): + return len(self._force()) + + def __getitem__(self, i): + return self._force()[i] + + +class RewardShaping(gym.RewardWrapper): + """Shaping the reward + For reward scale, func can be `lambda r: r * scale` + """ + + def __init__(self, env, func): + super(RewardShaping, self).__init__(env) + self.func = func + + def reward(self, reward): + return self.func(reward) + + +class VecFrameStack(object): + + def __init__(self, env, k): + self.env = env + self.k = k + self.action_space = env.action_space + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + shape = shp[:-1] + (shp[-1] * k,) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return np.asarray(self._get_ob()) + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return np.asarray(self._get_ob()), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + + +def _worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + if done: + ob = env.reset() + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset() + remote.send(ob) + elif cmd == 'reset_task': + ob = env._reset_task() + remote.send(ob) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + + +class CloudpickleWrapper(object): + """ + Uses cloudpickle to serialize contents + """ + + def __init__(self, x): + self.x = x + + def __getstate__(self): + import cloudpickle + return cloudpickle.dumps(self.x) + + def __setstate__(self, ob): + import pickle + self.x = pickle.loads(ob) + + +class SubprocVecEnv(object): + + def __init__(self, env_fns): + """ + envs: list of gym environments to run in subprocesses + """ + self.num_envs = len(env_fns) + + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.nenvs = nenvs + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + zipped_args = zip(self.work_remotes, self.remotes, env_fns) + self.ps = [ + Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zipped_args + ] + + for p in self.ps: + # if the main process crashes, we should not cause things to hang + p.daemon = True + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, action_space = self.remotes[0].recv() + self.observation_space = observation_space + self.action_space = action_space + + def _step_async(self, actions): + """ + Tell all the environments to start taking a step + with the given actions. + Call step_wait() to get the results of the step. + You should not call this if a step_async run is + already pending. + """ + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def _step_wait(self): + """ + Wait for the step taken with step_async(). + Returns (obs, rews, dones, infos): + - obs: an array of observations, or a tuple of + arrays of observations. + - rews: an array of rewards + - dones: an array of "episode done" booleans + - infos: a sequence of info objects + """ + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + """ + Reset all the environments and return an array of + observations, or a tuple of observation arrays. + If step_async is still doing work, that work will + be cancelled and step_wait() should not be called + until step_async() is invoked again. + """ + for remote in self.remotes: + remote.send(('reset', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def _reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + def __len__(self): + return self.nenvs + + def step(self, actions): + self._step_async(actions) + return self._step_wait() + + +class Monitor(gym.Wrapper): + + def __init__(self, env, info_keywords=None): + super(Monitor, self).__init__(env) + self._monitor_rewards = None + self._info_keywords = info_keywords or [] + + def reset(self, **kwargs): + self._monitor_rewards = [] + return self.env.reset(**kwargs) + + def step(self, action): + o_, r, done, info = self.env.step(action) + self._monitor_rewards.append(r) + if done: + info['episode'] = { + 'r': sum(self._monitor_rewards), + 'l': len(self._monitor_rewards) + } + for keyword in self._info_keywords: + info['episode'][keyword] = info[keyword] + return o_, r, done, info + + +class NormalizedActions(gym.ActionWrapper): + + def _action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = low + (action + 1.0) * 0.5 * (high - low) + action = np.clip(action, low, high) + + return action + + def _reverse_action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = 2 * (action - low) / (high - low) - 1 + action = np.clip(action, low, high) + + return action + + +def close_env(env): + """ + close environment or environment list + """ + try: + env.close() + except: + pass + try: + for e in env: + e.close() + except: + pass diff --git a/rlzoo/common/math_utils.py b/rlzoo/common/math_utils.py old mode 100644 new mode 100755 index 5fef326..c9fad08 --- a/rlzoo/common/math_utils.py +++ b/rlzoo/common/math_utils.py @@ -1,15 +1,15 @@ -""" -Functions for mathematics utilization. - -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 - -""" - - -def flatten_dims(shapes): # will be moved to common - dim = 1 - for s in shapes: - dim *= s - return dim +""" +Functions for mathematics utilization. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" + + +def flatten_dims(shapes): # will be moved to common + dim = 1 + for s in shapes: + dim *= s + return dim diff --git a/rlzoo/common/policy_networks.py b/rlzoo/common/policy_networks.py old mode 100644 new mode 100755 index 1724e90..cc3d773 --- a/rlzoo/common/policy_networks.py +++ b/rlzoo/common/policy_networks.py @@ -1,366 +1,366 @@ -""" -Functions for utilization. - -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 - -""" -import copy -import numpy as np -import tensorlayer as tl -from tensorlayer.models import Model - -from rlzoo.common.basic_nets import * -from rlzoo.common.distributions import make_dist - - -class StochasticContinuousPolicyNetwork(Model): - def __init__(self, state_shape, action_shape, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), - activation=tf.nn.relu, output_activation=None, log_std_min=-20, log_std_max=2, trainable=True): - """ - Stochastic continuous policy network with multiple fully-connected layers or convolutional layers (according to state shape) - - :param state_shape: (tuple[int]) shape of the state, for example, (state_dim, ) for single-dimensional state - :param action_shape: (tuple[int]) shape of the action, for example, (action_dim, ) for single-dimensional action - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param log_std_min: (float) lower bound of standard deviation of action - :param log_std_max: (float) upper bound of standard deviation of action - :param trainable: (bool) set training and evaluation mode - """ - - action_dim = action_shape[0] - if len(state_shape) == 1: - with tf.name_scope('MLP'): - state_dim = state_shape[0] - inputs, l = MLP(state_dim, hidden_dim_list, w_init, activation) - else: - with tf.name_scope('CNN'): - inputs, l = CNN(state_shape, conv_kwargs=None) - with tf.name_scope('Output_Mean'): - mean_linear = Dense(n_units=action_dim, act=output_activation, W_init=w_init)(l) - with tf.name_scope('Output_Std'): - log_std_linear = Dense(n_units=action_dim, act=output_activation, W_init=w_init)(l) - log_std_linear = tl.layers.Lambda(lambda x: tf.clip_by_value(x, log_std_min, log_std_max), name='Lambda')( - log_std_linear) - - super().__init__(inputs=inputs, outputs=[mean_linear, log_std_linear]) - if trainable: - self.train() - else: - self.eval() - - -class DeterministicContinuousPolicyNetwork(Model): - def __init__(self, state_shape, action_shape, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), \ - activation=tf.nn.relu, output_activation=tf.nn.tanh, trainable=True): - """ - Deterministic continuous policy network with multiple fully-connected layers or convolutional layers (according to state shape) - - :param state_shape: (tuple[int]) shape of the state, for example, (state_dim, ) for single-dimensional state - :param action_shape: (tuple[int]) shape of the action, for example, (action_dim, ) for single-dimensional action - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param trainable: (bool) set training and evaluation mode - """ - - action_dim = action_shape[0] - - if len(state_shape) == 1: - with tf.name_scope('MLP'): - state_dim = state_shape[0] - inputs, l = MLP(state_dim, hidden_dim_list, w_init, activation) - else: - with tf.name_scope('CNN'): - inputs, l = CNN(state_shape, conv_kwargs=None) - - with tf.name_scope('Output'): - outputs = Dense(n_units=action_dim, act=output_activation, W_init=w_init)(l) - - super().__init__(inputs=inputs, outputs=outputs) - if trainable: - self.train() - else: - self.eval() - - -class DeterministicPolicyNetwork(Model): - def __init__(self, state_space, action_space, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), - activation=tf.nn.relu, output_activation=tf.nn.tanh, trainable=True, name=None): - """ - Deterministic continuous/discrete policy network with multiple fully-connected layers - - :param state_space: (gym.spaces) space of the state from gym environments - :param action_space: (gym.spaces) space of the action from gym environments - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param trainable: (bool) set training and evaluation mode - """ - self._state_space, self._action_space = state_space, action_space - - if isinstance(self._action_space, spaces.Discrete): - self._action_shape = self._action_space.n, - - elif isinstance(self._action_space, spaces.Box): - assert len(self._action_space.shape) == 1 - self._action_shape = self._action_space.shape - - assert all(self._action_space.low < self._action_space.high) - action_bounds = [self._action_space.low, self._action_space.high] - self._action_mean = np.mean(action_bounds, 0) - self._action_scale = action_bounds[1] - self._action_mean - else: - raise NotImplementedError - - obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) - - if isinstance(state_space, spaces.Dict): - assert isinstance(obs_inputs, dict) - assert isinstance(current_layer, dict) - self.input_dict = obs_inputs - obs_inputs = list(obs_inputs.values()) - current_layer = tl.layers.Concat(-1)(list(current_layer.values())) - - with tf.name_scope('MLP'): - for i, dim in enumerate(hidden_dim_list): - current_layer = Dense(n_units=dim, act=activation, W_init=w_init, name='hidden_layer%d' % (i + 1))(current_layer) - - with tf.name_scope('Output'): - outputs = Dense(n_units=self._action_shape[0], act=output_activation, W_init=w_init, name='outputs')(current_layer) - - if isinstance(self._action_space, spaces.Discrete): - outputs = tl.layers.Lambda(lambda x: tf.argmax(tf.nn.softmax(x), axis=-1))(outputs) - elif isinstance(self._action_space, spaces.Box): - outputs = tl.layers.Lambda(lambda x: x * self._action_scale + self._action_mean)(outputs) - outputs = tl.layers.Lambda(lambda x: tf.clip_by_value(x, self._action_space.low, - self._action_space.high))(outputs) - - # make model - super().__init__(inputs=obs_inputs, outputs=outputs, name=name) - print('Policy network created') - if trainable: - self.train() - else: - self.eval() - - def __call__(self, states, *args, **kwargs): - if isinstance(self._state_space, spaces.Dict): - states = np.array(states).transpose([1, 0]).tolist() - else: - if np.shape(states)[1:] != self.state_shape: - raise ValueError( - 'Input state shape error. shape can be {} but your shape is {}'.format((None,) + self.state_shape, - np.shape(states))) - states = np.array(states, dtype=np.float32) - return super().__call__(states, *args, **kwargs) - - def random_sample(self): - """ generate random actions for exploration """ - - if isinstance(self._action_space, spaces.Discrete): - return np.random.choice(self._action_space.n, 1)[0] - else: - return np.random.uniform(self._action_space.low, self._action_space.high, self._action_shape) - - @property - def state_space(self): - return copy.deepcopy(self._state_space) - - @property - def action_space(self): - return copy.deepcopy(self._action_space) - - @property - def state_shape(self): - return copy.deepcopy(self._state_shape) - - @property - def action_shape(self): - return copy.deepcopy(self._action_shape) - - -class StochasticPolicyNetwork(Model): - def __init__(self, state_space, action_space, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), - activation=tf.nn.relu, output_activation=tf.nn.tanh, log_std_min=-20, log_std_max=2, trainable=True, - name=None, state_conditioned=False): - """ - Stochastic continuous/discrete policy network with multiple fully-connected layers - - :param state_space: (gym.spaces) space of the state from gym environments - :param action_space: (gym.spaces) space of the action from gym environments - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param log_std_min: (float) lower bound of standard deviation of action - :param log_std_max: (float) upper bound of standard deviation of action - :param trainable: (bool) set training and evaluation mode - - Tips: We recommend to use tf.nn.tanh for output_activation, especially for continuous action space, \ - to ensure the final action range is exactly the same as declared in action space after action normalization. - """ - self._state_space, self._action_space = state_space, action_space - - if isinstance(self._action_space, spaces.Discrete): - self._action_shape = self._action_space.n, - self.policy_dist = make_dist(self._action_space) # create action distribution - elif isinstance(self._action_space, spaces.Box): # normalize action - assert len(self._action_space.shape) == 1 - self._action_shape = self._action_space.shape - - assert all(self._action_space.low < self._action_space.high) - action_bounds = [self._action_space.low, self._action_space.high] - self._action_mean = np.mean(action_bounds, 0) - self._action_scale = action_bounds[1] - self._action_mean - - self.policy_dist = make_dist(self._action_space) # create action distribution - self.policy_dist.action_mean = self._action_mean - self.policy_dist.action_scale = self._action_scale - else: - raise NotImplementedError - - self._state_conditioned = state_conditioned - - obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) - - # build structure - if isinstance(state_space, spaces.Dict): - assert isinstance(obs_inputs, dict) - assert isinstance(current_layer, dict) - self.input_dict = obs_inputs - obs_inputs = list(obs_inputs.values()) - current_layer = tl.layers.Concat(-1)(list(current_layer.values())) - - with tf.name_scope('MLP'): - for i, dim in enumerate(hidden_dim_list): - current_layer = Dense(n_units=dim, act=activation, - W_init=w_init, name='hidden_layer%d' % (i + 1))(current_layer) - - with tf.name_scope('Output'): - if isinstance(action_space, spaces.Discrete): - outputs = Dense(n_units=self.policy_dist.ndim, act=output_activation, W_init=w_init)(current_layer) - elif isinstance(action_space, spaces.Box): - mu = Dense(n_units=self.policy_dist.ndim, act=output_activation, W_init=w_init)(current_layer) - - if self._state_conditioned: - log_sigma = Dense(n_units=self.policy_dist.ndim, act=None, W_init=w_init)(current_layer) - log_sigma = tl.layers.Lambda(lambda x: tf.clip_by_value(x, log_std_min, log_std_max))(log_sigma) - outputs = [mu, log_sigma] - else: - outputs = mu - self._log_sigma = tf.Variable(np.zeros(self.policy_dist.ndim, dtype=np.float32)) - else: - raise NotImplementedError - - # make model - super().__init__(inputs=obs_inputs, outputs=outputs, name=name) - if isinstance(self._action_space, spaces.Box) and not self._state_conditioned: - self.trainable_weights.append(self._log_sigma) - - if trainable: - self.train() - else: - self.eval() - - def __call__(self, states, *args, greedy=False, **kwargs): - if isinstance(self._state_space, spaces.Dict): - states = np.array(states).transpose([1, 0]).tolist() - else: - if np.shape(states)[1:] != self.state_shape: - raise ValueError( - 'Input state shape error. Shape should be {} but your shape is {}'.format((None,) + self.state_shape, - np.shape(states))) - states = np.array(states, dtype=np.float32) - params = super().__call__(states, *args, **kwargs) - if isinstance(self._action_space, spaces.Box) and not self._state_conditioned: - params = params, self._log_sigma - self.policy_dist.set_param(params) - if greedy: - result = self.policy_dist.greedy_sample() - else: - result = self.policy_dist.sample() - - if isinstance(self._action_space, spaces.Box): # normalize action - if greedy: - result = result * self._action_scale + self._action_mean - else: - result, explore = result - result = result * self._action_scale + self._action_mean + explore - - result = tf.clip_by_value(result, self._action_space.low, self._action_space.high) - return result - - def random_sample(self): - """ generate random actions for exploration """ - - if isinstance(self._action_space, spaces.Discrete): - return np.random.choice(self._action_space.n, 1)[0] - else: - return np.random.uniform(self._action_space.low, self._action_space.high, self._action_shape) - - @property - def state_space(self): - return copy.deepcopy(self._state_space) - - @property - def action_space(self): - return copy.deepcopy(self._action_space) - - @property - def state_shape(self): - return copy.deepcopy(self._state_shape) - - @property - def action_shape(self): - return copy.deepcopy(self._action_shape) - - -if __name__ == '__main__': - import gym - from rlzoo.common.env_wrappers import * - from rlzoo.common.value_networks import * - # EnvName = 'PongNoFrameskip-v4' - # EnvName = 'Pong-v4' - # EnvType = 'atari' - - EnvName = 'CartPole-v0' - # EnvName = 'Pendulum-v0' - EnvType = 'classic_control' - - # EnvName = 'BipedalWalker-v2' - # EnvType = 'box2d' - - # EnvName = 'Ant-v2' - # EnvType = 'mujoco' - - # EnvName = 'FetchPush-v1' - # EnvType = 'robotics' - - # EnvName = 'FishSwim-v0' - # EnvType = 'dm_control' - - # EnvName = 'ReachTarget' - # EnvType = 'rlbench' - # env = build_env(EnvName, EnvType, nenv=2) - - # env = build_env(EnvName, EnvType, state_type='vision', nenv=2) - # env = build_env(EnvName, EnvType, state_type='vision') - env = build_env(EnvName, EnvType) - s = env.reset() - print(s) - - # policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, [64, 64]) - policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, [64, 64]) - a = policy_net([s, s]) - print(a) - # q_net = QNetwork(env.observation_space, env.action_space, [64, 64], state_only=False, dueling=False) - # q = q_net([[s], a]) - print('-'*100) - # print(q) +""" +Functions for utilization. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import copy +import numpy as np +import tensorlayer as tl +from tensorlayer.models import Model + +from rlzoo.common.basic_nets import * +from rlzoo.common.distributions import make_dist + + +class StochasticContinuousPolicyNetwork(Model): + def __init__(self, state_shape, action_shape, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), + activation=tf.nn.relu, output_activation=None, log_std_min=-20, log_std_max=2, trainable=True): + """ + Stochastic continuous policy network with multiple fully-connected layers or convolutional layers (according to state shape) + + :param state_shape: (tuple[int]) shape of the state, for example, (state_dim, ) for single-dimensional state + :param action_shape: (tuple[int]) shape of the action, for example, (action_dim, ) for single-dimensional action + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param log_std_min: (float) lower bound of standard deviation of action + :param log_std_max: (float) upper bound of standard deviation of action + :param trainable: (bool) set training and evaluation mode + """ + + action_dim = action_shape[0] + if len(state_shape) == 1: + with tf.name_scope('MLP'): + state_dim = state_shape[0] + inputs, l = MLP(state_dim, hidden_dim_list, w_init, activation) + else: + with tf.name_scope('CNN'): + inputs, l = CNN(state_shape, conv_kwargs=None) + with tf.name_scope('Output_Mean'): + mean_linear = Dense(n_units=action_dim, act=output_activation, W_init=w_init)(l) + with tf.name_scope('Output_Std'): + log_std_linear = Dense(n_units=action_dim, act=output_activation, W_init=w_init)(l) + log_std_linear = tl.layers.Lambda(lambda x: tf.clip_by_value(x, log_std_min, log_std_max), name='Lambda')( + log_std_linear) + + super().__init__(inputs=inputs, outputs=[mean_linear, log_std_linear]) + if trainable: + self.train() + else: + self.eval() + + +class DeterministicContinuousPolicyNetwork(Model): + def __init__(self, state_shape, action_shape, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), \ + activation=tf.nn.relu, output_activation=tf.nn.tanh, trainable=True): + """ + Deterministic continuous policy network with multiple fully-connected layers or convolutional layers (according to state shape) + + :param state_shape: (tuple[int]) shape of the state, for example, (state_dim, ) for single-dimensional state + :param action_shape: (tuple[int]) shape of the action, for example, (action_dim, ) for single-dimensional action + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param trainable: (bool) set training and evaluation mode + """ + + action_dim = action_shape[0] + + if len(state_shape) == 1: + with tf.name_scope('MLP'): + state_dim = state_shape[0] + inputs, l = MLP(state_dim, hidden_dim_list, w_init, activation) + else: + with tf.name_scope('CNN'): + inputs, l = CNN(state_shape, conv_kwargs=None) + + with tf.name_scope('Output'): + outputs = Dense(n_units=action_dim, act=output_activation, W_init=w_init)(l) + + super().__init__(inputs=inputs, outputs=outputs) + if trainable: + self.train() + else: + self.eval() + + +class DeterministicPolicyNetwork(Model): + def __init__(self, state_space, action_space, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), + activation=tf.nn.relu, output_activation=tf.nn.tanh, trainable=True, name=None): + """ + Deterministic continuous/discrete policy network with multiple fully-connected layers + + :param state_space: (gym.spaces) space of the state from gym environments + :param action_space: (gym.spaces) space of the action from gym environments + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param trainable: (bool) set training and evaluation mode + """ + self._state_space, self._action_space = state_space, action_space + + if isinstance(self._action_space, spaces.Discrete): + self._action_shape = self._action_space.n, + + elif isinstance(self._action_space, spaces.Box): + assert len(self._action_space.shape) == 1 + self._action_shape = self._action_space.shape + + assert all(self._action_space.low < self._action_space.high) + action_bounds = [self._action_space.low, self._action_space.high] + self._action_mean = np.mean(action_bounds, 0) + self._action_scale = action_bounds[1] - self._action_mean + else: + raise NotImplementedError + + obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) + + if isinstance(state_space, spaces.Dict): + assert isinstance(obs_inputs, dict) + assert isinstance(current_layer, dict) + self.input_dict = obs_inputs + obs_inputs = list(obs_inputs.values()) + current_layer = tl.layers.Concat(-1)(list(current_layer.values())) + + with tf.name_scope('MLP'): + for i, dim in enumerate(hidden_dim_list): + current_layer = Dense(n_units=dim, act=activation, W_init=w_init, name='hidden_layer%d' % (i + 1))(current_layer) + + with tf.name_scope('Output'): + outputs = Dense(n_units=self._action_shape[0], act=output_activation, W_init=w_init, name='outputs')(current_layer) + + if isinstance(self._action_space, spaces.Discrete): + outputs = tl.layers.Lambda(lambda x: tf.argmax(tf.nn.softmax(x), axis=-1))(outputs) + elif isinstance(self._action_space, spaces.Box): + outputs = tl.layers.Lambda(lambda x: x * self._action_scale + self._action_mean)(outputs) + outputs = tl.layers.Lambda(lambda x: tf.clip_by_value(x, self._action_space.low, + self._action_space.high))(outputs) + + # make model + super().__init__(inputs=obs_inputs, outputs=outputs, name=name) + print('Policy network created') + if trainable: + self.train() + else: + self.eval() + + def __call__(self, states, *args, **kwargs): + if isinstance(self._state_space, spaces.Dict): + states = np.array(states).transpose([1, 0]).tolist() + else: + if np.shape(states)[1:] != self.state_shape: + raise ValueError( + 'Input state shape error. shape can be {} but your shape is {}'.format((None,) + self.state_shape, + np.shape(states))) + states = np.array(states, dtype=np.float32) + return super().__call__(states, *args, **kwargs) + + def random_sample(self): + """ generate random actions for exploration """ + + if isinstance(self._action_space, spaces.Discrete): + return np.random.choice(self._action_space.n, 1)[0] + else: + return np.random.uniform(self._action_space.low, self._action_space.high, self._action_shape) + + @property + def state_space(self): + return copy.deepcopy(self._state_space) + + @property + def action_space(self): + return copy.deepcopy(self._action_space) + + @property + def state_shape(self): + return copy.deepcopy(self._state_shape) + + @property + def action_shape(self): + return copy.deepcopy(self._action_shape) + + +class StochasticPolicyNetwork(Model): + def __init__(self, state_space, action_space, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), + activation=tf.nn.relu, output_activation=tf.nn.tanh, log_std_min=-20, log_std_max=2, trainable=True, + name=None, state_conditioned=False): + """ + Stochastic continuous/discrete policy network with multiple fully-connected layers + + :param state_space: (gym.spaces) space of the state from gym environments + :param action_space: (gym.spaces) space of the action from gym environments + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param log_std_min: (float) lower bound of standard deviation of action + :param log_std_max: (float) upper bound of standard deviation of action + :param trainable: (bool) set training and evaluation mode + + Tips: We recommend to use tf.nn.tanh for output_activation, especially for continuous action space, \ + to ensure the final action range is exactly the same as declared in action space after action normalization. + """ + self._state_space, self._action_space = state_space, action_space + + if isinstance(self._action_space, spaces.Discrete): + self._action_shape = self._action_space.n, + self.policy_dist = make_dist(self._action_space) # create action distribution + elif isinstance(self._action_space, spaces.Box): # normalize action + assert len(self._action_space.shape) == 1 + self._action_shape = self._action_space.shape + + assert all(self._action_space.low < self._action_space.high) + action_bounds = [self._action_space.low, self._action_space.high] + self._action_mean = np.mean(action_bounds, 0) + self._action_scale = action_bounds[1] - self._action_mean + + self.policy_dist = make_dist(self._action_space) # create action distribution + self.policy_dist.action_mean = self._action_mean + self.policy_dist.action_scale = self._action_scale + else: + raise NotImplementedError + + self._state_conditioned = state_conditioned + + obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) + + # build structure + if isinstance(state_space, spaces.Dict): + assert isinstance(obs_inputs, dict) + assert isinstance(current_layer, dict) + self.input_dict = obs_inputs + obs_inputs = list(obs_inputs.values()) + current_layer = tl.layers.Concat(-1)(list(current_layer.values())) + + with tf.name_scope('MLP'): + for i, dim in enumerate(hidden_dim_list): + current_layer = Dense(n_units=dim, act=activation, + W_init=w_init, name='hidden_layer%d' % (i + 1))(current_layer) + + with tf.name_scope('Output'): + if isinstance(action_space, spaces.Discrete): + outputs = Dense(n_units=self.policy_dist.ndim, act=output_activation, W_init=w_init)(current_layer) + elif isinstance(action_space, spaces.Box): + mu = Dense(n_units=self.policy_dist.ndim, act=output_activation, W_init=w_init)(current_layer) + + if self._state_conditioned: + log_sigma = Dense(n_units=self.policy_dist.ndim, act=None, W_init=w_init)(current_layer) + log_sigma = tl.layers.Lambda(lambda x: tf.clip_by_value(x, log_std_min, log_std_max))(log_sigma) + outputs = [mu, log_sigma] + else: + outputs = mu + self._log_sigma = tf.Variable(np.zeros(self.policy_dist.ndim, dtype=np.float32)) + else: + raise NotImplementedError + + # make model + super().__init__(inputs=obs_inputs, outputs=outputs, name=name) + if isinstance(self._action_space, spaces.Box) and not self._state_conditioned: + self.trainable_weights.append(self._log_sigma) + + if trainable: + self.train() + else: + self.eval() + + def __call__(self, states, *args, greedy=False, **kwargs): + if isinstance(self._state_space, spaces.Dict): + states = np.array(states).transpose([1, 0]).tolist() + else: + if np.shape(states)[1:] != self.state_shape: + raise ValueError( + 'Input state shape error. Shape should be {} but your shape is {}'.format((None,) + self.state_shape, + np.shape(states))) + states = np.array(states, dtype=np.float32) + params = super().__call__(states, *args, **kwargs) + if isinstance(self._action_space, spaces.Box) and not self._state_conditioned: + params = params, self._log_sigma + self.policy_dist.set_param(params) + if greedy: + result = self.policy_dist.greedy_sample() + else: + result = self.policy_dist.sample() + + if isinstance(self._action_space, spaces.Box): # normalize action + if greedy: + result = result * self._action_scale + self._action_mean + else: + result, explore = result + result = result * self._action_scale + self._action_mean + explore + + result = tf.clip_by_value(result, self._action_space.low, self._action_space.high) + return result + + def random_sample(self): + """ generate random actions for exploration """ + + if isinstance(self._action_space, spaces.Discrete): + return np.random.choice(self._action_space.n, 1)[0] + else: + return np.random.uniform(self._action_space.low, self._action_space.high, self._action_shape) + + @property + def state_space(self): + return copy.deepcopy(self._state_space) + + @property + def action_space(self): + return copy.deepcopy(self._action_space) + + @property + def state_shape(self): + return copy.deepcopy(self._state_shape) + + @property + def action_shape(self): + return copy.deepcopy(self._action_shape) + + +if __name__ == '__main__': + import gym + from rlzoo.common.env_wrappers import * + from rlzoo.common.value_networks import * + # EnvName = 'PongNoFrameskip-v4' + # EnvName = 'Pong-v4' + # EnvType = 'atari' + + EnvName = 'CartPole-v0' + # EnvName = 'Pendulum-v0' + EnvType = 'classic_control' + + # EnvName = 'BipedalWalker-v2' + # EnvType = 'box2d' + + # EnvName = 'Ant-v2' + # EnvType = 'mujoco' + + # EnvName = 'FetchPush-v1' + # EnvType = 'robotics' + + # EnvName = 'FishSwim-v0' + # EnvType = 'dm_control' + + # EnvName = 'ReachTarget' + # EnvType = 'rlbench' + # env = build_env(EnvName, EnvType, nenv=2) + + # env = build_env(EnvName, EnvType, state_type='vision', nenv=2) + # env = build_env(EnvName, EnvType, state_type='vision') + env = build_env(EnvName, EnvType) + s = env.reset() + print(s) + + # policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, [64, 64]) + policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, [64, 64]) + a = policy_net([s, s]) + print(a) + # q_net = QNetwork(env.observation_space, env.action_space, [64, 64], state_only=False, dueling=False) + # q = q_net([[s], a]) + print('-'*100) + # print(q) diff --git a/rlzoo/common/utils.py b/rlzoo/common/utils.py old mode 100644 new mode 100755 index 9b1ad7b..fc68c4d --- a/rlzoo/common/utils.py +++ b/rlzoo/common/utils.py @@ -1,145 +1,145 @@ -""" -Functions for utilization. - -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 - -""" -import os -import re - -import gym -import matplotlib.pyplot as plt -import numpy as np -import tensorlayer as tl -import tensorflow as tf -from importlib import import_module - - -def plot(episode_rewards, algorithm_name, env_name): - """ - plot the learning curve, saved as ./img/algorithm_name-env_name.png - - :param episode_rewards: array of floats - :param algorithm_name: string - :param env_name: string - """ - path = os.path.join('.', 'img') - name = algorithm_name + '-' + env_name - plt.figure(figsize=(10, 5)) - plt.title(name) - plt.plot(np.arange(len(episode_rewards)), episode_rewards) - plt.xlabel('Episode') - plt.ylabel('Episode Reward') - if not os.path.exists(path): - os.makedirs(path) - plt.savefig(os.path.join(path, name + '.png')) - plt.close() - - -def plot_save_log(episode_rewards, algorithm_name, env_name): - """ - plot the learning curve, saved as ./img/algorithm_name-env_name.png, - and save the rewards log as ./log/algorithm_name-env_name.npy - - :param episode_rewards: array of floats - :param algorithm_name: string - :param env_name: string - """ - path = os.path.join('.', 'log') - name = algorithm_name + '-' + env_name - plot(episode_rewards, algorithm_name, env_name) - if not os.path.exists(path): - os.makedirs(path) - np.save(os.path.join(path, name), episode_rewards) - - -def save_model(model, model_name, algorithm_name, env_name): - """ - save trained neural network model - - :param model: tensorlayer.models.Model - :param model_name: string, e.g. 'model_sac_q1' - :param algorithm_name: string, e.g. 'SAC' - """ - name = algorithm_name + '-' + env_name - path = os.path.join('.', 'model', name) - if not os.path.exists(path): - os.makedirs(path) - tl.files.save_npz(model.trainable_weights, os.path.join(path, model_name)) - - -def load_model(model, model_name, algorithm_name, env_name): - """ - load saved neural network model - - :param model: tensorlayer.models.Model - :param model_name: string, e.g. 'model_sac_q1' - :param algorithm_name: string, e.g. 'SAC' - """ - name = algorithm_name + '-' + env_name - path = os.path.join('.', 'model', name) - try: - param = tl.files.load_npz(path, model_name + '.npz') - for p0, p1 in zip(model.trainable_weights, param): - p0.assign(p1) - except Exception as e: - print('Load Model Fails!') - raise e - - -def parse_all_args(parser): - """ Parse known and unknown args """ - common_options, other_args = parser.parse_known_args() - other_options = dict() - index = 0 - n = len(other_args) - float_pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') - while index < n: # only str, int and float type will be parsed - if other_args[index].startswith('--'): - if other_args[index].__contains__('='): - key, value = other_args[index].split('=') - index += 1 - else: - key, value = other_args[index:index + 2] - index += 2 - if re.match(float_pattern, value): - value = float(value) - if value.is_integer(): - value = int(value) - other_options[key[2:]] = value - return common_options, other_options - - -def make_env(env_id): - env = gym.make(env_id).unwrapped - """ add env wrappers here """ - return env - - -def get_algorithm_module(algorithm, submodule): - """ Get algorithm module in the corresponding folder """ - return import_module('.'.join(['rlzoo', 'algorithms', algorithm, submodule])) - - -def call_default_params(env, envtype, alg, default_seed=True): - """ Get the default parameters for training from the default script """ - alg = alg.lower() - default = import_module('.'.join(['rlzoo', 'algorithms', alg, 'default'])) - params = getattr(default, envtype)(env, - default_seed) # need manually set seed in the main script if default_seed = False - return params - - -def set_seed(seed, env=None): - """ set random seed for reproduciblity """ - if isinstance(env, list): - assert isinstance(seed, list) - for i in range(len(env)): - env[i].seed(seed[i]) - seed = seed[0] # pick one seed for np and tf - elif env is not None: - env.seed(seed) - np.random.seed(seed) - tf.random.set_seed(seed) +""" +Functions for utilization. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import os +import re + +import gym +import matplotlib.pyplot as plt +import numpy as np +import tensorlayer as tl +import tensorflow as tf +from importlib import import_module + + +def plot(episode_rewards, algorithm_name, env_name): + """ + plot the learning curve, saved as ./img/algorithm_name-env_name.png + + :param episode_rewards: array of floats + :param algorithm_name: string + :param env_name: string + """ + path = os.path.join('.', 'img') + name = algorithm_name + '-' + env_name + plt.figure(figsize=(10, 5)) + plt.title(name) + plt.plot(np.arange(len(episode_rewards)), episode_rewards) + plt.xlabel('Episode') + plt.ylabel('Episode Reward') + if not os.path.exists(path): + os.makedirs(path) + plt.savefig(os.path.join(path, name + '.png')) + plt.close() + + +def plot_save_log(episode_rewards, algorithm_name, env_name): + """ + plot the learning curve, saved as ./img/algorithm_name-env_name.png, + and save the rewards log as ./log/algorithm_name-env_name.npy + + :param episode_rewards: array of floats + :param algorithm_name: string + :param env_name: string + """ + path = os.path.join('.', 'log') + name = algorithm_name + '-' + env_name + plot(episode_rewards, algorithm_name, env_name) + if not os.path.exists(path): + os.makedirs(path) + np.save(os.path.join(path, name), episode_rewards) + + +def save_model(model, model_name, algorithm_name, env_name): + """ + save trained neural network model + + :param model: tensorlayer.models.Model + :param model_name: string, e.g. 'model_sac_q1' + :param algorithm_name: string, e.g. 'SAC' + """ + name = algorithm_name + '-' + env_name + path = os.path.join('.', 'model', name) + if not os.path.exists(path): + os.makedirs(path) + tl.files.save_npz(model.trainable_weights, os.path.join(path, model_name)) + + +def load_model(model, model_name, algorithm_name, env_name): + """ + load saved neural network model + + :param model: tensorlayer.models.Model + :param model_name: string, e.g. 'model_sac_q1' + :param algorithm_name: string, e.g. 'SAC' + """ + name = algorithm_name + '-' + env_name + path = os.path.join('.', 'model', name) + try: + param = tl.files.load_npz(path, model_name + '.npz') + for p0, p1 in zip(model.trainable_weights, param): + p0.assign(p1) + except Exception as e: + print('Load Model Fails!') + raise e + + +def parse_all_args(parser): + """ Parse known and unknown args """ + common_options, other_args = parser.parse_known_args() + other_options = dict() + index = 0 + n = len(other_args) + float_pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') + while index < n: # only str, int and float type will be parsed + if other_args[index].startswith('--'): + if other_args[index].__contains__('='): + key, value = other_args[index].split('=') + index += 1 + else: + key, value = other_args[index:index + 2] + index += 2 + if re.match(float_pattern, value): + value = float(value) + if value.is_integer(): + value = int(value) + other_options[key[2:]] = value + return common_options, other_options + + +def make_env(env_id): + env = gym.make(env_id).unwrapped + """ add env wrappers here """ + return env + + +def get_algorithm_module(algorithm, submodule): + """ Get algorithm module in the corresponding folder """ + return import_module('.'.join(['rlzoo', 'algorithms', algorithm, submodule])) + + +def call_default_params(env, envtype, alg, default_seed=True): + """ Get the default parameters for training from the default script """ + alg = alg.lower() + default = import_module('.'.join(['rlzoo', 'algorithms', alg, 'default'])) + params = getattr(default, envtype)(env, + default_seed) # need manually set seed in the main script if default_seed = False + return params + + +def set_seed(seed, env=None): + """ set random seed for reproduciblity """ + if isinstance(env, list): + assert isinstance(seed, list) + for i in range(len(env)): + env[i].seed(seed[i]) + seed = seed[0] # pick one seed for np and tf + elif env is not None: + env.seed(seed) + np.random.seed(seed) + tf.random.set_seed(seed) diff --git a/rlzoo/common/value_networks.py b/rlzoo/common/value_networks.py old mode 100644 new mode 100755 index 2e787c9..c73dde6 --- a/rlzoo/common/value_networks.py +++ b/rlzoo/common/value_networks.py @@ -1,386 +1,386 @@ -""" -Functions for utilization. - -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 - -""" -import copy - -import numpy as np -import tensorlayer as tl -from tensorlayer.layers import BatchNorm, Dense, Input -from tensorlayer.models import Model - -from rlzoo.common.basic_nets import * - - -class ValueNetwork(Model): - def __init__(self, state_space, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), - activation=tf.nn.relu, output_activation=None, trainable=True, name=None): - """ - Value network with multiple fully-connected layers or convolutional layers (according to state shape) - - :param state_space: (gym.spaces) space of the state from gym environments - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param trainable: (bool) set training and evaluation mode - """ - self._state_space = state_space - - obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) - - if isinstance(state_space, spaces.Dict): - assert isinstance(obs_inputs, OrderedDict) - assert isinstance(current_layer, OrderedDict) - self.input_dict = obs_inputs - obs_inputs = list(obs_inputs.values()) - current_layer = tl.layers.Concat(-1)(list(current_layer.values())) - - with tf.name_scope('MLP'): - for i, dim in enumerate(hidden_dim_list): - current_layer = Dense(n_units=dim, act=activation, W_init=w_init, name='hidden_layer%d' % (i + 1))( - current_layer) - - with tf.name_scope('Output'): - outputs = Dense(n_units=1, act=output_activation, W_init=w_init)(current_layer) - - super().__init__(inputs=obs_inputs, outputs=outputs, name=name) - if trainable: - self.train() - else: - self.eval() - - def __call__(self, states, *args, **kwargs): - if isinstance(self._state_space, spaces.Dict): - states = np.array(states).transpose([1, 0]).tolist() - else: - if np.shape(states)[1:] != self.state_shape: - raise ValueError( - 'Input state shape error. Shape can be {} but your shape is {}'.format((None,) + self.state_shape, - np.shape(states))) - states = np.array(states, dtype=np.float32) - return super().__call__(states, *args, **kwargs) - - @property - def state_space(self): - return copy.deepcopy(self._state_space) - - @property - def state_shape(self): - return copy.deepcopy(self._state_shape) - - -class MlpQNetwork(Model): - def __init__(self, state_shape, action_shape, hidden_dim_list, \ - w_init=tf.keras.initializers.glorot_normal(), activation=tf.nn.relu, output_activation=None, - trainable=True): - """ - Q-value network with multiple fully-connected layers - - Inputs: (state tensor, action tensor) - - :param state_shape: (tuple[int]) shape of the state, for example, (state_dim, ) for single-dimensional state - :param action_shape: (tuple[int]) shape of the action, for example, (action_dim, ) for single-dimensional action - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param trainable: (bool) set training and evaluation mode - """ - - input_shape = tuple(map(sum, zip(action_shape, state_shape))) - input_dim = input_shape[0] - - assert len(state_shape) == 1 - with tf.name_scope('MLP'): - inputs, l = MLP(input_dim, hidden_dim_list, w_init, activation) - - with tf.name_scope('Output'): - outputs = Dense(n_units=1, act=output_activation, W_init=w_init)(l) - - super().__init__(inputs=inputs, outputs=outputs) - if trainable: - self.train() - else: - self.eval() - - -class QNetwork(Model): - def __init__(self, state_space, action_space, hidden_dim_list, - w_init=tf.keras.initializers.glorot_normal(), activation=tf.nn.relu, output_activation=None, - trainable=True, name=None, state_only=False, dueling=False): - """ Q-value network with multiple fully-connected layers or convolutional layers (according to state shape) - - :param state_space: (gym.spaces) space of the state from gym environments - :param action_space: (gym.spaces) space of the action from gym environments - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param output_activation: (callable or None) output activation function - :param trainable: (bool) set training and evaluation mode - :param name: (str) name the model - :param state_only: (bool) only input state or not, available in discrete action space - :param dueling: (bool) whether use the dueling output or not, available in discrete action space - """ - self._state_space, self._action_space = state_space, action_space - self.state_only = state_only - self.dueling = dueling - - # create state input layer - obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) - - # create action input layer - if isinstance(self._action_space, spaces.Discrete): - self._action_shape = self._action_space.n, - if not self.state_only: - act_inputs = Input((None,), name='Act_Input_Layer', dtype=tf.int64) - elif isinstance(self._action_space, spaces.Box): - self._action_shape = self._action_space.shape - assert len(self._action_shape) == 1 - act_inputs = Input((None,) + self._action_shape, name='Act_Input_Layer') - else: - raise NotImplementedError - - # concat multi-head state - if isinstance(state_space, spaces.Dict): - assert isinstance(obs_inputs, dict) - assert isinstance(current_layer, dict) - self.input_dict = obs_inputs - obs_inputs = list(obs_inputs.values()) - current_layer = tl.layers.Concat(-1)(list(current_layer.values())) - - if isinstance(self._action_space, spaces.Box): - current_layer = tl.layers.Concat(-1)([current_layer, act_inputs]) - - with tf.name_scope('QNet_MLP'): - for i, dim in enumerate(hidden_dim_list): - current_layer = Dense(n_units=dim, act=activation, W_init=w_init, - name='mlp_hidden_layer%d' % (i + 1))(current_layer) - - with tf.name_scope('Outputs'): - if isinstance(self._action_space, spaces.Discrete): - if self.dueling: - v = Dense(1, None, tf.initializers.Orthogonal(1.0))(current_layer) - q = Dense(n_units=self._action_shape[0], act=output_activation, W_init=w_init)( - current_layer) - mean_q = tl.layers.Lambda(lambda x: tf.reduce_mean(x, 1, True))(q) - current_layer = tl.layers.Lambda(lambda x: x[0] + x[1] - x[2])((v, q, mean_q)) - else: - current_layer = Dense(n_units=self._action_shape[0], act=output_activation, W_init=w_init)( - current_layer) - - if not self.state_only: - act_one_hot = tl.layers.OneHot(depth=self._action_shape[0], axis=1)( - act_inputs) # discrete action choice to one-hot vector - outputs = tl.layers.Lambda( - lambda x: tf.reduce_sum(tf.reduce_prod(x, axis=0), axis=1))((current_layer, act_one_hot)) - else: - outputs = current_layer - - elif isinstance(self._action_space, spaces.Box): - outputs = Dense(n_units=1, act=output_activation, W_init=w_init)(current_layer) - else: - raise ValueError("State Shape Not Accepted!") - - if isinstance(state_space, spaces.Dict): - if self.state_only: - super().__init__(inputs=obs_inputs, outputs=outputs, name=name) - else: - super().__init__(inputs=obs_inputs + [act_inputs], outputs=outputs, name=name) - else: - if self.state_only: - super().__init__(inputs=obs_inputs, outputs=outputs, name=name) - else: - super().__init__(inputs=[obs_inputs, act_inputs], outputs=outputs, name=name) - if trainable: - self.train() - else: - self.eval() - - def __call__(self, inputs, *args, **kwargs): - if self.state_only: - states = inputs - else: - states, actions = inputs - - # states and actions must have the same length - if not self.state_only and len(states) != len(actions): - raise ValueError( - 'Length of states and actions not match. States length is {} but actions length is {}'.format( - len(states), - len(actions))) - - if isinstance(self._state_space, spaces.Dict): - states = np.array(states).transpose([1, 0]).tolist() # batch states to multi-head - ssv = list(self._state_shape.values()) - # check state shape - for i, each_head in enumerate(states): - if np.shape(each_head)[1:] != ssv[i]: - raise ValueError('Input state shape error.') - - else: - if np.shape(states)[1:] != self.state_shape: - raise ValueError( - 'Input state shape error. Shape can be {} but your shape is {}'.format((None,) + self.state_shape, - np.shape(states))) - states = np.array(states, dtype=np.float32) - - if not self.state_only: - if isinstance(self._action_space, spaces.Discrete) and np.any(actions % 1): - raise ValueError('Input float actions in discrete action space') - if isinstance(self._action_space, spaces.Discrete): - actions = tf.convert_to_tensor(actions, dtype=tf.int64) - elif isinstance(self._action_space, spaces.Box): - actions = tf.convert_to_tensor(actions, dtype=tf.float32) - if isinstance(self._state_space, spaces.Dict): - return super().__call__(states + [actions], *args, **kwargs) - else: - return super().__call__([states, actions], *args, **kwargs) - else: - return super().__call__(states, *args, **kwargs) - - - @property - def state_space(self): - return copy.deepcopy(self._state_space) - - @property - def action_space(self): - return copy.deepcopy(self._action_space) - - @property - def state_shape(self): - return copy.deepcopy(self._state_shape) - - @property - def action_shape(self): - return copy.deepcopy(self._action_shape) - - -class NAFLayer(tl.layers.Layer): - def __init__(self, action_dim, name=None): - super(NAFLayer, self).__init__(name) - self.action_dim = action_dim - - def forward(self, inputs): - L, u, mu, value = inputs - pivot = 0 - rows = [] - for idx in range(self.action_dim): - offset = self.action_dim - idx - diag = tf.exp(tf.slice(L, (0, pivot), (-1, 1))) - nondiag = tf.slice(L, (0, pivot + 1), (-1, offset - 1)) - row = tf.pad(tf.concat([diag, nondiag], 1), ((0, 0), (idx, 0))) - pivot += offset - rows.append(row) - L_T = tf.stack(rows, axis=1) - P = tf.matmul(tf.transpose(L_T, (0, 2, 1)), L_T) # L L^T - temp = tf.expand_dims(u - mu, -1) - adv = tf.squeeze(-0.5 * tf.matmul(tf.transpose(temp, [0, 2, 1]), tf.matmul(P, temp)), -1) - return adv + value - - def build(self, inputs_shape=None): - pass - - -class NAFQNetwork(Model): - def __init__(self, state_space, action_space, hidden_dim_list, - w_init=tf.keras.initializers.glorot_normal(), activation=tf.nn.tanh, trainable=True, name=None): - """ NAF Q-value network with multiple fully-connected layers - - :param state_space: (gym.spaces) space of the state from gym environments - :param action_space: (gym.spaces) space of the action from gym environments - :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers - :param w_init: (callable) weights initialization - :param activation: (callable) activation function - :param trainable: (bool) set training and evaluation mode - :param name: (str) name the model - """ - assert isinstance(action_space, spaces.Box) - self._state_space, self._action_space = state_space, action_space - self._action_shape = self._action_space.shape - assert len(self._action_shape) == 1 - act_inputs = Input((None,) + self._action_shape, name='Act_Input_Layer') - - # create state input layer - obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) - - # concat multi-head state - if isinstance(state_space, spaces.Dict): - assert isinstance(obs_inputs, dict) - assert isinstance(current_layer, dict) - self.input_dict = obs_inputs - obs_inputs = list(obs_inputs.values()) - current_layer = tl.layers.Concat(-1)(list(current_layer.values())) - - # calculate value - current_layer = BatchNorm()(current_layer) - with tf.name_scope('NAF_VALUE_MLP'): - for i, dim in enumerate(hidden_dim_list): - current_layer = Dense(n_units=dim, act=activation, W_init=w_init, - name='mlp_hidden_layer%d' % (i + 1))(current_layer) - value = Dense(n_units=1, W_init=w_init, name='naf_value_mlp_output')(current_layer) - - # calculate advantange and Q-value - dim = self._action_shape[0] - with tf.name_scope('NAF_ADVANTAGE'): - mu = Dense(n_units=dim, act=activation, W_init=w_init, name='mu')(current_layer) - L = Dense(n_units=int((dim * (dim + 1)) / 2), W_init=w_init, name='L')(current_layer) - qvalue = NAFLayer(dim)([L, act_inputs, mu, value]) - - super().__init__(inputs=[obs_inputs, act_inputs], outputs=qvalue, name=name) - if trainable: - self.train() - else: - self.eval() - - def __call__(self, inputs, *args, **kwargs): - states, actions = inputs - - # states and actions must have the same length - if len(states) != len(actions): - raise ValueError( - 'Length of states and actions not match. States length is {} but actions length is {}'.format( - len(states), - len(actions))) - - if isinstance(self._state_space, spaces.Dict): - states = np.array(states).transpose([1, 0]).tolist() # batch states to multi-head - ssv = list(self._state_shape.values()) - # check state shape - for i, each_head in enumerate(states): - if np.shape(each_head)[1:] != ssv[i]: - raise ValueError('Input state shape error.') - - else: - if np.shape(states)[1:] != self.state_shape: - raise ValueError( - 'Input state shape error. Shape can be {} but your shape is {}'.format((None,) + self.state_shape, - np.shape(states))) - states = np.array(states, dtype=np.float32) - - actions = tf.convert_to_tensor(actions, dtype=tf.float32) - if isinstance(self._state_space, spaces.Dict): - return super().__call__(states + [actions], *args, **kwargs) - else: - return super().__call__([states, actions], *args, **kwargs) - - @property - def state_space(self): - return copy.deepcopy(self._state_space) - - @property - def action_space(self): - return copy.deepcopy(self._action_space) - - @property - def state_shape(self): - return copy.deepcopy(self._state_shape) - - @property - def action_shape(self): - return copy.deepcopy(self._action_shape) +""" +Functions for utilization. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import copy + +import numpy as np +import tensorlayer as tl +from tensorlayer.layers import BatchNorm, Dense, Input +from tensorlayer.models import Model + +from rlzoo.common.basic_nets import * + + +class ValueNetwork(Model): + def __init__(self, state_space, hidden_dim_list, w_init=tf.keras.initializers.glorot_normal(), + activation=tf.nn.relu, output_activation=None, trainable=True, name=None): + """ + Value network with multiple fully-connected layers or convolutional layers (according to state shape) + + :param state_space: (gym.spaces) space of the state from gym environments + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param trainable: (bool) set training and evaluation mode + """ + self._state_space = state_space + + obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) + + if isinstance(state_space, spaces.Dict): + assert isinstance(obs_inputs, OrderedDict) + assert isinstance(current_layer, OrderedDict) + self.input_dict = obs_inputs + obs_inputs = list(obs_inputs.values()) + current_layer = tl.layers.Concat(-1)(list(current_layer.values())) + + with tf.name_scope('MLP'): + for i, dim in enumerate(hidden_dim_list): + current_layer = Dense(n_units=dim, act=activation, W_init=w_init, name='hidden_layer%d' % (i + 1))( + current_layer) + + with tf.name_scope('Output'): + outputs = Dense(n_units=1, act=output_activation, W_init=w_init)(current_layer) + + super().__init__(inputs=obs_inputs, outputs=outputs, name=name) + if trainable: + self.train() + else: + self.eval() + + def __call__(self, states, *args, **kwargs): + if isinstance(self._state_space, spaces.Dict): + states = np.array(states).transpose([1, 0]).tolist() + else: + if np.shape(states)[1:] != self.state_shape: + raise ValueError( + 'Input state shape error. Shape can be {} but your shape is {}'.format((None,) + self.state_shape, + np.shape(states))) + states = np.array(states, dtype=np.float32) + return super().__call__(states, *args, **kwargs) + + @property + def state_space(self): + return copy.deepcopy(self._state_space) + + @property + def state_shape(self): + return copy.deepcopy(self._state_shape) + + +class MlpQNetwork(Model): + def __init__(self, state_shape, action_shape, hidden_dim_list, \ + w_init=tf.keras.initializers.glorot_normal(), activation=tf.nn.relu, output_activation=None, + trainable=True): + """ + Q-value network with multiple fully-connected layers + + Inputs: (state tensor, action tensor) + + :param state_shape: (tuple[int]) shape of the state, for example, (state_dim, ) for single-dimensional state + :param action_shape: (tuple[int]) shape of the action, for example, (action_dim, ) for single-dimensional action + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param trainable: (bool) set training and evaluation mode + """ + + input_shape = tuple(map(sum, zip(action_shape, state_shape))) + input_dim = input_shape[0] + + assert len(state_shape) == 1 + with tf.name_scope('MLP'): + inputs, l = MLP(input_dim, hidden_dim_list, w_init, activation) + + with tf.name_scope('Output'): + outputs = Dense(n_units=1, act=output_activation, W_init=w_init)(l) + + super().__init__(inputs=inputs, outputs=outputs) + if trainable: + self.train() + else: + self.eval() + + +class QNetwork(Model): + def __init__(self, state_space, action_space, hidden_dim_list, + w_init=tf.keras.initializers.glorot_normal(), activation=tf.nn.relu, output_activation=None, + trainable=True, name=None, state_only=False, dueling=False): + """ Q-value network with multiple fully-connected layers or convolutional layers (according to state shape) + + :param state_space: (gym.spaces) space of the state from gym environments + :param action_space: (gym.spaces) space of the action from gym environments + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param output_activation: (callable or None) output activation function + :param trainable: (bool) set training and evaluation mode + :param name: (str) name the model + :param state_only: (bool) only input state or not, available in discrete action space + :param dueling: (bool) whether use the dueling output or not, available in discrete action space + """ + self._state_space, self._action_space = state_space, action_space + self.state_only = state_only + self.dueling = dueling + + # create state input layer + obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) + + # create action input layer + if isinstance(self._action_space, spaces.Discrete): + self._action_shape = self._action_space.n, + if not self.state_only: + act_inputs = Input((None,), name='Act_Input_Layer', dtype=tf.int64) + elif isinstance(self._action_space, spaces.Box): + self._action_shape = self._action_space.shape + assert len(self._action_shape) == 1 + act_inputs = Input((None,) + self._action_shape, name='Act_Input_Layer') + else: + raise NotImplementedError + + # concat multi-head state + if isinstance(state_space, spaces.Dict): + assert isinstance(obs_inputs, dict) + assert isinstance(current_layer, dict) + self.input_dict = obs_inputs + obs_inputs = list(obs_inputs.values()) + current_layer = tl.layers.Concat(-1)(list(current_layer.values())) + + if isinstance(self._action_space, spaces.Box): + current_layer = tl.layers.Concat(-1)([current_layer, act_inputs]) + + with tf.name_scope('QNet_MLP'): + for i, dim in enumerate(hidden_dim_list): + current_layer = Dense(n_units=dim, act=activation, W_init=w_init, + name='mlp_hidden_layer%d' % (i + 1))(current_layer) + + with tf.name_scope('Outputs'): + if isinstance(self._action_space, spaces.Discrete): + if self.dueling: + v = Dense(1, None, tf.initializers.Orthogonal(1.0))(current_layer) + q = Dense(n_units=self._action_shape[0], act=output_activation, W_init=w_init)( + current_layer) + mean_q = tl.layers.Lambda(lambda x: tf.reduce_mean(x, 1, True))(q) + current_layer = tl.layers.Lambda(lambda x: x[0] + x[1] - x[2])((v, q, mean_q)) + else: + current_layer = Dense(n_units=self._action_shape[0], act=output_activation, W_init=w_init)( + current_layer) + + if not self.state_only: + act_one_hot = tl.layers.OneHot(depth=self._action_shape[0], axis=1)( + act_inputs) # discrete action choice to one-hot vector + outputs = tl.layers.Lambda( + lambda x: tf.reduce_sum(tf.reduce_prod(x, axis=0), axis=1))((current_layer, act_one_hot)) + else: + outputs = current_layer + + elif isinstance(self._action_space, spaces.Box): + outputs = Dense(n_units=1, act=output_activation, W_init=w_init)(current_layer) + else: + raise ValueError("State Shape Not Accepted!") + + if isinstance(state_space, spaces.Dict): + if self.state_only: + super().__init__(inputs=obs_inputs, outputs=outputs, name=name) + else: + super().__init__(inputs=obs_inputs + [act_inputs], outputs=outputs, name=name) + else: + if self.state_only: + super().__init__(inputs=obs_inputs, outputs=outputs, name=name) + else: + super().__init__(inputs=[obs_inputs, act_inputs], outputs=outputs, name=name) + if trainable: + self.train() + else: + self.eval() + + def __call__(self, inputs, *args, **kwargs): + if self.state_only: + states = inputs + else: + states, actions = inputs + + # states and actions must have the same length + if not self.state_only and len(states) != len(actions): + raise ValueError( + 'Length of states and actions not match. States length is {} but actions length is {}'.format( + len(states), + len(actions))) + + if isinstance(self._state_space, spaces.Dict): + states = np.array(states).transpose([1, 0]).tolist() # batch states to multi-head + ssv = list(self._state_shape.values()) + # check state shape + for i, each_head in enumerate(states): + if np.shape(each_head)[1:] != ssv[i]: + raise ValueError('Input state shape error.') + + else: + if np.shape(states)[1:] != self.state_shape: + raise ValueError( + 'Input state shape error. Shape can be {} but your shape is {}'.format((None,) + self.state_shape, + np.shape(states))) + states = np.array(states, dtype=np.float32) + + if not self.state_only: + if isinstance(self._action_space, spaces.Discrete) and np.any(actions % 1): + raise ValueError('Input float actions in discrete action space') + if isinstance(self._action_space, spaces.Discrete): + actions = tf.convert_to_tensor(actions, dtype=tf.int64) + elif isinstance(self._action_space, spaces.Box): + actions = tf.convert_to_tensor(actions, dtype=tf.float32) + if isinstance(self._state_space, spaces.Dict): + return super().__call__(states + [actions], *args, **kwargs) + else: + return super().__call__([states, actions], *args, **kwargs) + else: + return super().__call__(states, *args, **kwargs) + + + @property + def state_space(self): + return copy.deepcopy(self._state_space) + + @property + def action_space(self): + return copy.deepcopy(self._action_space) + + @property + def state_shape(self): + return copy.deepcopy(self._state_shape) + + @property + def action_shape(self): + return copy.deepcopy(self._action_shape) + + +class NAFLayer(tl.layers.Layer): + def __init__(self, action_dim, name=None): + super(NAFLayer, self).__init__(name) + self.action_dim = action_dim + + def forward(self, inputs): + L, u, mu, value = inputs + pivot = 0 + rows = [] + for idx in range(self.action_dim): + offset = self.action_dim - idx + diag = tf.exp(tf.slice(L, (0, pivot), (-1, 1))) + nondiag = tf.slice(L, (0, pivot + 1), (-1, offset - 1)) + row = tf.pad(tf.concat([diag, nondiag], 1), ((0, 0), (idx, 0))) + pivot += offset + rows.append(row) + L_T = tf.stack(rows, axis=1) + P = tf.matmul(tf.transpose(L_T, (0, 2, 1)), L_T) # L L^T + temp = tf.expand_dims(u - mu, -1) + adv = tf.squeeze(-0.5 * tf.matmul(tf.transpose(temp, [0, 2, 1]), tf.matmul(P, temp)), -1) + return adv + value + + def build(self, inputs_shape=None): + pass + + +class NAFQNetwork(Model): + def __init__(self, state_space, action_space, hidden_dim_list, + w_init=tf.keras.initializers.glorot_normal(), activation=tf.nn.tanh, trainable=True, name=None): + """ NAF Q-value network with multiple fully-connected layers + + :param state_space: (gym.spaces) space of the state from gym environments + :param action_space: (gym.spaces) space of the action from gym environments + :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers + :param w_init: (callable) weights initialization + :param activation: (callable) activation function + :param trainable: (bool) set training and evaluation mode + :param name: (str) name the model + """ + assert isinstance(action_space, spaces.Box) + self._state_space, self._action_space = state_space, action_space + self._action_shape = self._action_space.shape + assert len(self._action_shape) == 1 + act_inputs = Input((None,) + self._action_shape, name='Act_Input_Layer') + + # create state input layer + obs_inputs, current_layer, self._state_shape = CreateInputLayer(state_space) + + # concat multi-head state + if isinstance(state_space, spaces.Dict): + assert isinstance(obs_inputs, dict) + assert isinstance(current_layer, dict) + self.input_dict = obs_inputs + obs_inputs = list(obs_inputs.values()) + current_layer = tl.layers.Concat(-1)(list(current_layer.values())) + + # calculate value + current_layer = BatchNorm()(current_layer) + with tf.name_scope('NAF_VALUE_MLP'): + for i, dim in enumerate(hidden_dim_list): + current_layer = Dense(n_units=dim, act=activation, W_init=w_init, + name='mlp_hidden_layer%d' % (i + 1))(current_layer) + value = Dense(n_units=1, W_init=w_init, name='naf_value_mlp_output')(current_layer) + + # calculate advantange and Q-value + dim = self._action_shape[0] + with tf.name_scope('NAF_ADVANTAGE'): + mu = Dense(n_units=dim, act=activation, W_init=w_init, name='mu')(current_layer) + L = Dense(n_units=int((dim * (dim + 1)) / 2), W_init=w_init, name='L')(current_layer) + qvalue = NAFLayer(dim)([L, act_inputs, mu, value]) + + super().__init__(inputs=[obs_inputs, act_inputs], outputs=qvalue, name=name) + if trainable: + self.train() + else: + self.eval() + + def __call__(self, inputs, *args, **kwargs): + states, actions = inputs + + # states and actions must have the same length + if len(states) != len(actions): + raise ValueError( + 'Length of states and actions not match. States length is {} but actions length is {}'.format( + len(states), + len(actions))) + + if isinstance(self._state_space, spaces.Dict): + states = np.array(states).transpose([1, 0]).tolist() # batch states to multi-head + ssv = list(self._state_shape.values()) + # check state shape + for i, each_head in enumerate(states): + if np.shape(each_head)[1:] != ssv[i]: + raise ValueError('Input state shape error.') + + else: + if np.shape(states)[1:] != self.state_shape: + raise ValueError( + 'Input state shape error. Shape can be {} but your shape is {}'.format((None,) + self.state_shape, + np.shape(states))) + states = np.array(states, dtype=np.float32) + + actions = tf.convert_to_tensor(actions, dtype=tf.float32) + if isinstance(self._state_space, spaces.Dict): + return super().__call__(states + [actions], *args, **kwargs) + else: + return super().__call__([states, actions], *args, **kwargs) + + @property + def state_space(self): + return copy.deepcopy(self._state_space) + + @property + def action_space(self): + return copy.deepcopy(self._action_space) + + @property + def state_shape(self): + return copy.deepcopy(self._state_shape) + + @property + def action_shape(self): + return copy.deepcopy(self._action_shape) diff --git a/rlzoo/distributed/__init__.py b/rlzoo/distributed/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/rlzoo/distributed/dis_components.py b/rlzoo/distributed/dis_components.py new file mode 100755 index 0000000..d8ee1dd --- /dev/null +++ b/rlzoo/distributed/dis_components.py @@ -0,0 +1,128 @@ +import enum + +import tensorflow as tf +from kungfu.python import current_cluster_size, current_rank +from kungfu.tensorflow.ops import (barrier, request_variable, + request_variable_with_template, + save_variable, subset_all_reduce) +from kungfu.tensorflow.ops.queue import new_queue + + +class Role(enum.Enum): + Learner = 1 + Actor = 2 + Server = 3 + + +def show_role_name(role): + return { + Role.Learner: 'learner', + Role.Actor: 'actor', + Role.Server: 'server', + }[role] + + +def _interval(n, offset=0): + return list(range(offset, offset + n)) + + +class Agent: + def __init__(self, n_learners=1, n_actors=1, n_servers=1): + rank = current_rank() + size = current_cluster_size() + if n_learners + n_actors + n_servers != size: + raise RuntimeError('invalid cluster size') + self._n_learners = n_learners + self._n_actors = n_actors + self._n_servers = n_servers + self._global_rank = rank + self._global_size = size + roles = [Role.Learner] * n_learners + [Role.Actor] * n_actors + [Role.Server] * n_servers + rank2role = dict(enumerate(roles)) + self._role = rank2role[rank] + self._roles = { + Role.Learner: _interval(n_learners), + Role.Actor: _interval(n_actors, n_learners), + Role.Server: _interval(n_servers, n_learners + n_actors), + } + self._role_sizes = { + Role.Learner: n_learners, + Role.Actor: n_actors, + Role.Server: n_servers, + } + self._role_offsets = { + Role.Learner: 0, + Role.Actor: n_learners, + Role.Server: n_learners + n_actors, + } + self._role_rank = self._global_rank - self._role_offsets[self._role] + self._role_size = self._role_sizes[self._role] + + def _to_global_rank(self, role, role_rank): + return int(self._role_offsets[role] + int(role_rank)) + + # metadata APIs + def role(self): + return self._role + + def role_rank(self): + return self._role_rank + + def role_size(self, role=None): + if role is None: + return self._role_size + else: + return self._role_sizes[role] + + # collective APIs + def barrier(self): + return barrier() + + def role_all_reduce(self, x): + role_ranks = self._roles[self._role] + topology = [i for i in range(self._global_size)] + for i in role_ranks: + topology[i] = role_ranks[0] + # TODO: generate subset topology + return subset_all_reduce(x, topology) + + # p2p APIs + def save(self, x, name=None): + return save_variable(x, name=name) + + def request(self, role: Role, role_rank, name, shape, dtype): + role_size = self._role_sizes[role] + assert (0 <= role_rank and role_rank < role_size) + target = self._to_global_rank(role, role_rank) + return request_variable( + target, + name=name, + shape=shape, + dtype=dtype, + ) + + def new_queue(self, src, dst): + """create a uni-direction queue.""" + role1, rank1 = src + role2, rank2 = dst + srcRank = self._to_global_rank(role1, rank1) + dstRank = self._to_global_rank(role2, rank2) + return new_queue(srcRank, dstRank) + + def new_queue_pair(self, a, b): + """create a pair of queues.""" + q1 = self.new_queue(a, b) + q2 = self.new_queue(b, a) + return q1, q2 + + +class LearnerExample: + pass + + +class ActorExample: + pass + + +class ServerExample: + pass diff --git a/rlzoo/distributed/run_dis_train.sh b/rlzoo/distributed/run_dis_train.sh new file mode 100755 index 0000000..bfab290 --- /dev/null +++ b/rlzoo/distributed/run_dis_train.sh @@ -0,0 +1,46 @@ +#!/bin/sh +set -e + +cd $(dirname $0) + +kungfu_flags() { + echo -q + echo -logdir logs + + local ip1=127.0.0.1 + local np1=$np + + local ip2=127.0.0.10 + local np2=$np + local H=$ip1:$np1,$ip2:$np2 + local m=cpu,gpu + + echo -H $ip1:$np1 +} + +prun() { + local np=$1 + shift + kungfu-run $(kungfu_flags) -np $np $@ +} + +n_learner=2 +n_actor=2 +n_server=1 + +flags() { + echo -l $n_learner + echo -a $n_actor + echo -s $n_server +} + +rl_run() { + local n=$((n_learner + n_actor + n_server)) + prun $n python3 training_components.py $(flags) +} + +main() { + rl_run +} + +main diff --git a/rlzoo/distributed/start_dis_role.py b/rlzoo/distributed/start_dis_role.py new file mode 100755 index 0000000..7716c96 --- /dev/null +++ b/rlzoo/distributed/start_dis_role.py @@ -0,0 +1,206 @@ +import argparse + +from rlzoo.distributed.dis_components import * +import tensorflow as tf +import numpy as np + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument('-l', type=int, default=1) + p.add_argument('-a', type=int, default=1) + p.add_argument('-s', type=int, default=1) + p.add_argument('-f', type=str, default='') # config.json + + args = p.parse_args() + return args + + +def run_learner(agent, args, training_conf, env_conf, agent_conf): + agent_generator = agent_conf['agent_generator'] + total_step, traj_len, train_n_traj = training_conf['total_step'], training_conf['traj_len'], training_conf['train_n_traj'], + obs_shape, act_shape = env_conf['obs_shape'], env_conf['act_shape'] + + if agent.role_rank() == 0: + param_q = agent.new_queue((Role.Learner, 0), (Role.Server, 0)) + + traj_q = agent.new_queue((Role.Server, 0), (Role.Learner, agent.role_rank())) + + rl_agent = agent_generator() + rl_agent.init_components() + + # init model + rl_agent.update_model([agent.role_all_reduce(weights) for weights in rl_agent.all_weights]) + + if agent.role_rank() == 0: + for weight in rl_agent.all_weights: + param_q.put(tf.Variable(weight, dtype=tf.float32)) + + n_update = total_step // (traj_len * agent.role_size(Role.Learner) * train_n_traj) + for i in range(n_update): + traj_list = [[traj_q.get(dtype=tf.float32, shape=(traj_len, *shape)) for shape in [ + obs_shape, act_shape, (), (), obs_shape, (), (1,)]] for _ in range(train_n_traj)] + + rl_agent.train(traj_list, dis_agent=agent) + + # send weights to server + if agent.role_rank() == 0: + for weight in rl_agent.all_weights: + param_q.put(tf.Variable(weight, dtype=tf.float32)) + print('learner finished') + + +def run_actor(agent, args, training_conf, env_conf): # sampler + env_maker, total_step = env_conf['env_maker'], training_conf['total_step'] + + from gym import spaces + + env = env_maker() + action_q, step_data_q = agent.new_queue_pair((Role.Server, 0), (Role.Actor, agent.role_rank())) + + state, reward, done = env.reset(), 0, False + each_total_step = int(total_step/agent.role_size(Role.Actor)) + action_dtype = tf.int32 if isinstance(env.action_space, spaces.Discrete) else tf.float32 + for i in range(each_total_step): + step_data_q.put(tf.Variable(state, dtype=tf.float32)) + a = action_q.get(dtype=action_dtype, shape=env.action_space.shape).numpy() + next_state, reward, done, _ = env.step(a) + for data in (reward, done, next_state): + step_data_q.put(tf.Variable(data, dtype=tf.float32)) + if done: + state = env.reset() + else: + state = next_state + print('actor finished') + + +def run_server(agent, args, training_conf, env_conf, agent_conf): + total_step, traj_len, train_n_traj, save_interval = training_conf['total_step'], training_conf['traj_len'], \ + training_conf['train_n_traj'], training_conf['save_interval'], + obs_shape, env_name = env_conf['obs_shape'], env_conf['env_name'] + agent_generator = agent_conf['agent_generator'] + + from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP + from rlzoo.distributed.dis_components import Role + from gym import spaces + + learner_size = agent.role_size(Role.Learner) + rl_agent: DPPO_CLIP = agent_generator() + rl_agent.init_components() + + # queue to actor + q_list = [agent.new_queue_pair((Role.Server, 0), (Role.Actor, i)) for i in + range(agent.role_size(Role.Actor))] + action_q_list, step_data_q_list = zip(*q_list) + + # queue to learner + param_q = agent.new_queue((Role.Learner, 0), (Role.Server, 0)) + traj_q_list = [agent.new_queue((Role.Server, 0), (Role.Learner, i)) for i in + range(agent.role_size(Role.Learner))] + + # syn net weights from learner + all_weights = [param_q.get(dtype=weight.dtype, shape=weight.shape) for weight in rl_agent.all_weights] + rl_agent.update_model(all_weights) + + train_cnt = 0 + action_dtype = tf.int32 if isinstance(rl_agent.actor.action_space, spaces.Discrete) else tf.float32 + + curr_step = 0 + + total_reward_list = [] + curr_reward_list = [] + tmp_eps_reward = 0 + while curr_step < total_step: + # tmp_eps_reward = 0 # todo env with no end + for _ in range(traj_len): + curr_step += agent.role_size(Role.Actor) + + state_list = [] + for step_data_q in step_data_q_list: + state_list.append(step_data_q.get(dtype=tf.float32, shape=obs_shape)) + + action_list, log_p_list = rl_agent.get_action(state_list, batch_data=True) + + for action_q, action in zip(action_q_list, action_list): + action_q.put(tf.Variable(action, dtype=action_dtype)) + reward_list, done_list, next_state_list = [], [], [], + for i, step_data_q in enumerate(step_data_q_list): + reward = step_data_q.get(dtype=tf.float32, shape=()) + if i == 0: + tmp_eps_reward += reward + reward_list.append(reward) + done = step_data_q.get(dtype=tf.float32, shape=()) + if i == 0 and done: + curr_reward_list.append(tmp_eps_reward) + tmp_eps_reward = 0 + done_list.append(done) + next_state_list.append(step_data_q.get(dtype=tf.float32, shape=obs_shape)) + rl_agent.collect_data(state_list, action_list, reward_list, done_list, next_state_list, log_p_list, True) + + rl_agent.update_traj_list() + + # send traj to each learner and update weight + learn_traj_len = learner_size * train_n_traj + if len(rl_agent.traj_list) >= learn_traj_len: + train_cnt += 1 + + # todo env with end + avg_eps_reward = None + if curr_reward_list: + avg_eps_reward = np.mean(curr_reward_list) + curr_reward_list.clear() + total_reward_list.append(avg_eps_reward) + + # todo env with no end + # avg_eps_reward = tmp_eps_reward + # total_reward_list.append(np.array(avg_eps_reward)) + + print('Training iters: {}, steps so far: {}, average eps reward: {}'.format( + train_cnt, curr_step, np.array(avg_eps_reward))) + + rl_agent.plot_save_log(total_reward_list, env_name) + + traj_iter = iter(rl_agent.traj_list[:learn_traj_len]) + rl_agent.traj_list = rl_agent.traj_list[learn_traj_len:] + + # send traj data to each learner + for i, traj_q in enumerate(traj_q_list): + for _ in range(train_n_traj): + try: + traj_data = next(traj_iter) + except StopIteration: + break + for data in traj_data: + traj_q.put(tf.Variable(data, dtype=tf.float32)) + + # syn net weights from learner + all_weights = [param_q.get(dtype=weight.dtype, shape=weight.shape) for weight in rl_agent.all_weights] + rl_agent.update_model(all_weights) + + # save model + if not train_cnt % save_interval: + rl_agent.save_ckpt(env_name) + + # save the final model + rl_agent.save_ckpt(env_name) + print('Server Finished.') + + +def main(training_conf, env_conf, agent_conf): + args = parse_args() + agent = Agent(n_learners=args.l, n_actors=args.a, n_servers=args.s) + + print('%s : %d/%d' % (agent.role(), agent.role_rank(), agent.role_size())) + + agent.barrier() + + if agent.role() == Role.Learner: + run_learner(agent, args, training_conf, env_conf, agent_conf) + elif agent.role() == Role.Actor: + run_actor(agent, args, training_conf, env_conf) + elif agent.role() == Role.Server: + run_server(agent, args, training_conf, env_conf, agent_conf) + else: + raise RuntimeError('Invalid Role.') + + agent.barrier() diff --git a/rlzoo/distributed/training_components.py b/rlzoo/distributed/training_components.py new file mode 100755 index 0000000..a975245 --- /dev/null +++ b/rlzoo/distributed/training_components.py @@ -0,0 +1,63 @@ +from rlzoo.common.env_wrappers import build_env +from rlzoo.common.policy_networks import * +from rlzoo.common.value_networks import * +from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP +from functools import partial + +# Specify the training configurations +training_conf = { + 'total_step': int(1e7), # overall training timesteps + 'traj_len': 200, # length of the rollout trajectory + 'train_n_traj': 2, # update the models after every certain number of trajectories for each learner + 'save_interval': 10, # saving the models after every certain number of updates +} + +# Specify the environment and launch it +env_name, env_type = 'CartPole-v0', 'classic_control' +env_maker = partial(build_env, env_name, env_type) +temp_env = env_maker() +obs_shape, act_shape = temp_env.observation_space.shape, temp_env.action_space.shape + +env_conf = { + 'env_name': env_name, + 'env_type': env_type, + 'env_maker': env_maker, + 'obs_shape': obs_shape, + 'act_shape': act_shape, +} + + +def build_network(observation_space, action_space, name='DPPO_CLIP'): + """ build networks for the algorithm """ + hidden_dim = 256 + num_hidden_layer = 2 + critic = ValueNetwork(observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') + + actor = StochasticPolicyNetwork(observation_space, action_space, + [hidden_dim] * num_hidden_layer, + trainable=True, + name=name + '_policy') + return critic, actor + + +def build_opt(actor_lr=1e-4, critic_lr=2e-4): + """ choose the optimizer for learning """ + import tensorflow as tf + return [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] + + +net_builder = partial(build_network, temp_env.observation_space, temp_env.action_space) +opt_builder = partial(build_opt, ) + +agent_conf = { + 'net_builder': net_builder, + 'opt_builder': opt_builder, + 'agent_generator': partial(DPPO_CLIP, net_builder, opt_builder), +} +del temp_env + +from rlzoo.distributed.start_dis_role import main + +print('Start Training.') +main(training_conf, env_conf, agent_conf) +print('Training Finished.') diff --git a/rlzoo/interactive/.gitignore b/rlzoo/interactive/.gitignore old mode 100644 new mode 100755 index bbb7590..bd325b2 --- a/rlzoo/interactive/.gitignore +++ b/rlzoo/interactive/.gitignore @@ -1,3 +1,3 @@ -img/ -log/ -model/ +img/ +log/ +model/ diff --git a/rlzoo/interactive/common.py b/rlzoo/interactive/common.py old mode 100644 new mode 100755 index be2d625..a4e0257 --- a/rlzoo/interactive/common.py +++ b/rlzoo/interactive/common.py @@ -1,124 +1,124 @@ -import decimal - -import ipywidgets as widgets -import numpy as np - -border_list = [None, 'hidden', 'dotted', 'dashed', 'solid', 'double', - 'groove', 'ridge', 'inset', 'outset', 'inherit'] - - -class NumInput(widgets.HBox): - - def __init__(self, init_value, step=None, range_min=None, range_max=None): - self.range = [range_min, range_max] - range_min = 0 if range_min is None else range_min - range_max = init_value * 2 if range_max is None else range_max - self.range_size = max([range_max - init_value, init_value - range_min]) - if step is None: - fs = decimal.Decimal(str(init_value)).as_tuple().exponent - self.decimals = -fs - step = np.round(np.power(0.1, self.decimals), self.decimals) - else: - fs = decimal.Decimal(str(step)).as_tuple().exponent - fv = decimal.Decimal(str(init_value)).as_tuple().exponent - self.decimals = -min(fs, fv) - - self.step = step - - self.slider = widgets.FloatSlider( - value=init_value, - min=range_min, - max=range_max, - step=step, - description='Slider input:', - disabled=False, - continuous_update=False, - orientation='horizontal', - readout=True, - readout_format='.' + str(self.decimals) + 'f' - ) - - self.text = widgets.FloatText( - value=self.slider.value, - description='Manual input:', - disabled=False - ) - - def __extend_max(change): - num_new = np.around(change['new'], decimals=self.decimals) - num_old = change['old'] - if num_new > num_old: - if num_new - num_old > (self.slider.max - num_old) / 2: - self.range_size *= 2 - else: - self.range_size *= 0.5 - else: - if num_old - num_new > (num_old - self.slider.min) / 2: - self.range_size *= 2 - else: - self.range_size *= 0.5 - - if self.range_size < self.step * 10: - self.range_size = self.step * 10 - - self.slider.min = num_new - self.range_size if self.range[0] is None else self.range[0] - self.slider.max = num_new + self.range_size if self.range[1] is None else self.range[1] - self.slider.value = num_new - self.text.value = num_new - - self.slider.observe(__extend_max, names='value') - self.text.observe(__extend_max, names='value') - box_layout = widgets.Layout(display='flex', - align_items='stretch', - justify_content='center', ) - # self.frame = widgets.HBox([self.slider, self.text], layout=box_layout) - super().__init__([self.slider, self.text], layout=box_layout) - self._int_type = False - if (isinstance(init_value, int) or isinstance(init_value, np.int16) \ - or isinstance(init_value, np.int32) or isinstance(init_value, np.int64)) \ - and step % 1 == 0: - self._int_type = True - - @property - def value(self): - result = self.slider.value - if self._int_type: - result = int(result) - return result - - -class Border: - def __init__(self, element_list, description=None, size=5, style=0): - if not isinstance(element_list, list): - element_list = [element_list] - - box_layout = widgets.Layout(display='flex', - flex_flow='column', - align_items='flex-start', - align_content='flex-start', - # justify_content='center', - justify_content='space-around', - border=border_list[2] - ) - frame = widgets.Box(children=element_list, layout=box_layout) - - if description is not None: - caption = widgets.HTML(value=""+description+"") - children = [caption, frame] - else: - children = [frame] - - box_layout = widgets.Layout(display='flex', - flex_flow='column', - align_items='center', - justify_content='center', - border=border_list[style], ) - self.frame = widgets.Box(children=children, layout=box_layout) - - -class InfoDisplay: - def __init__(self, description, detail): - label = widgets.Label(description) - self.data = widgets.Label(detail) - self.frame = widgets.HBox([label, self.data], layout=widgets.Layout(justify_content='flex-start', )) -# border=border_list[2])) +import decimal + +import ipywidgets as widgets +import numpy as np + +border_list = [None, 'hidden', 'dotted', 'dashed', 'solid', 'double', + 'groove', 'ridge', 'inset', 'outset', 'inherit'] + + +class NumInput(widgets.HBox): + + def __init__(self, init_value, step=None, range_min=None, range_max=None): + self.range = [range_min, range_max] + range_min = 0 if range_min is None else range_min + range_max = init_value * 2 if range_max is None else range_max + self.range_size = max([range_max - init_value, init_value - range_min]) + if step is None: + fs = decimal.Decimal(str(init_value)).as_tuple().exponent + self.decimals = -fs + step = np.round(np.power(0.1, self.decimals), self.decimals) + else: + fs = decimal.Decimal(str(step)).as_tuple().exponent + fv = decimal.Decimal(str(init_value)).as_tuple().exponent + self.decimals = -min(fs, fv) + + self.step = step + + self.slider = widgets.FloatSlider( + value=init_value, + min=range_min, + max=range_max, + step=step, + description='Slider input:', + disabled=False, + continuous_update=False, + orientation='horizontal', + readout=True, + readout_format='.' + str(self.decimals) + 'f' + ) + + self.text = widgets.FloatText( + value=self.slider.value, + description='Manual input:', + disabled=False + ) + + def __extend_max(change): + num_new = np.around(change['new'], decimals=self.decimals) + num_old = change['old'] + if num_new > num_old: + if num_new - num_old > (self.slider.max - num_old) / 2: + self.range_size *= 2 + else: + self.range_size *= 0.5 + else: + if num_old - num_new > (num_old - self.slider.min) / 2: + self.range_size *= 2 + else: + self.range_size *= 0.5 + + if self.range_size < self.step * 10: + self.range_size = self.step * 10 + + self.slider.min = num_new - self.range_size if self.range[0] is None else self.range[0] + self.slider.max = num_new + self.range_size if self.range[1] is None else self.range[1] + self.slider.value = num_new + self.text.value = num_new + + self.slider.observe(__extend_max, names='value') + self.text.observe(__extend_max, names='value') + box_layout = widgets.Layout(display='flex', + align_items='stretch', + justify_content='center', ) + # self.frame = widgets.HBox([self.slider, self.text], layout=box_layout) + super().__init__([self.slider, self.text], layout=box_layout) + self._int_type = False + if (isinstance(init_value, int) or isinstance(init_value, np.int16) \ + or isinstance(init_value, np.int32) or isinstance(init_value, np.int64)) \ + and step % 1 == 0: + self._int_type = True + + @property + def value(self): + result = self.slider.value + if self._int_type: + result = int(result) + return result + + +class Border: + def __init__(self, element_list, description=None, size=5, style=0): + if not isinstance(element_list, list): + element_list = [element_list] + + box_layout = widgets.Layout(display='flex', + flex_flow='column', + align_items='flex-start', + align_content='flex-start', + # justify_content='center', + justify_content='space-around', + border=border_list[2] + ) + frame = widgets.Box(children=element_list, layout=box_layout) + + if description is not None: + caption = widgets.HTML(value=""+description+"") + children = [caption, frame] + else: + children = [frame] + + box_layout = widgets.Layout(display='flex', + flex_flow='column', + align_items='center', + justify_content='center', + border=border_list[style], ) + self.frame = widgets.Box(children=children, layout=box_layout) + + +class InfoDisplay: + def __init__(self, description, detail): + label = widgets.Label(description) + self.data = widgets.Label(detail) + self.frame = widgets.HBox([label, self.data], layout=widgets.Layout(justify_content='flex-start', )) +# border=border_list[2])) diff --git a/rlzoo/interactive/components.py b/rlzoo/interactive/components.py old mode 100644 new mode 100755 index d79993c..405609e --- a/rlzoo/interactive/components.py +++ b/rlzoo/interactive/components.py @@ -1,463 +1,463 @@ -from __future__ import print_function -import copy -from collections import OrderedDict - -from ipywidgets import Layout -from ipywidgets import GridspecLayout - -from IPython.display import clear_output -from IPython.core.interactiveshell import InteractiveShell -from gym import spaces - -from rlzoo.common.env_list import all_env_list -from rlzoo.common.utils import * -from rlzoo.interactive.common import * - -all_env_list = OrderedDict(sorted(all_env_list.items())) - - -class EnvironmentSelector(widgets.VBox): - def __init__(self): - env_list = all_env_list - # al = list(env_list.keys()) - al = ['atari', 'classic_control', 'box2d', 'mujoco', 'robotics', 'dm_control', 'rlbench'] - description = 'Environment Selector' - caption = widgets.HTML(value="" + description + "") - - text_0 = widgets.Label("Choose your environment") - - self.env_type = widgets.Dropdown( - options=al, - value=al[0], - description='env type:', - disabled=False, - ) - - self.env_name = widgets.Dropdown( - options=env_list[al[0]], - value=env_list[al[0]][0], - description='env name:', - disabled=False, - ) - env_select_box = widgets.VBox([text_0, self.env_type, self.env_name]) - - text_1 = widgets.Label(value="Environment settings") - - self.env_num = widgets.IntText( - value=1, - description='multi envs:', - disabled=False, - min=1, - # layout=Layout(width='150px') - ) - - self.env_state = widgets.Dropdown( - options=['default'], - value='default', - description='state type:', - disabled=False, - ) - - # self.create_button = widgets.Button( - # description='Create!', - # disabled=False, - # tooltip='Create', - # icon='check' - # ) - - # multi_box = widgets.HBox([self.env_multi, self.env_num], layout=Layout(justify_content='flex-start')) - env_setting_box = widgets.VBox([text_1, self.env_num, self.env_state]) - - select_box = widgets.HBox([env_select_box, env_setting_box], - layout=Layout(justify_content='Center')) - - # self.frame = widgets.VBox([select_box, widgets.Box([self.create_button], - # layout=Layout(justify_content='Center'))]) - # self.frame = widgets.AppLayout(left_sidebar=select_box, center=info_border.frame) - - def env_type_change(change): - d = env_list[self.env_type.value] - self.env_name.options = d - self.env_name.value = d[0] - if self.env_type.value == 'rlbench': - self.env_state.options = ['state', 'vision'] - self.env_state.value = 'state' - self.env_num.value = 1 - self.env_num.disabled = True - else: - self.env_state.options = ['default'] - self.env_state.value = 'default' - self.env_num.disabled = False - - # def create_env(c): # todo the program will be blocked if rlbench env is created here - # if self.env_type.value == 'rlbench': - # print(self.env_name.value, self.env_type.value, self.env_num.value, self.env_state.value) - # self._env = build_env(self.env_name.value, self.env_type.value, - # nenv=self.env_num.value, state_type=self.env_state.value) - # self._env = build_env(self.env_name.value, self.env_type.value, nenv=self.env_num.value) - # print('Environment created successfully!') - - def change_nenv(c): - if self.env_num.value < 1: - self.env_num.value = 1 - - self.env_num.observe(change_nenv, names='value') - self.env_type.observe(env_type_change, names='value') - - # self.create_button.on_click(create_env) - - super().__init__([caption, select_box], layout=widgets.Layout(align_items='center', )) - - @property - def value(self): - return {'env_id': self.env_name.value, - 'env_type': self.env_type.value, - 'nenv': self.env_num.value, - 'state_type': self.env_state.value} - - -# @property -# def env(self): -# return self._env - -class SpaceInfoViewer(widgets.Box): - def __init__(self, sp): - assert isinstance(sp, spaces.Space) - if isinstance(sp, spaces.Dict): - it = list(sp.spaces.items()) - info = GridspecLayout(len(it), 2) - for i, v in enumerate(it): - info[i, 0], info[i, 1] = widgets.Label(v[0]), widgets.Label(str(v[1])) - else: - info = widgets.Label(str(sp)) - super().__init__([info]) - - -class EnvInfoViewer(widgets.VBox): - def __init__(self, env): - if isinstance(env, list): - env = env[0] - env_obs = SpaceInfoViewer(env.observation_space) - env_act = SpaceInfoViewer(env.action_space) - tips = None - if isinstance(env.action_space, gym.spaces.Discrete): - tips = 'The action space is discrete.' - elif isinstance(env.action_space, gym.spaces.Box): - tips = 'The action space is continuous.' - - description = 'Environment Information' - caption = widgets.HTML(value="" + description + "") - - a00, a01 = widgets.Label('Environment name:'), widgets.Label(env.spec.id) - a10, a11 = widgets.Label('Observation space:'), env_obs - a20, a21 = widgets.Label('Action space:'), env_act - - if tips is None: - # use GirdBox instead of GridspecLayout to ensure each row has a different height - info = widgets.GridBox(children=[a00, a01, a10, a11, a20, a21], - layout=Layout(grid_template_areas=""" - "a00 a01" - "a10 a11" - "a20 a21" - """)) - else: - t0 = widgets.Label('Tips:') - t1 = widgets.Label(tips) - info = widgets.GridBox(children=[a00, a01, a10, a11, a20, a21, t0, t1], - layout=Layout(grid_template_areas=""" - "a00 a01" - "a10 a11" - "a20 a21" - "t0 t1" - """)) - - super().__init__([caption, info], layout=widgets.Layout(align_items='center', )) - - -all_alg_list = ['A3C', 'AC', 'DDPG', 'DPPO', 'DQN', 'PG', 'PPO', 'SAC', 'TD3', 'TRPO'] -all_alg_dict = {'discrete_action_space': ['AC', 'DQN', 'PG', 'PPO', 'TRPO'], - 'continuous_action_space': ['AC', 'DDPG', 'PG', 'PPO', 'SAC', 'TD3', 'TRPO'], - 'multi_env': ['A3C', 'DPPO'] - } - - -class AlgorithmSelector(widgets.VBox): - def __init__(self, env): - description = 'Algorithm Selector' - caption = widgets.HTML(value="" + description + "") - info = 'Supported algorithms are shown below' - if isinstance(env, list): - # info = 'Distributed algorithms are shown below' - table = all_alg_dict['multi_env'] - self.env_id = env[0].spec.id - elif isinstance(env.action_space, gym.spaces.Discrete): - # info = 'Algorithms which support discrete action space are shown below' - table = all_alg_dict['discrete_action_space'] - self.env_id = env.spec.id - elif isinstance(env.action_space, gym.spaces.Box): - # info = 'Algorithms which support continuous action space are shown below' - table = all_alg_dict['continuous_action_space'] - self.env_id = env.spec.id - else: - raise ValueError('Unsupported environment') - - self.algo_name = widgets.Dropdown( - options=table, - value=table[0], - description='Algorithms:', - disabled=False, - ) - - super().__init__([caption, widgets.Label(info), self.algo_name], - layout=widgets.Layout(align_items='center', )) - - @property - def value(self): - return self.algo_name.value - - -def TransInput(value): - if isinstance(value, bool): - return widgets.Checkbox(value=value, description='', disabled=False, indent=False) - elif isinstance(value, int) or isinstance(value, float) \ - or isinstance(value, np.int16) or isinstance(value, np.float16) \ - or isinstance(value, np.int32) or isinstance(value, np.float32) \ - or isinstance(value, np.int64) or isinstance(value, np.float64) \ - or isinstance(value, np.float128): - return NumInput(value) - else: - return widgets.Label(value) - - -class AlgoInfoViewer(widgets.VBox): - def __init__(self, alg_selector, org_alg_params, org_learn_params): - alg_params, learn_params = copy.deepcopy(org_alg_params), copy.deepcopy(org_learn_params) - - # ---------------- alg_params --------------- # - description = 'Algorithm Parameters' - alg_caption = widgets.HTML(value="" + description + "") - net_label = widgets.Label('Network information:') - show_net = lambda net: widgets.VBox([widgets.Label(str(layer)) for layer in net.all_layers]) - - n = np.ndim(alg_params['net_list']) - if n == 1: - model_net = alg_params['net_list'] - elif n == 2: - model_net = alg_params['net_list'][0] - - net_info = widgets.VBox([widgets.VBox([widgets.Label(str(net.__class__.__name__)), - show_net(net), ], - layout=widgets.Layout(border=border_list[2], - align_items='center', - align_content='center' - ) - ) for net in model_net]) - self._net_list = alg_params['net_list'] - del alg_params['net_list'] - - opt_label = widgets.Label('Optimizer information:') - - def show_params(params): - params = copy.deepcopy(params) - n = len(params) - frame = widgets.GridspecLayout(n, 2, layout=widgets.Layout(border=border_list[2], )) - show_info = lambda k: [widgets.Label(str(k)), widgets.Label(str(params[k]))] - frame[0, 0], frame[0, 1] = show_info('name') - frame[1, 0], frame[1, 1] = show_info('learning_rate') - del params['name'] - del params['learning_rate'] - for i, k in enumerate(sorted(params.keys())): - if k != 'name' and k != 'learning_rate': - frame[2 + i, 0], frame[2 + i, 1] = show_info(k) - return frame - - opt_info = widgets.VBox([show_params(n.get_config()) for n in alg_params['optimizers_list']]) - self._optimizers_list = alg_params['optimizers_list'] - del alg_params['optimizers_list'] - - stu_frame = widgets.GridBox(children=[net_label, net_info, opt_label, opt_info], - layout=Layout(grid_template_areas=""" - "net_label net_info" - "opt_label opt_info" - """)) - - alg_sel_dict = dict() - sk = sorted(alg_params.keys()) - n = len(sk) + 1 - alg_param_sel = widgets.GridspecLayout(n, 2) - b = 0 - if 'method' in sk: - module = widgets.RadioButtons(options=['penalty', 'clip'], disabled=False) - alg_param_sel[0, 0], alg_param_sel[0, 1] = widgets.Label('method'), module - alg_sel_dict['method'] = module - sk.remove('method') - b += 1 - - for i, k in enumerate(sk): - module = TransInput(alg_params[k]) - alg_sel_dict[k] = module - if k == 'dueling': - module.disabled = True - alg_param_sel[i + b, 0], alg_param_sel[i + b, 1] = widgets.Label(k), module - - alg_param_box = widgets.VBox([alg_caption, stu_frame, alg_param_sel], ) - name = alg_selector.value + '-' + alg_selector.env_id - path = os.path.join('.', 'model', name) - alg_param_sel[n - 1, 0] = widgets.Label('model save path') - alg_param_sel[n - 1, 1] = widgets.Label(path) - - self.alg_sel_dict = alg_sel_dict - # ================== alg_params ================= # - - # ----------------- learn_params ---------------- # - description = 'Learn Parameters' - learn_caption = widgets.HTML(value="" + description + "") - - learn_sel_dict = dict() - sk = sorted(learn_params.keys()) - - n = len(sk) - if 'mode' not in sk: n += 1 - if 'render' not in sk: n += 1 - learn_param_sel = widgets.GridspecLayout(n, 2) - - module = widgets.RadioButtons(options=['train', 'test'], disabled=False) - learn_param_sel[0, 0], learn_param_sel[0, 1] = widgets.Label('mode'), module - learn_sel_dict['mode'] = module - try: - sk.remove('mode') - except: - pass - - module = widgets.Checkbox(value=False, description='', disabled=False, indent=False) - learn_param_sel[1, 0], learn_param_sel[1, 1] = widgets.Label('render'), module - learn_sel_dict['render'] = module - try: - sk.remove('render') - except: - pass - - for i, k in enumerate(sk): - module = TransInput(learn_params[k]) - learn_sel_dict[k] = module - learn_param_sel[i + 2, 0], learn_param_sel[i + 2, 1] = widgets.Label(k), module - learn_param_box = widgets.VBox([learn_caption, learn_param_sel], - # layout=Layout(align_items='center',) - ) - self.learn_sel_dict = learn_sel_dict - # ================= learn_params ================ # - - b = widgets.Output(layout=widgets.Layout(border='solid')) - - self.smooth_factor_slider = widgets.FloatSlider( - value=0.8, - min=0, - max=1, - step=0.01, - description='learning curve smooth factor', - disabled=False, - continuous_update=False, - orientation='horizontal', - readout=True, - readout_format='.2f', - style={'description_width': 'initial'}, - ) - super().__init__([alg_param_box, b, learn_param_box, b, self.smooth_factor_slider]) - - @property - def alg_params(self): - result = {'net_list': self._net_list, 'optimizers_list': self._optimizers_list} - for k in self.alg_sel_dict.keys(): - result[k] = self.alg_sel_dict[k].value - return result - - @property - def smooth_factor(self): - return self.smooth_factor_slider.value - - @property - def learn_params(self): - result = dict() - for k in self.learn_sel_dict.keys(): - result[k] = self.learn_sel_dict[k].value - return result - - -class RevOutput(widgets.Output): - def _append_stream_output(self, text, stream_name): - """Append a stream output.""" - self.outputs = ( - {'output_type': 'stream', 'name': stream_name, 'text': text}, - ) + self.outputs - - def append_display_data(self, display_object): - """Append a display object as an output. - - Parameters - ---------- - display_object : IPython.core.display.DisplayObject - The object to display (e.g., an instance of - `IPython.display.Markdown` or `IPython.display.Image`). - """ - fmt = InteractiveShell.instance().display_formatter.format - data, metadata = fmt(display_object) - self.outputs = ( - { - 'output_type': 'display_data', - 'data': data, - 'metadata': metadata - }, - ) + self.outputs - - -class OutputMonitor(widgets.HBox): - def __init__(self, learn_params, smooth_factor): - max_num = learn_params['train_episodes'] if learn_params['mode'] == 'train' else learn_params['test_episodes'] - self.progress = widgets.FloatProgress(value=0.0, min=0.0, max=max_num, description='Progress') - - self.plot_out = widgets.Output(layout=widgets.Layout(width='350px', - height='250px', )) - self.smooth_factor = smooth_factor - # self.smooth_factor = widgets.FloatSlider( - # value=self.sf, - # min=0, - # max=1, - # step=0.01, - # description='smooth factor', - # disabled=False, - # continuous_update=False, - # orientation='horizontal', - # readout=True, - # readout_format='.2f' - # ) - - # def link(c): - # self.sf = self.smooth_factor.value - - # self.smooth_factor.observe(link, 'value') - # plot_out = widgets.VBox([widgets.Label('Learning curve'), self.plot_out, self.smooth_factor]) - plot_out = widgets.VBox([widgets.Label('Learning curve'), self.plot_out]) - - self.print_out = RevOutput(layout=widgets.Layout(overflow='scroll', - width='60%', - height='300px', - # display='flex', - # positioning='bottom', - border='1px solid black', - )) - self.plot_func([]) - super().__init__([widgets.VBox([plot_out, self.progress]), self.print_out]) - - def plot_func(self, datas): - # datas = signal.lfilter([1 - self.smooth_factor], [1, -self.smooth_factor], datas, axis=0) - if datas: - disD = [datas[0]] - for d in datas[1:]: - disD.append(disD[-1] * self.smooth_factor + d * (1 - self.smooth_factor)) - else: - disD = datas - with self.plot_out: - self.progress.value = len(disD) - plt.plot(disD) - clear_output(wait=True) - plt.show() +from __future__ import print_function +import copy +from collections import OrderedDict + +from ipywidgets import Layout +from ipywidgets import GridspecLayout + +from IPython.display import clear_output +from IPython.core.interactiveshell import InteractiveShell +from gym import spaces + +from rlzoo.common.env_list import all_env_list +from rlzoo.common.utils import * +from rlzoo.interactive.common import * + +all_env_list = OrderedDict(sorted(all_env_list.items())) + + +class EnvironmentSelector(widgets.VBox): + def __init__(self): + env_list = all_env_list + # al = list(env_list.keys()) + al = ['atari', 'classic_control', 'box2d', 'mujoco', 'robotics', 'dm_control', 'rlbench'] + description = 'Environment Selector' + caption = widgets.HTML(value="" + description + "") + + text_0 = widgets.Label("Choose your environment") + + self.env_type = widgets.Dropdown( + options=al, + value=al[0], + description='env type:', + disabled=False, + ) + + self.env_name = widgets.Dropdown( + options=env_list[al[0]], + value=env_list[al[0]][0], + description='env name:', + disabled=False, + ) + env_select_box = widgets.VBox([text_0, self.env_type, self.env_name]) + + text_1 = widgets.Label(value="Environment settings") + + self.env_num = widgets.IntText( + value=1, + description='multi envs:', + disabled=False, + min=1, + # layout=Layout(width='150px') + ) + + self.env_state = widgets.Dropdown( + options=['default'], + value='default', + description='state type:', + disabled=False, + ) + + # self.create_button = widgets.Button( + # description='Create!', + # disabled=False, + # tooltip='Create', + # icon='check' + # ) + + # multi_box = widgets.HBox([self.env_multi, self.env_num], layout=Layout(justify_content='flex-start')) + env_setting_box = widgets.VBox([text_1, self.env_num, self.env_state]) + + select_box = widgets.HBox([env_select_box, env_setting_box], + layout=Layout(justify_content='Center')) + + # self.frame = widgets.VBox([select_box, widgets.Box([self.create_button], + # layout=Layout(justify_content='Center'))]) + # self.frame = widgets.AppLayout(left_sidebar=select_box, center=info_border.frame) + + def env_type_change(change): + d = env_list[self.env_type.value] + self.env_name.options = d + self.env_name.value = d[0] + if self.env_type.value == 'rlbench': + self.env_state.options = ['state', 'vision'] + self.env_state.value = 'state' + self.env_num.value = 1 + self.env_num.disabled = True + else: + self.env_state.options = ['default'] + self.env_state.value = 'default' + self.env_num.disabled = False + + # def create_env(c): # todo the program will be blocked if rlbench env is created here + # if self.env_type.value == 'rlbench': + # print(self.env_name.value, self.env_type.value, self.env_num.value, self.env_state.value) + # self._env = build_env(self.env_name.value, self.env_type.value, + # nenv=self.env_num.value, state_type=self.env_state.value) + # self._env = build_env(self.env_name.value, self.env_type.value, nenv=self.env_num.value) + # print('Environment created successfully!') + + def change_nenv(c): + if self.env_num.value < 1: + self.env_num.value = 1 + + self.env_num.observe(change_nenv, names='value') + self.env_type.observe(env_type_change, names='value') + + # self.create_button.on_click(create_env) + + super().__init__([caption, select_box], layout=widgets.Layout(align_items='center', )) + + @property + def value(self): + return {'env_id': self.env_name.value, + 'env_type': self.env_type.value, + 'nenv': self.env_num.value, + 'state_type': self.env_state.value} + + +# @property +# def env(self): +# return self._env + +class SpaceInfoViewer(widgets.Box): + def __init__(self, sp): + assert isinstance(sp, spaces.Space) + if isinstance(sp, spaces.Dict): + it = list(sp.spaces.items()) + info = GridspecLayout(len(it), 2) + for i, v in enumerate(it): + info[i, 0], info[i, 1] = widgets.Label(v[0]), widgets.Label(str(v[1])) + else: + info = widgets.Label(str(sp)) + super().__init__([info]) + + +class EnvInfoViewer(widgets.VBox): + def __init__(self, env): + if isinstance(env, list): + env = env[0] + env_obs = SpaceInfoViewer(env.observation_space) + env_act = SpaceInfoViewer(env.action_space) + tips = None + if isinstance(env.action_space, gym.spaces.Discrete): + tips = 'The action space is discrete.' + elif isinstance(env.action_space, gym.spaces.Box): + tips = 'The action space is continuous.' + + description = 'Environment Information' + caption = widgets.HTML(value="" + description + "") + + a00, a01 = widgets.Label('Environment name:'), widgets.Label(env.spec.id) + a10, a11 = widgets.Label('Observation space:'), env_obs + a20, a21 = widgets.Label('Action space:'), env_act + + if tips is None: + # use GirdBox instead of GridspecLayout to ensure each row has a different height + info = widgets.GridBox(children=[a00, a01, a10, a11, a20, a21], + layout=Layout(grid_template_areas=""" + "a00 a01" + "a10 a11" + "a20 a21" + """)) + else: + t0 = widgets.Label('Tips:') + t1 = widgets.Label(tips) + info = widgets.GridBox(children=[a00, a01, a10, a11, a20, a21, t0, t1], + layout=Layout(grid_template_areas=""" + "a00 a01" + "a10 a11" + "a20 a21" + "t0 t1" + """)) + + super().__init__([caption, info], layout=widgets.Layout(align_items='center', )) + + +all_alg_list = ['A3C', 'AC', 'DDPG', 'DPPO', 'DQN', 'PG', 'PPO', 'SAC', 'TD3', 'TRPO'] +all_alg_dict = {'discrete_action_space': ['AC', 'DQN', 'PG', 'PPO', 'TRPO'], + 'continuous_action_space': ['AC', 'DDPG', 'PG', 'PPO', 'SAC', 'TD3', 'TRPO'], + 'multi_env': ['A3C', 'DPPO'] + } + + +class AlgorithmSelector(widgets.VBox): + def __init__(self, env): + description = 'Algorithm Selector' + caption = widgets.HTML(value="" + description + "") + info = 'Supported algorithms are shown below' + if isinstance(env, list): + # info = 'Distributed algorithms are shown below' + table = all_alg_dict['multi_env'] + self.env_id = env[0].spec.id + elif isinstance(env.action_space, gym.spaces.Discrete): + # info = 'Algorithms which support discrete action space are shown below' + table = all_alg_dict['discrete_action_space'] + self.env_id = env.spec.id + elif isinstance(env.action_space, gym.spaces.Box): + # info = 'Algorithms which support continuous action space are shown below' + table = all_alg_dict['continuous_action_space'] + self.env_id = env.spec.id + else: + raise ValueError('Unsupported environment') + + self.algo_name = widgets.Dropdown( + options=table, + value=table[0], + description='Algorithms:', + disabled=False, + ) + + super().__init__([caption, widgets.Label(info), self.algo_name], + layout=widgets.Layout(align_items='center', )) + + @property + def value(self): + return self.algo_name.value + + +def TransInput(value): + if isinstance(value, bool): + return widgets.Checkbox(value=value, description='', disabled=False, indent=False) + elif isinstance(value, int) or isinstance(value, float) \ + or isinstance(value, np.int16) or isinstance(value, np.float16) \ + or isinstance(value, np.int32) or isinstance(value, np.float32) \ + or isinstance(value, np.int64) or isinstance(value, np.float64) \ + or isinstance(value, np.float128): + return NumInput(value) + else: + return widgets.Label(value) + + +class AlgoInfoViewer(widgets.VBox): + def __init__(self, alg_selector, org_alg_params, org_learn_params): + alg_params, learn_params = copy.deepcopy(org_alg_params), copy.deepcopy(org_learn_params) + + # ---------------- alg_params --------------- # + description = 'Algorithm Parameters' + alg_caption = widgets.HTML(value="" + description + "") + net_label = widgets.Label('Network information:') + show_net = lambda net: widgets.VBox([widgets.Label(str(layer)) for layer in net.all_layers]) + + n = np.ndim(alg_params['net_list']) + if n == 1: + model_net = alg_params['net_list'] + elif n == 2: + model_net = alg_params['net_list'][0] + + net_info = widgets.VBox([widgets.VBox([widgets.Label(str(net.__class__.__name__)), + show_net(net), ], + layout=widgets.Layout(border=border_list[2], + align_items='center', + align_content='center' + ) + ) for net in model_net]) + self._net_list = alg_params['net_list'] + del alg_params['net_list'] + + opt_label = widgets.Label('Optimizer information:') + + def show_params(params): + params = copy.deepcopy(params) + n = len(params) + frame = widgets.GridspecLayout(n, 2, layout=widgets.Layout(border=border_list[2], )) + show_info = lambda k: [widgets.Label(str(k)), widgets.Label(str(params[k]))] + frame[0, 0], frame[0, 1] = show_info('name') + frame[1, 0], frame[1, 1] = show_info('learning_rate') + del params['name'] + del params['learning_rate'] + for i, k in enumerate(sorted(params.keys())): + if k != 'name' and k != 'learning_rate': + frame[2 + i, 0], frame[2 + i, 1] = show_info(k) + return frame + + opt_info = widgets.VBox([show_params(n.get_config()) for n in alg_params['optimizers_list']]) + self._optimizers_list = alg_params['optimizers_list'] + del alg_params['optimizers_list'] + + stu_frame = widgets.GridBox(children=[net_label, net_info, opt_label, opt_info], + layout=Layout(grid_template_areas=""" + "net_label net_info" + "opt_label opt_info" + """)) + + alg_sel_dict = dict() + sk = sorted(alg_params.keys()) + n = len(sk) + 1 + alg_param_sel = widgets.GridspecLayout(n, 2) + b = 0 + if 'method' in sk: + module = widgets.RadioButtons(options=['penalty', 'clip'], disabled=False) + alg_param_sel[0, 0], alg_param_sel[0, 1] = widgets.Label('method'), module + alg_sel_dict['method'] = module + sk.remove('method') + b += 1 + + for i, k in enumerate(sk): + module = TransInput(alg_params[k]) + alg_sel_dict[k] = module + if k == 'dueling': + module.disabled = True + alg_param_sel[i + b, 0], alg_param_sel[i + b, 1] = widgets.Label(k), module + + alg_param_box = widgets.VBox([alg_caption, stu_frame, alg_param_sel], ) + name = alg_selector.value + '-' + alg_selector.env_id + path = os.path.join('.', 'model', name) + alg_param_sel[n - 1, 0] = widgets.Label('model save path') + alg_param_sel[n - 1, 1] = widgets.Label(path) + + self.alg_sel_dict = alg_sel_dict + # ================== alg_params ================= # + + # ----------------- learn_params ---------------- # + description = 'Learn Parameters' + learn_caption = widgets.HTML(value="" + description + "") + + learn_sel_dict = dict() + sk = sorted(learn_params.keys()) + + n = len(sk) + if 'mode' not in sk: n += 1 + if 'render' not in sk: n += 1 + learn_param_sel = widgets.GridspecLayout(n, 2) + + module = widgets.RadioButtons(options=['train', 'test'], disabled=False) + learn_param_sel[0, 0], learn_param_sel[0, 1] = widgets.Label('mode'), module + learn_sel_dict['mode'] = module + try: + sk.remove('mode') + except: + pass + + module = widgets.Checkbox(value=False, description='', disabled=False, indent=False) + learn_param_sel[1, 0], learn_param_sel[1, 1] = widgets.Label('render'), module + learn_sel_dict['render'] = module + try: + sk.remove('render') + except: + pass + + for i, k in enumerate(sk): + module = TransInput(learn_params[k]) + learn_sel_dict[k] = module + learn_param_sel[i + 2, 0], learn_param_sel[i + 2, 1] = widgets.Label(k), module + learn_param_box = widgets.VBox([learn_caption, learn_param_sel], + # layout=Layout(align_items='center',) + ) + self.learn_sel_dict = learn_sel_dict + # ================= learn_params ================ # + + b = widgets.Output(layout=widgets.Layout(border='solid')) + + self.smooth_factor_slider = widgets.FloatSlider( + value=0.8, + min=0, + max=1, + step=0.01, + description='learning curve smooth factor', + disabled=False, + continuous_update=False, + orientation='horizontal', + readout=True, + readout_format='.2f', + style={'description_width': 'initial'}, + ) + super().__init__([alg_param_box, b, learn_param_box, b, self.smooth_factor_slider]) + + @property + def alg_params(self): + result = {'net_list': self._net_list, 'optimizers_list': self._optimizers_list} + for k in self.alg_sel_dict.keys(): + result[k] = self.alg_sel_dict[k].value + return result + + @property + def smooth_factor(self): + return self.smooth_factor_slider.value + + @property + def learn_params(self): + result = dict() + for k in self.learn_sel_dict.keys(): + result[k] = self.learn_sel_dict[k].value + return result + + +class RevOutput(widgets.Output): + def _append_stream_output(self, text, stream_name): + """Append a stream output.""" + self.outputs = ( + {'output_type': 'stream', 'name': stream_name, 'text': text}, + ) + self.outputs + + def append_display_data(self, display_object): + """Append a display object as an output. + + Parameters + ---------- + display_object : IPython.core.display.DisplayObject + The object to display (e.g., an instance of + `IPython.display.Markdown` or `IPython.display.Image`). + """ + fmt = InteractiveShell.instance().display_formatter.format + data, metadata = fmt(display_object) + self.outputs = ( + { + 'output_type': 'display_data', + 'data': data, + 'metadata': metadata + }, + ) + self.outputs + + +class OutputMonitor(widgets.HBox): + def __init__(self, learn_params, smooth_factor): + max_num = learn_params['train_episodes'] if learn_params['mode'] == 'train' else learn_params['test_episodes'] + self.progress = widgets.FloatProgress(value=0.0, min=0.0, max=max_num, description='Progress') + + self.plot_out = widgets.Output(layout=widgets.Layout(width='350px', + height='250px', )) + self.smooth_factor = smooth_factor + # self.smooth_factor = widgets.FloatSlider( + # value=self.sf, + # min=0, + # max=1, + # step=0.01, + # description='smooth factor', + # disabled=False, + # continuous_update=False, + # orientation='horizontal', + # readout=True, + # readout_format='.2f' + # ) + + # def link(c): + # self.sf = self.smooth_factor.value + + # self.smooth_factor.observe(link, 'value') + # plot_out = widgets.VBox([widgets.Label('Learning curve'), self.plot_out, self.smooth_factor]) + plot_out = widgets.VBox([widgets.Label('Learning curve'), self.plot_out]) + + self.print_out = RevOutput(layout=widgets.Layout(overflow='scroll', + width='60%', + height='300px', + # display='flex', + # positioning='bottom', + border='1px solid black', + )) + self.plot_func([]) + super().__init__([widgets.VBox([plot_out, self.progress]), self.print_out]) + + def plot_func(self, datas): + # datas = signal.lfilter([1 - self.smooth_factor], [1, -self.smooth_factor], datas, axis=0) + if datas: + disD = [datas[0]] + for d in datas[1:]: + disD.append(disD[-1] * self.smooth_factor + d * (1 - self.smooth_factor)) + else: + disD = datas + with self.plot_out: + self.progress.value = len(disD) + plt.plot(disD) + clear_output(wait=True) + plt.show() diff --git a/rlzoo/interactive/main.ipynb b/rlzoo/interactive/main.ipynb old mode 100644 new mode 100755 index d5459a9..42c30a7 --- a/rlzoo/interactive/main.ipynb +++ b/rlzoo/interactive/main.ipynb @@ -1,6056 +1,6056 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Interactive Hyper-parameter Configuration\n", - "This is a use case provided by RLzoo to support an interactive hyper-parameter configuration process. It is built with *ipywidgets* package, so make sure you have the package installed:\n", - "\n", - "```! pip3 install ipywidgets==7.5.1```\n", - "\n", - "You just need to **run** each cell (Shift+Enter) and **select** the sliders or dropdown lists to configure the hyper-parameters for the learning process, for whichever algorithm and environment supported in RLzoo. \n", - "\n", - "It follows four steps:\n", - "1. Environment Configuration\n", - "2. Environment Information Display and Algorithm Configuration\n", - "3. Algorithm Parameters Display and Learning Parameters Configuration\n", - "4. Launch Learning with Visualization \n", - "\n", - "Tips:\n", - "To stop the learning process and start a new one, you needs to restart the kernel (always work) or interrupt the kernel (not always work). \n", - "\n", - "Have fun!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "1. Environment Configuration\n", - "-----------------------------\n", - "Run a environment selector and select a environment you like.\n", - "\n", - "Tips: no need to rerun after selection, directly go to next cell.\n", - "\"\"\"\n", - "\n", - "from rlzoo.interactive.common import *\n", - "from rlzoo.interactive.components import *\n", - "from rlzoo.algorithms import *\n", - "from rlzoo.common.env_wrappers import build_env, close_env\n", - "env_sel = EnvironmentSelector()\n", - "display(env_sel)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "2. Environment Information Display and Algorithm Configuration\n", - "--------------------------------------------------------------\n", - "Run this code to create the enivronment instance.\n", - "\n", - "Tips: need to rerun every time you want to create a new environment with above cell, \\\n", - "because this cell builds the environment.\n", - "\"\"\"\n", - "\n", - "try:\n", - " close_env(env) # close the previous environment\n", - "except:\n", - " pass\n", - "env = build_env(**env_sel.value)\n", - "print('Environment created!')\n", - "display(EnvInfoViewer(env))\n", - "\n", - "# run a algorithm selector and select a RL algorithm\n", - "alog_sel = AlgorithmSelector(env)\n", - "display(alog_sel)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "3. Algorithm Parameters Display and Learning Parameters Configuration\n", - "----------------------------------------------------------------------\n", - "Call the default parameters of the selected algorithm in our environment and display them, \\\n", - "then select learning parameters.\n", - "\n", - "Tips: need to rerun after you created a different algorithm or environment.\n", - "\"\"\"\n", - "\n", - "EnvType, AlgName = env_sel.value['env_type'], alog_sel.value\n", - "alg_params, learn_params = call_default_params(env, EnvType, AlgName)\n", - "print('Default parameters loaded!')\n", - "\n", - "# see the networks, optimizers and adjust other parameters\n", - "algiv = AlgoInfoViewer(alog_sel, alg_params, learn_params)\n", - "display(algiv)\n", - "\n", - "# run this to generate the algorithm instance with the algorithm parameter settings above\n", - "alg_params = algiv.alg_params\n", - "alg = eval(AlgName+'(**alg_params)')\n", - "print('Algorithm instance created!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "4. Launch Learning with Visualization \n", - "---------------------------------------\n", - "Run the cell to train the algorithm with the configurations above.\n", - "\"\"\"\n", - "\n", - "learn_params = algiv.learn_params\n", - "om = OutputMonitor(learn_params, smooth_factor=algiv.smooth_factor)\n", - "display(om)\n", - "with om.print_out:\n", - " alg.learn(env=env, plot_func=om.plot_func, **learn_params)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# whenever leaving the page, please close the environment by the way\n", - "close_env(env)\n", - "print('Environment closed')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "00663174be1342fbbd29bc99cdd6d3aa": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "00ead8f3c1ea4020930b11c3bde3dd48": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_543b543dd8bb4fcb9dc9f4a16ac4bd6e", - "style": "IPY_MODEL_f63f7fca433e4d32ad6252416895155b", - "value": "max_steps" - } - }, - "0106cced0fe54fbb9a3a261b11941cce": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_593926166a704759992244f9732d0f8d", - "style": "IPY_MODEL_4a1bc5d7007848cb89e08eff1479ddf8", - "value": "Learn Parameters" - } - }, - "012eeb7c3bab46d9baa05356cd4ff0f6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "0143906a10054b1594675c3674642d83": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "014bf4270fea44b6aad4c80c7a5979b7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "019cd764de374cb382236f88a5d204af": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "01cece59d650454b9cf09d03e85a6a10": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_81d1f55272ef4977b06be173bdd59b8c", - "style": "IPY_MODEL_e62a214128d34799be2e1cc2cdb98b8c", - "value": "Network information:" - } - }, - "0201bde3e922471d9bb86857be61df95": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_8178676fb5e441ec92464938695643a8", - "step": null, - "style": "IPY_MODEL_0143906a10054b1594675c3674642d83", - "value": 24 - } - }, - "02904d8bc2d442deb3da0b5e6e0363a9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "04461564de8c45d6af4c6055f7b4c17f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "RadioButtonsModel", - "state": { - "_options_labels": [ - "train", - "test" - ], - "index": 0, - "layout": "IPY_MODEL_520b2e1af36547edbae1352d82099fda", - "style": "IPY_MODEL_2c9a721e0f084f8f8f437a5d4d875e3f" - } - }, - "04abdee05e514880bb74dfe64bca36ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_9384c24875c24e5b8be37d4c55e04820", - "style": "IPY_MODEL_bebb739676c74aacb396889de39592e6", - "value": "0.9" - } - }, - "0580852520e142a89d7b42c50bfef6a1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "066c122ea5f64991b7347279a79e8061": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "06d5c4249f3d404793fe2defc8eb0051": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_5af1a3e17ac64264905701b109c013e2", - "IPY_MODEL_691c17934ca3435eb36a2d84d15ecdf7" - ], - "layout": "IPY_MODEL_7d163d682d5744d6ac7be041fb66c158" - } - }, - "070bc781a91449c6a7fb227586d347e6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_494deb5503e842b78948ed2c14e28e3e", - "style": "IPY_MODEL_2d1f0d1b81ee4e1f85ae2f777dcd0db9", - "value": "beta_2" - } - }, - "07377f1ec0e74dd4897d484914a44f99": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "077609b632e64492acbc9a009222e086": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "078c44ca72d24661bbeb9921196ddace": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "07b040199f664673b2cb1b45c5a5af34": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_47513573787c4ab1bfafee8a38450355", - "style": "IPY_MODEL_0abdf6aca8e44b2f96d9e278ce60a016", - "value": "Dense(n_units=1, tanh, in_channels='64', name='dense_2')" - } - }, - "07b0e1377c414989a1d7ce1bf1da1c4e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_0bd6e0b89391415fa6fc2c7f7fbf3bd3", - "style": "IPY_MODEL_da04b8e9a4464f7ea141e41904fa3b0f", - "value": "0.999" - } - }, - "080346c4f0ae457182549d3c68aaaaea": { - "model_module": "@jupyter-widgets/output", - "model_module_version": "1.0.0", - "model_name": "OutputModel", - "state": { - "layout": "IPY_MODEL_23d66d78336541bf8b3f863dc3e554d4", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": "Training... | Algorithm: AC | Environment: Pendulum-v0\nEpisode: 0/10 | Episode Reward: -1730.5698 | Running Time: 1.6412\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_actor\n[TL] [*] Saved\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_critic\n[TL] [*] Saved\nEpisode: 1/10 | Episode Reward: -1738.3357 | Running Time: 3.3340\nEpisode: 2/10 | Episode Reward: -1744.1233 | Running Time: 4.9608\nEpisode: 3/10 | Episode Reward: -1854.8743 | Running Time: 6.5518\nEpisode: 4/10 | Episode Reward: -1678.3274 | Running Time: 8.1632\nEpisode: 5/10 | Episode Reward: -1833.9245 | Running Time: 9.7298\nEpisode: 6/10 | Episode Reward: -1805.7677 | Running Time: 11.3628\nEpisode: 7/10 | Episode Reward: -1822.8594 | Running Time: 12.9569\nEpisode: 8/10 | Episode Reward: -1409.2653 | Running Time: 14.5867\nEpisode: 9/10 | Episode Reward: -1752.4231 | Running Time: 16.2574\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_actor\n[TL] [*] Saved\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_critic\n[TL] [*] Saved\n" - } - ] - } - }, - "081136f1075542a3999ce83eba68fdb5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_1ef9aa26484548e99e94bb3d8aae3cce", - "IPY_MODEL_45847f561d154d999d93f170524e2bdf", - "IPY_MODEL_9ce0362f9fac4e45a87ebe7a085a24af" - ], - "layout": "IPY_MODEL_ab2e3b3dc5024debb0c00c3d27d48a8b" - } - }, - "08f5684d8e194916ac04ed379e2bf022": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_019cd764de374cb382236f88a5d204af", - "step": null, - "style": "IPY_MODEL_c4662ffdadef4c7d82aba5ddca1fbfda", - "value": 0.9 - } - }, - "093fd11986764d78ad5dcf1429a496c9": { - "model_module": "@jupyter-widgets/output", - "model_module_version": "1.0.0", - "model_name": "OutputModel", - "state": { - "layout": "IPY_MODEL_0b19536128d34993b9a3354b2a05e2dc", - "msg_id": "8f19b370e7f641249abb608a3c84b213", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": "Training... | Algorithm: AC | Environment: Pendulum-v0\nEpisode: 0/100 | Episode Reward: -1730.5698 | Running Time: 1.6647\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_actor\n[TL] [*] Saved\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_critic\n[TL] [*] Saved\nEpisode: 1/100 | Episode Reward: -1738.3357 | Running Time: 3.3156\nEpisode: 2/100 | Episode Reward: -1744.1233 | Running Time: 4.9611\nEpisode: 3/100 | Episode Reward: -1854.8743 | Running Time: 6.5757\nEpisode: 4/100 | Episode Reward: -1678.3274 | Running Time: 8.2029\nEpisode: 5/100 | Episode Reward: -1833.9245 | Running Time: 9.7915\nEpisode: 6/100 | Episode Reward: -1805.7677 | Running Time: 11.3793\nEpisode: 7/100 | Episode Reward: -1822.8594 | Running Time: 12.9897\nEpisode: 8/100 | Episode Reward: -1409.2653 | Running Time: 14.5941\nEpisode: 9/100 | Episode Reward: -1752.4231 | Running Time: 16.2545\nEpisode: 10/100 | Episode Reward: -1595.9812 | Running Time: 17.8784\nEpisode: 11/100 | Episode Reward: -1750.5559 | Running Time: 19.4594\nEpisode: 12/100 | Episode Reward: -1780.9001 | Running Time: 21.0874\nEpisode: 13/100 | Episode Reward: -1645.4007 | Running Time: 22.7261\nEpisode: 14/100 | Episode Reward: -1684.3441 | Running Time: 24.3810\nEpisode: 15/100 | Episode Reward: -1764.5074 | Running Time: 25.9965\nEpisode: 16/100 | Episode Reward: -1688.8096 | Running Time: 27.6359\nEpisode: 17/100 | Episode Reward: -1582.7040 | Running Time: 29.2999\n" - } - ] - } - }, - "094d34956035446984a6cb8a6efc22a7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "09c74a8b5d1a43828034e148d2edfbfc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "09eb8f946d00416dace2ee661ad55fbd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "0a179f0e33df4522b9286a546e181b60": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_f2ffc80dd5074916b1a69e9de91149f9", - "style": "IPY_MODEL_8784dbc322c7455aaef2b352bae2f205", - "value": "name" - } - }, - "0a21d0f35913467a9b266a75d2af8db0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "0a575cd57803474a9574922e07d3d316": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_5532430429754176a10d6ab53ba4b6d9", - "style": "IPY_MODEL_e35bce23c28f4af3b0d4dce2266ed2e8", - "value": "Learning curve" - } - }, - "0abdf6aca8e44b2f96d9e278ce60a016": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "0af6103ca9e44bb4a44c62b84b39415f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_4112e1653afc41a795418fc54377af6c", - "IPY_MODEL_10d4f1af65b0492594efc926d9976e59" - ], - "layout": "IPY_MODEL_1e197bc7d05a4518969ee7d3f97f211c" - } - }, - "0b081708649d446ab37f522f5a019e19": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "0b19536128d34993b9a3354b2a05e2dc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "1px solid black", - "height": "300px", - "overflow": "scroll", - "width": "60%" - } - }, - "0b1a53d081f547f8ab913cd15fe70058": { - "model_module": "@jupyter-widgets/output", - "model_module_version": "1.0.0", - "model_name": "OutputModel", - "state": { - "layout": "IPY_MODEL_d99dceda8ae6483f8df298525d45be82" - } - }, - "0bd6e0b89391415fa6fc2c7f7fbf3bd3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget010" - } - }, - "0c0d922d9ed14199ab9b8f48b9e8ba1d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "IntTextModel", - "state": { - "description": "multi envs:", - "layout": "IPY_MODEL_f2db93e6094b47d0bfce3821b33d707a", - "step": 1, - "style": "IPY_MODEL_454f999c2ca44e7b86263594806f6191", - "value": 1 - } - }, - "0c64eb2046714b6c885261124bcb09f8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_167de1c7956c4ede9fa6a584404bc568", - "style": "IPY_MODEL_5469680f21e44e77b1092b8354d9aee0", - "value": "Dense(n_units=1, No Activation, in_channels='64', name='dense_1')" - } - }, - "0cabfd585d5d4421a05805698bc1c8ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "0d95601931d94f8cac55349f5886038a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "0dc03ae5db46484a85272ce1899e53c0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_8ca1f8992583484a8a0ff2f7f46afee2", - "IPY_MODEL_99ac959475eb4f75b586ed6599b99113", - "IPY_MODEL_2ab7b4c8b49a4163b5521127d8329674", - "IPY_MODEL_9689f9977c7f455282a9831bcd81905c" - ], - "layout": "IPY_MODEL_eb5fdb48aa1d483fa9acf05a229ef307" - } - }, - "0e74af77352a4b40b0f9e5163d92a836": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget007" - } - }, - "0eb34e6e2b07401dae9a2bfa4f1d49df": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_47ed36f4da904759bb9adcf9f1f1685b", - "style": "IPY_MODEL_7dc1333733194435934e6ca098ede1ad", - "value": "False" - } - }, - "0ec6f6b7c7c84bb4b54e92db8342ce85": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "atari", - "classic_control", - "box2d", - "mujoco", - "robotics", - "dm_control", - "rlbench" - ], - "description": "env type:", - "index": 1, - "layout": "IPY_MODEL_bfdfc9d77a654743a9ebdfc08ab167da", - "style": "IPY_MODEL_ce5b0166c393435a840819472b761b8c" - } - }, - "0fb529fd883648edb15d72a94813126e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_a01f34500cfc486289f3334e3cd222df", - "IPY_MODEL_d6ddb43e654a421ead72beacfae7145e", - "IPY_MODEL_b106f6f6a7f047a4a11ec9f9a23804e2" - ], - "layout": "IPY_MODEL_ffce2434eb114cd1a7f6961dd71ff755" - } - }, - "1022056a831a477e91366a9deda960de": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_ecc6da99cf7944f5a5a6cfd1f0516aa6", - "style": "IPY_MODEL_ebff747fea3f4cf2abb9efcd9f998ddb", - "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" - } - }, - "10685777c5384041b62b4ce3aa26bf6e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "10b2a84971164564ac50d9f53bd98579": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "10d4f1af65b0492594efc926d9976e59": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_50ce374ed2fc4f2ebc2c156c16ba4f38", - "IPY_MODEL_11337137fc3b4e19b06d48508495d2ce", - "IPY_MODEL_fc6a2f4827034d64b99a15547f3d9f43", - "IPY_MODEL_1846a28797b64a7a8266f33f497550d4", - "IPY_MODEL_00ead8f3c1ea4020930b11c3bde3dd48", - "IPY_MODEL_89ae5379ee8b4e2d92f116a018b9420e", - "IPY_MODEL_d6a04d9b77b54ae89af21fa5551e205e", - "IPY_MODEL_b42c755dec514e6fa26ca97f3f0ef923", - "IPY_MODEL_d915d378018e4bd085cf4a0a935e2aaa", - "IPY_MODEL_162bfef08113403d82be4e50b362acb9", - "IPY_MODEL_30d87705b48648089aaa078817a89da2", - "IPY_MODEL_bdb404863da84bdf870e550898f54848" - ], - "layout": "IPY_MODEL_81a50427a5384feeaaee374a19ad5931" - } - }, - "11337137fc3b4e19b06d48508495d2ce": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "RadioButtonsModel", - "state": { - "_options_labels": [ - "train", - "test" - ], - "index": 0, - "layout": "IPY_MODEL_da5694fd870b41e79f41ebc7d7b8db5e", - "style": "IPY_MODEL_3a389cd3e9254722a3bef185d92c9ac4" - } - }, - "1202663af1bf4653bc967824c8574e1a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_98f2c9b34e884cada9e2eedac93e1912", - "style": "IPY_MODEL_67a79ba4cbf84418967857e237a5a1be", - "value": "Environment name:" - } - }, - "1222c8a942134f83aa262d9b321ee413": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "125f5c3fd35e49339e558a30a39a9f8a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_fc83fd9df36b4c0fa6ee544fe520cde7", - "style": "IPY_MODEL_3f7607f9884f482498bb28a91df5ab02", - "value": "beta_1" - } - }, - "12a0f20f2ecd423889594f36b15647f1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "12e50eba7f3e4e9f888416f46172b60f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "145001c5826a41cd989997ea61244ca1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "14a01344ad1b48b3becfe74fa709a0c6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_40848c8562dc485fa88be8cf89c7a5e2", - "style": "IPY_MODEL_a7d8b17ff9fd43298bc30e0471ade94f", - "value": "Input(shape=(None, 3), name='input_layer')" - } - }, - "1537ab75a9dd4f429ffb3812c485116f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_0a179f0e33df4522b9286a546e181b60", - "IPY_MODEL_91d86c9ddbfa4acdaf18e13d8adf3862", - "IPY_MODEL_b4d945e45eae41ceb40de345939615ad", - "IPY_MODEL_715b10d741354c8db506fb8ba945a074", - "IPY_MODEL_b92bc4065ee4473aa6e1b4051e044dee", - "IPY_MODEL_c2160078393b421d9f3a4343f37307e2", - "IPY_MODEL_125f5c3fd35e49339e558a30a39a9f8a", - "IPY_MODEL_04abdee05e514880bb74dfe64bca36ff", - "IPY_MODEL_070bc781a91449c6a7fb227586d347e6", - "IPY_MODEL_2bb83c7012914171b4b76d559b92034c", - "IPY_MODEL_fa3877a284354fd08f33d320314b6765", - "IPY_MODEL_5446746816dd4edf8dffb29995d15715", - "IPY_MODEL_3755df840c214a33941879b316489adf", - "IPY_MODEL_776cdbcecc004924a856eb45ec0a5699" - ], - "layout": "IPY_MODEL_4b5dc49fbc1743c8abe6cded3f9ed703" - } - }, - "159f94f25de5436aafa6fec3c88e3356": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_0106cced0fe54fbb9a3a261b11941cce", - "IPY_MODEL_cfb6b6bcedad4f61893206fb1eb28385" - ], - "layout": "IPY_MODEL_89880b2c3e03469da53b8a7e9e2e930b" - } - }, - "15ae64b32d794189a34bba91e2f7a15b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "162bfef08113403d82be4e50b362acb9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_474e0de897334eb69236cc05ae69f164", - "IPY_MODEL_aafbebe0ec5b4425acf54f0ad9f6c80f" - ], - "layout": "IPY_MODEL_66bc7fd58a2743a0960e9dd5df378998" - } - }, - "167816e5912f4ea18d96b6e468d82ae7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "167de1c7956c4ede9fa6a584404bc568": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1764805129704afcb7c170e877b81788": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_fe547223f16e423fa8493d4c6ae577ba", - "IPY_MODEL_093fd11986764d78ad5dcf1429a496c9" - ], - "layout": "IPY_MODEL_2bea049f9ec74da0bcf2a7eeffce8720" - } - }, - "182107ee16aa4bfba497dd033e347d65": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e6958eae462d43d8bdb9c6227deddcc7", - "style": "IPY_MODEL_f9a9a8529629435f926e28c9e2ff6d21", - "value": "Observation space:" - } - }, - "1826b147229c4a96b6603cc13978a090": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "182c5797541f4476bb02c95a710f1bca": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1846a28797b64a7a8266f33f497550d4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "CheckboxModel", - "state": { - "disabled": false, - "indent": false, - "layout": "IPY_MODEL_e8b87d816ccb409083b0c522ef0bd9dd", - "style": "IPY_MODEL_167816e5912f4ea18d96b6e468d82ae7", - "value": false - } - }, - "18470dca56a94ced8388c8eec402515f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_294896e2ec5f413e9e23d9ec81e6bbbf", - "IPY_MODEL_8c59866961674911b2157bded443e366", - "IPY_MODEL_261d86e673814c6b9c6ed7b921861867", - "IPY_MODEL_6d5b0a5b26874cfd874c4a0bdf307eff" - ], - "layout": "IPY_MODEL_b58381d8050044ee9df6c0857e3a06e4" - } - }, - "18a7121ba72e42af9a496a39fb8c6f6a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "18ea002dd43344a5864f8a8651ceeaeb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget007" - } - }, - "19b0d8173d9141e0a0db8d0b2110c98c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(6, 1fr)" - } - }, - "1a3aa6da2cad4cfd9696b32125ab645b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "1adbcde168d04bcdaed1c410feae74ac": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1b48b0f90cef4077aa20b9ee8be52e9b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget002" - } - }, - "1c09f9523eb2469ab864ddcd5f15f417": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1c75d4a07143476588ce4826116ea8ee": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1cb1d8e98bef410e85502ad2edb46c45": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(2, 1fr)" - } - }, - "1cb88e139a0642afb2f3c958dff539aa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_9705108e9dd540fa8e02c1933e03eadd", - "style": "IPY_MODEL_2126fce329534e2b98f039a35e99344a", - "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" - } - }, - "1d03aaf95d45497ca74e337a82632cee": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_2ee89b46bdc146f9b9f4f48f5874a349", - "style": "IPY_MODEL_e0a1f12f4f0e4e31adc281b1fe6dee11", - "value": "0.9" - } - }, - "1db128fafd984258b040b5295b477f0d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1dbbcf0744194117b3463d5ae8af00ef": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "1e197bc7d05a4518969ee7d3f97f211c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "1e327c8e4b844c2fbb017a5544fa678e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "1e6d0c80ceaa4e58846e9f554371b363": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "1eec2203d3bf49c2876604c21291cc18": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d20f2266d6fc44df988c78b63b202a81", - "style": "IPY_MODEL_5228a7a8160f421f846e2d7d06c9d159", - "value": "1e-07" - } - }, - "1ef9aa26484548e99e94bb3d8aae3cce": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_5ac9e6a121a3488ea93f85f5589429a0", - "style": "IPY_MODEL_698f9329e3754e7482dc32690ba58f4a", - "value": "Environment settings" - } - }, - "1f0e424278554da08fbb15138e571a62": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "1f37fdacb85646a1b5ff9a2b1d6ab38a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "2126fce329534e2b98f039a35e99344a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "214c87e57eb641bb89644c9f465889ca": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget007" - } - }, - "2205db5769754bf0948d81dde160eab4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "22126658c9d54cfab48b63029798c705": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "223fd915d3a5472aabdde3b5dd47a5f1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "22ff0e7129b04334b71044d77e3c9298": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "BoxModel", - "state": { - "children": [ - "IPY_MODEL_eef437964b4e4fa29ea42afc6b9a69ce" - ], - "layout": "IPY_MODEL_759c11789beb46f798f3b48c4cf88577" - } - }, - "23424247d797485dba0788eb6b7614aa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "235af533ab1c41a6b82350c6f3a88426": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget010" - } - }, - "23d66d78336541bf8b3f863dc3e554d4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "1px solid black", - "height": "300px", - "overflow": "scroll", - "width": "60%" - } - }, - "24f450d31f2d47a68aa2c58be28170fb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_09eb8f946d00416dace2ee661ad55fbd", - "style": "IPY_MODEL_f8a20f2f4b8b4c03857bcd85bf96b136", - "value": "name" - } - }, - "254576dd293543d384c9e5620c3db225": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget005" - } - }, - "26036b1a064245a6a1cef60ec7d39376": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_b18ac7a05b7c4d58813a3e735173a3ca", - "IPY_MODEL_0ec6f6b7c7c84bb4b54e92db8342ce85", - "IPY_MODEL_467644544d33439284f04fe2a9883182" - ], - "layout": "IPY_MODEL_f9b983bef3a14087b6d1f966b8b041ed" - } - }, - "261d86e673814c6b9c6ed7b921861867": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e9794b57be6c4c0e981a017d3fa82a36", - "style": "IPY_MODEL_946c2a2e7e8f4e36b0311e922520272f", - "value": "Optimizer information:" - } - }, - "266e10703ed340a78b259c7d3ddc8836": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_6d6739242111448eaf1e80a8962f1aac", - "style": "IPY_MODEL_bf620c54949846b49135585c61101b19", - "value": "Environment Information" - } - }, - "26c0e699dae643b58817819a3d134e6f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_f29a7f4ff2a74bbf8d6485cbfb086152", - "IPY_MODEL_bb5d38052b40427585a8ec928bdef7b5" - ], - "layout": "IPY_MODEL_d02f0cd6f8f94156ac86605286a6ee78" - } - }, - "27fbf57b093b4444b8990601eaddca26": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_01cece59d650454b9cf09d03e85a6a10", - "IPY_MODEL_e09e0ff65ebf454b80a965aaa0f61d32", - "IPY_MODEL_83c18b3b4c374f70947e47230ffe4f82", - "IPY_MODEL_06d5c4249f3d404793fe2defc8eb0051" - ], - "layout": "IPY_MODEL_ff06931e66b544389c8f409734b472e3" - } - }, - "283080f17fcf4286b2e6e059bcda3370": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_4b23820dcff647a6ad204c7c4a596248", - "style": "IPY_MODEL_1826b147229c4a96b6603cc13978a090", - "value": "mode" - } - }, - "28ad6172b7f34ba9923847d24dd555b3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "294896e2ec5f413e9e23d9ec81e6bbbf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_98824ad5eda8475394e9fb13819502a9", - "style": "IPY_MODEL_79953b3e59c048548c96bb197d46a7ea", - "value": "Network information:" - } - }, - "2982ccca674f4bfc839557e06cde9993": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_14a01344ad1b48b3becfe74fa709a0c6", - "IPY_MODEL_1022056a831a477e91366a9deda960de", - "IPY_MODEL_814eef7fa97a4fa2b4c5f1ed1b3728f3", - "IPY_MODEL_0c64eb2046714b6c885261124bcb09f8" - ], - "layout": "IPY_MODEL_223fd915d3a5472aabdde3b5dd47a5f1" - } - }, - "298f572cd2ec4a9ca5a6feafaf334040": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget002" - } - }, - "29a207365d934cc4a402ed72a19194ca": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "solid" - } - }, - "2a9fb576ef6145abaf95398bf620cd8d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget010", - "justify_content": "center" - } - }, - "2ab7b4c8b49a4163b5521127d8329674": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_7d70e416e925499f93e5837aabc6afc2", - "style": "IPY_MODEL_69268529fca5425e9f11506c968490e7", - "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" - } - }, - "2b0d8567d4aa4e53a5837284b315cc58": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "2bb83c7012914171b4b76d559b92034c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_235af533ab1c41a6b82350c6f3a88426", - "style": "IPY_MODEL_75c167ca66774581880b2500d5176a36", - "value": "0.999" - } - }, - "2bea049f9ec74da0bcf2a7eeffce8720": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "2c0353597c114ba184977dac607510c3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget006" - } - }, - "2c48650276864e79a7b82413ddd8c6fa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d34c7789bb974de1a36ef3cc45737b52", - "style": "IPY_MODEL_626ae439ee1f4ce4895764fb66f9c6d3", - "value": "0.999" - } - }, - "2c9a721e0f084f8f8f437a5d4d875e3f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "2d1f0d1b81ee4e1f85ae2f777dcd0db9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "2da2537f2e444e16ad634693e684af58": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "BoxModel", - "state": { - "children": [ - "IPY_MODEL_e4665eee9731436a839eaebea246f048" - ], - "layout": "IPY_MODEL_e944a76d793541058cf5f32563847fb3" - } - }, - "2dab24721ba34bd789afa55d1479464b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "2dece16eb4994e5082a1cbeeea4163d0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget002" - } - }, - "2e65a763e5db40ca8969c36950c0d9bd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_1c75d4a07143476588ce4826116ea8ee", - "style": "IPY_MODEL_15ae64b32d794189a34bba91e2f7a15b", - "value": "Supported algorithms are shown below" - } - }, - "2e6e71650a6a48878fce055c8e563538": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_f1985e262a7d401ea97c903091713789", - "style": "IPY_MODEL_2205db5769754bf0948d81dde160eab4", - "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" - } - }, - "2e8b3025623248e2a92daa5a7750997f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "2ece943ff83c48e8b69e0b2396b6064c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_aeecfc3325ec482ebd31ced3fc2e6839", - "style": "IPY_MODEL_b979276c5b584ebab1400eea707b2c39", - "value": "Pendulum-v0" - } - }, - "2ee89b46bdc146f9b9f4f48f5874a349": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget008" - } - }, - "2f93a27048a44beda22771c8249fba0d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "dotted", - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(7, 1fr)" - } - }, - "3025ff51115247eebfcfe7e2a18e414e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "3044da8a1f89485398f1ea9d4965bc55": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget006", - "justify_content": "center" - } - }, - "304f4dcdb42b4bca91451ccfe7eba639": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "30d87705b48648089aaa078817a89da2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_c3c09aa3ecea45eda2b142c857c5d7c5", - "style": "IPY_MODEL_e3adb676dd9b48a6bd4e895ac644b653", - "value": "train_episodes" - } - }, - "31276a604cf14bcd82297907c46c17f8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_a899edcecbcf49d1a1f57b48bed97865", - "max": 400, - "readout_format": ".0f", - "style": "IPY_MODEL_4711e3b757ae4ba08ece2d994aa46c2a", - "value": 200 - } - }, - "31f3ea5f445a4342b1a4db664f61eb93": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "31fe17808d8e4f7ead5964af2e4f5894": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "dotted", - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(7, 1fr)" - } - }, - "329f804132904f47a73d10b3ccba4b4d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "atari", - "classic_control", - "box2d", - "mujoco", - "robotics", - "dm_control", - "rlbench" - ], - "description": "env type:", - "index": 1, - "layout": "IPY_MODEL_8ae2c037e98f420486a61a8570daf106", - "style": "IPY_MODEL_df84370f89e949518569f900854e2510" - } - }, - "334d1a726d2347db82e42df5760618b3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "CheckboxModel", - "state": { - "disabled": false, - "indent": false, - "layout": "IPY_MODEL_c3d17e5a575344968f8b84a174b26ba9", - "style": "IPY_MODEL_31f3ea5f445a4342b1a4db664f61eb93", - "value": false - } - }, - "33ecf71f75a649a285ea6a8211b5acbd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "3488ba4c7374447794395c4c315a1193": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "351ae05c16d040dab9a578c06a78858c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "35525c0fbffa497eb43f7d5bd081bb0b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "initial" - } - }, - "3556d6d1fe0c4e558b21b70b8c7b9395": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "3755df840c214a33941879b316489adf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e835260b70924edd959ac38cbdaa50d3", - "style": "IPY_MODEL_7aa2babe24dc4fab84bfbd511f0b5e98", - "value": "epsilon" - } - }, - "379d32750a8c4e88b3b6a8d76c3ee91b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget012" - } - }, - "383cf0cb101341d4bdfb65604a24a4d5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "38484ea61c3449a1b809d8526ead582d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget006" - } - }, - "389174ab87e24a48a23ad5f81a32da61": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d2ba7f491ec94768be174bba323aff6d", - "style": "IPY_MODEL_a32e41356969452abe56558608109dc8", - "value": "test_episodes" - } - }, - "38f46c0b84c84233a228758c9b306a79": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "3909591203bd4321b62ed4e0aa575a3e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_a2bb633318304f79a811eb07e18da7f5", - "IPY_MODEL_4ab1ce52edf54c879f2ee002e94c98f1", - "IPY_MODEL_159f94f25de5436aafa6fec3c88e3356", - "IPY_MODEL_4ab1ce52edf54c879f2ee002e94c98f1", - "IPY_MODEL_88b977df9d82476298ff3c70d714afe0" - ], - "layout": "IPY_MODEL_886c73a1052a4a2da9ec06c958855a51" - } - }, - "39219af0b9a34c03a11682fdbaf85b04": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_e9d6d91ceda64a63b9fe358e90337820", - "IPY_MODEL_9694a75a41e543a3b2642aee3572857d" - ], - "layout": "IPY_MODEL_ed746bfae28741e9ae1d450dd1394423" - } - }, - "39c394badc7246fdb12032649f71a1b6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "3a389cd3e9254722a3bef185d92c9ac4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "3a3916bde1e849aeae0e2701258ddc34": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_fc20a5f1e967425c840960c1948f00c8", - "style": "IPY_MODEL_c75a9640bb26465785ca214520007519", - "value": "train_episodes" - } - }, - "3a96e3ae233940e18c75f004da9e0459": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_bf7a578fb6204ce694235598a0f00ea2", - "style": "IPY_MODEL_f2612900bd944258af3be77cacc7a46b", - "value": "name" - } - }, - "3b0358464a32494ea410b866646b79b1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_dd51349042bc4341b061da02df9f8be2", - "style": "IPY_MODEL_63c30e87411c45dd8d58dfa485850fc2", - "value": "learning_rate" - } - }, - "3c695e15ebbd4ecfb555b0fe5221ad10": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_9a247aedcd64492d9b4ddf9d76c13062", - "style": "IPY_MODEL_96fc368f69794e5baa9433c3a31b1ec1", - "value": "amsgrad" - } - }, - "3c77984eb49f4b3fbf5b78b313af8071": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget013" - } - }, - "3cfd11894b514078901081bddd35c83d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "3d9166fc4fcf43f3b930ebc7f996a5bf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "3e9c9dcc814b47f8b2b392074c83d853": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\n \"a00 a01\"\n \"a10 a11\"\n \"a20 a21\"\n \"t0 t1\"\n " - } - }, - "3f7607f9884f482498bb28a91df5ab02": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "40747ee3248e4cbca2b22e3201e7ae52": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "4080aa3475b94001b5324fd14d18816c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "initial" - } - }, - "40848c8562dc485fa88be8cf89c7a5e2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "40c1e5560977460b86028ca09ee94662": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget005" - } - }, - "4112e1653afc41a795418fc54377af6c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_808fb0e5d6b940388d588196c927564d", - "style": "IPY_MODEL_9b276e72efa44a7e911ee209d08859b6", - "value": "Learn Parameters" - } - }, - "413fd706b68148a099ed9af1a952ec6d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "41425cf814dc44c49ac901aeec4c668f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "420cda5d7fd34a05b48fa845558987c4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_cfc4c351d9da4a2bbe36bb1288f74e82", - "step": null, - "style": "IPY_MODEL_9b5f3fd4ebd341ac91227f9ded9fab19", - "value": 200 - } - }, - "42f8297b00d240308e7403a004a1c6b4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "432a3a690b36409192aa3ee4dd5fedf8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_45b014170b1e4c6b8efc9d245b587b48", - "max": 1.8, - "readout_format": ".1f", - "step": 0.1, - "style": "IPY_MODEL_4c528854314c4df18a84eafa4f1a7404", - "value": 0.9 - } - }, - "434eec441fb94a30bcb70bec50c60d78": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_681fa50d92ed4da0afda87805d2383ca", - "IPY_MODEL_18470dca56a94ced8388c8eec402515f", - "IPY_MODEL_da5536ed85464ee5a97c44660b985348" - ], - "layout": "IPY_MODEL_74dc8e60490943c8b9601232bf24f608" - } - }, - "43730220bf8e489cae588fcf375d08cf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_31276a604cf14bcd82297907c46c17f8", - "IPY_MODEL_420cda5d7fd34a05b48fa845558987c4" - ], - "layout": "IPY_MODEL_ddba268ea0db428898643ae0f9a259a3" - } - }, - "43ca75c41e054155b5ad51e493b3b990": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e53d3b32848c4872a5e1254a2ed080f1", - "style": "IPY_MODEL_e467ed3285684035a013df63ebb6b422", - "value": "Tips:" - } - }, - "43f9446733e242f1977bbe394ddc479b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "452324b6d7cc4cf28d456787efc23b8f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "454021a337164bae8a96f5a5a7749b78": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "454f999c2ca44e7b86263594806f6191": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "455c6fed537d48b188edef0200ab0fb1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_07377f1ec0e74dd4897d484914a44f99", - "style": "IPY_MODEL_a5d8986e9aad47b1ba7821ddf2850c7a", - "value": "Algorithm Selector" - } - }, - "45847f561d154d999d93f170524e2bdf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "IntTextModel", - "state": { - "description": "multi envs:", - "layout": "IPY_MODEL_4cff6dcb31874722a4fcd9052bb1f9b6", - "step": 1, - "style": "IPY_MODEL_e41fe8ee1bf04764abe02428057a540a", - "value": 1 - } - }, - "45850b0512424834a6d4c70e60892ae8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "45b014170b1e4c6b8efc9d245b587b48": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "45e906bdfe7a464d848f9c972f536d31": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "467644544d33439284f04fe2a9883182": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "Acrobot-v1", - "CartPole-v1", - "CartPole-v0", - "MountainCar-v0", - "MountainCarContinuous-v0", - "Pendulum-v0" - ], - "description": "env name:", - "index": 5, - "layout": "IPY_MODEL_e210fdbc53d246a2ae55da6a3689745b", - "style": "IPY_MODEL_f29ba87ee02f4fc38760b98a32e20581" - } - }, - "469da089cf804101a4cbc570975a1aed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_ac4da45cf7d84d5fa0ea8963afbe5c12", - "style": "IPY_MODEL_dc12042cc1bb40c98a69bef90468797a", - "value": "gamma" - } - }, - "4711e3b757ae4ba08ece2d994aa46c2a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "4749f46df2c4438e874ed6912a4d7ef1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget009" - } - }, - "474e0de897334eb69236cc05ae69f164": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_c234ed19a3204e1d9452d6686e014efb", - "max": 200, - "readout_format": ".0f", - "style": "IPY_MODEL_22126658c9d54cfab48b63029798c705", - "value": 100 - } - }, - "47513573787c4ab1bfafee8a38450355": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "47d275b36e704a74a22098c38f14f301": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "47ed36f4da904759bb9adcf9f1f1685b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget006" - } - }, - "48392da1f6c64d3fad859465d0d0095b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget002", - "justify_content": "center" - } - }, - "48a97cf1c4a44a858c3376f962060321": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "48d65f9009904854b076047201074a2c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\n \"a00 a01\"\n \"a10 a11\"\n \"a20 a21\"\n \"t0 t1\"\n " - } - }, - "494deb5503e842b78948ed2c14e28e3e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget009" - } - }, - "49c009585e524d98af99d984cf65a85b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "4a1bc5d7007848cb89e08eff1479ddf8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "4a2a0ec5e8f641f489d58e31f3f5fcef": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d1b7a611e0ea474991c6034e7e7a9e98", - "style": "IPY_MODEL_60104c359482485eaa44f621628fb667", - "value": "Box(3,)" - } - }, - "4a88a99c974d47da993c8bde3faab362": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "justify_content": "center" - } - }, - "4ab1ce52edf54c879f2ee002e94c98f1": { - "model_module": "@jupyter-widgets/output", - "model_module_version": "1.0.0", - "model_name": "OutputModel", - "state": { - "layout": "IPY_MODEL_29a207365d934cc4a402ed72a19194ca" - } - }, - "4b23820dcff647a6ad204c7c4a596248": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "4b5dc49fbc1743c8abe6cded3f9ed703": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "dotted", - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(7, 1fr)" - } - }, - "4b9184b437ac441e8c485894889e7fd4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_8865f419c3a04323907d8e9d11f06c24", - "IPY_MODEL_c60dc42b295c47138b76205df9071217", - "IPY_MODEL_85165a2de0d64a2bb9baf9b64b3ffa38", - "IPY_MODEL_bffd75c7e90346ebb8214c6fe0ce2ab4" - ], - "layout": "IPY_MODEL_1cb1d8e98bef410e85502ad2edb46c45" - } - }, - "4bbe95c5e6b34795a2058cc7bf7416f9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_615934e58366458ea65a907cae98c64e", - "style": "IPY_MODEL_570c4f6867da492cafc6318dd145f87d", - "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" - } - }, - "4c528854314c4df18a84eafa4f1a7404": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "4cff6dcb31874722a4fcd9052bb1f9b6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "4d8d22e583c64179817ad9c514bd4490": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "4e6414fcd34b454e94c982f7233402a7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "4ee9cbafcaad44de9f9e7453ee765047": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_f74c2a3b52114bbc80056d7097731209", - "IPY_MODEL_7fbbe1851a944d69a568c06875de2b0f" - ], - "layout": "IPY_MODEL_2a9fb576ef6145abaf95398bf620cd8d" - } - }, - "4fa0861e758940d9b9c2775304ebb140": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "50ce374ed2fc4f2ebc2c156c16ba4f38": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_5ee808e0128f4e85921b2855f4ff3831", - "style": "IPY_MODEL_6a001a1bb11844d0b85857486c544879", - "value": "mode" - } - }, - "510e33d521264ac387af97dbbb46dd39": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_content": "center", - "align_items": "center", - "border": "dotted" - } - }, - "516cc7132ca94faab3023ffcd1ed4cd4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_5af150388cac4ebc96775a3696923399", - "style": "IPY_MODEL_81621cd1e69f47a1868bf499caac5824", - "value": "Choose your environment" - } - }, - "520b2e1af36547edbae1352d82099fda": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget002" - } - }, - "5228a7a8160f421f846e2d7d06c9d159": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "532ea00fd94045298f69a3917ced39c7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget008", - "justify_content": "center" - } - }, - "53c0481b6b294cf888f2b3abdc33a95c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "543b543dd8bb4fcb9dc9f4a16ac4bd6e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget005" - } - }, - "5446746816dd4edf8dffb29995d15715": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_6406ec864c1848d88b92c9b5248a9c9e", - "style": "IPY_MODEL_891e2bdcc12d4314affa4fd372ed7ade", - "value": "0.0" - } - }, - "5469680f21e44e77b1092b8354d9aee0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "547d2113aae04e20ba41d30deb33ec5f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget014" - } - }, - "54927f9f2cde4416bf0e3b782fbd5118": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "5526ed8ea7b4499eadc0bbb165d7bbc4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_965b9a99694b4227a43121ae2e974290", - "IPY_MODEL_e57f860aafca4775a03574208f4944b7" - ], - "layout": "IPY_MODEL_510e33d521264ac387af97dbbb46dd39" - } - }, - "5532430429754176a10d6ab53ba4b6d9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "55790721852a4ac38f0bf04e1016c16a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_28ad6172b7f34ba9923847d24dd555b3", - "style": "IPY_MODEL_c35cf89d5b4c42c886c9c83fdc93c8e6", - "value": "Environment name:" - } - }, - "55abe6fb296b491ba2e2a09a492b5ae8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "570c4f6867da492cafc6318dd145f87d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "575f6d3a87c041e4a3005385d7ec75b4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_b20aaab10e6a49138d9cf0a414321c49", - "IPY_MODEL_c2aa94c81efc4f3f826adcb847fbdb89" - ], - "layout": "IPY_MODEL_8173f889450249d58f18acfe83d63ddd" - } - }, - "57f97e2ebec542f8b297365916bf571e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget011" - } - }, - "58201f662dc74741bcdeb0e7753843c4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "593926166a704759992244f9732d0f8d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "595aeae2634948268510587998ec9587": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_575f6d3a87c041e4a3005385d7ec75b4", - "IPY_MODEL_080346c4f0ae457182549d3c68aaaaea" - ], - "layout": "IPY_MODEL_b9743661bbd24d94969c463e1f77d6e8" - } - }, - "59da397a7faa43c79c633dd523b6f07b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "5ac9e6a121a3488ea93f85f5589429a0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "5adceaf568da4a1d88d6bf7b379965c2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_0e74af77352a4b40b0f9e5163d92a836", - "style": "IPY_MODEL_c7a9f23b553e43a78d5c0ced37526327", - "value": "beta_1" - } - }, - "5af150388cac4ebc96775a3696923399": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "5af1a3e17ac64264905701b109c013e2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_24f450d31f2d47a68aa2c58be28170fb", - "IPY_MODEL_a7d002d3e5454965af1d9cdb2e54e7ca", - "IPY_MODEL_3b0358464a32494ea410b866646b79b1", - "IPY_MODEL_b4047180a5aa44479c358d8c12f0c5d5", - "IPY_MODEL_9fd6a74ce4e54ae38816e55d19327281", - "IPY_MODEL_0eb34e6e2b07401dae9a2bfa4f1d49df", - "IPY_MODEL_5fc0273b28ca4f42b441948986c98e99", - "IPY_MODEL_bd7afa2132154beebd89e4320ebcad26", - "IPY_MODEL_d48e8464b37c4f0099d42e59369dbab6", - "IPY_MODEL_07b0e1377c414989a1d7ce1bf1da1c4e", - "IPY_MODEL_b04b868ce504489c82bd8818501b3ac3", - "IPY_MODEL_d1ba6fbf21674589b3f585f6e0f9638b", - "IPY_MODEL_c083a4b8f36848ed9f277f423ae18084", - "IPY_MODEL_8c168f5c8ecc4d0ba203b60193856d1c" - ], - "layout": "IPY_MODEL_2f93a27048a44beda22771c8249fba0d" - } - }, - "5afcc13ec3d94e6299bd06fb87ed7885": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget007" - } - }, - "5b759ba6fc8f451c97ee15467069a6ed": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "5b87473fb6cc473a89998a285388f4da": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "5bced3d11d4a41a4b3e1c712f83b98e4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "default" - ], - "description": "state type:", - "index": 0, - "layout": "IPY_MODEL_f4d0297192f5464bac7ab02b3dabed2c", - "style": "IPY_MODEL_7fea48aa29c24b4b94784890589e01e4" - } - }, - "5caab83d7d4d4658ac739d02b56e9fd6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "5daa3bcd6829495cb223328230f0f8e4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "5ee808e0128f4e85921b2855f4ff3831": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "5efb085669c2400a909ac37b5cb4e45e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget008", - "justify_content": "center" - } - }, - "5f1fda7eb4ac4ce694f721e312e205ab": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "5fc0273b28ca4f42b441948986c98e99": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_18ea002dd43344a5864f8a8651ceeaeb", - "style": "IPY_MODEL_e14f5611fa9242af879512207669394f", - "value": "beta_1" - } - }, - "60104c359482485eaa44f621628fb667": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "611b3bc2e8e749a38fe77bbdab064670": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_3c77984eb49f4b3fbf5b78b313af8071", - "style": "IPY_MODEL_b64d5e345cb5482595aa92662c8f162c", - "value": "epsilon" - } - }, - "615934e58366458ea65a907cae98c64e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "6187b72c80f64272a6c33c90cb582c4c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "center" - } - }, - "626ae439ee1f4ce4895764fb66f9c6d3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "629ece3b43ac4c8a8c2f83733a180978": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget005" - } - }, - "62a5e4f04f554e6580d63bb32f36b3be": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget012", - "justify_content": "center" - } - }, - "63c30e87411c45dd8d58dfa485850fc2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "63d55c74d6ed493abe58361958b23046": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "6406ec864c1848d88b92c9b5248a9c9e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget012" - } - }, - "64750206fa3a48119aa85e75f5ff2de8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_55790721852a4ac38f0bf04e1016c16a", - "IPY_MODEL_2ece943ff83c48e8b69e0b2396b6064c", - "IPY_MODEL_7a5d99612efa45acb82149814a4a7e82", - "IPY_MODEL_87b22017505c4d14a335692f09abd816", - "IPY_MODEL_8f5e2c19238240c38947f1a5d8e72792", - "IPY_MODEL_2da2537f2e444e16ad634693e684af58", - "IPY_MODEL_6e144126a66b48f9a22641284932ad73", - "IPY_MODEL_ef95b43fb5cd436cb6f737f2defc8e38" - ], - "layout": "IPY_MODEL_48d65f9009904854b076047201074a2c" - } - }, - "660e8c250f974ff685128c61b3d57fe3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "661fd55473c0431aa9dffd6876d1d559": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "66bc7fd58a2743a0960e9dd5df378998": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget010", - "justify_content": "center" - } - }, - "677e2010d7ce45eb9adc6f26a8977636": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_3556d6d1fe0c4e558b21b70b8c7b9395", - "style": "IPY_MODEL_0580852520e142a89d7b42c50bfef6a1", - "value": "learning_rate" - } - }, - "67a79ba4cbf84418967857e237a5a1be": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "681fa50d92ed4da0afda87805d2383ca": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_fb19638e8a38465f844aaf06c6378b29", - "style": "IPY_MODEL_47d275b36e704a74a22098c38f14f301", - "value": "Algorithm Parameters" - } - }, - "683e3afa65604f1b85604a79ec228a2b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "689e8f05af2f4f159239a896e7e9843a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "68d4eab6f1cf4e2fa0e229ecdce8d392": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget005" - } - }, - "68fcf5652dd14e5fad220fcbe777ddbb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "description_width": "" - } - }, - "691c17934ca3435eb36a2d84d15ecdf7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_3a96e3ae233940e18c75f004da9e0459", - "IPY_MODEL_8d18e0fa10b94372a3edf64edb4814bc", - "IPY_MODEL_677e2010d7ce45eb9adc6f26a8977636", - "IPY_MODEL_e224793bc1524f0c91ce3d7ef0e98f8e", - "IPY_MODEL_c34d5f3024f24951b4f478bca62dd7c7", - "IPY_MODEL_6bb0b7ee0cdf49ca97bb0c3b528131e8", - "IPY_MODEL_5adceaf568da4a1d88d6bf7b379965c2", - "IPY_MODEL_6c1a4850cad844f4bd144b78177e6d31", - "IPY_MODEL_c12ffb6b4533460bbdfc7404ff89d807", - "IPY_MODEL_e6c798aa900740009741c67dfccb0d92", - "IPY_MODEL_75b1aa83fa184214aecc8ea858858cd3", - "IPY_MODEL_e1f03c622ff64b3bb4e59fc54e7898a6", - "IPY_MODEL_611b3bc2e8e749a38fe77bbdab064670", - "IPY_MODEL_eb54eb7b3c674e67b10610ce2aaf309a" - ], - "layout": "IPY_MODEL_fb06877af7ae451baefc12dfd27d9348" - } - }, - "6923c73eeac747fdbe41b2062e257a58": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget007" - } - }, - "69268529fca5425e9f11506c968490e7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "698f9329e3754e7482dc32690ba58f4a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "6a001a1bb11844d0b85857486c544879": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "6ab9513a615a4551a596a3d2e637d181": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "6bb0b7ee0cdf49ca97bb0c3b528131e8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_2c0353597c114ba184977dac607510c3", - "style": "IPY_MODEL_82c3b758724944d0b02d17ecfdd05698", - "value": "False" - } - }, - "6c1a4850cad844f4bd144b78177e6d31": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e255dc6e7af7487e8a2729f670bffd8a", - "style": "IPY_MODEL_012eeb7c3bab46d9baa05356cd4ff0f6", - "value": "0.9" - } - }, - "6c751fa2c2aa415ea57d3c9b0e11b22d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_254576dd293543d384c9e5620c3db225", - "style": "IPY_MODEL_304f4dcdb42b4bca91451ccfe7eba639", - "value": "max_steps" - } - }, - "6caef128e4df40ebb76ef90ad9a40d41": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_a496bd2aabab465fbcf0022dc1acd19f", - "IPY_MODEL_2982ccca674f4bfc839557e06cde9993" - ], - "layout": "IPY_MODEL_fbd450c8b01f4ab9ae7ea1caa129bd66" - } - }, - "6cb628f08ae2469db2ee42e38ca4de74": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "BoxModel", - "state": { - "children": [ - "IPY_MODEL_90d52d8b63c342f087384246a76680d7" - ], - "layout": "IPY_MODEL_759fddd650134c46bbbbd4b4c6f8c744" - } - }, - "6d5b0a5b26874cfd874c4a0bdf307eff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_1537ab75a9dd4f429ffb3812c485116f", - "IPY_MODEL_a18265de326b4d399e760f9d2e5bb238" - ], - "layout": "IPY_MODEL_7208b8f21c77462dad67124eb0fd8164" - } - }, - "6d6739242111448eaf1e80a8962f1aac": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "6db9105409df4485909f169fc6e6d696": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "6dc0399123f94dd1831a2b2cfb6c3078": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "6e144126a66b48f9a22641284932ad73": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_53c0481b6b294cf888f2b3abdc33a95c", - "style": "IPY_MODEL_a8e550f371f94677a29e238776be2cdb", - "value": "Tips:" - } - }, - "6efa143c4b9d43aa94ed8cfe56824583": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "6f0bd8ffadf44461a70b1031b3f65064": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "learning curve smooth factor", - "layout": "IPY_MODEL_145001c5826a41cd989997ea61244ca1", - "max": 1, - "step": 0.01, - "style": "IPY_MODEL_4080aa3475b94001b5324fd14d18816c", - "value": 0.8 - } - }, - "6f525160109d45299758550c08196bd9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "description_width": "" - } - }, - "70c300868924433094e74b74d260a4a2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "justify_content": "center" - } - }, - "715b10d741354c8db506fb8ba945a074": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_1e327c8e4b844c2fbb017a5544fa678e", - "style": "IPY_MODEL_6ab9513a615a4551a596a3d2e637d181", - "value": "0.0001" - } - }, - "7208b8f21c77462dad67124eb0fd8164": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "731d299fb9dd45c1a41a5d4df4f41f94": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "747e88ebfefc4efb95f60f63e725dcc1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "74d03d1491d4451d879384ab357f33a9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "center" - } - }, - "74dc8e60490943c8b9601232bf24f608": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7532b84aea3a4f4290efa4b0369e846a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "759c11789beb46f798f3b48c4cf88577": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "759fddd650134c46bbbbd4b4c6f8c744": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "75b1aa83fa184214aecc8ea858858cd3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_7cdb0eb01b9b434ca4c08fd25f243f09", - "style": "IPY_MODEL_3cfd11894b514078901081bddd35c83d", - "value": "decay" - } - }, - "75c167ca66774581880b2500d5176a36": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "76c7ceb7a42e44048e694b71f27f56eb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "76d1b335a0134c19852090005ae135c4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e8260cb1f55049a49bdaf024528d43c4", - "style": "IPY_MODEL_def02ee29d9a44b19a1fd20f8a4be1a0", - "value": "name" - } - }, - "76dec90334724f3ba9e51ba05856ff79": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "776cdbcecc004924a856eb45ec0a5699": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_98eeb6cc7ac643ac882d54fab647de04", - "style": "IPY_MODEL_a02320673c484c46848d7aeb6fda6e18", - "value": "1e-07" - } - }, - "78f5897896d144fe839fafd65e76816e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "79611f87c64c431794f17eccbbd60f38": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget012" - } - }, - "79953b3e59c048548c96bb197d46a7ea": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7a4be7c4229640b18c29d60d30cc0e70": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7a5d99612efa45acb82149814a4a7e82": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_7e40917d81264ee9986d07bae8291022", - "style": "IPY_MODEL_1e6d0c80ceaa4e58846e9f554371b363", - "value": "Observation space:" - } - }, - "7a6c0819e1344119aae9ef136830ad44": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7a7ebee6dcf34f36b1d55d2cb443e387": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7a807eea55d14bae96d792b1e475adcb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7aa2babe24dc4fab84bfbd511f0b5e98": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7aba7921241e41af9a32cbe042699485": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7af9623e94c64555b01efa581f338e60": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_841b7f5d915e4f639784140b23610d75", - "IPY_MODEL_e904337542fd4e5d8187b9b9190b7522" - ], - "layout": "IPY_MODEL_532ea00fd94045298f69a3917ced39c7" - } - }, - "7b48f1fae96e40519787018ed628b99b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7cc3bf6293494425b70569d1eca3af03": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7cdb0eb01b9b434ca4c08fd25f243f09": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget011" - } - }, - "7d163d682d5744d6ac7be041fb66c158": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7d64c7c8f2dc4d4eb6218e55ae44bfbe": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7d70e416e925499f93e5837aabc6afc2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7dc1333733194435934e6ca098ede1ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7df23ef826fb4c568071b0667bafcd3b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_6db9105409df4485909f169fc6e6d696", - "style": "IPY_MODEL_84111028e0ea4937a6fea8f96b279bec", - "value": "model save path" - } - }, - "7e128d275e3c4e88829167514cec3bc6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7e40917d81264ee9986d07bae8291022": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7f3f44cbaac94755810c0e589d048490": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7f9233b831cc448a97a909e398122bb9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "7f94bb571172453a920e7bd6d7a9050f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_58201f662dc74741bcdeb0e7753843c4", - "max": 600, - "min": -400, - "readout_format": ".0f", - "style": "IPY_MODEL_b5dd447dec9c48bc8b1bb664c9553912", - "value": 100 - } - }, - "7fbbe1851a944d69a568c06875de2b0f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_12e50eba7f3e4e9f888416f46172b60f", - "step": null, - "style": "IPY_MODEL_18a7121ba72e42af9a496a39fb8c6f6a", - "value": 100 - } - }, - "7fea48aa29c24b4b94784890589e01e4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "7ff9e3e9f09b40d398b6c898e5ee9653": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "808fb0e5d6b940388d588196c927564d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "80d9bf94c37c49708820ccb5a2aa8f8b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "814eef7fa97a4fa2b4c5f1ed1b3728f3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_2e8b3025623248e2a92daa5a7750997f", - "style": "IPY_MODEL_bb04f52581bb496e9a6931ce291714c9", - "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" - } - }, - "81621cd1e69f47a1868bf499caac5824": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "8173f889450249d58f18acfe83d63ddd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "8178676fb5e441ec92464938695643a8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "81a50427a5384feeaaee374a19ad5931": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(6, 1fr)" - } - }, - "81d1f55272ef4977b06be173bdd59b8c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "81f34a95028440608c8a5a307cd7ee9b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_content": "center", - "align_items": "center", - "border": "dotted" - } - }, - "82c3b758724944d0b02d17ecfdd05698": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "831ed45407f74193acc07dacada162a9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "835ef9a1125846679a65d679afb62013": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "8387714984af4e9cbaf16cbff2a45cbb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "83c18b3b4c374f70947e47230ffe4f82": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_be4d4fbbc53d4705963f9b343aff399f", - "style": "IPY_MODEL_8efed772f09f4ea1a1dabf91598fd49a", - "value": "Optimizer information:" - } - }, - "84111028e0ea4937a6fea8f96b279bec": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "841b7f5d915e4f639784140b23610d75": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_0b081708649d446ab37f522f5a019e19", - "readout_format": ".0f", - "style": "IPY_MODEL_12a0f20f2ecd423889594f36b15647f1", - "value": 50 - } - }, - "842ea79123034275adec1df392a4846d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget009" - } - }, - "84f7291061b34bfaaaec0711bd0cca56": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_ae877e1e2a554a19b78fb9a12f60e5d3", - "style": "IPY_MODEL_1f0e424278554da08fbb15138e571a62", - "value": "The action space is continuous." - } - }, - "85165a2de0d64a2bb9baf9b64b3ffa38": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_383cf0cb101341d4bdfb65604a24a4d5", - "style": "IPY_MODEL_23424247d797485dba0788eb6b7614aa", - "value": "model save path" - } - }, - "85514e8a938240e7b2df7c2a8ad6b6e8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "85d35dbed0594a3a837f536309af0b59": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_1db128fafd984258b040b5295b477f0d", - "max": 74, - "min": -26, - "readout_format": ".0f", - "style": "IPY_MODEL_066c122ea5f64991b7347279a79e8061", - "value": 24 - } - }, - "86e357397076415ba3ac239b26a8bc8f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget006" - } - }, - "8784dbc322c7455aaef2b352bae2f205": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "87b22017505c4d14a335692f09abd816": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "BoxModel", - "state": { - "children": [ - "IPY_MODEL_4a2a0ec5e8f641f489d58e31f3f5fcef" - ], - "layout": "IPY_MODEL_1f37fdacb85646a1b5ff9a2b1d6ab38a" - } - }, - "885608d7df064c51ac0523ef9928e6b6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_689e8f05af2f4f159239a896e7e9843a", - "style": "IPY_MODEL_b85dbc19731e4b84bb6122ea52367809", - "value": "Action space:" - } - }, - "8865f419c3a04323907d8e9d11f06c24": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_8387714984af4e9cbaf16cbff2a45cbb", - "style": "IPY_MODEL_5daa3bcd6829495cb223328230f0f8e4", - "value": "gamma" - } - }, - "886c73a1052a4a2da9ec06c958855a51": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "88aafdf648784ac7954ce933431f9a3a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_8d80128792d44bf1a0467b7e86df0b54", - "IPY_MODEL_d91d58d65e864faa90c9cc7bfd2959b0" - ], - "layout": "IPY_MODEL_8ff956034aa047d0a8809922cbefa856" - } - }, - "88b977df9d82476298ff3c70d714afe0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "learning curve smooth factor", - "layout": "IPY_MODEL_7f9233b831cc448a97a909e398122bb9", - "max": 1, - "step": 0.01, - "style": "IPY_MODEL_35525c0fbffa497eb43f7d5bd081bb0b", - "value": 0.8 - } - }, - "88fc41c33c024f4eb22b13e0ea98e605": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "891909eab8204a4bb78c9a468bc20112": { - "model_module": "@jupyter-widgets/output", - "model_module_version": "1.0.0", - "model_name": "OutputModel", - "state": { - "layout": "IPY_MODEL_e1f175e02edf40f39585c485ec11cbff", - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD8CAYAAACCRVh7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl8VOXZ//HPlX0hJBACgbBDAiJCkMiiwqOCilsRKluR2tY+tFar1tpq26e/9nm6qK11a91otaUtKopasKi4F0VLDcgShEzCmgAZEoQwScg61++PDBgwIcssJ8v1fr3mxcx9zpm5MmK+nHPf575FVTHGGGMaE+Z0AcYYY9ovCwljjDFNspAwxhjTJAsJY4wxTbKQMMYY0yQLCWOMMU2ykDDGGNMkCwljjDFNspAwxhjTpAinC/BXr169dPDgwU6XYYwxHcqGDRtKVDWluf06fEgMHjyY7Oxsp8swxpgORUT2tmQ/u9xkjDGmSRYSxhhjmmQhYYwxpkkWEsYYY5pkIWGMMaZJFhLGGGOaZCFhjDGmSRYSxhjTCq9tPciu4jKnywgZCwljjGmhQ55Kblq2kase+YDlH+9DVZ0uKegsJIwxpoVcRfVnEH26R3PXi1v57rOfcKyyxuGqgstCwhhjWijX7QHg+W9N5geXj+C1nCKufPh9Nu474nBlwWMhYYwxLeQq8tAzPoqUhGhuvng4z39rMqow54mPePTdfLzeznf5yULCGGNaKNftIaNPN0QEgPGDevDqbVOYMTqV367JZdHT63Efq3S4ysCykDDGmBZQVfLcHkb0STilPTE2kj8sGMd9Xz6HDXuPcMXD7/PODrdDVQaehYQxxrTA/qPHKa+uIyM14QvbRIR55w3kn9+9kN4J0XzjL9n83yufUlVb50ClgeVXSIjIHBHZJiJeEclq0L5QRDY1eHhFJNO37VciUiAiZae9V7SILBeRfBFZLyKD/anNGGMCyeXrtM7o88WQOGF47wT+cfMFfO38wTy9bjezHv2QnR38ngp/zyRygNnA2oaNqrpMVTNVNRNYBOxW1U2+za8AExp5rxuBI6o6HHgQuM/P2owxJmByfcNfM3o3HRIAMZHh/PxLZ/Onr2ZxsPQ41/z+A17ILuiw91T4FRKqul1Vc5vZbQHwXINj/q2qBxvZbyaw1Pd8BTBNTvQOGWOMw1xuD6ndY0iMi2zR/tNH9eG126Yypn8iP1ixhdue24SnA95TEYo+iXnAsy3YLw0oAFDVWqAUSG5sRxFZLCLZIpJdXFwcsEKNMaYpLren0f6IM0lNjGHZNyfx/UszWL31IFc+8j6bCo4GqcLgaDYkROQtEclp5DGzBcdOBCpUNScg1fqo6hJVzVLVrJSUZtfxNsYYv9R5lbxDZYzo063Vx4aHCd+dls7yxZPweuG6xz/kiX/t7DD3VEQ0t4OqTvfj/efTsrMIgP3AAKBQRCKAROCwH59tjDEBsfdwOdW1XtLP0GndnKzBPXn11inc/dIW7n1tBx/klfDAvLH0TogJYKWBF7TLTSISBsylQX9EM1YBN/ieXwe8ox21p8cY06mcGNl0+j0SrZUYF8ljC8/lntnnkL33M6546H3ezT0UiBKDxt8hsLNEpBCYDKwWkTUNNk8FClR112nH/MZ3TJyIFIrIz32bngKSRSQfuAO425/ajDEmUE6MbEpvw+Wm04kICyYM5JVbLqRXt2i+/ueP+eU/P6W61uv3eweDdPR/rGdlZWl2drbTZRhjOrGbn9nI1sJS1v7w4oC+b2VNHb9avZ2//Xsv56Ql8siCcQzpFR/Qz2iKiGxQ1azm9rM7ro0xphmuIs8Zb6Jrq5jIcH5x7WieXDSefZ9VcPUj7/PSxsKAf44/LCSMMeYMqmrr2F1STkYALjU15fKzU3nttimc3S+RO57fzPeWb6KsqjZon9caFhLGGHMGu0vKqfUqI1p5j0Rr9UuK5dnFk/je9AxWbtrPVY+8z5ZC5++psJAwxpgzcLl903EE4XLT6cLDhNump/Pc4snU1HqZ/diHLFnr7D0VFhLGGHMGriIP4WHC0JTQdCgDTBjSk1dvm8K0s3rz61d38LW/fEyxpypkn9+QhYQxxpxBrtvDkF7xREeEh/Rzk+KieOL68fzy2tGs33WYKx5ey1pX6KchspAwxpgzcPlWo3OCiHD9pEGsuuVCesZH8dWn/8M9r24P6T0VFhLGGNOE49V17PusIiT9EWcyIjWBlTdfyFcmDuTJtbuY88SH7D1cHpLPtpAwxpgm5B8qQ9X/6TgCITYqnF/POofHF57L7pJyrnrkA9bllwT9c5ud4M8YY7qq3BOr0QV5+GtrXHFOX8YMSOL/XtkWkjMcCwljjGmCy+0hKiKMQT3jnC7lFGlJsTy5qNkZNQLCLjcZY0wTcos8DEvpRkR41/1V2XV/cmOMaYbL7WnTQkOdiYWEMcY04lhlDQdLK9tVf4QTLCSMMaYReQFaaKijs5AwxphGnFhoyOl7JJxmIWGMMY1wuT3ERYWTlhTrdCmO8nf50jkisk1EvCKS1aB9oYhsavDwikimiMSJyGoR2eE77t4Gx0SLyHIRyReR9SIy2J/ajDHGH7lFHtL7JBAWJk6X4ih/zyRygNnA2oaNqrpMVTNVNRNYBOxW1U2+zfer6khgHHCBiFzha78ROKKqw4EHgfv8rM0YY9os75CNbAI/Q0JVt6tqbjO7LQCe8+1foarv+p5XAxuB/r79ZgJLfc9XANNEpGtHuDHGESVlVZSUVXf5/ggITZ/EPODZ0xtFJAm4Bnjb15QGFACoai1QCiSHoD5jjDmF68TIpi4+/BVaMC2HiLwFpDay6SequrKZYycCFaqac1p7BPXB8Yiq7mpFvSeOXwwsBhg4cGBrDzfGmDNyFfnmbLIzieZDQlWn+/H+82nkLAJYAuSp6kMN2vYDA4BCX4gkAoebqGmJ7z3Iyspybl0/Y0ynlOsuIzE2kt4J0U6X4rigXW4SkTBgLr7+iAbtv6Q+AG4/7ZBVwA2+59cB76iqBYAxJuTy3B5G9EnAukX9HwI7S0QKgcnAahFZ02DzVKCg4eUkEekP/AQYBWz0DY/9pm/zU0CyiOQDdwB3+1ObMca0haqS6/aQkWojm8DPqcJV9WXg5Sa2vQdMOq2tEGg0mlW1EpjjTz3GGOOvomOVeCpru/x0HCfYHdfGGNNArq/TOt1CArCQMMaYU5wY/mojm+pZSBhjTAMudxkpCdH0jI9yupR2wULCGGMacPlGNpl6FhLGGOPj9Sout8cuNTVgIWGMMT4FRyqorPGSYRP7nWQhYYwxPidGNnX1JUsbspAwxhifvEP1q9Gl97YziRMsJIwxxie3yENaUiwJMZFOl9JuWEgYY4yPy+2x6cFPYyFhjDFATZ2XncVlNrLpNBYSxhgD7Ckpp6ZObWTTaSwkjDGG+jutwabjOJ2FhDHGALluD2ECw21k0yksJIwxhvolSwcnxxMTGe50Ke2KhYQxxoBNx9EECwljTJdXWVPHnsPl1mndCAsJY0yXt7O4DK/adByN8XeN6zkisk1EvCKS1aB9oW/96hMPr4hk+ra9LiKbfcc9ISLhvvaeIvKmiOT5/uzh349mjDEtc2KhIZsi/Iv8PZPIAWYDaxs2quoyVc1U1UxgEbBbVTf5Ns9V1bHAaCCFz9e1vht4W1XTgbd9r40xJuhyi8qIDBcG94p3upR2x6+QUNXtqprbzG4LgOcaHHPM9zQCiALU93omsNT3fClwrT+1GWNMS7ncHoaldCMy3K7Any4U38g84NmGDSKyBjgEeIAVvuY+qnrQ97wI6BOC2owxhtwiD+l2qalREc3tICJvAamNbPqJqq5s5tiJQIWq5jRsV9XLRSQGWAZcArx52nYVEaUJIrIYWAwwcODA5n6ERv1zywG2FJaSlhRL/x6xpPWItdkfjemCyqpq2X/0OAsmDHC6lHap2ZBQ1el+vP98TjuLaPC+lSKykvrLTG8CbhHpq6oHRaQv9WcaTdW0BFgCkJWV1WSYnMnWwlL+8uEeqmu9p7R3j4mgf4+4k6HRv4cvRJLq23rERSIibflIY0w7lOfrtLZ7JBrXbEi0lYiEAXOBKQ3augEJviCIAK4C3vdtXgXcANzr+/OMZyn++tGVZ3HXjJGUlFex/8hx9h89TuGR4yef7z1czof5JZRX151yXFxUOGlJ9WceDcMjLSmWAT1i6dUtmrAwCxFjOoqTI5ts+Guj/AoJEZkF/J76UUqrRWSTql7u2zwVKFDVXQ0OiQdWiUg09f0h7wJP+LbdCzwvIjcCe6kPmKAKCxN6J8TQOyGGcQO/OOJWVSk9XkPhEV+AHK0PkcIjFew/epxNBUc5WlFzyjFR4WH0S4qpPxtJij3ljCStRyyp3WOIsM4xY9qN3KIyYiLDGNAjzulS2iW/QkJVXwZebmLbe8Ck09rcwHlN7H8YmOZPPYEmIiTFRZEUF8XotMRG9ymrqvWdfVT4AuQ4hb4weXvHIUrKqk7ZPzxMSO0eU38m4guRoSnxjOqbyLCUeAsQY0LM5faQ3jvBrgA0IWiXm7qKbtERjEhNaPJUtbKmjgNHTz0Tqb+0VcG/dx2m6FglXl+vSlREGCNTExjVtztn9+vOqH7dGZnanfho+89kTLC43B6mpKc4XUa7Zb99giwmMpyhKd0YmtL4nDA1dV52l5Sz7UApnx44xqcHj/H6tiKe+7gAABEYkhzPWf18wdG3Pjx6J8SE8scwplM6Ul7NIU8VI1JtzqamWEg4LDI8jIw+CWT0SWDWuPo2VeVgaSXbDhzzBUcpmwuOsnrLwZPHpSREn3LGMapvdwYnx9spszGt4LKRTc2ykGiHRIR+SbH0S4rl0lGf31NYerzm5NnGpweOse1AKevyS6j1Xa+KiwrnrL6nnnFk9Emw+fGNaYKNbGqehUQHkhgbyeRhyUwelnyyraq2jjx32cnw2HaglJc27uevVXuB+o7y4SndTjnjGNWvO0lxUU79GMa0Gy53GQnREaR2t8u3TbGQ6OCiI8IZnZZ4yugrr1cpOFLR4HLVMdbtLOGlT/af3CctKfaU0Di7X3f62xBA08Xkuj1kpCbYDbJnYCHRCYWFCYOS4xmUHM+V5/Q92V5SVvWFy1VvbXejvtFVt09P5/bpGQ5VbUxoqSout4crRvdtfucuzEKiC+nVLZqpGSlMzfh8uF9FdS07ijw89m4+j723k3nnDaBvYqyDVRoTGsWeKo5W1DDCVqM7I7tzq4uLi4rg3IE9+Nk1Z6OqPPJ2ntMlGRMSuSdGNlmn9RlZSBgABvSMY+HEQTyfXciu4jKnyzEm6Fzu+r/nNvz1zCwkzEk3Xzyc6IgwHnjT5XQpxgSdq8hDcnwUvbpFO11Ku2YhYU5KSYjmGxcM4Z9bDpKzv9TpcowJqly3x84iWsBCwpziv6cOJTE2kvvfaG5VWmM6Lq9XyXN77Ca6FrCQMKdIjI3kpouG8V5uMet3HXa6HGOCYv/R45RX19mZRAtYSJgvuGHyYHonRPObNbmotmnhP2PatbxDJ+ZssuGvzbGQMF8QGxXOrdPS2bD3CO/saHIVWWM6rNyi+pFN6XYm0SwLCdOoeecNYFByHL9dk4vXa2cTpnNxuT30TYwhMTbS6VLaPQsJ06jI8DDuuDSDHUUeXtlywOlyjAmo3CIb2dRSfoWEiMwRkW0i4hWRrAbtC0VkU4OHV0QyTzt2lYjkNHjdU0TeFJE8359fXHTahNQ1Y/oxMjWBB950UVPndbocYwKizqvkF5fZyKYW8vdMIgeYDaxt2Kiqy1Q1U1UzgUXAblXddGK7iMwGTr+t927gbVVNB972vTYOCgsTfnD5CPYermC5b6U8Yzq6vYfLqa71kt7bOq1bwq+QUNXtqtrcgPoFwHMnXohIN+AO4Jen7TcTWOp7vhS41p/aTGBcMrI34wf14JG38zheXed0Ocb4zRYaap1Q9EnMA55t8PoXwO+AitP266OqJ9bnLAL60AQRWSwi2SKSXVxcHNBizalEhLtmjOSQp4qlH+1xuhxj/JZbVIYIDLcziRZpNiRE5C0RyWnkMbMFx04EKlQ1x/c6Eximqi+f6TitH5zf5JAaVV2iqlmqmpWSktLUbiZAJgzpyUUjUnj8vZ2UHq9xuhxj/OJyexjYM464KFspoSWaDQlVna6qoxt5rGzB+8/n1LOIyUCWiOwBPgAyROQ93za3iPQF8P1pA/TbkTsvG0Hp8Rr+uHaX06UY4xebs6l1gna5SUTCgLk06I9Q1cdVtZ+qDgYuBFyqepFv8yrgBt/zG4CWhJAJkdFpiVw9pi9Pr9tNsafK6XKMaZOq2jr2lJTbndat4O8Q2FkiUkj9GcJqEVnTYPNUoEBVW/pPz3uBS0UkD5jue23ake9fNoKqWi+PvpvvdCnGtMnuknJqvWpnEq3g10U5X99Co/0LqvoeMOkMx+4BRjd4fRiY5k89JriG9IpnblZ/lq3fy40XDmFAzzinSzKmVXKLbGRTa9kd16ZVbp2Wjojw0Fudd5nTvYfL+dnKHI6UVztdigkwl9tDRJgwtJddbmopCwnTKn0TY7lh8iBe+qTw5HjzzqS8qpZvLs1m6Ud7+fHLW20W3E4mt6iMIb3iiYqwX30tZd+UabWbLhpOfFQE96/pXAsTqSo/XLGFncVlXDWmL6/lFPHyJ/udLssEUN4hG9nUWhYSptV6xkfx31OG8sanbj7Zd8TpcgJmydpdrN56kLtmjOSR+ePIGtSDn63cRuGR0+/7NB1RRXUt+z6rsJBoJQsJ0yY3ThlCcnwUv+0kZxPr8ku47/UdXHVOXxZPHUp4mPDA3Ey8qtz5wmabLr0TyD9UhiqMSLX+iNawkDBt0i06gu9cPJwPdx7mg7wSp8vxS+GRCm55ZiPDUrrxm+vGICIADEyO42fXnM2/d33G0+t2O1yl8deJkU12JtE6FhKmzRZOHEhaUiy/XbOjw3bwVtbU8e2/b6C2Tnly0Xjio08dFT4nqz+XjurDb17PPflLxnRMLreHqIgwBiXHO11Kh2IhYdosJjKc26ans7mwlDXbipwup9VUlf/5Rw45+4/x4LxMhqZ88TKEiHDP7HPoHhvB7cs3UVVrM+F2VC53GcNTuhEeJk6X0qFYSBi/zB6XxrCUeO5/w0VdB7tu//f1+1ixoZBbp6UzfVSTkw7Tq1s0984ew/aDx3jwzc57f0hn53J77Ca6NrCQMH6JCA/jzstGkH+ojJc2FjpdTott2PsZ//fKNi4ekcLt09Kb3X/6qD4smDCAJ9fu5D+7PwtBhSaQSo/XcLC00voj2sBCwvhtxuhUxvRP5KG38jrE5ZhDxyr59t830i8plofmjSOshZcf/ueqUQzoEccdz2/CU2lTpnckeScXGrKRTa1lIWH8JlK/zOn+o8d5Zv0+p8s5o+paL99ZtpGyylqeXDSexLjIFh8bHx3Bg/PGcuDocf7vlU+DWKUJtFy3jWxqKwsJExAXDu/F5KHJ/OGdfMqrap0up0m/XP0p2XuPcN91YxiZ2r3Vx48f1JObLhrGCxsKO2RnfVeV5y4jPiqctKRYp0vpcCwkTECICD+YMYLD5dU8/UH7vKdgxYZC/vrRXv57yhC+NLZfm9/ntmkZnN2vOz96aautrdFB5BZ5SO+TcPIeGNNyFhImYM4d2INLR/Vhydpd7W4G1Zz9pfzk5a1MHprMXTNG+vVeURFhPDQvk7KqWu5+cUuHvUekK3G5PYywS01tYiFhAurOy0ZQVl3L4//a6XQpJ31WXs23/raB5Pgo/vCVcUSE+//XPr1PAnfPGMnbOw7x3McFAajSBEtJWRWHy6vJsOGvbWIhYQJqRGoCszLTWPrhHopKK50uh9o6L7c++wnFZVU8fv14krtFB+y9v3b+YC4Ynswv/vkpe0rKA/a+JrBcJxYasjOJNvF3+dI5IrJNRLwiktWgfaGIbGrw8IpIpm/beyKS22Bbb197tIgsF5F8EVkvIoP9qc0453uXZuBV5eG3nb/x7P43XHyQX8IvZ45m7ICkgL53WJhw/5yxRIQJdzy/ido6b0Df3wSG6+TIJhv+2hb+nknkALOBtQ0bVXWZqmaqaiawCNitqpsa7LLwxHZVPeRruxE4oqrDgQeB+/yszThkQM84FkwYyPPZBex28F/Yr249yBP/2slXJg5k7nkDgvIZfRNj+cW1o9m47yhPtKNLbOZzue4ykuIiSUkI3FlkV+JXSKjqdlVtbq7oBcBzLXi7mcBS3/MVwDSxoQgd1i2XDCcqPIwH3nQ58vl5bg93vrCZcQOT+Nk1o4L6WTMz07hmbD8eeiuPrYWlQf0s03oud/1CQ/brpG1C0ScxD3j2tLY/+y41/bRBEKQBBQCqWguUAskhqM8EQe+EGL5x4WBe2XyAbQdC+4vzWGUNi/+2gbioCB5fOJ7oiPCgf+YvZp5Ncrcobl/+CZU17f+u865CVXEV2cgmfzQbEiLylojkNPKY2YJjJwIVqprToHmhqp4DTPE9FrW2aBFZLCLZIpJdXFzc2sNNiCyeOozE2MiQLnPq9Sp3LN9MwWcVPLbwXFITY0LyuUlxUdw/Zyw7i8u57/UdIflM07yDpZV4qmptZJMfmg0JVZ2uqqMbeaxswfvP57SzCFXd7/vTAzwDTPBt2g8MABCRCCARONxETUtUNUtVs1JSUlpQhnFCYmwk3/6vYbybW8zHe0IzKd4f3s3nre1ufnLVWUwY0jMkn3nClPQUvnb+YP68bk+HX4ipszjZad3bOq3bKmiXm0QkDJhLg/4IEYkQkV6+55HA1dR3fgOsAm7wPb8OeEftLqUO72vnD6Z3QjS/eT34CxO9u+MQD77lYta4NL52/uCgflZT7poxkmEp8dz5wmZKK2wSQKe5bM4mv/k7BHaWiBQCk4HVIrKmweapQIGq7mrQFg2sEZEtwCbqzx7+6Nv2FJAsIvnAHcDd/tRm2ofYqHC+Oy2dj/cc4b3c4F0a3FNSzm3PfcJZqd359axzHOukjI0K58F5mZSUVfHTlTnNH2CCKreojN4J0fSIj3K6lA7L39FNL6tqf1WNVtU+qnp5g23vqeqk0/YvV9XxqjpGVc9W1dtUtc63rVJV56jqcFWdcFq4mA5sXtYABvaM4zdrcvEGYWGiiupavv33DYgITy4aT2xU8Duqz2RM/yRum5bOqs0HWLlpv6O1dHW20JD/7I5rE3RREWHccWkG2w8e459bDwb0vVWVu17cSq7bwyMLxjGgZ1xA37+tbrpoGOMGJvHTf+RwsPS40+V0SV6vknfIY5ea/GQhYULiS2P7MTI1gQfeyKUmgHcmP/XBbl7ZfIA7LxvBf2W0n0EMEeFhPDg3k5o65QcvbAnKGZQ5s4IjFVTWeO1Oaz9ZSJiQCAsT7rxsBHsOV/BCdmCWOf1wZwn3vLaDy8/uw3cuGhaQ9wykwb3i+enVo/ggv4S/frTH6XK6nNwi67QOBAsJEzLTzurNuQOTePhtl983nB04epzvPvMJg5PjuH/O2HZ7N+2CCQO4ZGRv7nltB/mHPE6X06WcGNmUbiHhFwsJEzIiwg9njMR9rIqlH+5p8/tU1tRx0983UFXr5clFWSTEtHwJ0lATEe798jnERYVz+/JNVNfaJIChkusuo3+PWLpFRzhdSodmIWFCatLQZKZmpPD4v3ZyrLJt9xH8fNU2NheW8ru5YxneAW6S6p0Qwz2zzyFn/zF+/47zM+N2FXm20FBAWEiYkPvh5SM4WlHDH9e2fpTzM+v38dzHBdxy8XAuPzs1CNUFx4zRfblufH8efTefDXuPOF1Op1dT52VncZldagoACwkTcqPTErlqTF+e+mB3q9aI3rjvCD9blcPUjBS+d2lGECsMjp9dM4q+ibHc8fwmyqtqnS6nU9tTUk5NnTIitf2fabZ3FhLGEd+/NIOqWi+Pvpvfov2LPVV85+8bSU2M4ZH5mYSHtc+O6jNJiInkgblj2fdZBb9cvd3pcjq1XJuOI2AsJIwjhqZ0Y874/jyzfh+FRyrOuG9NnZebn9nI0ePVPHl9FklxHXeKhYlDk1k8dSjP/mcfb293O11Op+Uq8hAmMCzFziT8ZSFhHHPb9HQQeOitM3fm/vrV7fxn92fcO3sMo/p1D1F1wXPHpRmMTE3grhe3cris5ZfbTMu53GUM7hVPTKSzU7R0BhYSxjF9E2P56qRBvLSxkDx34/cQ/OOT/fx53R6+fsFgrh2XFuIKgyM6IpyH5mdy7HgNP3ppa9Bnx+2KXG4PGb3tUlMgWEgYR33n4uHERUXwuze+uMzptgOl3P3SFiYM6cmPrzzLgeqCZ2Rqd+68PIM3PnWzYkNg7kA39Spr6thzuNwWGgoQCwnjqJ7xUXxzyhBe31bE5oKjJ9uPVlTz7b9vICk2ike/ci6R4Z3vr+qNFw5l4pCe/O8rn1Lw2Zn7ZUzL5R8qw6vYPRIB0vn+zzMdzjenDKVnfBS/9S1zWudVbn1uE+7SKh6//lxSEqIdrjA4wsOE380dC8D3n99MnU0CGBAnpuOw4a+BYSFhHNctOoLvXDSMD/JL+DC/hAffdLHWVczPv3Q24wb2cLq8oOrfI47//dLZ/GfPZ/zxfVtCJRBc7jKiwsMYlBzvdCmdgoWEaReunzSIfokx3PnCZv7wbj7zzxvAVyYOdLqskJh9bhpXjE7ld2/k8umBY06X0+G53B6GpsR3ykuUTrBv0bQLMZHh3DY9nQOllYztn8jPv3S20yWFjIjwq1nnkBQXxfeWb/J7htyuLrfIFhoKJH/XuJ4jIttExCsiWQ3aF4rIpgYPr4hk+rZFicgSEXGJyA4R+bKvPVpElotIvoisF5HB/tRmOp4vn9ufX80azR9vyOpy49t7xkfxm+vGkOv28MCbXxzpZVrGU1nD/qPHbcnSAPL3TCIHmA2sbdioqstUNVNVM4FFwG5V3eTb/BPgkKpmAKOAf/nabwSOqOpw4EHgPj9rMx1MRHgYCycOondCjNOlOOLiEb25ftJA/vj+Lj7aedjpcjqkvENlgE3HEUh+hYSqblfV3GZ2WwA81+D1N4B7fMd7VbXE1z4TWOp7vgKYJu11JRljguTHV57F4OR47nydNTO5AAAPXklEQVRhc5unUu/KTtyUacNfAycUfRLzgGcBRCTJ1/YLEdkoIi+ISB9fWxpQAKCqtUApkByC+oxpN+KiInhg7liKjlXy81XbnC6nw8ktKiM2Mpz+PWKdLqXTaDYkROQtEclp5DGzBcdOBCpUNcfXFAH0Bz5U1XOBj4D7W1u0iCwWkWwRyS4uLm7t4ca0a+MG9uDmi4fz0sb9vLr1oNPldCgut4f0Pt0I64CzBLdXza7rp6rT/Xj/+fjOInwOAxXAS77XL1DfFwGwHxgAFIpIBJDo27+xmpYASwCysrLsDiTT6Xz3kuG8l3uIu1ZsQYArzunrdEkdQq7bw39lpDhdRqcStMtNIhIGzKVBf4TWz2T2CnCRr2ka8Knv+SrgBt/z64B31GY+M11UZHgYjy08lyEp8dy0bCM/XLHZFipqxpHyaoo9VdYfEWD+DoGdJSKFwGRgtYisabB5KlCgqqffRnoX8HMR2UL9yKfv+9qfApJFJB+4A7jbn9qM6ej694jjxZvO5+aLh/HChkKufOR9PtlnS5825cR0HDaxX2A1e7npTFT1ZeDlJra9B0xqpH0v9QFyenslMMefeozpbCLDw/jB5SP5r4zefG/5Jq574iNuvSSdmy8eRoTdUXyKkyHRx+ZsCiT7W2ZMBzBhSE9evW0KV4/py4NvuZi35N/sO2wzxzaU6/aQEBNBaveueZ9NsFhIGNNBJMZG8vD8cTw8PxNXkYcrH3mfFzcU2qJFPq6iMkb0ScBurwosCwljOpiZmWm8dvsURvXtzvdf2Mwtz35CaUXXvvFOVcl1e6w/IggsJIzpgPr3iOPZxZP4weUjWJNTxIyH1/LhzpLmD+ykij1VlB6vsZFNQWAhYUwHFR4m9Tfdfed8YiLDWfin9dzz2naqa71OlxZyub5O63TrtA44CwljOrgx/ZNYfeuFzD9vIE/+axezHltH/iGP02WFVG6RzdkULBYSxnQCcVER3DP7HJYsGs+Bo8e5+vcf8LeP9nSZTm2X20OvblEkd+ucS906yULCmE7ksrNTWXP7VCYMSeanK7dx49JsSsqqnC4r6HLdZTY9eJBYSBjTyfTuHsNfvnYeP7tmFB/klzDjobW8u+OQ02UFjder5LttNbpgsZAwphMKCxO+fsEQXrnlQnp1i+brf/mY/7cyp1Mujbr/6HHKq+ssJILEQsKYTmxEagL/uPkCbrxwCH/9aC9X//4Dth0odbqsgDoxHceIVBvZFAwWEsZ0cjGR4fz06lH87cYJHDtew7WPruPJf+3E6+0cndqfD3+1M4lgsJAwpouYkp7CmtuncsnI3tzz2g6uf2o9B0uPO12W31xFHvolxtA9JtLpUjolCwljupAe8VE8cf147vvyOWwqOMqMh95n9ZaOvfqdy11m03EEkYWEMV2MiDDvvIGsvnUKg3vFc/MzG/n+85sp64CLGtXWeckvtuGvwWQhYUwXNaRXPCu+PZlbLxnOy58UcuXD77Nhb8da1GjvZxVU13otJILIQsKYLiwyPIw7LhvB8m9NxqvK3Cc/4sE3XdTWdYz5n1w2HUfQWUgYYzhvcP2iRjPH9uPht/OY8+RH7D1c7nRZzcp1exCB4b1t+Guw+LvG9RwR2SYiXhHJatC+UEQ2NXh4RSRTRBJOay8RkYd8x0SLyHIRyReR9SIy2L8fzRjTGt1jInlgXiaPLBhH/qEyrnz4fV7ILmjX8z/lucsY1DOO2Khwp0vptPw9k8gBZgNrGzaq6jJVzVTVTGARsFtVN6mq50S7b9te4CXfYTcCR1R1OPAgcJ+ftRlj2uBLY/vx+u1TGZ2WyA9WbOHmZzZytKLa6bIalev22P0RQeZXSKjqdlXNbWa3BcBzpzeKSAbQG3jf1zQTWOp7vgKYJrYOoTGOSEuK5Zn/nsRdM0byxjY3Vz3yAdsPHnO6rFNU1daxu6Tc+iOCLBR9EvOAZxtpnw8s18/PZdOAAgBVrQVKgeTG3lBEFotItohkFxcXB6FkY0x4mHDTRcN48abzqfV6+fLjH/Lmp26nyzppV3E5dV61eySCrNmQEJG3RCSnkcfMFhw7EahQ1ZxGNs+n8fBolqouUdUsVc1KSUlpy1sYY1po7IAkVt1yIcN7d2Px37J58l8720U/xck5m+xMIqgimttBVaf78f6NBoGIjAUiVHVDg+b9wACgUEQigETgsB+fbYwJkD7dY1i+eDJ3rtjMPa/tIO9QGb+aNZroCOc6jF1uDxFhwpBe8Y7V0BU0GxJtJSJhwFxgSiObF/DF8FgF3AB8BFwHvKPt4Z8rxhgAYqPC+cOCcQxP6cbDb+ex93A5T1w/3rHV4HKLyhjSK56oCBvJH0z+DoGdJSKFwGRgtYisabB5KlCgqrsaOXQuXwyJp4BkEckH7gDu9qc2Y0zgiQjfuzSD3y8Yx5bCUmY+uu7k+tKh5nJ7rD8iBPwd3fSyqvZX1WhV7aOqlzfY9p6qTmriuKGquuO0tkpVnaOqw1V1QhPhYoxpB64Z24/nvzWZ6lovsx9bxzs7QtuhXVFdy77PKqw/IgTsPM0Y0yZjBySx8pYLGNwrnhuXZvOn93eFrEM7z10GYHM2hYCFhDGmzfomxvLCtycz4+xUfrl6O3e/uJXq2uDP+/T5anQWEsFmIWGM8UtcVASPfuVcvnvJcJZnF3D9U+v5rDy4d2i73B6iI8IY2DMuqJ9jLCSMMQEQFiZ8/7IRPDw/k00FR7n20XXkuYPXoZ3rLmN4726Eh9mkDMFmIWGMCZiZmWksXzyJiuo6Zj/2Ie/lHgrK57iKPNZpHSIWEsaYgBo3sAcrb7mA/j3j+MZfPubP63YHtEO79HgNRccqbfhriFhIGGMCLi0plhXfnsz0s/rwv698yo9fzqEmQAsZ5dl0HCFlIWGMCYr46AieuH4837loGM/+Zx9ffeo/AZlyPNcXEul9bKGhULCQMMYETViY8MMZI3lg7lg27D3CtY+uI/9QmV/v6SryEB8VTlpSbICqNGdiIWGMCbrZ5/bn2cUT8VTWMuuxdbyf1/Yp/nN903HYcjOhYSFhjAmJ8YN6svKWC0hLiuVrf/6Yv360p03vk+cus/6IELKQMMaETP8ecay46XwuHpHC/1u5jZ/+o3Ud2iVlVRwur7bpOELIQsIYE1LdoiN4clEW35o6lL/9ey9f//PHlFbUtOhYl2/GWQuJ0LGQMMaEXHiY8KMrz+I3141h/e7DzHpsHbuKm+/QPjGyKSPVRjaFioWEMcYxc7MGsOybkzh6vIZrH13HuvySM+7vcnvoERdJikMLHXVFFhLGGEdNGNKTlTdfQGpiDF99+j8sW7+3yX1d7jIy+tjIplCykDDGOG5AzzhevOl8pqb34icv5/DzVduoPa1DW1Xr52yy6ThCyt/lS+eIyDYR8YpIVoP2hSKyqcHDKyKZvm0LRGSriGwRkddFpJevvaeIvCkieb4/e/j3oxljOpKEmEj+dMN5fPPCIfzlwz18/S8fU3r88w7tg6WVeKpqSbdO65Dy90wiB5gNrG3YqKrLVDVTVTOBRcBuVd0kIhHAw8DFqjoG2ALc4jvsbuBtVU0H3sbWuDamywkPE/7n6lHcO/scPtp5mNmPrWNPSTnweae13SMRWv6ucb1dVXOb2W0B8Jzvufge8VJ/UbE7cMC3bSaw1Pd8KXCtP7UZYzqu+RMG8rcbJ3K4vJprH1vHRzsPNxj+aiObQikUfRLzgGcBVLUGuAnYSn04jAKe8u3XR1UP+p4XAX1CUJsxpp2aPCyZlTdfQHJ8FIueWs9zHxfQp3s0SXFRTpfWpTQbEiLylojkNPKY2YJjJwIVqprjex1JfUiMA/pRf7npR6cfp/WTzzc5Ab2ILBaRbBHJLi5u+xwwxpj2bVByPC/ffAHnD+/F7pJyu4nOARHN7aCq0/14//n4ziJ8Mn3vuRNARJ7n874Ht4j0VdWDItIXaHJJK1VdAiwByMrKCtxqJsaYdqd7TCRP35DF0+t2Mzot0elyupygXW4SkTBgLp/3RwDsB0aJSIrv9aXAdt/zVcANvuc3ACuDVZsxpmOJCA9j8dRhnD+sl9OldDn+DoGdJSKFwGRgtYisabB5KlCgqrtONKjqAeB/gbUisoX6M4tf+zbfC1wqInnAdN9rY4wxDpJArj3rhKysLM3Ozna6DGOM6VBEZIOqZjW3n91xbYwxpkkWEsYYY5pkIWGMMaZJFhLGGGOaZCFhjDGmSRYSxhhjmtThh8CKSDHQ9ColZ9YLOPNSWF2LfR+fs+/iVPZ9nKozfB+DVDWluZ06fEj4Q0SyWzJOuKuw7+Nz9l2cyr6PU3Wl78MuNxljjGmShYQxxpgmdfWQWOJ0Ae2MfR+fs+/iVPZ9nKrLfB9duk/CGGPMmXX1MwljjDFn0GVDQkRmiEiuiOSLyN3NH9E5icgAEXlXRD4VkW0icpvTNbUHIhIuIp+IyD+drsVpIpIkIitEZIeIbBeRyU7X5BQR+Z7v/5McEXlWRGKcrinYumRIiEg48ChwBfXrbC8QkVHOVuWYWuD7qjoKmATc3IW/i4Zu4/MFsbq6h4HXVXUkMJYu+r2ISBpwK5ClqqOBcOpX3+zUumRIABOAfFXdparV1K+e1+ya3Z2Rqh5U1Y2+5x7qfwGkOVuVs0SkP3AV8Cena3GaiCRSv4DYUwCqWq2qR52tylERQKyIRABxwAGH6wm6rhoSaUBBg9eFdPFfjAAiMhgYB6x3thLHPQT8EPA6XUg7MAQoBv7su/z2JxGJd7ooJ6jqfuB+YB9wEChV1TecrSr4umpImNOISDfgReB2VT3mdD1OEZGrgUOqusHpWtqJCOBc4HFVHQeUA12yD09EelB/xWEI0A+IF5Hrna0q+LpqSOwHBjR43d/X1iWJSCT1AbFMVV9yuh6HXQB8SUT2UH8Z8hIR+buzJTmqEChU1RNnlyuoD42uaDqwW1WLVbUGeAk43+Gagq6rhsTHQLqIDBGRKOo7n1Y5XJMjRESov968XVUfcLoep6nqj1S1v6oOpv7vxTuq2un/tdgUVS0CCkRkhK9pGvCpgyU5aR8wSUTifP/fTKMLdOJHOF2AE1S1VkRuAdZQP0LhaVXd5nBZTrkAWARsFZFNvrYfq+qrDtZk2pfvAst8/6DaBXzd4XocoarrRWQFsJH6UYGf0AXuvLY7ro0xxjSpq15uMsYY0wIWEsYYY5pkIWGMMaZJFhLGGGOaZCFhjDGmSRYSxhhjmmQhYYwxpkkWEsYYY5r0/wGKtE2Kfdh95gAAAABJRU5ErkJggg==\n", - "text/plain": "
" - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ] - } - }, - "891e2bdcc12d4314affa4fd372ed7ade": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "89880b2c3e03469da53b8a7e9e2e930b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "8991ca296f464086aab8e12cc644430c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget011" - } - }, - "89ae5379ee8b4e2d92f116a018b9420e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_c2eca071d21942c98a47aaf881130883", - "IPY_MODEL_a6a4d48baea44d659e3b2dd7e54fcd17" - ], - "layout": "IPY_MODEL_3044da8a1f89485398f1ea9d4965bc55" - } - }, - "8ae2c037e98f420486a61a8570daf106": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "8b14eeb5b78e4e4cb98441ffaeccf4fb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget002" - } - }, - "8c168f5c8ecc4d0ba203b60193856d1c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_b9ad33908a4f4a6ba687c820c123c37a", - "style": "IPY_MODEL_094d34956035446984a6cb8a6efc22a7", - "value": "1e-07" - } - }, - "8c27b4b759354d64b25bcb3462c444ef": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "AC", - "DDPG", - "PG", - "PPO", - "SAC", - "TD3", - "TRPO" - ], - "description": "Algorithms:", - "index": 0, - "layout": "IPY_MODEL_b5ac8df291f9438bacc64a6cb2805620", - "style": "IPY_MODEL_45850b0512424834a6d4c70e60892ae8" - } - }, - "8c59866961674911b2157bded443e366": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_fc69d16aa7e547b09859e2ca7dbfbde8", - "IPY_MODEL_6caef128e4df40ebb76ef90ad9a40d41" - ], - "layout": "IPY_MODEL_00663174be1342fbbd29bc99cdd6d3aa" - } - }, - "8ca1f8992583484a8a0ff2f7f46afee2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_76c7ceb7a42e44048e694b71f27f56eb", - "style": "IPY_MODEL_97b119b9f8fc4a5f80b7f35b2fbc20dd", - "value": "Input(shape=(None, 3), name='input_layer')" - } - }, - "8d025735275c4dfdbbbf2d491e727c08": { - "model_module": "@jupyter-widgets/output", - "model_module_version": "1.0.0", - "model_name": "OutputModel", - "state": { - "layout": "IPY_MODEL_ce5b912531614dfe90ee3e20fa7ba467", - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD8CAYAAABpcuN4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xd81fW9+PHXO5uEkJCQhAz23hAiiANBUXEiWHGPum613ttbtdba3229HVZba/WqtUWte1WFgsUFCOICDSHskYCMnJMJWWQn5/P743yjB8g44ZycQd7Px+M88j3f+eabw3nn+5lijEEppZRqT4i/A1BKKRXYNFEopZTqkCYKpZRSHdJEoZRSqkOaKJRSSnVIE4VSSqkOaaJQSinVIU0USimlOqSJQimlVIfCPDlYRK4AHgTGANOMMdku2yYCfwf6AA7gFGNMvYhcDTwAGMAOXGeMKRORBOAtYDCwD1hojCnvLIZ+/fqZwYMHe/LPUEqpHmfDhg1lxpgkd/YVT4bwEJExOJPA34F7WxOFiIQBOcD1xphNIpIIVACCMzmMtZLDH4FaY8yD1vJhY8zDInI/0NcY8/POYsjKyjLZ2dmd7aaUUsqFiGwwxmS5s69HRU/GmB3GmF1tbDoP2GyM2WTtd8gY04IzUQgQIyKC82nDbh0zD3jJWn4JuMyT2JRSSnlHd9VRjASMiHwkIjkich+AMaYJuAPYgvVkATxvHZNijCm0louAlG6KTSmlVBd0mihEZKWIbG3jNa+Dw8KAM4BrrZ/zReQcEQnHmSimAGnAZuAXxx5snOVh7ZaJicjtIpItItmlpaWd/ROUUkp5oNPKbGPMnBM4bwGw1hhTBiAi7wOZQJV1zj3W+n8C91vHFItIqjGmUERSgZIOYloELAJnHcUJxKeUUspN3VX09BEwQUSirYrts4DtgA0YKyKtNe3nAjus5WXAjdbyjcDSbopNKaVUF3jaPHY+8CSQBCwXkVxjzPnGmHIReQz4BmcR0vvGmOXWMf8LrBWRJmA/cJN1uoeBf4rILdb6hZ7EppRSyjs8ah4bCLR5rFJKdZ3PmscqpZTyvZqGZn777+3sK6vxyfU0USilVJBZuaOY5z//ltIjDT65niYKpZQKMktz7aTH92LqwL4+uZ4mCqWUCiKHaxpZu7uUiyelEhIiPrmmJgqllAoi728ppNlhmDcp3WfX1EShlFJBZFmunRHJvRmTGuuza2qiUEqpIGGrqOPrfYeZNzkN57iqvqGJQimlgsR7m5yDbV/qw2In0EShlFJBY2munSkD4xmYGO3T62qiUEqpIJBXXM2OwirmTUrz+bU1USilVBBYtslOiMBFEzVRKKWUOoYxhqW5dk4f3o+k2EifX18ThVJKBbjcgxUcOFzLpX4odgJNFEopFfCW5tqJCAvh/PH9/XJ9TRRKKRXAmlsc/HtzIeeMTqZPVLhfYtBEoZRSAeyrvYcoO9LAvMn+KXYCTRRKKRXQlubaiY0MY9aoZL/FoIlCqSBXXd/EVlulv8NQ3aC+qYUPtxYxd3x/osJD/RaHJgqlgtyvl27j0qc+Z93eQ/4ORXnZ6p0lHGloZt5k3w7ZcSxNFEoFsSMNzby/tRCHgf96YyNlPprxTPnGsk12+vWOZMawRL/GoYlCqSD2/pZC6pscPDR/AhV1Tfz0rVwcDuPvsJQXVNU3sWpnCZdMSiXURxMUtUcThVJBbHFOAUP7xXD1tAH8+pKxfJZXxjOf7vF3WMoLPtpaRGOzw2+d7FxpolAqSB08XMu6vYdZkJmOiHDNtIFcPDGVx1bs5utvD/s7POWhZZvsDEyIZvKAeH+HoolCqWD1r402AC6b4qzoFBH+sGACA/r24r/e2MjhmkZ/hqc8UFJdzxf5ZT6foKg9miiUCkLGGBZvtDFjaCIZfb+fmyA2KpynrsnkcE0jd/9T6yuC1fLNzgYK/uxk50oThVJBKOdABd+W1bAg8/hmk+PT4/ifi8ewZlcpiz7b64folKeW5toZm9qH4cm+mxe7I5oolApCi3MK6BUeygUTUtvcft2pg7hwQn/+9NEuNuzX+opgsv9QDbkHKwLmaQI0USgVdOqbWnhvk5254/vTOzKszX1EhIcvn0h6fC/+8/WNlGt9RdBYluucF/uSAGjt1EoThVJB5pOdJVTVN3N5ZkaH+/WJCufpazIpO9LIvW9vwhitrwh0xhiWbrIzbUgCafG9/B3OdzRRKBVk3t1QQP8+UW711p2QEccDF45m1c4SnvvsWx9Epzyxo7Ca/JIjAdF3wpUmCqWCSNmRBtbsLmV+ZrrbvXVvPG0wc8f155EPd5JzoLybI1SeWLrJRliIcGE7dU/+oolCqSCyNNdOi8OwYIr7g8SJCI/8YCL946L4z9c3Ulnb1I0RqhPlcBjey7Uzc2QSCTER/g7nKB4lChG5QkS2iYhDRLJc1l8rIrkuL4eITLa2TRWRLSKSLyL/J1ZvEhFJEJEVIpJn/ezr2T9NqZPP4pwCJmbEMSKla80m43o5+1eUVNdz7ztaXxGIsveXY6+sD6jWTq08faLYCiwA1rquNMa8ZoyZbIyZDFwPfGuMybU2PwPcBoywXnOt9fcDq4wxI4BV1nullGVnURXb7FWdVmK3Z/KAeH4+dzQrthfzwhf7vBuc8tjSXBu9wkOZMybF36Ecx6NEYYzZYYzZ1cluVwNvAohIKtDHGLPOOP+keRm4zNpvHvCStfySy3qlFLA4x0Z4qHjUbPKWM4YwZ0wKf/hgB5sOVngxOuWJxmYHy7cUcu7YFGLaafLsT76oo7gSeMNaTgcKXLYVWOsAUowxhdZyERB4aVUpP2lucbBko43Zo5I9Kr8WER69YiLJsVH8+PUcKutO3voKYwxrd5fyo1c2sGyT3d/hdOjz/FIqapsCstgJoNPUJSIrgf5tbPqlMWZpJ8dOB2qNMVu7EpQxxohIu4WoInI7cDvAwIEDu3JqpYLS5/lllFY3sOAEi51cxUdH8OQ1U1j4t6/4+Tubeea6zIAYeM5bGpsdLNtk57nP9rKzqJoQgU93lzIxPY7B/WL8HV6blubaiY8O58wRSf4OpU2dPlEYY+YYY8a38eowSViu4vunCQAb4PpJz7DWARRbRVOtRVQlHcS0yBiTZYzJSkoKzBurlDctzrERHx3O2aOTvXK+zIF9uW/uKD7cVsTLX+33yjn9raK2kadX53PGI59YHQzhTz+YyJp7ZxMeKtzz9iZaAnCQxNrGZlZsL+bCCalEhAVmQ9RuKwwTkRBgIXBm6zpjTKGIVInIqcB64AbgSWvzMuBG4GHrpzuJSKmTXlV9Ex9tK+LKUwZ49Yvk1jOGsm7vYX6/fAeZA/syISPOa+f2pf2HavjH59/yz+wC6ppaOHNEP/50xSRmjuj33ZPSby8bz0/ezGXR2r3cMWuYnyM+2sodJdQ2tgRcJztXnjaPnS8iBcAMYLmIfOSyeSZw0Bhz7PCVdwLPAfnAHuADa/3DwLkikgfMsd4r1eN9sKWQhmaHV4qdXIWECH++YhKJvSP48es5VNUHV33Fhv3l3PHqBmY/uobXvz7AhRNS+eAnZ/LKLdM5a2TSUcVpl05K48IJ/XlsxS52FFb5MerjLcu10b9PFNMGJ/g7lHZJsLenzsrKMtnZ2f4OQ6lus/BvX3GopoGVd5/VLXUJG/YfZuHf1zF3XH+eumZKQNdXtDgMH28r4tnP9pJzoIK4XuFcO30gN542mJQ+UR0ee7imkfP+spak2EiW/vj0gCjmKa9p5JTfr+TmM4bwwIVjfHptEdlgjMnqfE/tma1UQDtwqJav9x1mQWZGt32BTx2UwL3njWL5lkJeXX+gW67hqdrGZl76ch+zH13DHa/lUHakkf+9dBxf3n82980d3WmSAEiIieDhBRPYUVjFE6t2+yDqzn2wtYhmhwnoYifoxjoKpZTnFm8sQATmd2HIjhPxHzOH8tXeQzy0fAfnjU1x64vXF0qq6nnxy328tv4AlXVNZA6M5xcXjOa8cf3dHuvK1ZyxKSzMyuCZNXs4e3QKUwf5dwCIpbk2hiXFMC6tj1/j6Iw+USgVoIwxLM6xcdqwxG4fcjokRPjdvPG0OAx//LCzPrS+8ehHuzj9kU/426d7OG1YIu/ecRqL7zydCyaknlCSaPU/F48lNa4X9769idrGZi9G3DX2ijq+3neYeZPTA7q4DzRRKBWwNuwv58Dh2hMesqOrBiZG88MzBvNuTgFbCip9cs32fLmnjKdW53P+uP6svncWz1w31Wt//cdGhfPoFZP4tqyGhz/Y6ZVznoh/b7ZjDAFf7ASaKJQKWO/mFBAdEcr549rq79o97po9nMSYCH7z721+GziwqcXBr5ZuY0BCLx69YhKDEr3fSW7GsERuOWMIL3+1n8/ySr1+fncszbUzaUB8wHYCdKWJQqkAVN/Uwr83FzJ3fH+fjv0TGxXOPeeN4pt95Xywtchn13X1whffkl9yhAcvGUdUeGi3Xedn549ieHJvfvb2Zp8PZZJfcoRt9irmBcHTBGiiUKrLSqrq+eOHO6nuxn4HK7YXU13fzA98VOzk6spTBjC6fywPvb+D+qYWn167qLKeJ1bmcc7oZM7p5lFUo8JDeWzhJEqPNPC/y7Z167WOtWyTnRCBiycG1gRF7dFEoVQXfby9mL+u2cMtL2ZT19g9X6SLcwpIi4vi1KGdT3fqbaEhwv9cPJaC8jqfD0f++/d30OQw/PqScT653sSMeO6aPZzFG218uLWw8wO8wBjDslwbM4Ylkhwgrcs6o4lCqS6yV9QhAtn7D3P7K9k0NHs3WZRU17M2r4z5memEeNC6xxOnD+/HnDHJPL06n9LqBp9c88s9Zby3yc4dZw1jYGK0T64JcNfZw5mQHscDS7b65N+6uaCSfYdqmTepe5s8e5MmCqW6yFZRR0bfXjy8YCKf5ZVx1+sbaWpxeO38y1qnO/VDsZOrBy4cQ31TC4+t6P7msk0tDn5tVWD7eiym8NAQHls4iSMNzTywZEu3V+IvzbUTERrC+eN910jBU5oolOoie0UdaXG9WHjKAB68ZCwrthdzrxdHJn1nQwGTB8QzLKm3V853ooYm9eaGGYN585uDbLd37/hIL36xjzwfVGC3Z0RKLPedP4oV24t5Z0NB5wecoIraRt7bbGf26CTieoV323W8TROFUl1kr6gnva+zA9xNpw/hvrmjWJpr55de+Gt0u72KnUXVXJ4ZGMUSPzlnBHG9wvntv7d321/aRZX1PL5yt08qsDty8+lDmD4kgd+8t52C8lqvnru+qYVFa/cw84+rKTvSwLXTB3n1/N1NE4VSXdDc4qCoqp50l57Sd84azl2zh/PmNwf5jYdfqO/mFHg83ak3xUWH89M5I/lq7yFWbC/ulmv4ugK7PSEhwqNXTMJhDD97ezMOLzwhOhyGpbk2zvnzpzz0/k4yB/Xlg5+cycyRwTWPjiYKpbqguLqBFoc5bkiNe84byQ9PH8wLX+zjsRUnNuBcc4vD+aUyOoX46BOf7tTbrpk+kOHJvXno/R00NnuvLgb8V4HdngEJ0fzqkrF8tfcQL321z6NzfbXnEPOe/oKfvJlLXK9wXrt1Oi/+cBqj+wf2uE5t0UShVBfYyusAjnqiAOdc1L+6eCxXnTKAJz/J569r8rt87s/yyig70siCACl2ahUeGsIvLxrDvkO1vPzVPq+d158V2B1ZmDWAs0cn8/AHO8kvOdLl4/OKq7nlxW+4+tl1HDrSwGMLJ/Hv/zyD04f364ZofUMThVJdYK9wJoq2BukTEX4/fwKXTkrjjx/u4qUv93Xp3O/kFJAQE8GsUd6Z7tSbZo9KZubIJJ5YlcfhmkavnLO1AvvXF/unArs9IsLDl08gOiKUe97eRLObLdpKqur5xeLNnP/4Wr7+9jA/nzuaT+6dxYLMDL81c/YWTRRKdYGtou0nilahIcKfF07i3LEp/HrZNv6ZfdCt81bWNbFiezGXTkoLiAl12vL/LhpDbWMLj6/0fC6H4qrvK7DnjPVfBXZ7kmOj+N1lE9h0sIJn1uzpcN+ahmb+smI3sx5dwzsbCrjxtMF8et9s7pg1LKASoCcC8xOpVICyVdSREBNBr4j2vwDCQ0N46popnDmiH/e/u5n3Ntk7Pe/yzYU0Njt8NlLsiRiZEss10wby2voD5BVXe3Su3y8PjArsjlw0MZVLJ6XxxKo8ttqOH023ucXB6+sPMOvRNTyxKo/Zo5JZefdZ/PqScSTEBE4dkzdoolCqC+wVdaTFdz7sQmRYKIuuzyJrUAI/fSuXlZ20GFqcU8CI5N6MTw/sis6fnjuS6IhQfrd8xwmf48s9ZSwLoArsjvxm3jgSe0dw9z9zvxv3yhjDqh3FzH3iMx5YsoVBCdEsvvM0nr42s1tGug0EmiiU6gJbeV27xU7H6hURyvM3ZTE2rQ93vp7D53llbe63r6yG7P3lXD61+6Y79ZaEmAh+cs4IPt1dyupdJV0+PlArsNsTHx3BI5dPZHfxER5bsZvNBRVc/ew6bnkpmxaH4W/XTeXtH80gc6B/Z8rrbpoolHKTMcZ6onB/trnYqHBevnkaQ/vFcNvL2WTvO3zcPos32hCByyYHVmun9twwYzCDE6OdxUddHLokUCuwOzJrVDLXTh/Is5/t5dKnviCv+Ai/mTeOj386k7nj+wd8cvcGTRRKuamqrpmaxha3nyhaxUdH8Mot00mNi+KHL3xz1OxxDodhcU4BZwzvR/+44BhJNCIshAcuHEN+yRFeX3/A7eMCvQK7Iw9cOIazRyXz49nDWPOzWdwwYzDhoT3n67Pn/EuV8lBBhXNYh64mCoCk2EhevXU6fXqFc/0/1rOryFkZ/M2+wxSU1wV0JXZbzh2bwmnDEvnLyt1U1ro3L0cwVGC3JyYyjOdvOoWfnT+a2KjgGaPJWzRRKOUme0U90HYfCnekxffi9dumExEawnXPr+fbshoW59iIiQjlvHHB9Re2iPD/LhpLZV0TT6zK63T/YKrAVsfTRKGUm1o727UOCHgiBiXG8Nqt02lxGK59dh3LtxRy4YRUoiN8N92pt4xN68OVWQN4+at97C1tvwdzsFVgq+NpolDKTbaKOiLCQkj0sI38iJRYXr55GtUNzRxpaPb7vBOeuOe8UUSFh/LQ++03lw3GCmx1NE0USrnJVuFsGuuNVi7j0+N4/dZTufe8kUwfkuCF6PwjKTaSO2cPY+WOkjab/7ZWYJ8dhBXY6nuaKJRyU1f6ULhjQkYcd509IujHAbr59CFk9O3F75ZvP27ypu8rsMf6KTrlDZoolHKTu72ye5qo8FB+ccEYdhZV89Y3349t5VqBfbL2WO4pNFEo5YaG5hZKqhtIj9cWO225cEJ/Thnclz9/vIuq+iatwD7JaKJQyg1Fla1NY/WJoi0iwv9cPJZDNY08vTpfK7BPMsHXJk8pP+hseHEFEzPiuTwzgxc+30d4qGgF9knEoycKEblCRLaJiENEslzWXysiuS4vh4hMFpFoEVkuIjut4x52OSZSRN4SkXwRWS8igz2JTSlv+m5mOw/6UPQE980dRWiIaAX2ScbTJ4qtwALg764rjTGvAa8BiMgE4F/GmFwRiQYeNcasFpEIYJWIXGCM+QC4BSg3xgwXkauAR4ArPYxPKa9o7ZUdLOMx+UtKnyj+el0mLS1GK7BPIh4lCmPMDqCzduVXA29a+9cCq63lRhHJAVp7G80DHrSW3wGeEhExxhzd3k4pP7BX1JEcG0lkmJa3d2Z2AE7lqjzji8rsK4E3jl0pIvHAJcAqa1U6cBDAGNMMVAKJbZ1QRG4XkWwRyS4tLe2WoJVyZevi8OJKnUw6TRQislJEtrbxmufGsdOBWmPM1mPWh+FMHv9njNnb1aCNMYuMMVnGmKykpKSuHq5Ul9krvNvZTqlg0mnRkzFmjgfnv4o2niaARUCeMeZxl3U2YABQYCWSOOCQB9dWyiuMMdgq6rQFj+qxuq3oSURCgIVY9RMu63+HMwn89zGHLANutJZ/AHyi9RMqEByqaaSh2UGaVmSrHsrT5rHzRaQAmAEsF5GPXDbPBA66Fi2JSAbwS2AskGM1nb3V2vw8kCgi+cDdwP2exKaUt3w/vLj2ylY9k6etnpYAS9rZtgY49Zh1BUCbTaSMMfXAFZ7Eo1R3aO1Dob2yVU+lQ3go1Qntla16Ok0USnXCXlFPTEQocb163lzJSoEmCqU6ZauoJc1LExYpFYw0USjVCXtFvY7xpHo0TRRKdUJ7ZaueThOFUh2oa2zhcE2jVmSrHk0ThVIdsFdqiyelNFEo1YHv+1BoolA9lyYKpTrwfa9sTRSq59JEoVQHbBV1hAikxEb6OxSl/EYThVIdsFXU0b9PFGGh+l9F9Vz66VeqA/aKOi12Uj2eJgqlOqB9KJTSRKFUu1ochqLKek0UqsfrsYmi7EgD1fVN/g5DBbDS6gaaWoz2oVA9nkfzUQSzpz7J58Uv95Ee34vR/WMZZb1G9+/D0KQYwrXyssfT4cWVcuqxieKSSWkkxUayq6iaXUXVfLq7lGaHc+bV8FBhWFLv7xJHayJJjYvSEUR7EO1DoZRTj00UUwf1Zeqgvt+9b2x2sKf0CLuKqtlZVM2uoiq++fYwS3Pt3+3TJyrsuyePUS4JpE+UzlNwMmp9okjVubJVD9djE8WxIsJCGJPahzGpfY5aX1nbxK5iZ+LYaT19LN1op7rhwHf7DOkXwxnD+3HmiH7MGJZIrCaOk4K9oo4+UWH6+1Q9niaKTsRFhzNtSALThiR8t84Yg72ynp2FzuSxYX8572wo4JV1+wkNEaYMiOfMEUmcObIfE9PjtLNWkLKV15HeN9rfYSjld5ooToCIkB7fi/T4XpwzJgWAhuYWcvZX8Hl+KZ/llfH4qt38ZeVu+kSFcdqwfpw5sh8zRyQxIEG/eIKFraKODK2fUEoThbdEhoUyY1giM4Yl8rPzobymkS/2lPHZ7jI+yyvlw21FAAxKjObMEf04c0QSM4Ylav1GALNX1DHd5UlSqZ5KE0U36RsTwcUT07h4YhrGGPaW1fDZ7lI+zy9jSY6NV9cdIDREmDwg/rvEMXlAPKEh2qoqEFTXN1FV36yd7ZRCE4VPiDib2w5L6s1Npw+hsdnBxgPlfJ5fxtq8Mp5YlcfjK/O4bHIaj181xd/hKpzzZIPOQ6EUaKLwi4iwEKYPTWT60ETuOW8UFbWN/OH9nbyTU8AvLxpLkg5p7Xfah0Kp72lznAAQHx3BbTOH0OIwLNtk7/wA1e0KtFe2Ut/RRBEghifHMjEjjsU5Bf4OReF8oggPFZJ669OdUpooAsiCKelss1exq6ja36H0eLbyOlLjehGijQuU0kQRSC6ZlEZYiLB4oz5V+Ju9oo60eB26QynQRBFQEntHMmtUEv/aaKPFGqBQ+Ye9oo70eO0cqRRoogg4CzIzKK5q4Ms9Zf4OpcdqanFQVFVPuj5RKAV4mChE5AoR2SYiDhHJcll/rYjkurwcIjL5mGOXichWl/cJIrJCRPKsn33pgc4enUyfqDCW5Nj8HUqPVVxVj8No01ilWnn6RLEVWACsdV1pjHnNGDPZGDMZuB741hiT27pdRBYAR4451/3AKmPMCGCV9b7HiQoP5aKJaXywtYiahmZ/h9Mj2cqdTWO1s51STh4lCmPMDmPMrk52uxp4s/WNiPQG7gZ+d8x+84CXrOWXgMs8iS2YXZ6ZTl1TCx9uLfJ3KD2SvVIThVKufFFHcSXwhsv73wJ/BmqP2S/FGFNoLRcBKT6ILSBNHdSXgQnR2vrJT1qH79DOdko5dZooRGSliGxt4zXPjWOnA7XGmK3W+8nAMGPMko6OM8YYoN1mPyJyu4hki0h2aWlpZ2EEHRFhQWY6X+45RKH1163ynYLyOhJjIogKD/V3KEoFhE4ThTFmjjFmfBuvpW6c/yqOfpqYAWSJyD7gc2CkiKyxthWLSCqA9bOkg5gWGWOyjDFZSUlJboQRfOZPSccY+NdGHdLD1+wVdVqRrZSLbit6EpEQYCEu9RPGmGeMMWnGmMHAGcBuY8wsa/My4EZr+UbAnUR00hqUGEPWoL4szinA+YClfMVWUUdanCYKpVp52jx2vogU4HxSWC4iH7lsngkcNMbsdfN0DwPnikgeMMd636MtyMwgr+QIW21V/g6lxzDGWL2yNVEo1crTVk9LjDEZxphIY0yKMeZ8l21rjDGndnDsPmPMeJf3h4wx5xhjRljFXYc9ie1kcNGEVCLCQnxeqe1wGH7+zmY+2Vns0+sGgsq6JmobW7ToSSkX2jM7gMVFhzNnTDLLcu00tTh8dt0lG228lX2Q37y3nWYfXjcQFJS3Di+uvbKVaqWJIsAtmJLBoZpG1u72Teuu+qYW/vzxLvpGh7PvUC3LtxR2ftBJpHXCIi16Uup7migC3FmjkkiIiWCxj4b0eOGLfdgr63n6mkxGpvTm6dX5OHrQAIU2nbBIqeNooghw4aEhXDopjRU7iqmsa+rWax2uaeSvq/M5Z3Qypw3vx49nD2d38RE+3t5z6irsFXVEhYeQEBPh71CUChiaKILAgsx0GpsdvN/NxUBPfpJHTWMz918wGnBWpg9KjObp1fk9pomuvaKetPheiOiERUq10kQRBCakxzE8uXe3TpO6/1ANr67bz5WnDGBESiwAYaEh3DlrGFtslXzqozoSfyuoqNNiJ6WOoYkiCLQO6fHNvnIOHDp2iCzv+ONHuwgLCeGnc0YetX7+lAzS4qJ46pOe8VRh1852Sh1HE0WQuGxyOiLOpqvetvFAOcs3F3LbzKEk9zm6WWhEWAj/cdYwsveXs/7bk7trS0NzC6XVDdqHQqljaKIIEmnxvZgxNJHFG707pIcxhj+8v5N+vSO5febQNve58pQB9OsdydOr87123UBUaI0aq01jlTqaJoogsiAzg/2Hask5UO61c67YXszX+w7z33NG0DsyrM19osJDufXMIXyWV0buwQqvXTvQ2LVprFJt0kQRROaO70+v8FCv9alobnHw8Ic7GZoUw1WnDOhw3+tOHURcr3Ce+uTkfaoo0EShVJs0UQSR3pFhnD8uhfc22WlobvH4fG9+c5C9pTXcP3eTXfe3AAAS6ElEQVQ0YaEdfxR6R4bxw9MHs3JHMTsKT85BCu0VdYhA/zgdvkMpV5oogsyCzAyq6pv5ZEe703W45UhDM4+v3M20wQmcO9a9yQRvOm0wvSPDTtq6CntFHcmxkUSE6X8LpVzp/4ggc/rwfiTHRvKuh8VPi9bupexIIw9cNMbtzmXx0RFcd+oglm8pZE/pEY+uH4hsOry4Um3SRBFkQkOEy6aks2ZXCYdrGk/oHMVV9Ty7di8XTUxl8oD4Lh1765lDiAwL4Zk1e07o2oHMXlGv9RNKtUETRRBakJlOs8Pw3qYTmyb18ZW7aXY4uO/8UV0+tl/vSK46ZSD/2mjj4OHu6fznDw6Hwaa9spVqkyaKIDS6fx/GpvY5oSE9dhdX89Y3B7nu1EEMSow5oev/x1lDEYG/rz15nioO1TTS2OzQoiel2qCJIkgtyExnU0El+SVdqyt45IOdxESG8V9njzjha6fG9eIHUzP4Z3YBJVX1J3yeQKJ9KJRqnyaKIHXp5DRCBJZ0YZrUr/YcYtXOEu6cNZy+Hg6jfcdZw2lxGJ79zN0p0QObTScsUqpdmiiCVHJsFDNHJrEkx+bWxEIOh+EPH+wgLS6KH54+2OPrD0yM5tJJaby67sAJV6oHku+eKHScJ6WOo4kiiC3IzMBeWc+6bw91uu97m+1sLqjknvNGERUe6pXr3zlrGPXNLbzwxbdeOZ8/FZTX0TsyjD5RbQ9jolRPpokiiJ03NoXYyDCWdNKnoqG5hT99tIuxqX2YPyXda9cfkRLL3HH9efHLfVTVd+/se93NXlFHWnyUTlikVBs0UQSxqPBQLpjQn/e3FFLX2P6QHq98tZ+C8joeuHAMISHe/SL88ezhVNc388pX+716Xl+zV2rTWKXao4kiyC3IzKCmsYWPtxe1ub2ytoknP8ln5sgkzhjRz+vXH58ex+xRSTz32V5qG5u9fn5fsZVrr2yl2qOJIshNG5xAenyvdof0eHpNPlX1TfzCmge7O9x19nDKa5t4ff2BbrtGd6ptbKa8tkkrspVqhyaKIBcSIsyfks7neaXH9Wk4eLiWF7/Yx+WZGYxJ7dNtMUwdlMCMoYksWruX+ibPR7X1Ne1DoVTHNFGcBOZnpuMwsDT36CE9/vzxLkTgnvNGtnOk99x19nBKqht4Z0PXe4v7m01ntlOqQ5ooTgLDknozeUA877oM6bGloJJ/5dq55YwhpMZ1/xfgacMSmTIwnr99uoemFke3X8+b9IlCqY5pojhJXJ6Zzs6iarbbqzDG8ND7O0iIieBHs4b55Poiwl2zh1NQXnfck02gs5XXERoiJMdG+jsUpQKSJoqTxMUT0wgPFZZsLGDNrlK+2nuI/zp7OH2iwn0Ww9mjkxmT2oe/rsmnxY3e4oHCXlFH/z5Rnc7yp1RPpf8zThJ9YyKYPSqZf+Xa+cMHOxicGM010wf5NIbWp4q9pTV8sLXQp9f2hA4vrlTHNFGcRBZkZlBa3cDu4iP8fO5ov0zpOXd8f4YlxfDUJ/kYExxPFTarV7ZSqm0efZOIyBUisk1EHCKS5bL+WhHJdXk5RGSytS1CRBaJyG4R2Skil1vrI0XkLRHJF5H1IjLYk9h6otmjk+gbHU7mwHjmju/vlxhCQ4Q7Zw1nZ1E1qzyc19sXWhyGosp67UOhVAc8/ZNzK7AAWOu60hjzmjFmsjFmMnA98K0xJtfa/EugxBgzEhgLfGqtvwUoN8YMB/4CPOJhbD1OZFgo79xxGotuyPLrmEWXTk5jQEIvnlod+E8VJdX1NDuMNo1VqgMeJQpjzA5jzK5OdrsaeNPl/c3AH6zjHcaYMmv9POAla/kd4BzREdq6bFhSb/r19m/rnfDQEH501jByD1bwRX7nI9v6k13noVCqU74oxL4SeANAROKtdb8VkRwReVtEUqx16cBBAGNMM1AJJLZ1QhG5XUSyRSS7tLS0e6NXJ+QHUzNI6RPJU6vz/B1Kh1o722VoolCqXZ0mChFZKSJb23jNc+PY6UCtMWartSoMyAC+NMZkAl8Bj3Y1aGPMImNMljEmKykpqauHKx+IDAvl9pnDWLf3MF/ml3V+gJ/YyvWJQqnOdJoojDFzjDHj23gtdeP8V2E9TVgOAbXAYuv920CmtWwDBgCISBgQZ+2vgtQ10wYyKDGaH726ga22Sn+H0yZ7RR3x0eHEROqERUq1p9uKnkQkBFiIS/2EcdZsvgfMsladA2y3lpcBN1rLPwA+MYFeE6o61CsilNdunU5sVDjXPree7fYqf4d0HFtFHWk+GOJEqWDmafPY+SJSAMwAlovIRy6bZwIHjTF7jzns58CDIrIZZ4uoe6z1zwOJIpIP3A3c70lsKjBk9I3mzdtPJSYilOueX8+uomp/h3QU58x2miiU6oinrZ6WGGMyjDGRxpgUY8z5LtvWGGNObeOY/caYmcaYicaYc4wxB6z19caYK4wxw40x09pIMCpIDUiI5vXbTiU8VLj2uXXklwROsrBV1JGhfSiU6pD2zFY+MbhfDG/cdioiwtXPrmdP6RF/h0RVfRPV9c3aK1upTmiiUD4zNKk3b9w2HWMM1zy7jn1lNX6N5/vhxaP9GodSgU4ThfKp4cmxvHbrqTS1OJPFwcO1fovl+6ax+kShVEc0USifG9U/lldvmU5tUwtXLVpHQbl/koVOWKSUezRRKL8Ym9aHV2+ZTnV9E9c8u57Cyjqfx2CrqCciNMTvQ54oFeg0USi/GZ8exyu3TKe8ppGrF62juKrep9e3VdSRGh9FSIgOKaZURzRRKL+aNCCeF2+eRml1A1c/u46Sat8lC7tOWKSUWzRRKL+bOqgvL948jaLKeq59dj1lRxp8cl3tbKeUezRRqIBwyuAE/nHTKRwsr+W659ZzuKaxW6/X1OKguKpeE4VSbtBEoQLGqUMTef7GU/i2rIbrnltPRW33JYuiynocRocXV8odmihUQDl9eD8W3ZBFfskRrn/+ayrrmrrlOjadsEgpt2miUAHnrJFJ/P36qewsquLGf3xNdb33k8V3fSh0nCelOqWJQgWk2aOT+eu1U9lqq+SmF77hSEOzV8/fmihS47RXtlKd0UShAta5Y1N48uop5B6s4OYXvqG20XvJwlZRR7/eEUSFh3rtnEqdrDRRqIB2wYRUnrhqMtn7D3P3W5twOLwzl5Wtol77UCjlJk0UKuBdPDGNX140lg+3FfGXlbu9ck5bea1WZCvlJk0UKijcfPpgrswawJOf5LM01+bRuYwx2PWJQim3aaJQQUFE+O1l45k2OIH73tlM7sGKEz5XRW0TdU0t+kShlJs0UaigEREWwjPXZZIUG8ntL2dTVHli40JpHwqlukYThQoqib0jef7GU6hpaOa2l7Opa2zp8jlaE4XOla2UezRRqKAzqn8s/3f1FLbaK7n3nU0Y07WWUN/PbKeJQil3aKJQQemcMSncP3c0yzcX8sSqvC4da6+oIyo8hL7R4d0UnVInlzB/B6DUibp95lB2Fx/h8ZV5jEiO5aKJqW4dZ690zkMhohMWKeUOfaJQQUtEeGjBeKYO6ss9b+eypaDSreNs5ToPhVJdoYlCBbXIsFD+fv1UEmMiue3lbErcmE7VVlGvFdlKdYEmChX0+vWO5Nkbsqiqb+K2l7Opb2q/JVR9UwtlRxpIi9NEoZS7NFGok8LYtD785crJbCqo5L53NrfbEqrQ6nuhRU9KuU8ThTppnD+uPz87fxTLNtl5enV+m/voPBRKdZ22elInlTtnDSOvuJpHP97N8ORY5o7vf9T21j4UOs6TUu7TJwp1UhERHr58IpMHxPPTt3LZZj+6JZStog4R6K8TFinlNk0U6qQTFR7KohumEh8dzm0vZVNS/X1LKHtFHSmxUYSH6kdfKXd59L9FRK4QkW0i4hCRLJf114pIrsvLISKTrW1Xi8gWEdksIh+KSD9rfYKIrBCRPOtnX8/+aaonS46N4tkbsiivbeI/XtnwXUsoW0UdafH6NKFUV3j6Z9VWYAGw1nWlMeY1Y8xkY8xk4HrgW2NMroiEAU8As40xE4HNwF3WYfcDq4wxI4BV1nulTtj49DgeWziJjQcqeGDxFmseijrS+0b7OzSlgopHicIYs8MYs6uT3a4G3rSWxXrFiHP8hD6A3do2D3jJWn4JuMyT2JQC51Sqd587ksUbbTzz6R7sFfX6RKFUF/mi1dOVOJMAxpgmEbkD2ALUAHnAj639UowxhdZyEZDig9hUD/CfZw9nd3E1f/zQ+TdNhrZ4UqpLOn2iEJGVIrK1jdc8N46dDtQaY7Za78OBO4ApQBrOoqdfHHuccfaWanfsaBG5XUSyRSS7tLS0szBUDyciPHrFJCZmxAHa2U6prur0icIYM8eD818FvOHyfrJ1zj0AIvJPvq+LKBaRVGNMoYikAiUdxLQIWASQlZXVtckIVI8UFR7Kszdk8cyaPUwbkuDvcJQKKt3WRlBEQoCFfF8/AWADxopIkvX+XGCHtbwMuNFavhFY2l2xqZ4ppU8UD146jtgonYdCqa7wtHnsfBEpAGYAy0XkI5fNM4GDxpi9rSuMMXbgf4G1IrIZ5xPGQ9bmh4FzRSQPmGO9V0op5WfS1WkkA01WVpbJzs72dxhKKRVURGSDMSar8z21Z7ZSSqlOaKJQSinVIU0USimlOqSJQimlVIc0USillOqQJgqllFIdCvrmsSJSCuw/wcP7AWVeDMcXNObuF2zxgsbsK8EWc0fxDjLGJLWz7ShBnyg8ISLZ7rYjDhQac/cLtnhBY/aVYIvZW/Fq0ZNSSqkOaaJQSinVoZ6eKBb5O4AToDF3v2CLFzRmXwm2mL0Sb4+uo1BKKdW5nv5EoZRSqhM9IlGIyFwR2SUi+SJyfxvbI0XkLWv7ehEZ7Psoj4pngIisFpHtIrJNRH7Sxj6zRKRSRHKt16/8EesxMe0TkS1WPMcN6StO/2fd580ikumPOK1YRrncu1wRqRKR/z5mH7/fYxH5h4iUiMhWl3UJIrJCRPKsn33bOfZGa588EbmxrX18GPOfRGSn9XtfIiLx7Rzb4WfIxzE/KCI2l9//he0c2+H3iw/jfcsl1n0iktvOsV2/x8aYk/oFhAJ7gKFABLAJGHvMPncCf7OWrwLe8nPMqUCmtRwL7G4j5lnAv/19f4+JaR/Qr4PtFwIfAAKcCqz3d8wun5EinO3KA+oe45zXJRPY6rLuj8D91vL9wCNtHJcA7LV+9rWW+/ox5vOAMGv5kbZiducz5OOYHwTudeOz0+H3i6/iPWb7n4Ffeese94QnimlAvjFmrzGmEeeMe8fO9z0PeMlafgc4R0TEhzEexRhTaIzJsZarcc4CmO6veLxoHvCycVoHxFvT3vrbOcAeY8yJdtzsNsaYtcDhY1a7fl5fAi5r49DzgRXGmMPGmHJgBTC32wJ10VbMxpiPjTHN1tt1QIYvYnFXO/fZHe58v3hdR/Fa310LOXoaao/0hESRDhx0eV/A8V+63+1jfZgrgUSfRNcJqxhsCrC+jc0zRGSTiHwgIuN8GljbDPCxiGwQkdvb2O7O78Ifjp3b3VWg3WOAFGNMobVcBKS0sU+g3muAm3E+Wbals8+Qr91lFZf9o50ivkC8z2cCxcaYvHa2d/ke94REEbREpDfwLvDfxpiqYzbn4CwqmQQ8CfzL1/G14QxjTCZwAfBjEZnp74A6IyIRwKXA221sDsR7fBTjLEsImqaLIvJLoBl4rZ1dAukz9AwwDOeUzYU4i3OCwdV0/DTR5XvcExKFDRjg8j7DWtfmPiISBsQBh3wSXTtEJBxnknjNGLP42O3GmCpjzBFr+X0gXET6+TjMY2OyWT9LgCU4H8tdufO78LULgBxjTPGxGwLxHluKW4vsrJ8lbewTcPdaRG4CLgautRLccdz4DPmMMabYGNNijHEAz7YTS0DdZ+v7awHwVnv7nMg97gmJ4htghIgMsf56vApYdsw+y4DWViE/AD5p74PsC1YZ4/PADmPMY+3s07+1HkVEpuH8XfotuYlIjIjEti7jrLzcesxuy4AbrNZPpwKVLkUo/tLuX1+Bdo9duH5ebwSWtrHPR8B5ItLXKjI5z1rnFyIyF7gPuNQYU9vOPu58hnzmmPqz+e3E4s73iy/NAXYaYwra2njC97i7a+cD4YWztc1unK0Tfmmt+w3ODy1AFM6ih3zga2Con+M9A2dxwmYg13pdCPwI+JG1z13ANpytLNYBp/k55qFWLJusuFrvs2vMAjxt/R62AFl+jjkG5xd/nMu6gLrHOJNYIdCEs/z7Fpz1Z6uAPGAlkGDtmwU853LszdZnOh/4oZ9jzsdZlt/6eW5tZZgGvN/RZ8iPMb9ifU434/zyTz02Zuv9cd8v/ojXWv9i6+fXZV+P77H2zFZKKdWhnlD0pJRSygOaKJRSSnVIE4VSSqkOaaJQSinVIU0USimlOqSJQimlVIc0USillOqQJgqllFId+v/Mln+PZVMKegAAAABJRU5ErkJggg==\n", - "text/plain": "
" - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ] - } - }, - "8d18e0fa10b94372a3edf64edb4814bc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_8b14eeb5b78e4e4cb98441ffaeccf4fb", - "style": "IPY_MODEL_a89219097e994deb9caa9b27d8bd2866", - "value": "Adam" - } - }, - "8d80128792d44bf1a0467b7e86df0b54": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_09c74a8b5d1a43828034e148d2edfbfc", - "max": 510, - "min": -490, - "readout_format": ".0f", - "style": "IPY_MODEL_e318e3ad8e11430d840261e7eb1b540e", - "value": 10 - } - }, - "8efed772f09f4ea1a1dabf91598fd49a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "8f01f6cb90754bcb8b2e64809505291d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "8f5e2c19238240c38947f1a5d8e72792": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_413fd706b68148a099ed9af1a952ec6d", - "style": "IPY_MODEL_ff0e9f4940eb4b57bd99d96059b5e194", - "value": "Action space:" - } - }, - "8f90c0a8d78442cfa05aff9b006a94d6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget013" - } - }, - "8f9477722bb54e6185f07c7069ed73bc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "8fd0788ed947457d8556dc976e0eda38": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "8ff956034aa047d0a8809922cbefa856": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget012", - "justify_content": "center" - } - }, - "90d52d8b63c342f087384246a76680d7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_f6f23b9ba55946d0aa626d62ba4bbdf5", - "style": "IPY_MODEL_3488ba4c7374447794395c4c315a1193", - "value": "Box(3,)" - } - }, - "91d86c9ddbfa4acdaf18e13d8adf3862": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_298f572cd2ec4a9ca5a6feafaf334040", - "style": "IPY_MODEL_de8a6e2e9cb447439055e987582fc63e", - "value": "Adam" - } - }, - "9384c24875c24e5b8be37d4c55e04820": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget008" - } - }, - "93fcd071ff834486b199ab26105f6901": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "946c2a2e7e8f4e36b0311e922520272f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "965b9a99694b4227a43121ae2e974290": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_bcb79998188240e99279f9cda7e004d9", - "style": "IPY_MODEL_02904d8bc2d442deb3da0b5e6e0363a9", - "value": "StochasticPolicyNetwork" - } - }, - "9689f9977c7f455282a9831bcd81905c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_59da397a7faa43c79c633dd523b6f07b", - "style": "IPY_MODEL_ec6b04eac2cd4e5a821244a954846a39", - "value": "Dense(n_units=1, tanh, in_channels='64', name='dense_2')" - } - }, - "9694a75a41e543a3b2642aee3572857d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_1202663af1bf4653bc967824c8574e1a", - "IPY_MODEL_e1d753092ae3420ead7a3086b9405f2a", - "IPY_MODEL_182107ee16aa4bfba497dd033e347d65", - "IPY_MODEL_6cb628f08ae2469db2ee42e38ca4de74", - "IPY_MODEL_885608d7df064c51ac0523ef9928e6b6", - "IPY_MODEL_22ff0e7129b04334b71044d77e3c9298", - "IPY_MODEL_43ca75c41e054155b5ad51e493b3b990", - "IPY_MODEL_84f7291061b34bfaaaec0711bd0cca56" - ], - "layout": "IPY_MODEL_3e9c9dcc814b47f8b2b392074c83d853" - } - }, - "96fc368f69794e5baa9433c3a31b1ec1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "9705108e9dd540fa8e02c1933e03eadd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "97b119b9f8fc4a5f80b7f35b2fbc20dd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "97f58376ed524fab85dde1ea5f67ee17": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_8fd0788ed947457d8556dc976e0eda38", - "style": "IPY_MODEL_c480ff00167c4205a51065548cbea855", - "value": "StochasticPolicyNetwork" - } - }, - "98824ad5eda8475394e9fb13819502a9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "98eeb6cc7ac643ac882d54fab647de04": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget014" - } - }, - "98f2c9b34e884cada9e2eedac93e1912": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "99ac959475eb4f75b586ed6599b99113": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_7a7ebee6dcf34f36b1d55d2cb443e387", - "style": "IPY_MODEL_55abe6fb296b491ba2e2a09a492b5ae8", - "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" - } - }, - "9a247aedcd64492d9b4ddf9d76c13062": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget005" - } - }, - "9ac98c15de5a4548a99d80e8ea3004c9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_d9b467355fa940af8f164b0b53137582", - "style": "IPY_MODEL_351ae05c16d040dab9a578c06a78858c", - "value": "Environment Selector" - } - }, - "9b276e72efa44a7e911ee209d08859b6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "9b5f3fd4ebd341ac91227f9ded9fab19": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "9c226167c8fb4cfab3a7161a87588ae1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "9ce0362f9fac4e45a87ebe7a085a24af": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "default" - ], - "description": "state type:", - "index": 0, - "layout": "IPY_MODEL_a6379873f0434d53a6ad52553c164bdb", - "style": "IPY_MODEL_dceb338b27c742cd8733350448a2e798" - } - }, - "9dd1d4acaad44f16b1bbf0693ee9fad5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_7e128d275e3c4e88829167514cec3bc6", - "style": "IPY_MODEL_10b2a84971164564ac50d9f53bd98579", - "value": "Input(shape=(None, 3), name='input_layer')" - } - }, - "9dfcd5e4ec744ed4a0a9091bed5ed2d8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_88fc41c33c024f4eb22b13e0ea98e605", - "style": "IPY_MODEL_5caab83d7d4d4658ac739d02b56e9fd6", - "value": "render" - } - }, - "9e37b046f2d841dd9572b2284a729bf5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "description": "Progress", - "layout": "IPY_MODEL_33ecf71f75a649a285ea6a8211b5acbd", - "style": "IPY_MODEL_68fcf5652dd14e5fad220fcbe777ddbb", - "value": 18 - } - }, - "9ee876553e424052a509a2daed8da1c6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_cefe9c21582d46dc9471bee195b466b7", - "style": "IPY_MODEL_b04de6976e7d476fa4981293ded26bd6", - "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" - } - }, - "9fc5c513843a4c0fa7ae9c8b37c3b4ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "9fd6a74ce4e54ae38816e55d19327281": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_68d4eab6f1cf4e2fa0e229ecdce8d392", - "style": "IPY_MODEL_bb0110f57f39444db2d635a30437c85d", - "value": "amsgrad" - } - }, - "a01f34500cfc486289f3334e3cd222df": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_7a4be7c4229640b18c29d60d30cc0e70", - "style": "IPY_MODEL_7d64c7c8f2dc4d4eb6218e55ae44bfbe", - "value": "Algorithm Selector" - } - }, - "a02320673c484c46848d7aeb6fda6e18": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "a0371ec3949944198211395dc7848ba6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "Acrobot-v1", - "CartPole-v1", - "CartPole-v0", - "MountainCar-v0", - "MountainCarContinuous-v0", - "Pendulum-v0" - ], - "description": "env name:", - "index": 5, - "layout": "IPY_MODEL_45e906bdfe7a464d848f9c972f536d31", - "style": "IPY_MODEL_ad07aedb699c4a3da0110a187e381619" - } - }, - "a038c2e1def5473484b4d9bbc5393145": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_6923c73eeac747fdbe41b2062e257a58", - "style": "IPY_MODEL_93fcd071ff834486b199ab26105f6901", - "value": "save_interval" - } - }, - "a0b2c18704554c60bfb62c5c7ea46e34": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_40747ee3248e4cbca2b22e3201e7ae52", - "style": "IPY_MODEL_7f3f44cbaac94755810c0e589d048490", - "value": "ValueNetwork" - } - }, - "a18265de326b4d399e760f9d2e5bb238": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_76d1b335a0134c19852090005ae135c4", - "IPY_MODEL_dd631605869640d9b8564da50fd7f14e", - "IPY_MODEL_dd3cb8ec44e2454a9fd787b26a794aa2", - "IPY_MODEL_e2d50772ac80494ea293f047efb33527", - "IPY_MODEL_3c695e15ebbd4ecfb555b0fe5221ad10", - "IPY_MODEL_f401d82a291f4cdb9d44cf62f1c48978", - "IPY_MODEL_f8eb99b0291b45dda1b391805141e984", - "IPY_MODEL_1d03aaf95d45497ca74e337a82632cee", - "IPY_MODEL_ec1d469669a2411f9a5a7a1774480576", - "IPY_MODEL_2c48650276864e79a7b82413ddd8c6fa", - "IPY_MODEL_e923a0f829b14a6b83f8ef159b7e1e67", - "IPY_MODEL_ad74a56ab452440e86d1ff508a37e2fc", - "IPY_MODEL_a8c7fbd1b9e64ebebfc11f7da9dfbfd5", - "IPY_MODEL_1eec2203d3bf49c2876604c21291cc18" - ], - "layout": "IPY_MODEL_31fe17808d8e4f7ead5964af2e4f5894" - } - }, - "a23a881ee9034a33a8d23c63c65490c7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a2bb633318304f79a811eb07e18da7f5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_fca1d8802f264b48aa3f7bef2b5f5b81", - "IPY_MODEL_27fbf57b093b4444b8990601eaddca26", - "IPY_MODEL_4b9184b437ac441e8c485894889e7fd4" - ], - "layout": "IPY_MODEL_1c09f9523eb2469ab864ddcd5f15f417" - } - }, - "a2bf112fa96c4e8aba14a96af2788dbc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "a32e41356969452abe56558608109dc8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "a496bd2aabab465fbcf0022dc1acd19f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_f72ef10c1acd44608d2db2b932f2b167", - "style": "IPY_MODEL_077609b632e64492acbc9a009222e086", - "value": "ValueNetwork" - } - }, - "a517b57a04ed49bf82a0820df4bcf3b2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "center" - } - }, - "a592a51f7f3d40cf81de06ff0c9e1546": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "a5d8986e9aad47b1ba7821ddf2850c7a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "a6379873f0434d53a6ad52553c164bdb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a6a4d48baea44d659e3b2dd7e54fcd17": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_f9cd83ba01bb440b9510e0ada3cfd4aa", - "step": null, - "style": "IPY_MODEL_1a3aa6da2cad4cfd9696b32125ab645b", - "value": 200 - } - }, - "a7d002d3e5454965af1d9cdb2e54e7ca": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_1b48b0f90cef4077aa20b9ee8be52e9b", - "style": "IPY_MODEL_3d9166fc4fcf43f3b930ebc7f996a5bf", - "value": "Adam" - } - }, - "a7d8b17ff9fd43298bc30e0471ade94f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "a860d9c958c646aa89ae598dc67eaa08": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a89219097e994deb9caa9b27d8bd2866": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "a899edcecbcf49d1a1f57b48bed97865": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "a8c7fbd1b9e64ebebfc11f7da9dfbfd5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_ae3b1f8332bd40ab9ef5ec6dfd688123", - "style": "IPY_MODEL_6efa143c4b9d43aa94ed8cfe56824583", - "value": "epsilon" - } - }, - "a8e550f371f94677a29e238776be2cdb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "aafbebe0ec5b4425acf54f0ad9f6c80f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_49c009585e524d98af99d984cf65a85b", - "step": null, - "style": "IPY_MODEL_76dec90334724f3ba9e51ba05856ff79", - "value": 100 - } - }, - "ab2e3b3dc5024debb0c00c3d27d48a8b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ac4da45cf7d84d5fa0ea8963afbe5c12": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "ad07aedb699c4a3da0110a187e381619": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ad34362a6d0b43edb782d9f50d666a41": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ad74a56ab452440e86d1ff508a37e2fc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_379d32750a8c4e88b3b6a8d76c3ee91b", - "style": "IPY_MODEL_b1240a01113b4044b84ce15397d29251", - "value": "0.0" - } - }, - "ae1716b3153545b394ccc02357c0cecc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ae3b1f8332bd40ab9ef5ec6dfd688123": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget013" - } - }, - "ae877e1e2a554a19b78fb9a12f60e5d3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "aeecfc3325ec482ebd31ced3fc2e6839": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "af4e53453b1a434e9426fd63d61888c5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_fca98009fe56433b97f1fd16969f9a35", - "IPY_MODEL_0c0d922d9ed14199ab9b8f48b9e8ba1d", - "IPY_MODEL_5bced3d11d4a41a4b3e1c712f83b98e4" - ], - "layout": "IPY_MODEL_7a6c0819e1344119aae9ef136830ad44" - } - }, - "afeba836a14d4fb6a7c5407794848b80": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "b04b868ce504489c82bd8818501b3ac3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_8991ca296f464086aab8e12cc644430c", - "style": "IPY_MODEL_683e3afa65604f1b85604a79ec228a2b", - "value": "decay" - } - }, - "b04de6976e7d476fa4981293ded26bd6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "b04fb98f9bb24f24bfa2c883cb8bd2fc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_434eec441fb94a30bcb70bec50c60d78", - "IPY_MODEL_0b1a53d081f547f8ab913cd15fe70058", - "IPY_MODEL_0af6103ca9e44bb4a44c62b84b39415f", - "IPY_MODEL_0b1a53d081f547f8ab913cd15fe70058", - "IPY_MODEL_6f0bd8ffadf44461a70b1031b3f65064" - ], - "layout": "IPY_MODEL_452324b6d7cc4cf28d456787efc23b8f" - } - }, - "b106f6f6a7f047a4a11ec9f9a23804e2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DropdownModel", - "state": { - "_options_labels": [ - "AC", - "DDPG", - "PG", - "PPO", - "SAC", - "TD3", - "TRPO" - ], - "description": "Algorithms:", - "index": 0, - "layout": "IPY_MODEL_eb5620a9d421450a9c0b629c52d3d8ba", - "style": "IPY_MODEL_1dbbcf0744194117b3463d5ae8af00ef" - } - }, - "b1240a01113b4044b84ce15397d29251": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "b18ac7a05b7c4d58813a3e735173a3ca": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_a23a881ee9034a33a8d23c63c65490c7", - "style": "IPY_MODEL_014bf4270fea44b6aad4c80c7a5979b7", - "value": "Choose your environment" - } - }, - "b20aaab10e6a49138d9cf0a414321c49": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_c06d332516bf42b2b764cc7b6117aade", - "IPY_MODEL_891909eab8204a4bb78c9a468bc20112" - ], - "layout": "IPY_MODEL_ce069bda2c504adabddf4308b196d410" - } - }, - "b2ed3221465c4c7097b79683b8e5c5f0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget002", - "justify_content": "center" - } - }, - "b316a517fda34deba03047080e565a59": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget008" - } - }, - "b3a43d5f73df48299fdf24a855c623a7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "b4047180a5aa44479c358d8c12f0c5d5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_3025ff51115247eebfcfe7e2a18e414e", - "style": "IPY_MODEL_5f1fda7eb4ac4ce694f721e312e205ab", - "value": "0.0001" - } - }, - "b42c755dec514e6fa26ca97f3f0ef923": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_85d35dbed0594a3a837f536309af0b59", - "IPY_MODEL_0201bde3e922471d9bb86857be61df95" - ], - "layout": "IPY_MODEL_5efb085669c2400a909ac37b5cb4e45e" - } - }, - "b4d945e45eae41ceb40de345939615ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_42f8297b00d240308e7403a004a1c6b4", - "style": "IPY_MODEL_f48e72d8d0b5470798d5faeed3dc8e40", - "value": "learning_rate" - } - }, - "b50b99192c944a348df722c9f5cdaa90": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "b5214d589d704727964cdb67261b2d47": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_ad34362a6d0b43edb782d9f50d666a41", - "step": null, - "style": "IPY_MODEL_dca0afd22296462f8a0e11b82566f289", - "value": 0.9 - } - }, - "b58381d8050044ee9df6c0857e3a06e4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\n \"net_label net_info\"\n \"opt_label opt_info\"\n " - } - }, - "b5ac8df291f9438bacc64a6cb2805620": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "b5bcfb7873f44eba8f8f90e018f09b6a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget012" - } - }, - "b5dd447dec9c48bc8b1bb664c9553912": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "b64d5e345cb5482595aa92662c8f162c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "b672fea2d3ac4732a92e992eaaef260e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget009" - } - }, - "b85dbc19731e4b84bb6122ea52367809": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "b92bc4065ee4473aa6e1b4051e044dee": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_629ece3b43ac4c8a8c2f83733a180978", - "style": "IPY_MODEL_38f46c0b84c84233a228758c9b306a79", - "value": "amsgrad" - } - }, - "b9743661bbd24d94969c463e1f77d6e8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "b979276c5b584ebab1400eea707b2c39": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "b9ad33908a4f4a6ba687c820c123c37a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget014" - } - }, - "bb0110f57f39444db2d635a30437c85d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "bb04f52581bb496e9a6931ce291714c9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "bb5d38052b40427585a8ec928bdef7b5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_ca41ace6e197496b8d0e375f53b92729", - "IPY_MODEL_081136f1075542a3999ce83eba68fdb5" - ], - "layout": "IPY_MODEL_4a88a99c974d47da993c8bde3faab362" - } - }, - "bcb79998188240e99279f9cda7e004d9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "bd7afa2132154beebd89e4320ebcad26": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_b316a517fda34deba03047080e565a59", - "style": "IPY_MODEL_d21ecfeb69a54154ad0c0cadf69db4fa", - "value": "0.9" - } - }, - "bdb404863da84bdf870e550898f54848": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_7f94bb571172453a920e7bd6d7a9050f", - "IPY_MODEL_e2ecea0189544c41a0ca172743cf16a1" - ], - "layout": "IPY_MODEL_62a5e4f04f554e6580d63bb32f36b3be" - } - }, - "be4d4fbbc53d4705963f9b343aff399f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "bebb739676c74aacb396889de39592e6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "bf3a856d0c5f4d47abf596f528a2d947": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "bf620c54949846b49135585c61101b19": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "bf7a578fb6204ce694235598a0f00ea2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "bfa16a837ebd4ec795d5aa0a893d5298": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "bfdfc9d77a654743a9ebdfc08ab167da": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "bffd75c7e90346ebb8214c6fe0ce2ab4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d9864398064d4a4ea93f2f985bf70bb5", - "style": "IPY_MODEL_835ef9a1125846679a65d679afb62013", - "value": "./model/AC-Pendulum-v0" - } - }, - "c06d332516bf42b2b764cc7b6117aade": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d466ecd3ea76446fa72d90acf2d7c5ba", - "style": "IPY_MODEL_c726054bb59f40aab21ea2d4485ce77e", - "value": "Learning curve" - } - }, - "c083a4b8f36848ed9f277f423ae18084": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_8f90c0a8d78442cfa05aff9b006a94d6", - "style": "IPY_MODEL_d220d182817c44408e2df2a364760e43", - "value": "epsilon" - } - }, - "c096b60cb96b4aa68be8728e6feb2366": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "c12ffb6b4533460bbdfc7404ff89d807": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_842ea79123034275adec1df392a4846d", - "style": "IPY_MODEL_0cabfd585d5d4421a05805698bc1c8ad", - "value": "beta_2" - } - }, - "c2160078393b421d9f3a4343f37307e2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_38484ea61c3449a1b809d8526ead582d", - "style": "IPY_MODEL_7ff9e3e9f09b40d398b6c898e5ee9653", - "value": "False" - } - }, - "c234ed19a3204e1d9452d6686e014efb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "c2aa94c81efc4f3f826adcb847fbdb89": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "description": "Progress", - "layout": "IPY_MODEL_5b87473fb6cc473a89998a285388f4da", - "max": 10, - "style": "IPY_MODEL_6f525160109d45299758550c08196bd9", - "value": 10 - } - }, - "c2eca071d21942c98a47aaf881130883": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_ae1716b3153545b394ccc02357c0cecc", - "max": 400, - "readout_format": ".0f", - "style": "IPY_MODEL_8f9477722bb54e6185f07c7069ed73bc", - "value": 200 - } - }, - "c3233dc4967548279ff54f73e91e27a0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c34d5f3024f24951b4f478bca62dd7c7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_40c1e5560977460b86028ca09ee94662", - "style": "IPY_MODEL_e00c049b23f34848a62ee225b63ec0b7", - "value": "amsgrad" - } - }, - "c35cf89d5b4c42c886c9c83fdc93c8e6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c3c09aa3ecea45eda2b142c857c5d7c5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget011" - } - }, - "c3d17e5a575344968f8b84a174b26ba9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "c3ef353dd171416da3dc55582107fa67": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_266e10703ed340a78b259c7d3ddc8836", - "IPY_MODEL_64750206fa3a48119aa85e75f5ff2de8" - ], - "layout": "IPY_MODEL_a517b57a04ed49bf82a0820df4bcf3b2" - } - }, - "c4662ffdadef4c7d82aba5ddca1fbfda": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c480ff00167c4205a51065548cbea855": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c60dc42b295c47138b76205df9071217": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_432a3a690b36409192aa3ee4dd5fedf8", - "IPY_MODEL_08f5684d8e194916ac04ed379e2bf022" - ], - "layout": "IPY_MODEL_48392da1f6c64d3fad859465d0d0095b" - } - }, - "c726054bb59f40aab21ea2d4485ce77e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c75a9640bb26465785ca214520007519": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c7a9f23b553e43a78d5c0ced37526327": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "c90e24c07a754360836c2acc6f3a7e22": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_9ac98c15de5a4548a99d80e8ea3004c9", - "IPY_MODEL_f10d3787733a4ece9120c3641017114b" - ], - "layout": "IPY_MODEL_6187b72c80f64272a6c33c90cb582c4c" - } - }, - "ca41ace6e197496b8d0e375f53b92729": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_516cc7132ca94faab3023ffcd1ed4cd4", - "IPY_MODEL_329f804132904f47a73d10b3ccba4b4d", - "IPY_MODEL_a0371ec3949944198211395dc7848ba6" - ], - "layout": "IPY_MODEL_9c226167c8fb4cfab3a7161a87588ae1" - } - }, - "ce069bda2c504adabddf4308b196d410": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ce5b0166c393435a840819472b761b8c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ce5b912531614dfe90ee3e20fa7ba467": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "height": "250px", - "width": "350px" - } - }, - "ce777268358f48608666122680449e3c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ce96b4fa2ae14c6f8f4af830f9442000": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "cefe9c21582d46dc9471bee195b466b7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "cf3de6c59d124068af4aef37293c26e2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "cfb6b6bcedad4f61893206fb1eb28385": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_283080f17fcf4286b2e6e059bcda3370", - "IPY_MODEL_04461564de8c45d6af4c6055f7b4c17f", - "IPY_MODEL_9dfcd5e4ec744ed4a0a9091bed5ed2d8", - "IPY_MODEL_334d1a726d2347db82e42df5760618b3", - "IPY_MODEL_6c751fa2c2aa415ea57d3c9b0e11b22d", - "IPY_MODEL_43730220bf8e489cae588fcf375d08cf", - "IPY_MODEL_a038c2e1def5473484b4d9bbc5393145", - "IPY_MODEL_7af9623e94c64555b01efa581f338e60", - "IPY_MODEL_389174ab87e24a48a23ad5f81a32da61", - "IPY_MODEL_4ee9cbafcaad44de9f9e7453ee765047", - "IPY_MODEL_3a3916bde1e849aeae0e2701258ddc34", - "IPY_MODEL_88aafdf648784ac7954ce933431f9a3a" - ], - "layout": "IPY_MODEL_19b0d8173d9141e0a0db8d0b2110c98c" - } - }, - "cfc4c351d9da4a2bbe36bb1288f74e82": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "d02f0cd6f8f94156ac86605286a6ee78": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "center" - } - }, - "d16d026731104f40ad77f1c7b8f77bf6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget009" - } - }, - "d1b7a611e0ea474991c6034e7e7a9e98": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "d1ba6fbf21674589b3f585f6e0f9638b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_79611f87c64c431794f17eccbbd60f38", - "style": "IPY_MODEL_a2bf112fa96c4e8aba14a96af2788dbc", - "value": "0.0" - } - }, - "d20f2266d6fc44df988c78b63b202a81": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget014" - } - }, - "d21ecfeb69a54154ad0c0cadf69db4fa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "d220d182817c44408e2df2a364760e43": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "d2ba7f491ec94768be174bba323aff6d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget009" - } - }, - "d34c7789bb974de1a36ef3cc45737b52": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget010" - } - }, - "d439f3de7aeb4f059483dedb8aca131a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "d466ecd3ea76446fa72d90acf2d7c5ba": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "d48e8464b37c4f0099d42e59369dbab6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_b672fea2d3ac4732a92e992eaaef260e", - "style": "IPY_MODEL_f834d6547a954a478d9e755653e4f5a1", - "value": "beta_2" - } - }, - "d4c91e304ca34f88a4c959ecc4683678": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "d5a3129aed5d47718c478523d35359ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "d6a04d9b77b54ae89af21fa5551e205e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_214c87e57eb641bb89644c9f465889ca", - "style": "IPY_MODEL_7a807eea55d14bae96d792b1e475adcb", - "value": "save_interval" - } - }, - "d6ddb43e654a421ead72beacfae7145e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_2dab24721ba34bd789afa55d1479464b", - "style": "IPY_MODEL_0a21d0f35913467a9b266a75d2af8db0", - "value": "Supported algorithms are shown below" - } - }, - "d915d378018e4bd085cf4a0a935e2aaa": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_d16d026731104f40ad77f1c7b8f77bf6", - "style": "IPY_MODEL_7aba7921241e41af9a32cbe042699485", - "value": "test_episodes" - } - }, - "d91d58d65e864faa90c9cc7bfd2959b0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_182c5797541f4476bb02c95a710f1bca", - "step": null, - "style": "IPY_MODEL_6dc0399123f94dd1831a2b2cfb6c3078", - "value": 10 - } - }, - "d932e823fc31419d9d00cb89736f8a5f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_a0b2c18704554c60bfb62c5c7ea46e34", - "IPY_MODEL_f80bd1f80d99494595e88c9fc5f055d2" - ], - "layout": "IPY_MODEL_f3645a595f8c4e1f82d71ed6f97e7dd6" - } - }, - "d9864398064d4a4ea93f2f985bf70bb5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "d99dceda8ae6483f8df298525d45be82": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "solid" - } - }, - "d9b467355fa940af8f164b0b53137582": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "da04b8e9a4464f7ea141e41904fa3b0f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "da5536ed85464ee5a97c44660b985348": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "GridBoxModel", - "state": { - "children": [ - "IPY_MODEL_469da089cf804101a4cbc570975a1aed", - "IPY_MODEL_dc4226a0086147b29ba43f099ccad551", - "IPY_MODEL_7df23ef826fb4c568071b0667bafcd3b", - "IPY_MODEL_f5879b9ebaab4df9b53830cef8c25e62" - ], - "layout": "IPY_MODEL_de78a9211dba417182808fc83d0ebbf8" - } - }, - "da5694fd870b41e79f41ebc7d7b8db5e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget002" - } - }, - "dc12042cc1bb40c98a69bef90468797a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "dc4226a0086147b29ba43f099ccad551": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_f5c5c8e022aa4f239006a40e2ac8b990", - "IPY_MODEL_b5214d589d704727964cdb67261b2d47" - ], - "layout": "IPY_MODEL_b2ed3221465c4c7097b79683b8e5c5f0" - } - }, - "dca0afd22296462f8a0e11b82566f289": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "dceb338b27c742cd8733350448a2e798": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "dd3cb8ec44e2454a9fd787b26a794aa2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_39c394badc7246fdb12032649f71a1b6", - "style": "IPY_MODEL_ce96b4fa2ae14c6f8f4af830f9442000", - "value": "learning_rate" - } - }, - "dd51349042bc4341b061da02df9f8be2": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget003" - } - }, - "dd631605869640d9b8564da50fd7f14e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_2dece16eb4994e5082a1cbeeea4163d0", - "style": "IPY_MODEL_d439f3de7aeb4f059483dedb8aca131a", - "value": "Adam" - } - }, - "ddaf2150308c4af2876f9f423d0b803d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ddba268ea0db428898643ae0f9a259a3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "stretch", - "display": "flex", - "grid_area": "widget006", - "justify_content": "center" - } - }, - "de78a9211dba417182808fc83d0ebbf8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(2, 1fr)" - } - }, - "de8a6e2e9cb447439055e987582fc63e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "def02ee29d9a44b19a1fd20f8a4be1a0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "df228d4f3b644bb081011555c9f36485": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget010" - } - }, - "df84370f89e949518569f900854e2510": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e00c049b23f34848a62ee225b63ec0b7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e09e0ff65ebf454b80a965aaa0f61d32": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_5526ed8ea7b4499eadc0bbb165d7bbc4", - "IPY_MODEL_d932e823fc31419d9d00cb89736f8a5f" - ], - "layout": "IPY_MODEL_54927f9f2cde4416bf0e3b782fbd5118" - } - }, - "e0a1f12f4f0e4e31adc281b1fe6dee11": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e14f5611fa9242af879512207669394f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e1d753092ae3420ead7a3086b9405f2a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e9add15a402448ee8f55d0a65f2d460c", - "style": "IPY_MODEL_ddaf2150308c4af2876f9f423d0b803d", - "value": "Pendulum-v0" - } - }, - "e1f03c622ff64b3bb4e59fc54e7898a6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_b5bcfb7873f44eba8f8f90e018f09b6a", - "style": "IPY_MODEL_c3233dc4967548279ff54f73e91e27a0", - "value": "0.0" - } - }, - "e1f175e02edf40f39585c485ec11cbff": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "height": "250px", - "width": "350px" - } - }, - "e210fdbc53d246a2ae55da6a3689745b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e224793bc1524f0c91ce3d7ef0e98f8e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_4d8d22e583c64179817ad9c514bd4490", - "style": "IPY_MODEL_f91418c725364297a60aa4983253ae07", - "value": "0.0002" - } - }, - "e255dc6e7af7487e8a2729f670bffd8a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget008" - } - }, - "e27f2db74f874171acd272cf848ddc80": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget011" - } - }, - "e2d50772ac80494ea293f047efb33527": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_a592a51f7f3d40cf81de06ff0c9e1546", - "style": "IPY_MODEL_d5a3129aed5d47718c478523d35359ad", - "value": "0.0002" - } - }, - "e2ecea0189544c41a0ca172743cf16a1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_1adbcde168d04bcdaed1c410feae74ac", - "step": null, - "style": "IPY_MODEL_4e6414fcd34b454e94c982f7233402a7", - "value": 100 - } - }, - "e318e3ad8e11430d840261e7eb1b540e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "SliderStyleModel", - "state": { - "description_width": "" - } - }, - "e35bce23c28f4af3b0d4dce2266ed2e8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e3adb676dd9b48a6bd4e895ac644b653": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e41fe8ee1bf04764abe02428057a540a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e4665eee9731436a839eaebea246f048": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_0d95601931d94f8cac55349f5886038a", - "style": "IPY_MODEL_ee84c4f73d284618aa3241fcb758da9f", - "value": "Box(1,)" - } - }, - "e467ed3285684035a013df63ebb6b422": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e527873f8829445dbdb49e0710132c63": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e53d3b32848c4872a5e1254a2ed080f1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e57f860aafca4775a03574208f4944b7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_fd1693effce0420c8f4bbbebde0ef7c3", - "IPY_MODEL_4bbe95c5e6b34795a2058cc7bf7416f9", - "IPY_MODEL_9ee876553e424052a509a2daed8da1c6", - "IPY_MODEL_07b040199f664673b2cb1b45c5a5af34" - ], - "layout": "IPY_MODEL_41425cf814dc44c49ac901aeec4c668f" - } - }, - "e62a214128d34799be2e1cc2cdb98b8c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "e6958eae462d43d8bdb9c6227deddcc7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e6c798aa900740009741c67dfccb0d92": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_df228d4f3b644bb081011555c9f36485", - "style": "IPY_MODEL_63d55c74d6ed493abe58361958b23046", - "value": "0.999" - } - }, - "e8260cb1f55049a49bdaf024528d43c4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "e835260b70924edd959ac38cbdaa50d3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget013" - } - }, - "e8b87d816ccb409083b0c522ef0bd9dd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget004" - } - }, - "e904337542fd4e5d8187b9b9190b7522": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatTextModel", - "state": { - "description": "Manual input:", - "layout": "IPY_MODEL_b50b99192c944a348df722c9f5cdaa90", - "step": null, - "style": "IPY_MODEL_831ed45407f74193acc07dacada162a9", - "value": 50 - } - }, - "e923a0f829b14a6b83f8ef159b7e1e67": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_e27f2db74f874171acd272cf848ddc80", - "style": "IPY_MODEL_b3a43d5f73df48299fdf24a855c623a7", - "value": "decay" - } - }, - "e944a76d793541058cf5f32563847fb3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e9794b57be6c4c0e981a017d3fa82a36": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e9add15a402448ee8f55d0a65f2d460c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "e9d6d91ceda64a63b9fe358e90337820": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_fe785154b75c4badbab0d946f05802cf", - "style": "IPY_MODEL_78f5897896d144fe839fafd65e76816e", - "value": "Environment Information" - } - }, - "eb54eb7b3c674e67b10610ce2aaf309a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_547d2113aae04e20ba41d30deb33ec5f", - "style": "IPY_MODEL_7b48f1fae96e40519787018ed628b99b", - "value": "1e-07" - } - }, - "eb5620a9d421450a9c0b629c52d3d8ba": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "eb5fdb48aa1d483fa9acf05a229ef307": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ebff747fea3f4cf2abb9efcd9f998ddb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ec1d469669a2411f9a5a7a1774480576": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_4749f46df2c4438e874ed6912a4d7ef1", - "style": "IPY_MODEL_7cc3bf6293494425b70569d1eca3af03", - "value": "beta_2" - } - }, - "ec6b04eac2cd4e5a821244a954846a39": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ecc6da99cf7944f5a5a6cfd1f0516aa6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ed746bfae28741e9ae1d450dd1394423": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "center" - } - }, - "ee84c4f73d284618aa3241fcb758da9f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "eef437964b4e4fa29ea42afc6b9a69ce": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_8f01f6cb90754bcb8b2e64809505291d", - "style": "IPY_MODEL_ce777268358f48608666122680449e3c", - "value": "Box(1,)" - } - }, - "ef95b43fb5cd436cb6f737f2defc8e38": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_747e88ebfefc4efb95f60f63e725dcc1", - "style": "IPY_MODEL_078c44ca72d24661bbeb9921196ddace", - "value": "The action space is continuous." - } - }, - "f10d3787733a4ece9120c3641017114b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "children": [ - "IPY_MODEL_26036b1a064245a6a1cef60ec7d39376", - "IPY_MODEL_af4e53453b1a434e9426fd63d61888c5" - ], - "layout": "IPY_MODEL_70c300868924433094e74b74d260a4a2" - } - }, - "f1888922c93c435f8bac11033ae325e9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_455c6fed537d48b188edef0200ab0fb1", - "IPY_MODEL_2e65a763e5db40ca8969c36950c0d9bd", - "IPY_MODEL_8c27b4b759354d64b25bcb3462c444ef" - ], - "layout": "IPY_MODEL_74d03d1491d4451d879384ab357f33a9" - } - }, - "f1985e262a7d401ea97c903091713789": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f2612900bd944258af3be77cacc7a46b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f29a7f4ff2a74bbf8d6485cbfb086152": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_bf3a856d0c5f4d47abf596f528a2d947", - "style": "IPY_MODEL_10685777c5384041b62b4ce3aa26bf6e", - "value": "Environment Selector" - } - }, - "f29ba87ee02f4fc38760b98a32e20581": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f2db93e6094b47d0bfce3821b33d707a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f2ffc80dd5074916b1a69e9de91149f9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget001" - } - }, - "f3645a595f8c4e1f82d71ed6f97e7dd6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_content": "center", - "align_items": "center", - "border": "dotted" - } - }, - "f401d82a291f4cdb9d44cf62f1c48978": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_86e357397076415ba3ac239b26a8bc8f", - "style": "IPY_MODEL_faea715cb8894b8ca444f80d17c07e12", - "value": "False" - } - }, - "f48e72d8d0b5470798d5faeed3dc8e40": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f4d0297192f5464bac7ab02b3dabed2c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f5879b9ebaab4df9b53830cef8c25e62": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_afeba836a14d4fb6a7c5407794848b80", - "style": "IPY_MODEL_9fc5c513843a4c0fa7ae9c8b37c3b4ff", - "value": "./model/AC-Pendulum-v0" - } - }, - "f5c5c8e022aa4f239006a40e2ac8b990": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_e527873f8829445dbdb49e0710132c63", - "max": 1.8, - "readout_format": ".1f", - "step": 0.1, - "style": "IPY_MODEL_2b0d8567d4aa4e53a5837284b315cc58", - "value": 0.9 - } - }, - "f63f7fca433e4d32ad6252416895155b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f6f23b9ba55946d0aa626d62ba4bbdf5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f72ef10c1acd44608d2db2b932f2b167": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f74c2a3b52114bbc80056d7097731209": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatSliderModel", - "state": { - "continuous_update": false, - "description": "Slider input:", - "layout": "IPY_MODEL_80d9bf94c37c49708820ccb5a2aa8f8b", - "max": 200, - "readout_format": ".0f", - "style": "IPY_MODEL_731d299fb9dd45c1a41a5d4df4f41f94", - "value": 100 - } - }, - "f77e6fff86704faea6c01e0262104c70": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_0a575cd57803474a9574922e07d3d316", - "IPY_MODEL_8d025735275c4dfdbbbf2d491e727c08" - ], - "layout": "IPY_MODEL_5b759ba6fc8f451c97ee15467069a6ed" - } - }, - "f80bd1f80d99494595e88c9fc5f055d2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_9dd1d4acaad44f16b1bbf0693ee9fad5", - "IPY_MODEL_1cb88e139a0642afb2f3c958dff539aa", - "IPY_MODEL_2e6e71650a6a48878fce055c8e563538", - "IPY_MODEL_fe6a7094bdd649e6b5270a701e12253a" - ], - "layout": "IPY_MODEL_bfa16a837ebd4ec795d5aa0a893d5298" - } - }, - "f834d6547a954a478d9e755653e4f5a1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f8a20f2f4b8b4c03857bcd85bf96b136": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f8eb99b0291b45dda1b391805141e984": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_5afcc13ec3d94e6299bd06fb87ed7885", - "style": "IPY_MODEL_d4c91e304ca34f88a4c959ecc4683678", - "value": "beta_1" - } - }, - "f91418c725364297a60aa4983253ae07": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f9a9a8529629435f926e28c9e2ff6d21": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "f9b983bef3a14087b6d1f966b8b041ed": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "f9cd83ba01bb440b9510e0ada3cfd4aa": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "fa3877a284354fd08f33d320314b6765": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_57f97e2ebec542f8b297365916bf571e", - "style": "IPY_MODEL_454021a337164bae8a96f5a5a7749b78", - "value": "decay" - } - }, - "faea715cb8894b8ca444f80d17c07e12": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "fb06877af7ae451baefc12dfd27d9348": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "border": "dotted", - "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", - "grid_template_columns": "repeat(2, 1fr)", - "grid_template_rows": "repeat(7, 1fr)" - } - }, - "fb19638e8a38465f844aaf06c6378b29": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "fbd450c8b01f4ab9ae7ea1caa129bd66": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_content": "center", - "align_items": "center", - "border": "dotted" - } - }, - "fc20a5f1e967425c840960c1948f00c8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget011" - } - }, - "fc69d16aa7e547b09859e2ca7dbfbde8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_97f58376ed524fab85dde1ea5f67ee17", - "IPY_MODEL_0dc03ae5db46484a85272ce1899e53c0" - ], - "layout": "IPY_MODEL_81f34a95028440608c8a5a307cd7ee9b" - } - }, - "fc6a2f4827034d64b99a15547f3d9f43": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_cf3de6c59d124068af4aef37293c26e2", - "style": "IPY_MODEL_1222c8a942134f83aa262d9b321ee413", - "value": "render" - } - }, - "fc83fd9df36b4c0fa6ee544fe520cde7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_area": "widget007" - } - }, - "fca1d8802f264b48aa3f7bef2b5f5b81": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "layout": "IPY_MODEL_c096b60cb96b4aa68be8728e6feb2366", - "style": "IPY_MODEL_7532b84aea3a4f4290efa4b0369e846a", - "value": "Algorithm Parameters" - } - }, - "fca98009fe56433b97f1fd16969f9a35": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_43f9446733e242f1977bbe394ddc479b", - "style": "IPY_MODEL_660e8c250f974ff685128c61b3d57fe3", - "value": "Environment settings" - } - }, - "fd1693effce0420c8f4bbbebde0ef7c3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_4fa0861e758940d9b9c2775304ebb140", - "style": "IPY_MODEL_661fd55473c0431aa9dffd6876d1d559", - "value": "Input(shape=(None, 3), name='input_layer')" - } - }, - "fe547223f16e423fa8493d4c6ae577ba": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "VBoxModel", - "state": { - "children": [ - "IPY_MODEL_f77e6fff86704faea6c01e0262104c70", - "IPY_MODEL_9e37b046f2d841dd9572b2284a729bf5" - ], - "layout": "IPY_MODEL_48a97cf1c4a44a858c3376f962060321" - } - }, - "fe6a7094bdd649e6b5270a701e12253a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "LabelModel", - "state": { - "layout": "IPY_MODEL_a860d9c958c646aa89ae598dc67eaa08", - "style": "IPY_MODEL_85514e8a938240e7b2df7c2a8ad6b6e8", - "value": "Dense(n_units=1, No Activation, in_channels='64', name='dense_1')" - } - }, - "fe785154b75c4badbab0d946f05802cf": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": {} - }, - "ff06931e66b544389c8f409734b472e3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "grid_template_areas": "\n \"net_label net_info\"\n \"opt_label opt_info\"\n " - } - }, - "ff0e9f4940eb4b57bd99d96059b5e194": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "description_width": "" - } - }, - "ffce2434eb114cd1a7f6961dd71ff755": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "align_items": "center" - } - } - }, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interactive Hyper-parameter Configuration\n", + "This is a use case provided by RLzoo to support an interactive hyper-parameter configuration process. It is built with *ipywidgets* package, so make sure you have the package installed:\n", + "\n", + "```! pip3 install ipywidgets==7.5.1```\n", + "\n", + "You just need to **run** each cell (Shift+Enter) and **select** the sliders or dropdown lists to configure the hyper-parameters for the learning process, for whichever algorithm and environment supported in RLzoo. \n", + "\n", + "It follows four steps:\n", + "1. Environment Configuration\n", + "2. Environment Information Display and Algorithm Configuration\n", + "3. Algorithm Parameters Display and Learning Parameters Configuration\n", + "4. Launch Learning with Visualization \n", + "\n", + "Tips:\n", + "To stop the learning process and start a new one, you needs to restart the kernel (always work) or interrupt the kernel (not always work). \n", + "\n", + "Have fun!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "1. Environment Configuration\n", + "-----------------------------\n", + "Run a environment selector and select a environment you like.\n", + "\n", + "Tips: no need to rerun after selection, directly go to next cell.\n", + "\"\"\"\n", + "\n", + "from rlzoo.interactive.common import *\n", + "from rlzoo.interactive.components import *\n", + "from rlzoo.algorithms import *\n", + "from rlzoo.common.env_wrappers import build_env, close_env\n", + "env_sel = EnvironmentSelector()\n", + "display(env_sel)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "2. Environment Information Display and Algorithm Configuration\n", + "--------------------------------------------------------------\n", + "Run this code to create the enivronment instance.\n", + "\n", + "Tips: need to rerun every time you want to create a new environment with above cell, \\\n", + "because this cell builds the environment.\n", + "\"\"\"\n", + "\n", + "try:\n", + " close_env(env) # close the previous environment\n", + "except:\n", + " pass\n", + "env = build_env(**env_sel.value)\n", + "print('Environment created!')\n", + "display(EnvInfoViewer(env))\n", + "\n", + "# run a algorithm selector and select a RL algorithm\n", + "alog_sel = AlgorithmSelector(env)\n", + "display(alog_sel)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "3. Algorithm Parameters Display and Learning Parameters Configuration\n", + "----------------------------------------------------------------------\n", + "Call the default parameters of the selected algorithm in our environment and display them, \\\n", + "then select learning parameters.\n", + "\n", + "Tips: need to rerun after you created a different algorithm or environment.\n", + "\"\"\"\n", + "\n", + "EnvType, AlgName = env_sel.value['env_type'], alog_sel.value\n", + "alg_params, learn_params = call_default_params(env, EnvType, AlgName)\n", + "print('Default parameters loaded!')\n", + "\n", + "# see the networks, optimizers and adjust other parameters\n", + "algiv = AlgoInfoViewer(alog_sel, alg_params, learn_params)\n", + "display(algiv)\n", + "\n", + "# run this to generate the algorithm instance with the algorithm parameter settings above\n", + "alg_params = algiv.alg_params\n", + "alg = eval(AlgName+'(**alg_params)')\n", + "print('Algorithm instance created!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "4. Launch Learning with Visualization \n", + "---------------------------------------\n", + "Run the cell to train the algorithm with the configurations above.\n", + "\"\"\"\n", + "\n", + "learn_params = algiv.learn_params\n", + "om = OutputMonitor(learn_params, smooth_factor=algiv.smooth_factor)\n", + "display(om)\n", + "with om.print_out:\n", + " alg.learn(env=env, plot_func=om.plot_func, **learn_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# whenever leaving the page, please close the environment by the way\n", + "close_env(env)\n", + "print('Environment closed')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "00663174be1342fbbd29bc99cdd6d3aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "00ead8f3c1ea4020930b11c3bde3dd48": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_543b543dd8bb4fcb9dc9f4a16ac4bd6e", + "style": "IPY_MODEL_f63f7fca433e4d32ad6252416895155b", + "value": "max_steps" + } + }, + "0106cced0fe54fbb9a3a261b11941cce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_593926166a704759992244f9732d0f8d", + "style": "IPY_MODEL_4a1bc5d7007848cb89e08eff1479ddf8", + "value": "Learn Parameters" + } + }, + "012eeb7c3bab46d9baa05356cd4ff0f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "0143906a10054b1594675c3674642d83": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "014bf4270fea44b6aad4c80c7a5979b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "019cd764de374cb382236f88a5d204af": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "01cece59d650454b9cf09d03e85a6a10": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_81d1f55272ef4977b06be173bdd59b8c", + "style": "IPY_MODEL_e62a214128d34799be2e1cc2cdb98b8c", + "value": "Network information:" + } + }, + "0201bde3e922471d9bb86857be61df95": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_8178676fb5e441ec92464938695643a8", + "step": null, + "style": "IPY_MODEL_0143906a10054b1594675c3674642d83", + "value": 24 + } + }, + "02904d8bc2d442deb3da0b5e6e0363a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "04461564de8c45d6af4c6055f7b4c17f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "RadioButtonsModel", + "state": { + "_options_labels": [ + "train", + "test" + ], + "index": 0, + "layout": "IPY_MODEL_520b2e1af36547edbae1352d82099fda", + "style": "IPY_MODEL_2c9a721e0f084f8f8f437a5d4d875e3f" + } + }, + "04abdee05e514880bb74dfe64bca36ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_9384c24875c24e5b8be37d4c55e04820", + "style": "IPY_MODEL_bebb739676c74aacb396889de39592e6", + "value": "0.9" + } + }, + "0580852520e142a89d7b42c50bfef6a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "066c122ea5f64991b7347279a79e8061": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "06d5c4249f3d404793fe2defc8eb0051": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_5af1a3e17ac64264905701b109c013e2", + "IPY_MODEL_691c17934ca3435eb36a2d84d15ecdf7" + ], + "layout": "IPY_MODEL_7d163d682d5744d6ac7be041fb66c158" + } + }, + "070bc781a91449c6a7fb227586d347e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_494deb5503e842b78948ed2c14e28e3e", + "style": "IPY_MODEL_2d1f0d1b81ee4e1f85ae2f777dcd0db9", + "value": "beta_2" + } + }, + "07377f1ec0e74dd4897d484914a44f99": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "077609b632e64492acbc9a009222e086": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "078c44ca72d24661bbeb9921196ddace": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "07b040199f664673b2cb1b45c5a5af34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_47513573787c4ab1bfafee8a38450355", + "style": "IPY_MODEL_0abdf6aca8e44b2f96d9e278ce60a016", + "value": "Dense(n_units=1, tanh, in_channels='64', name='dense_2')" + } + }, + "07b0e1377c414989a1d7ce1bf1da1c4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_0bd6e0b89391415fa6fc2c7f7fbf3bd3", + "style": "IPY_MODEL_da04b8e9a4464f7ea141e41904fa3b0f", + "value": "0.999" + } + }, + "080346c4f0ae457182549d3c68aaaaea": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_23d66d78336541bf8b3f863dc3e554d4", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Training... | Algorithm: AC | Environment: Pendulum-v0\nEpisode: 0/10 | Episode Reward: -1730.5698 | Running Time: 1.6412\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_actor\n[TL] [*] Saved\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_critic\n[TL] [*] Saved\nEpisode: 1/10 | Episode Reward: -1738.3357 | Running Time: 3.3340\nEpisode: 2/10 | Episode Reward: -1744.1233 | Running Time: 4.9608\nEpisode: 3/10 | Episode Reward: -1854.8743 | Running Time: 6.5518\nEpisode: 4/10 | Episode Reward: -1678.3274 | Running Time: 8.1632\nEpisode: 5/10 | Episode Reward: -1833.9245 | Running Time: 9.7298\nEpisode: 6/10 | Episode Reward: -1805.7677 | Running Time: 11.3628\nEpisode: 7/10 | Episode Reward: -1822.8594 | Running Time: 12.9569\nEpisode: 8/10 | Episode Reward: -1409.2653 | Running Time: 14.5867\nEpisode: 9/10 | Episode Reward: -1752.4231 | Running Time: 16.2574\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_actor\n[TL] [*] Saved\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_critic\n[TL] [*] Saved\n" + } + ] + } + }, + "081136f1075542a3999ce83eba68fdb5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_1ef9aa26484548e99e94bb3d8aae3cce", + "IPY_MODEL_45847f561d154d999d93f170524e2bdf", + "IPY_MODEL_9ce0362f9fac4e45a87ebe7a085a24af" + ], + "layout": "IPY_MODEL_ab2e3b3dc5024debb0c00c3d27d48a8b" + } + }, + "08f5684d8e194916ac04ed379e2bf022": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_019cd764de374cb382236f88a5d204af", + "step": null, + "style": "IPY_MODEL_c4662ffdadef4c7d82aba5ddca1fbfda", + "value": 0.9 + } + }, + "093fd11986764d78ad5dcf1429a496c9": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_0b19536128d34993b9a3354b2a05e2dc", + "msg_id": "8f19b370e7f641249abb608a3c84b213", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "Training... | Algorithm: AC | Environment: Pendulum-v0\nEpisode: 0/100 | Episode Reward: -1730.5698 | Running Time: 1.6647\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_actor\n[TL] [*] Saved\n[TL] [*] Saving TL weights into ./model/AC-Pendulum-v0/model_critic\n[TL] [*] Saved\nEpisode: 1/100 | Episode Reward: -1738.3357 | Running Time: 3.3156\nEpisode: 2/100 | Episode Reward: -1744.1233 | Running Time: 4.9611\nEpisode: 3/100 | Episode Reward: -1854.8743 | Running Time: 6.5757\nEpisode: 4/100 | Episode Reward: -1678.3274 | Running Time: 8.2029\nEpisode: 5/100 | Episode Reward: -1833.9245 | Running Time: 9.7915\nEpisode: 6/100 | Episode Reward: -1805.7677 | Running Time: 11.3793\nEpisode: 7/100 | Episode Reward: -1822.8594 | Running Time: 12.9897\nEpisode: 8/100 | Episode Reward: -1409.2653 | Running Time: 14.5941\nEpisode: 9/100 | Episode Reward: -1752.4231 | Running Time: 16.2545\nEpisode: 10/100 | Episode Reward: -1595.9812 | Running Time: 17.8784\nEpisode: 11/100 | Episode Reward: -1750.5559 | Running Time: 19.4594\nEpisode: 12/100 | Episode Reward: -1780.9001 | Running Time: 21.0874\nEpisode: 13/100 | Episode Reward: -1645.4007 | Running Time: 22.7261\nEpisode: 14/100 | Episode Reward: -1684.3441 | Running Time: 24.3810\nEpisode: 15/100 | Episode Reward: -1764.5074 | Running Time: 25.9965\nEpisode: 16/100 | Episode Reward: -1688.8096 | Running Time: 27.6359\nEpisode: 17/100 | Episode Reward: -1582.7040 | Running Time: 29.2999\n" + } + ] + } + }, + "094d34956035446984a6cb8a6efc22a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "09c74a8b5d1a43828034e148d2edfbfc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "09eb8f946d00416dace2ee661ad55fbd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "0a179f0e33df4522b9286a546e181b60": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_f2ffc80dd5074916b1a69e9de91149f9", + "style": "IPY_MODEL_8784dbc322c7455aaef2b352bae2f205", + "value": "name" + } + }, + "0a21d0f35913467a9b266a75d2af8db0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "0a575cd57803474a9574922e07d3d316": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_5532430429754176a10d6ab53ba4b6d9", + "style": "IPY_MODEL_e35bce23c28f4af3b0d4dce2266ed2e8", + "value": "Learning curve" + } + }, + "0abdf6aca8e44b2f96d9e278ce60a016": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "0af6103ca9e44bb4a44c62b84b39415f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_4112e1653afc41a795418fc54377af6c", + "IPY_MODEL_10d4f1af65b0492594efc926d9976e59" + ], + "layout": "IPY_MODEL_1e197bc7d05a4518969ee7d3f97f211c" + } + }, + "0b081708649d446ab37f522f5a019e19": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "0b19536128d34993b9a3354b2a05e2dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "1px solid black", + "height": "300px", + "overflow": "scroll", + "width": "60%" + } + }, + "0b1a53d081f547f8ab913cd15fe70058": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_d99dceda8ae6483f8df298525d45be82" + } + }, + "0bd6e0b89391415fa6fc2c7f7fbf3bd3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget010" + } + }, + "0c0d922d9ed14199ab9b8f48b9e8ba1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "multi envs:", + "layout": "IPY_MODEL_f2db93e6094b47d0bfce3821b33d707a", + "step": 1, + "style": "IPY_MODEL_454f999c2ca44e7b86263594806f6191", + "value": 1 + } + }, + "0c64eb2046714b6c885261124bcb09f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_167de1c7956c4ede9fa6a584404bc568", + "style": "IPY_MODEL_5469680f21e44e77b1092b8354d9aee0", + "value": "Dense(n_units=1, No Activation, in_channels='64', name='dense_1')" + } + }, + "0cabfd585d5d4421a05805698bc1c8ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "0d95601931d94f8cac55349f5886038a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "0dc03ae5db46484a85272ce1899e53c0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_8ca1f8992583484a8a0ff2f7f46afee2", + "IPY_MODEL_99ac959475eb4f75b586ed6599b99113", + "IPY_MODEL_2ab7b4c8b49a4163b5521127d8329674", + "IPY_MODEL_9689f9977c7f455282a9831bcd81905c" + ], + "layout": "IPY_MODEL_eb5fdb48aa1d483fa9acf05a229ef307" + } + }, + "0e74af77352a4b40b0f9e5163d92a836": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget007" + } + }, + "0eb34e6e2b07401dae9a2bfa4f1d49df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_47ed36f4da904759bb9adcf9f1f1685b", + "style": "IPY_MODEL_7dc1333733194435934e6ca098ede1ad", + "value": "False" + } + }, + "0ec6f6b7c7c84bb4b54e92db8342ce85": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "atari", + "classic_control", + "box2d", + "mujoco", + "robotics", + "dm_control", + "rlbench" + ], + "description": "env type:", + "index": 1, + "layout": "IPY_MODEL_bfdfc9d77a654743a9ebdfc08ab167da", + "style": "IPY_MODEL_ce5b0166c393435a840819472b761b8c" + } + }, + "0fb529fd883648edb15d72a94813126e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_a01f34500cfc486289f3334e3cd222df", + "IPY_MODEL_d6ddb43e654a421ead72beacfae7145e", + "IPY_MODEL_b106f6f6a7f047a4a11ec9f9a23804e2" + ], + "layout": "IPY_MODEL_ffce2434eb114cd1a7f6961dd71ff755" + } + }, + "1022056a831a477e91366a9deda960de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_ecc6da99cf7944f5a5a6cfd1f0516aa6", + "style": "IPY_MODEL_ebff747fea3f4cf2abb9efcd9f998ddb", + "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" + } + }, + "10685777c5384041b62b4ce3aa26bf6e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "10b2a84971164564ac50d9f53bd98579": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "10d4f1af65b0492594efc926d9976e59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_50ce374ed2fc4f2ebc2c156c16ba4f38", + "IPY_MODEL_11337137fc3b4e19b06d48508495d2ce", + "IPY_MODEL_fc6a2f4827034d64b99a15547f3d9f43", + "IPY_MODEL_1846a28797b64a7a8266f33f497550d4", + "IPY_MODEL_00ead8f3c1ea4020930b11c3bde3dd48", + "IPY_MODEL_89ae5379ee8b4e2d92f116a018b9420e", + "IPY_MODEL_d6a04d9b77b54ae89af21fa5551e205e", + "IPY_MODEL_b42c755dec514e6fa26ca97f3f0ef923", + "IPY_MODEL_d915d378018e4bd085cf4a0a935e2aaa", + "IPY_MODEL_162bfef08113403d82be4e50b362acb9", + "IPY_MODEL_30d87705b48648089aaa078817a89da2", + "IPY_MODEL_bdb404863da84bdf870e550898f54848" + ], + "layout": "IPY_MODEL_81a50427a5384feeaaee374a19ad5931" + } + }, + "11337137fc3b4e19b06d48508495d2ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "RadioButtonsModel", + "state": { + "_options_labels": [ + "train", + "test" + ], + "index": 0, + "layout": "IPY_MODEL_da5694fd870b41e79f41ebc7d7b8db5e", + "style": "IPY_MODEL_3a389cd3e9254722a3bef185d92c9ac4" + } + }, + "1202663af1bf4653bc967824c8574e1a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_98f2c9b34e884cada9e2eedac93e1912", + "style": "IPY_MODEL_67a79ba4cbf84418967857e237a5a1be", + "value": "Environment name:" + } + }, + "1222c8a942134f83aa262d9b321ee413": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "125f5c3fd35e49339e558a30a39a9f8a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_fc83fd9df36b4c0fa6ee544fe520cde7", + "style": "IPY_MODEL_3f7607f9884f482498bb28a91df5ab02", + "value": "beta_1" + } + }, + "12a0f20f2ecd423889594f36b15647f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "12e50eba7f3e4e9f888416f46172b60f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "145001c5826a41cd989997ea61244ca1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "14a01344ad1b48b3becfe74fa709a0c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_40848c8562dc485fa88be8cf89c7a5e2", + "style": "IPY_MODEL_a7d8b17ff9fd43298bc30e0471ade94f", + "value": "Input(shape=(None, 3), name='input_layer')" + } + }, + "1537ab75a9dd4f429ffb3812c485116f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_0a179f0e33df4522b9286a546e181b60", + "IPY_MODEL_91d86c9ddbfa4acdaf18e13d8adf3862", + "IPY_MODEL_b4d945e45eae41ceb40de345939615ad", + "IPY_MODEL_715b10d741354c8db506fb8ba945a074", + "IPY_MODEL_b92bc4065ee4473aa6e1b4051e044dee", + "IPY_MODEL_c2160078393b421d9f3a4343f37307e2", + "IPY_MODEL_125f5c3fd35e49339e558a30a39a9f8a", + "IPY_MODEL_04abdee05e514880bb74dfe64bca36ff", + "IPY_MODEL_070bc781a91449c6a7fb227586d347e6", + "IPY_MODEL_2bb83c7012914171b4b76d559b92034c", + "IPY_MODEL_fa3877a284354fd08f33d320314b6765", + "IPY_MODEL_5446746816dd4edf8dffb29995d15715", + "IPY_MODEL_3755df840c214a33941879b316489adf", + "IPY_MODEL_776cdbcecc004924a856eb45ec0a5699" + ], + "layout": "IPY_MODEL_4b5dc49fbc1743c8abe6cded3f9ed703" + } + }, + "159f94f25de5436aafa6fec3c88e3356": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_0106cced0fe54fbb9a3a261b11941cce", + "IPY_MODEL_cfb6b6bcedad4f61893206fb1eb28385" + ], + "layout": "IPY_MODEL_89880b2c3e03469da53b8a7e9e2e930b" + } + }, + "15ae64b32d794189a34bba91e2f7a15b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "162bfef08113403d82be4e50b362acb9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_474e0de897334eb69236cc05ae69f164", + "IPY_MODEL_aafbebe0ec5b4425acf54f0ad9f6c80f" + ], + "layout": "IPY_MODEL_66bc7fd58a2743a0960e9dd5df378998" + } + }, + "167816e5912f4ea18d96b6e468d82ae7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "167de1c7956c4ede9fa6a584404bc568": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1764805129704afcb7c170e877b81788": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_fe547223f16e423fa8493d4c6ae577ba", + "IPY_MODEL_093fd11986764d78ad5dcf1429a496c9" + ], + "layout": "IPY_MODEL_2bea049f9ec74da0bcf2a7eeffce8720" + } + }, + "182107ee16aa4bfba497dd033e347d65": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e6958eae462d43d8bdb9c6227deddcc7", + "style": "IPY_MODEL_f9a9a8529629435f926e28c9e2ff6d21", + "value": "Observation space:" + } + }, + "1826b147229c4a96b6603cc13978a090": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "182c5797541f4476bb02c95a710f1bca": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1846a28797b64a7a8266f33f497550d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_e8b87d816ccb409083b0c522ef0bd9dd", + "style": "IPY_MODEL_167816e5912f4ea18d96b6e468d82ae7", + "value": false + } + }, + "18470dca56a94ced8388c8eec402515f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_294896e2ec5f413e9e23d9ec81e6bbbf", + "IPY_MODEL_8c59866961674911b2157bded443e366", + "IPY_MODEL_261d86e673814c6b9c6ed7b921861867", + "IPY_MODEL_6d5b0a5b26874cfd874c4a0bdf307eff" + ], + "layout": "IPY_MODEL_b58381d8050044ee9df6c0857e3a06e4" + } + }, + "18a7121ba72e42af9a496a39fb8c6f6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "18ea002dd43344a5864f8a8651ceeaeb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget007" + } + }, + "19b0d8173d9141e0a0db8d0b2110c98c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(6, 1fr)" + } + }, + "1a3aa6da2cad4cfd9696b32125ab645b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "1adbcde168d04bcdaed1c410feae74ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1b48b0f90cef4077aa20b9ee8be52e9b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget002" + } + }, + "1c09f9523eb2469ab864ddcd5f15f417": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1c75d4a07143476588ce4826116ea8ee": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1cb1d8e98bef410e85502ad2edb46c45": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(2, 1fr)" + } + }, + "1cb88e139a0642afb2f3c958dff539aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_9705108e9dd540fa8e02c1933e03eadd", + "style": "IPY_MODEL_2126fce329534e2b98f039a35e99344a", + "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" + } + }, + "1d03aaf95d45497ca74e337a82632cee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_2ee89b46bdc146f9b9f4f48f5874a349", + "style": "IPY_MODEL_e0a1f12f4f0e4e31adc281b1fe6dee11", + "value": "0.9" + } + }, + "1db128fafd984258b040b5295b477f0d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1dbbcf0744194117b3463d5ae8af00ef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "1e197bc7d05a4518969ee7d3f97f211c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "1e327c8e4b844c2fbb017a5544fa678e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "1e6d0c80ceaa4e58846e9f554371b363": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "1eec2203d3bf49c2876604c21291cc18": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d20f2266d6fc44df988c78b63b202a81", + "style": "IPY_MODEL_5228a7a8160f421f846e2d7d06c9d159", + "value": "1e-07" + } + }, + "1ef9aa26484548e99e94bb3d8aae3cce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_5ac9e6a121a3488ea93f85f5589429a0", + "style": "IPY_MODEL_698f9329e3754e7482dc32690ba58f4a", + "value": "Environment settings" + } + }, + "1f0e424278554da08fbb15138e571a62": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "1f37fdacb85646a1b5ff9a2b1d6ab38a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "2126fce329534e2b98f039a35e99344a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "214c87e57eb641bb89644c9f465889ca": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget007" + } + }, + "2205db5769754bf0948d81dde160eab4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "22126658c9d54cfab48b63029798c705": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "223fd915d3a5472aabdde3b5dd47a5f1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "22ff0e7129b04334b71044d77e3c9298": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "BoxModel", + "state": { + "children": [ + "IPY_MODEL_eef437964b4e4fa29ea42afc6b9a69ce" + ], + "layout": "IPY_MODEL_759c11789beb46f798f3b48c4cf88577" + } + }, + "23424247d797485dba0788eb6b7614aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "235af533ab1c41a6b82350c6f3a88426": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget010" + } + }, + "23d66d78336541bf8b3f863dc3e554d4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "1px solid black", + "height": "300px", + "overflow": "scroll", + "width": "60%" + } + }, + "24f450d31f2d47a68aa2c58be28170fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_09eb8f946d00416dace2ee661ad55fbd", + "style": "IPY_MODEL_f8a20f2f4b8b4c03857bcd85bf96b136", + "value": "name" + } + }, + "254576dd293543d384c9e5620c3db225": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget005" + } + }, + "26036b1a064245a6a1cef60ec7d39376": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_b18ac7a05b7c4d58813a3e735173a3ca", + "IPY_MODEL_0ec6f6b7c7c84bb4b54e92db8342ce85", + "IPY_MODEL_467644544d33439284f04fe2a9883182" + ], + "layout": "IPY_MODEL_f9b983bef3a14087b6d1f966b8b041ed" + } + }, + "261d86e673814c6b9c6ed7b921861867": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e9794b57be6c4c0e981a017d3fa82a36", + "style": "IPY_MODEL_946c2a2e7e8f4e36b0311e922520272f", + "value": "Optimizer information:" + } + }, + "266e10703ed340a78b259c7d3ddc8836": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_6d6739242111448eaf1e80a8962f1aac", + "style": "IPY_MODEL_bf620c54949846b49135585c61101b19", + "value": "Environment Information" + } + }, + "26c0e699dae643b58817819a3d134e6f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_f29a7f4ff2a74bbf8d6485cbfb086152", + "IPY_MODEL_bb5d38052b40427585a8ec928bdef7b5" + ], + "layout": "IPY_MODEL_d02f0cd6f8f94156ac86605286a6ee78" + } + }, + "27fbf57b093b4444b8990601eaddca26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_01cece59d650454b9cf09d03e85a6a10", + "IPY_MODEL_e09e0ff65ebf454b80a965aaa0f61d32", + "IPY_MODEL_83c18b3b4c374f70947e47230ffe4f82", + "IPY_MODEL_06d5c4249f3d404793fe2defc8eb0051" + ], + "layout": "IPY_MODEL_ff06931e66b544389c8f409734b472e3" + } + }, + "283080f17fcf4286b2e6e059bcda3370": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_4b23820dcff647a6ad204c7c4a596248", + "style": "IPY_MODEL_1826b147229c4a96b6603cc13978a090", + "value": "mode" + } + }, + "28ad6172b7f34ba9923847d24dd555b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "294896e2ec5f413e9e23d9ec81e6bbbf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_98824ad5eda8475394e9fb13819502a9", + "style": "IPY_MODEL_79953b3e59c048548c96bb197d46a7ea", + "value": "Network information:" + } + }, + "2982ccca674f4bfc839557e06cde9993": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_14a01344ad1b48b3becfe74fa709a0c6", + "IPY_MODEL_1022056a831a477e91366a9deda960de", + "IPY_MODEL_814eef7fa97a4fa2b4c5f1ed1b3728f3", + "IPY_MODEL_0c64eb2046714b6c885261124bcb09f8" + ], + "layout": "IPY_MODEL_223fd915d3a5472aabdde3b5dd47a5f1" + } + }, + "298f572cd2ec4a9ca5a6feafaf334040": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget002" + } + }, + "29a207365d934cc4a402ed72a19194ca": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "solid" + } + }, + "2a9fb576ef6145abaf95398bf620cd8d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget010", + "justify_content": "center" + } + }, + "2ab7b4c8b49a4163b5521127d8329674": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_7d70e416e925499f93e5837aabc6afc2", + "style": "IPY_MODEL_69268529fca5425e9f11506c968490e7", + "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" + } + }, + "2b0d8567d4aa4e53a5837284b315cc58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "2bb83c7012914171b4b76d559b92034c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_235af533ab1c41a6b82350c6f3a88426", + "style": "IPY_MODEL_75c167ca66774581880b2500d5176a36", + "value": "0.999" + } + }, + "2bea049f9ec74da0bcf2a7eeffce8720": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "2c0353597c114ba184977dac607510c3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget006" + } + }, + "2c48650276864e79a7b82413ddd8c6fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d34c7789bb974de1a36ef3cc45737b52", + "style": "IPY_MODEL_626ae439ee1f4ce4895764fb66f9c6d3", + "value": "0.999" + } + }, + "2c9a721e0f084f8f8f437a5d4d875e3f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "2d1f0d1b81ee4e1f85ae2f777dcd0db9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "2da2537f2e444e16ad634693e684af58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "BoxModel", + "state": { + "children": [ + "IPY_MODEL_e4665eee9731436a839eaebea246f048" + ], + "layout": "IPY_MODEL_e944a76d793541058cf5f32563847fb3" + } + }, + "2dab24721ba34bd789afa55d1479464b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "2dece16eb4994e5082a1cbeeea4163d0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget002" + } + }, + "2e65a763e5db40ca8969c36950c0d9bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_1c75d4a07143476588ce4826116ea8ee", + "style": "IPY_MODEL_15ae64b32d794189a34bba91e2f7a15b", + "value": "Supported algorithms are shown below" + } + }, + "2e6e71650a6a48878fce055c8e563538": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_f1985e262a7d401ea97c903091713789", + "style": "IPY_MODEL_2205db5769754bf0948d81dde160eab4", + "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" + } + }, + "2e8b3025623248e2a92daa5a7750997f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "2ece943ff83c48e8b69e0b2396b6064c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_aeecfc3325ec482ebd31ced3fc2e6839", + "style": "IPY_MODEL_b979276c5b584ebab1400eea707b2c39", + "value": "Pendulum-v0" + } + }, + "2ee89b46bdc146f9b9f4f48f5874a349": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget008" + } + }, + "2f93a27048a44beda22771c8249fba0d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "dotted", + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(7, 1fr)" + } + }, + "3025ff51115247eebfcfe7e2a18e414e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "3044da8a1f89485398f1ea9d4965bc55": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget006", + "justify_content": "center" + } + }, + "304f4dcdb42b4bca91451ccfe7eba639": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "30d87705b48648089aaa078817a89da2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_c3c09aa3ecea45eda2b142c857c5d7c5", + "style": "IPY_MODEL_e3adb676dd9b48a6bd4e895ac644b653", + "value": "train_episodes" + } + }, + "31276a604cf14bcd82297907c46c17f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_a899edcecbcf49d1a1f57b48bed97865", + "max": 400, + "readout_format": ".0f", + "style": "IPY_MODEL_4711e3b757ae4ba08ece2d994aa46c2a", + "value": 200 + } + }, + "31f3ea5f445a4342b1a4db664f61eb93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "31fe17808d8e4f7ead5964af2e4f5894": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "dotted", + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(7, 1fr)" + } + }, + "329f804132904f47a73d10b3ccba4b4d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "atari", + "classic_control", + "box2d", + "mujoco", + "robotics", + "dm_control", + "rlbench" + ], + "description": "env type:", + "index": 1, + "layout": "IPY_MODEL_8ae2c037e98f420486a61a8570daf106", + "style": "IPY_MODEL_df84370f89e949518569f900854e2510" + } + }, + "334d1a726d2347db82e42df5760618b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "disabled": false, + "indent": false, + "layout": "IPY_MODEL_c3d17e5a575344968f8b84a174b26ba9", + "style": "IPY_MODEL_31f3ea5f445a4342b1a4db664f61eb93", + "value": false + } + }, + "33ecf71f75a649a285ea6a8211b5acbd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "3488ba4c7374447794395c4c315a1193": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "351ae05c16d040dab9a578c06a78858c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "35525c0fbffa497eb43f7d5bd081bb0b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "initial" + } + }, + "3556d6d1fe0c4e558b21b70b8c7b9395": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "3755df840c214a33941879b316489adf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e835260b70924edd959ac38cbdaa50d3", + "style": "IPY_MODEL_7aa2babe24dc4fab84bfbd511f0b5e98", + "value": "epsilon" + } + }, + "379d32750a8c4e88b3b6a8d76c3ee91b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget012" + } + }, + "383cf0cb101341d4bdfb65604a24a4d5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "38484ea61c3449a1b809d8526ead582d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget006" + } + }, + "389174ab87e24a48a23ad5f81a32da61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d2ba7f491ec94768be174bba323aff6d", + "style": "IPY_MODEL_a32e41356969452abe56558608109dc8", + "value": "test_episodes" + } + }, + "38f46c0b84c84233a228758c9b306a79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3909591203bd4321b62ed4e0aa575a3e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_a2bb633318304f79a811eb07e18da7f5", + "IPY_MODEL_4ab1ce52edf54c879f2ee002e94c98f1", + "IPY_MODEL_159f94f25de5436aafa6fec3c88e3356", + "IPY_MODEL_4ab1ce52edf54c879f2ee002e94c98f1", + "IPY_MODEL_88b977df9d82476298ff3c70d714afe0" + ], + "layout": "IPY_MODEL_886c73a1052a4a2da9ec06c958855a51" + } + }, + "39219af0b9a34c03a11682fdbaf85b04": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_e9d6d91ceda64a63b9fe358e90337820", + "IPY_MODEL_9694a75a41e543a3b2642aee3572857d" + ], + "layout": "IPY_MODEL_ed746bfae28741e9ae1d450dd1394423" + } + }, + "39c394badc7246fdb12032649f71a1b6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "3a389cd3e9254722a3bef185d92c9ac4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3a3916bde1e849aeae0e2701258ddc34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_fc20a5f1e967425c840960c1948f00c8", + "style": "IPY_MODEL_c75a9640bb26465785ca214520007519", + "value": "train_episodes" + } + }, + "3a96e3ae233940e18c75f004da9e0459": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_bf7a578fb6204ce694235598a0f00ea2", + "style": "IPY_MODEL_f2612900bd944258af3be77cacc7a46b", + "value": "name" + } + }, + "3b0358464a32494ea410b866646b79b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_dd51349042bc4341b061da02df9f8be2", + "style": "IPY_MODEL_63c30e87411c45dd8d58dfa485850fc2", + "value": "learning_rate" + } + }, + "3c695e15ebbd4ecfb555b0fe5221ad10": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_9a247aedcd64492d9b4ddf9d76c13062", + "style": "IPY_MODEL_96fc368f69794e5baa9433c3a31b1ec1", + "value": "amsgrad" + } + }, + "3c77984eb49f4b3fbf5b78b313af8071": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget013" + } + }, + "3cfd11894b514078901081bddd35c83d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3d9166fc4fcf43f3b930ebc7f996a5bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "3e9c9dcc814b47f8b2b392074c83d853": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\n \"a00 a01\"\n \"a10 a11\"\n \"a20 a21\"\n \"t0 t1\"\n " + } + }, + "3f7607f9884f482498bb28a91df5ab02": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "40747ee3248e4cbca2b22e3201e7ae52": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "4080aa3475b94001b5324fd14d18816c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "initial" + } + }, + "40848c8562dc485fa88be8cf89c7a5e2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "40c1e5560977460b86028ca09ee94662": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget005" + } + }, + "4112e1653afc41a795418fc54377af6c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_808fb0e5d6b940388d588196c927564d", + "style": "IPY_MODEL_9b276e72efa44a7e911ee209d08859b6", + "value": "Learn Parameters" + } + }, + "413fd706b68148a099ed9af1a952ec6d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "41425cf814dc44c49ac901aeec4c668f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "420cda5d7fd34a05b48fa845558987c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_cfc4c351d9da4a2bbe36bb1288f74e82", + "step": null, + "style": "IPY_MODEL_9b5f3fd4ebd341ac91227f9ded9fab19", + "value": 200 + } + }, + "42f8297b00d240308e7403a004a1c6b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "432a3a690b36409192aa3ee4dd5fedf8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_45b014170b1e4c6b8efc9d245b587b48", + "max": 1.8, + "readout_format": ".1f", + "step": 0.1, + "style": "IPY_MODEL_4c528854314c4df18a84eafa4f1a7404", + "value": 0.9 + } + }, + "434eec441fb94a30bcb70bec50c60d78": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_681fa50d92ed4da0afda87805d2383ca", + "IPY_MODEL_18470dca56a94ced8388c8eec402515f", + "IPY_MODEL_da5536ed85464ee5a97c44660b985348" + ], + "layout": "IPY_MODEL_74dc8e60490943c8b9601232bf24f608" + } + }, + "43730220bf8e489cae588fcf375d08cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_31276a604cf14bcd82297907c46c17f8", + "IPY_MODEL_420cda5d7fd34a05b48fa845558987c4" + ], + "layout": "IPY_MODEL_ddba268ea0db428898643ae0f9a259a3" + } + }, + "43ca75c41e054155b5ad51e493b3b990": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e53d3b32848c4872a5e1254a2ed080f1", + "style": "IPY_MODEL_e467ed3285684035a013df63ebb6b422", + "value": "Tips:" + } + }, + "43f9446733e242f1977bbe394ddc479b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "452324b6d7cc4cf28d456787efc23b8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "454021a337164bae8a96f5a5a7749b78": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "454f999c2ca44e7b86263594806f6191": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "455c6fed537d48b188edef0200ab0fb1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_07377f1ec0e74dd4897d484914a44f99", + "style": "IPY_MODEL_a5d8986e9aad47b1ba7821ddf2850c7a", + "value": "Algorithm Selector" + } + }, + "45847f561d154d999d93f170524e2bdf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntTextModel", + "state": { + "description": "multi envs:", + "layout": "IPY_MODEL_4cff6dcb31874722a4fcd9052bb1f9b6", + "step": 1, + "style": "IPY_MODEL_e41fe8ee1bf04764abe02428057a540a", + "value": 1 + } + }, + "45850b0512424834a6d4c70e60892ae8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "45b014170b1e4c6b8efc9d245b587b48": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "45e906bdfe7a464d848f9c972f536d31": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "467644544d33439284f04fe2a9883182": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "Acrobot-v1", + "CartPole-v1", + "CartPole-v0", + "MountainCar-v0", + "MountainCarContinuous-v0", + "Pendulum-v0" + ], + "description": "env name:", + "index": 5, + "layout": "IPY_MODEL_e210fdbc53d246a2ae55da6a3689745b", + "style": "IPY_MODEL_f29ba87ee02f4fc38760b98a32e20581" + } + }, + "469da089cf804101a4cbc570975a1aed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_ac4da45cf7d84d5fa0ea8963afbe5c12", + "style": "IPY_MODEL_dc12042cc1bb40c98a69bef90468797a", + "value": "gamma" + } + }, + "4711e3b757ae4ba08ece2d994aa46c2a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "4749f46df2c4438e874ed6912a4d7ef1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget009" + } + }, + "474e0de897334eb69236cc05ae69f164": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_c234ed19a3204e1d9452d6686e014efb", + "max": 200, + "readout_format": ".0f", + "style": "IPY_MODEL_22126658c9d54cfab48b63029798c705", + "value": 100 + } + }, + "47513573787c4ab1bfafee8a38450355": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "47d275b36e704a74a22098c38f14f301": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "47ed36f4da904759bb9adcf9f1f1685b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget006" + } + }, + "48392da1f6c64d3fad859465d0d0095b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget002", + "justify_content": "center" + } + }, + "48a97cf1c4a44a858c3376f962060321": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "48d65f9009904854b076047201074a2c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\n \"a00 a01\"\n \"a10 a11\"\n \"a20 a21\"\n \"t0 t1\"\n " + } + }, + "494deb5503e842b78948ed2c14e28e3e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget009" + } + }, + "49c009585e524d98af99d984cf65a85b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "4a1bc5d7007848cb89e08eff1479ddf8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "4a2a0ec5e8f641f489d58e31f3f5fcef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d1b7a611e0ea474991c6034e7e7a9e98", + "style": "IPY_MODEL_60104c359482485eaa44f621628fb667", + "value": "Box(3,)" + } + }, + "4a88a99c974d47da993c8bde3faab362": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "justify_content": "center" + } + }, + "4ab1ce52edf54c879f2ee002e94c98f1": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_29a207365d934cc4a402ed72a19194ca" + } + }, + "4b23820dcff647a6ad204c7c4a596248": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "4b5dc49fbc1743c8abe6cded3f9ed703": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "dotted", + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(7, 1fr)" + } + }, + "4b9184b437ac441e8c485894889e7fd4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_8865f419c3a04323907d8e9d11f06c24", + "IPY_MODEL_c60dc42b295c47138b76205df9071217", + "IPY_MODEL_85165a2de0d64a2bb9baf9b64b3ffa38", + "IPY_MODEL_bffd75c7e90346ebb8214c6fe0ce2ab4" + ], + "layout": "IPY_MODEL_1cb1d8e98bef410e85502ad2edb46c45" + } + }, + "4bbe95c5e6b34795a2058cc7bf7416f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_615934e58366458ea65a907cae98c64e", + "style": "IPY_MODEL_570c4f6867da492cafc6318dd145f87d", + "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" + } + }, + "4c528854314c4df18a84eafa4f1a7404": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "4cff6dcb31874722a4fcd9052bb1f9b6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "4d8d22e583c64179817ad9c514bd4490": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "4e6414fcd34b454e94c982f7233402a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "4ee9cbafcaad44de9f9e7453ee765047": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_f74c2a3b52114bbc80056d7097731209", + "IPY_MODEL_7fbbe1851a944d69a568c06875de2b0f" + ], + "layout": "IPY_MODEL_2a9fb576ef6145abaf95398bf620cd8d" + } + }, + "4fa0861e758940d9b9c2775304ebb140": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "50ce374ed2fc4f2ebc2c156c16ba4f38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_5ee808e0128f4e85921b2855f4ff3831", + "style": "IPY_MODEL_6a001a1bb11844d0b85857486c544879", + "value": "mode" + } + }, + "510e33d521264ac387af97dbbb46dd39": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_content": "center", + "align_items": "center", + "border": "dotted" + } + }, + "516cc7132ca94faab3023ffcd1ed4cd4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_5af150388cac4ebc96775a3696923399", + "style": "IPY_MODEL_81621cd1e69f47a1868bf499caac5824", + "value": "Choose your environment" + } + }, + "520b2e1af36547edbae1352d82099fda": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget002" + } + }, + "5228a7a8160f421f846e2d7d06c9d159": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "532ea00fd94045298f69a3917ced39c7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget008", + "justify_content": "center" + } + }, + "53c0481b6b294cf888f2b3abdc33a95c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "543b543dd8bb4fcb9dc9f4a16ac4bd6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget005" + } + }, + "5446746816dd4edf8dffb29995d15715": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_6406ec864c1848d88b92c9b5248a9c9e", + "style": "IPY_MODEL_891e2bdcc12d4314affa4fd372ed7ade", + "value": "0.0" + } + }, + "5469680f21e44e77b1092b8354d9aee0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "547d2113aae04e20ba41d30deb33ec5f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget014" + } + }, + "54927f9f2cde4416bf0e3b782fbd5118": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5526ed8ea7b4499eadc0bbb165d7bbc4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_965b9a99694b4227a43121ae2e974290", + "IPY_MODEL_e57f860aafca4775a03574208f4944b7" + ], + "layout": "IPY_MODEL_510e33d521264ac387af97dbbb46dd39" + } + }, + "5532430429754176a10d6ab53ba4b6d9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "55790721852a4ac38f0bf04e1016c16a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_28ad6172b7f34ba9923847d24dd555b3", + "style": "IPY_MODEL_c35cf89d5b4c42c886c9c83fdc93c8e6", + "value": "Environment name:" + } + }, + "55abe6fb296b491ba2e2a09a492b5ae8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "570c4f6867da492cafc6318dd145f87d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "575f6d3a87c041e4a3005385d7ec75b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_b20aaab10e6a49138d9cf0a414321c49", + "IPY_MODEL_c2aa94c81efc4f3f826adcb847fbdb89" + ], + "layout": "IPY_MODEL_8173f889450249d58f18acfe83d63ddd" + } + }, + "57f97e2ebec542f8b297365916bf571e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget011" + } + }, + "58201f662dc74741bcdeb0e7753843c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "593926166a704759992244f9732d0f8d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "595aeae2634948268510587998ec9587": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_575f6d3a87c041e4a3005385d7ec75b4", + "IPY_MODEL_080346c4f0ae457182549d3c68aaaaea" + ], + "layout": "IPY_MODEL_b9743661bbd24d94969c463e1f77d6e8" + } + }, + "59da397a7faa43c79c633dd523b6f07b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5ac9e6a121a3488ea93f85f5589429a0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5adceaf568da4a1d88d6bf7b379965c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_0e74af77352a4b40b0f9e5163d92a836", + "style": "IPY_MODEL_c7a9f23b553e43a78d5c0ced37526327", + "value": "beta_1" + } + }, + "5af150388cac4ebc96775a3696923399": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5af1a3e17ac64264905701b109c013e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_24f450d31f2d47a68aa2c58be28170fb", + "IPY_MODEL_a7d002d3e5454965af1d9cdb2e54e7ca", + "IPY_MODEL_3b0358464a32494ea410b866646b79b1", + "IPY_MODEL_b4047180a5aa44479c358d8c12f0c5d5", + "IPY_MODEL_9fd6a74ce4e54ae38816e55d19327281", + "IPY_MODEL_0eb34e6e2b07401dae9a2bfa4f1d49df", + "IPY_MODEL_5fc0273b28ca4f42b441948986c98e99", + "IPY_MODEL_bd7afa2132154beebd89e4320ebcad26", + "IPY_MODEL_d48e8464b37c4f0099d42e59369dbab6", + "IPY_MODEL_07b0e1377c414989a1d7ce1bf1da1c4e", + "IPY_MODEL_b04b868ce504489c82bd8818501b3ac3", + "IPY_MODEL_d1ba6fbf21674589b3f585f6e0f9638b", + "IPY_MODEL_c083a4b8f36848ed9f277f423ae18084", + "IPY_MODEL_8c168f5c8ecc4d0ba203b60193856d1c" + ], + "layout": "IPY_MODEL_2f93a27048a44beda22771c8249fba0d" + } + }, + "5afcc13ec3d94e6299bd06fb87ed7885": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget007" + } + }, + "5b759ba6fc8f451c97ee15467069a6ed": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5b87473fb6cc473a89998a285388f4da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "5bced3d11d4a41a4b3e1c712f83b98e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "default" + ], + "description": "state type:", + "index": 0, + "layout": "IPY_MODEL_f4d0297192f5464bac7ab02b3dabed2c", + "style": "IPY_MODEL_7fea48aa29c24b4b94784890589e01e4" + } + }, + "5caab83d7d4d4658ac739d02b56e9fd6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5daa3bcd6829495cb223328230f0f8e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5ee808e0128f4e85921b2855f4ff3831": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "5efb085669c2400a909ac37b5cb4e45e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget008", + "justify_content": "center" + } + }, + "5f1fda7eb4ac4ce694f721e312e205ab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "5fc0273b28ca4f42b441948986c98e99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_18ea002dd43344a5864f8a8651ceeaeb", + "style": "IPY_MODEL_e14f5611fa9242af879512207669394f", + "value": "beta_1" + } + }, + "60104c359482485eaa44f621628fb667": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "611b3bc2e8e749a38fe77bbdab064670": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_3c77984eb49f4b3fbf5b78b313af8071", + "style": "IPY_MODEL_b64d5e345cb5482595aa92662c8f162c", + "value": "epsilon" + } + }, + "615934e58366458ea65a907cae98c64e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "6187b72c80f64272a6c33c90cb582c4c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "center" + } + }, + "626ae439ee1f4ce4895764fb66f9c6d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "629ece3b43ac4c8a8c2f83733a180978": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget005" + } + }, + "62a5e4f04f554e6580d63bb32f36b3be": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget012", + "justify_content": "center" + } + }, + "63c30e87411c45dd8d58dfa485850fc2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "63d55c74d6ed493abe58361958b23046": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6406ec864c1848d88b92c9b5248a9c9e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget012" + } + }, + "64750206fa3a48119aa85e75f5ff2de8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_55790721852a4ac38f0bf04e1016c16a", + "IPY_MODEL_2ece943ff83c48e8b69e0b2396b6064c", + "IPY_MODEL_7a5d99612efa45acb82149814a4a7e82", + "IPY_MODEL_87b22017505c4d14a335692f09abd816", + "IPY_MODEL_8f5e2c19238240c38947f1a5d8e72792", + "IPY_MODEL_2da2537f2e444e16ad634693e684af58", + "IPY_MODEL_6e144126a66b48f9a22641284932ad73", + "IPY_MODEL_ef95b43fb5cd436cb6f737f2defc8e38" + ], + "layout": "IPY_MODEL_48d65f9009904854b076047201074a2c" + } + }, + "660e8c250f974ff685128c61b3d57fe3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "661fd55473c0431aa9dffd6876d1d559": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "66bc7fd58a2743a0960e9dd5df378998": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget010", + "justify_content": "center" + } + }, + "677e2010d7ce45eb9adc6f26a8977636": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_3556d6d1fe0c4e558b21b70b8c7b9395", + "style": "IPY_MODEL_0580852520e142a89d7b42c50bfef6a1", + "value": "learning_rate" + } + }, + "67a79ba4cbf84418967857e237a5a1be": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "681fa50d92ed4da0afda87805d2383ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_fb19638e8a38465f844aaf06c6378b29", + "style": "IPY_MODEL_47d275b36e704a74a22098c38f14f301", + "value": "Algorithm Parameters" + } + }, + "683e3afa65604f1b85604a79ec228a2b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "689e8f05af2f4f159239a896e7e9843a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "68d4eab6f1cf4e2fa0e229ecdce8d392": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget005" + } + }, + "68fcf5652dd14e5fad220fcbe777ddbb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "691c17934ca3435eb36a2d84d15ecdf7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_3a96e3ae233940e18c75f004da9e0459", + "IPY_MODEL_8d18e0fa10b94372a3edf64edb4814bc", + "IPY_MODEL_677e2010d7ce45eb9adc6f26a8977636", + "IPY_MODEL_e224793bc1524f0c91ce3d7ef0e98f8e", + "IPY_MODEL_c34d5f3024f24951b4f478bca62dd7c7", + "IPY_MODEL_6bb0b7ee0cdf49ca97bb0c3b528131e8", + "IPY_MODEL_5adceaf568da4a1d88d6bf7b379965c2", + "IPY_MODEL_6c1a4850cad844f4bd144b78177e6d31", + "IPY_MODEL_c12ffb6b4533460bbdfc7404ff89d807", + "IPY_MODEL_e6c798aa900740009741c67dfccb0d92", + "IPY_MODEL_75b1aa83fa184214aecc8ea858858cd3", + "IPY_MODEL_e1f03c622ff64b3bb4e59fc54e7898a6", + "IPY_MODEL_611b3bc2e8e749a38fe77bbdab064670", + "IPY_MODEL_eb54eb7b3c674e67b10610ce2aaf309a" + ], + "layout": "IPY_MODEL_fb06877af7ae451baefc12dfd27d9348" + } + }, + "6923c73eeac747fdbe41b2062e257a58": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget007" + } + }, + "69268529fca5425e9f11506c968490e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "698f9329e3754e7482dc32690ba58f4a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6a001a1bb11844d0b85857486c544879": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6ab9513a615a4551a596a3d2e637d181": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6bb0b7ee0cdf49ca97bb0c3b528131e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_2c0353597c114ba184977dac607510c3", + "style": "IPY_MODEL_82c3b758724944d0b02d17ecfdd05698", + "value": "False" + } + }, + "6c1a4850cad844f4bd144b78177e6d31": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e255dc6e7af7487e8a2729f670bffd8a", + "style": "IPY_MODEL_012eeb7c3bab46d9baa05356cd4ff0f6", + "value": "0.9" + } + }, + "6c751fa2c2aa415ea57d3c9b0e11b22d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_254576dd293543d384c9e5620c3db225", + "style": "IPY_MODEL_304f4dcdb42b4bca91451ccfe7eba639", + "value": "max_steps" + } + }, + "6caef128e4df40ebb76ef90ad9a40d41": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_a496bd2aabab465fbcf0022dc1acd19f", + "IPY_MODEL_2982ccca674f4bfc839557e06cde9993" + ], + "layout": "IPY_MODEL_fbd450c8b01f4ab9ae7ea1caa129bd66" + } + }, + "6cb628f08ae2469db2ee42e38ca4de74": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "BoxModel", + "state": { + "children": [ + "IPY_MODEL_90d52d8b63c342f087384246a76680d7" + ], + "layout": "IPY_MODEL_759fddd650134c46bbbbd4b4c6f8c744" + } + }, + "6d5b0a5b26874cfd874c4a0bdf307eff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_1537ab75a9dd4f429ffb3812c485116f", + "IPY_MODEL_a18265de326b4d399e760f9d2e5bb238" + ], + "layout": "IPY_MODEL_7208b8f21c77462dad67124eb0fd8164" + } + }, + "6d6739242111448eaf1e80a8962f1aac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "6db9105409df4485909f169fc6e6d696": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "6dc0399123f94dd1831a2b2cfb6c3078": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6e144126a66b48f9a22641284932ad73": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_53c0481b6b294cf888f2b3abdc33a95c", + "style": "IPY_MODEL_a8e550f371f94677a29e238776be2cdb", + "value": "Tips:" + } + }, + "6efa143c4b9d43aa94ed8cfe56824583": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "6f0bd8ffadf44461a70b1031b3f65064": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "learning curve smooth factor", + "layout": "IPY_MODEL_145001c5826a41cd989997ea61244ca1", + "max": 1, + "step": 0.01, + "style": "IPY_MODEL_4080aa3475b94001b5324fd14d18816c", + "value": 0.8 + } + }, + "6f525160109d45299758550c08196bd9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "70c300868924433094e74b74d260a4a2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "justify_content": "center" + } + }, + "715b10d741354c8db506fb8ba945a074": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_1e327c8e4b844c2fbb017a5544fa678e", + "style": "IPY_MODEL_6ab9513a615a4551a596a3d2e637d181", + "value": "0.0001" + } + }, + "7208b8f21c77462dad67124eb0fd8164": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "731d299fb9dd45c1a41a5d4df4f41f94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "747e88ebfefc4efb95f60f63e725dcc1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "74d03d1491d4451d879384ab357f33a9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "center" + } + }, + "74dc8e60490943c8b9601232bf24f608": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7532b84aea3a4f4290efa4b0369e846a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "759c11789beb46f798f3b48c4cf88577": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "759fddd650134c46bbbbd4b4c6f8c744": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "75b1aa83fa184214aecc8ea858858cd3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_7cdb0eb01b9b434ca4c08fd25f243f09", + "style": "IPY_MODEL_3cfd11894b514078901081bddd35c83d", + "value": "decay" + } + }, + "75c167ca66774581880b2500d5176a36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "76c7ceb7a42e44048e694b71f27f56eb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "76d1b335a0134c19852090005ae135c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e8260cb1f55049a49bdaf024528d43c4", + "style": "IPY_MODEL_def02ee29d9a44b19a1fd20f8a4be1a0", + "value": "name" + } + }, + "76dec90334724f3ba9e51ba05856ff79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "776cdbcecc004924a856eb45ec0a5699": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_98eeb6cc7ac643ac882d54fab647de04", + "style": "IPY_MODEL_a02320673c484c46848d7aeb6fda6e18", + "value": "1e-07" + } + }, + "78f5897896d144fe839fafd65e76816e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "79611f87c64c431794f17eccbbd60f38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget012" + } + }, + "79953b3e59c048548c96bb197d46a7ea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7a4be7c4229640b18c29d60d30cc0e70": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7a5d99612efa45acb82149814a4a7e82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_7e40917d81264ee9986d07bae8291022", + "style": "IPY_MODEL_1e6d0c80ceaa4e58846e9f554371b363", + "value": "Observation space:" + } + }, + "7a6c0819e1344119aae9ef136830ad44": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7a7ebee6dcf34f36b1d55d2cb443e387": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7a807eea55d14bae96d792b1e475adcb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7aa2babe24dc4fab84bfbd511f0b5e98": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7aba7921241e41af9a32cbe042699485": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7af9623e94c64555b01efa581f338e60": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_841b7f5d915e4f639784140b23610d75", + "IPY_MODEL_e904337542fd4e5d8187b9b9190b7522" + ], + "layout": "IPY_MODEL_532ea00fd94045298f69a3917ced39c7" + } + }, + "7b48f1fae96e40519787018ed628b99b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7cc3bf6293494425b70569d1eca3af03": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7cdb0eb01b9b434ca4c08fd25f243f09": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget011" + } + }, + "7d163d682d5744d6ac7be041fb66c158": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7d64c7c8f2dc4d4eb6218e55ae44bfbe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7d70e416e925499f93e5837aabc6afc2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7dc1333733194435934e6ca098ede1ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7df23ef826fb4c568071b0667bafcd3b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_6db9105409df4485909f169fc6e6d696", + "style": "IPY_MODEL_84111028e0ea4937a6fea8f96b279bec", + "value": "model save path" + } + }, + "7e128d275e3c4e88829167514cec3bc6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7e40917d81264ee9986d07bae8291022": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7f3f44cbaac94755810c0e589d048490": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7f9233b831cc448a97a909e398122bb9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "7f94bb571172453a920e7bd6d7a9050f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_58201f662dc74741bcdeb0e7753843c4", + "max": 600, + "min": -400, + "readout_format": ".0f", + "style": "IPY_MODEL_b5dd447dec9c48bc8b1bb664c9553912", + "value": 100 + } + }, + "7fbbe1851a944d69a568c06875de2b0f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_12e50eba7f3e4e9f888416f46172b60f", + "step": null, + "style": "IPY_MODEL_18a7121ba72e42af9a496a39fb8c6f6a", + "value": 100 + } + }, + "7fea48aa29c24b4b94784890589e01e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "7ff9e3e9f09b40d398b6c898e5ee9653": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "808fb0e5d6b940388d588196c927564d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "80d9bf94c37c49708820ccb5a2aa8f8b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "814eef7fa97a4fa2b4c5f1ed1b3728f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_2e8b3025623248e2a92daa5a7750997f", + "style": "IPY_MODEL_bb04f52581bb496e9a6931ce291714c9", + "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" + } + }, + "81621cd1e69f47a1868bf499caac5824": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "8173f889450249d58f18acfe83d63ddd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8178676fb5e441ec92464938695643a8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "81a50427a5384feeaaee374a19ad5931": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(6, 1fr)" + } + }, + "81d1f55272ef4977b06be173bdd59b8c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "81f34a95028440608c8a5a307cd7ee9b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_content": "center", + "align_items": "center", + "border": "dotted" + } + }, + "82c3b758724944d0b02d17ecfdd05698": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "831ed45407f74193acc07dacada162a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "835ef9a1125846679a65d679afb62013": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "8387714984af4e9cbaf16cbff2a45cbb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "83c18b3b4c374f70947e47230ffe4f82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_be4d4fbbc53d4705963f9b343aff399f", + "style": "IPY_MODEL_8efed772f09f4ea1a1dabf91598fd49a", + "value": "Optimizer information:" + } + }, + "84111028e0ea4937a6fea8f96b279bec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "841b7f5d915e4f639784140b23610d75": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_0b081708649d446ab37f522f5a019e19", + "readout_format": ".0f", + "style": "IPY_MODEL_12a0f20f2ecd423889594f36b15647f1", + "value": 50 + } + }, + "842ea79123034275adec1df392a4846d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget009" + } + }, + "84f7291061b34bfaaaec0711bd0cca56": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_ae877e1e2a554a19b78fb9a12f60e5d3", + "style": "IPY_MODEL_1f0e424278554da08fbb15138e571a62", + "value": "The action space is continuous." + } + }, + "85165a2de0d64a2bb9baf9b64b3ffa38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_383cf0cb101341d4bdfb65604a24a4d5", + "style": "IPY_MODEL_23424247d797485dba0788eb6b7614aa", + "value": "model save path" + } + }, + "85514e8a938240e7b2df7c2a8ad6b6e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "85d35dbed0594a3a837f536309af0b59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_1db128fafd984258b040b5295b477f0d", + "max": 74, + "min": -26, + "readout_format": ".0f", + "style": "IPY_MODEL_066c122ea5f64991b7347279a79e8061", + "value": 24 + } + }, + "86e357397076415ba3ac239b26a8bc8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget006" + } + }, + "8784dbc322c7455aaef2b352bae2f205": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "87b22017505c4d14a335692f09abd816": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "BoxModel", + "state": { + "children": [ + "IPY_MODEL_4a2a0ec5e8f641f489d58e31f3f5fcef" + ], + "layout": "IPY_MODEL_1f37fdacb85646a1b5ff9a2b1d6ab38a" + } + }, + "885608d7df064c51ac0523ef9928e6b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_689e8f05af2f4f159239a896e7e9843a", + "style": "IPY_MODEL_b85dbc19731e4b84bb6122ea52367809", + "value": "Action space:" + } + }, + "8865f419c3a04323907d8e9d11f06c24": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_8387714984af4e9cbaf16cbff2a45cbb", + "style": "IPY_MODEL_5daa3bcd6829495cb223328230f0f8e4", + "value": "gamma" + } + }, + "886c73a1052a4a2da9ec06c958855a51": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "88aafdf648784ac7954ce933431f9a3a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_8d80128792d44bf1a0467b7e86df0b54", + "IPY_MODEL_d91d58d65e864faa90c9cc7bfd2959b0" + ], + "layout": "IPY_MODEL_8ff956034aa047d0a8809922cbefa856" + } + }, + "88b977df9d82476298ff3c70d714afe0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "learning curve smooth factor", + "layout": "IPY_MODEL_7f9233b831cc448a97a909e398122bb9", + "max": 1, + "step": 0.01, + "style": "IPY_MODEL_35525c0fbffa497eb43f7d5bd081bb0b", + "value": 0.8 + } + }, + "88fc41c33c024f4eb22b13e0ea98e605": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "891909eab8204a4bb78c9a468bc20112": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_e1f175e02edf40f39585c485ec11cbff", + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD8CAYAAACCRVh7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl8VOXZ//HPlX0hJBACgbBDAiJCkMiiwqOCilsRKluR2tY+tFar1tpq26e/9nm6qK11a91otaUtKopasKi4F0VLDcgShEzCmgAZEoQwScg61++PDBgwIcssJ8v1fr3mxcx9zpm5MmK+nHPf575FVTHGGGMaE+Z0AcYYY9ovCwljjDFNspAwxhjTJAsJY4wxTbKQMMYY0yQLCWOMMU2ykDDGGNMkCwljjDFNspAwxhjTpAinC/BXr169dPDgwU6XYYwxHcqGDRtKVDWluf06fEgMHjyY7Oxsp8swxpgORUT2tmQ/u9xkjDGmSRYSxhhjmmQhYYwxpkkWEsYYY5pkIWGMMaZJFhLGGGOaZCFhjDGmSRYSxhjTCq9tPciu4jKnywgZCwljjGmhQ55Kblq2kase+YDlH+9DVZ0uKegsJIwxpoVcRfVnEH26R3PXi1v57rOfcKyyxuGqgstCwhhjWijX7QHg+W9N5geXj+C1nCKufPh9Nu474nBlwWMhYYwxLeQq8tAzPoqUhGhuvng4z39rMqow54mPePTdfLzeznf5yULCGGNaKNftIaNPN0QEgPGDevDqbVOYMTqV367JZdHT63Efq3S4ysCykDDGmBZQVfLcHkb0STilPTE2kj8sGMd9Xz6HDXuPcMXD7/PODrdDVQaehYQxxrTA/qPHKa+uIyM14QvbRIR55w3kn9+9kN4J0XzjL9n83yufUlVb50ClgeVXSIjIHBHZJiJeEclq0L5QRDY1eHhFJNO37VciUiAiZae9V7SILBeRfBFZLyKD/anNGGMCyeXrtM7o88WQOGF47wT+cfMFfO38wTy9bjezHv2QnR38ngp/zyRygNnA2oaNqrpMVTNVNRNYBOxW1U2+za8AExp5rxuBI6o6HHgQuM/P2owxJmByfcNfM3o3HRIAMZHh/PxLZ/Onr2ZxsPQ41/z+A17ILuiw91T4FRKqul1Vc5vZbQHwXINj/q2qBxvZbyaw1Pd8BTBNTvQOGWOMw1xuD6ndY0iMi2zR/tNH9eG126Yypn8iP1ixhdue24SnA95TEYo+iXnAsy3YLw0oAFDVWqAUSG5sRxFZLCLZIpJdXFwcsEKNMaYpLren0f6IM0lNjGHZNyfx/UszWL31IFc+8j6bCo4GqcLgaDYkROQtEclp5DGzBcdOBCpUNScg1fqo6hJVzVLVrJSUZtfxNsYYv9R5lbxDZYzo063Vx4aHCd+dls7yxZPweuG6xz/kiX/t7DD3VEQ0t4OqTvfj/efTsrMIgP3AAKBQRCKAROCwH59tjDEBsfdwOdW1XtLP0GndnKzBPXn11inc/dIW7n1tBx/klfDAvLH0TogJYKWBF7TLTSISBsylQX9EM1YBN/ieXwe8ox21p8cY06mcGNl0+j0SrZUYF8ljC8/lntnnkL33M6546H3ezT0UiBKDxt8hsLNEpBCYDKwWkTUNNk8FClR112nH/MZ3TJyIFIrIz32bngKSRSQfuAO425/ajDEmUE6MbEpvw+Wm04kICyYM5JVbLqRXt2i+/ueP+eU/P6W61uv3eweDdPR/rGdlZWl2drbTZRhjOrGbn9nI1sJS1v7w4oC+b2VNHb9avZ2//Xsv56Ql8siCcQzpFR/Qz2iKiGxQ1azm9rM7ro0xphmuIs8Zb6Jrq5jIcH5x7WieXDSefZ9VcPUj7/PSxsKAf44/LCSMMeYMqmrr2F1STkYALjU15fKzU3nttimc3S+RO57fzPeWb6KsqjZon9caFhLGGHMGu0vKqfUqI1p5j0Rr9UuK5dnFk/je9AxWbtrPVY+8z5ZC5++psJAwxpgzcLl903EE4XLT6cLDhNump/Pc4snU1HqZ/diHLFnr7D0VFhLGGHMGriIP4WHC0JTQdCgDTBjSk1dvm8K0s3rz61d38LW/fEyxpypkn9+QhYQxxpxBrtvDkF7xREeEh/Rzk+KieOL68fzy2tGs33WYKx5ey1pX6KchspAwxpgzcPlWo3OCiHD9pEGsuuVCesZH8dWn/8M9r24P6T0VFhLGGNOE49V17PusIiT9EWcyIjWBlTdfyFcmDuTJtbuY88SH7D1cHpLPtpAwxpgm5B8qQ9X/6TgCITYqnF/POofHF57L7pJyrnrkA9bllwT9c5ud4M8YY7qq3BOr0QV5+GtrXHFOX8YMSOL/XtkWkjMcCwljjGmCy+0hKiKMQT3jnC7lFGlJsTy5qNkZNQLCLjcZY0wTcos8DEvpRkR41/1V2XV/cmOMaYbL7WnTQkOdiYWEMcY04lhlDQdLK9tVf4QTLCSMMaYReQFaaKijs5AwxphGnFhoyOl7JJxmIWGMMY1wuT3ERYWTlhTrdCmO8nf50jkisk1EvCKS1aB9oYhsavDwikimiMSJyGoR2eE77t4Gx0SLyHIRyReR9SIy2J/ajDHGH7lFHtL7JBAWJk6X4ih/zyRygNnA2oaNqrpMVTNVNRNYBOxW1U2+zfer6khgHHCBiFzha78ROKKqw4EHgfv8rM0YY9os75CNbAI/Q0JVt6tqbjO7LQCe8+1foarv+p5XAxuB/r79ZgJLfc9XANNEpGtHuDHGESVlVZSUVXf5/ggITZ/EPODZ0xtFJAm4Bnjb15QGFACoai1QCiSHoD5jjDmF68TIpi4+/BVaMC2HiLwFpDay6SequrKZYycCFaqac1p7BPXB8Yiq7mpFvSeOXwwsBhg4cGBrDzfGmDNyFfnmbLIzieZDQlWn+/H+82nkLAJYAuSp6kMN2vYDA4BCX4gkAoebqGmJ7z3Iyspybl0/Y0ynlOsuIzE2kt4J0U6X4rigXW4SkTBgLr7+iAbtv6Q+AG4/7ZBVwA2+59cB76iqBYAxJuTy3B5G9EnAukX9HwI7S0QKgcnAahFZ02DzVKCg4eUkEekP/AQYBWz0DY/9pm/zU0CyiOQDdwB3+1ObMca0haqS6/aQkWojm8DPqcJV9WXg5Sa2vQdMOq2tEGg0mlW1EpjjTz3GGOOvomOVeCpru/x0HCfYHdfGGNNArq/TOt1CArCQMMaYU5wY/mojm+pZSBhjTAMudxkpCdH0jI9yupR2wULCGGMacPlGNpl6FhLGGOPj9Sout8cuNTVgIWGMMT4FRyqorPGSYRP7nWQhYYwxPidGNnX1JUsbspAwxhifvEP1q9Gl97YziRMsJIwxxie3yENaUiwJMZFOl9JuWEgYY4yPy+2x6cFPYyFhjDFATZ2XncVlNrLpNBYSxhgD7Ckpp6ZObWTTaSwkjDGG+jutwabjOJ2FhDHGALluD2ECw21k0yksJIwxhvolSwcnxxMTGe50Ke2KhYQxxoBNx9EECwljTJdXWVPHnsPl1mndCAsJY0yXt7O4DK/adByN8XeN6zkisk1EvCKS1aB9oW/96hMPr4hk+ra9LiKbfcc9ISLhvvaeIvKmiOT5/uzh349mjDEtc2KhIZsi/Iv8PZPIAWYDaxs2quoyVc1U1UxgEbBbVTf5Ns9V1bHAaCCFz9e1vht4W1XTgbd9r40xJuhyi8qIDBcG94p3upR2x6+QUNXtqprbzG4LgOcaHHPM9zQCiALU93omsNT3fClwrT+1GWNMS7ncHoaldCMy3K7Any4U38g84NmGDSKyBjgEeIAVvuY+qnrQ97wI6BOC2owxhtwiD+l2qalREc3tICJvAamNbPqJqq5s5tiJQIWq5jRsV9XLRSQGWAZcArx52nYVEaUJIrIYWAwwcODA5n6ERv1zywG2FJaSlhRL/x6xpPWItdkfjemCyqpq2X/0OAsmDHC6lHap2ZBQ1el+vP98TjuLaPC+lSKykvrLTG8CbhHpq6oHRaQv9WcaTdW0BFgCkJWV1WSYnMnWwlL+8uEeqmu9p7R3j4mgf4+4k6HRv4cvRJLq23rERSIibflIY0w7lOfrtLZ7JBrXbEi0lYiEAXOBKQ3augEJviCIAK4C3vdtXgXcANzr+/OMZyn++tGVZ3HXjJGUlFex/8hx9h89TuGR4yef7z1czof5JZRX151yXFxUOGlJ9WceDcMjLSmWAT1i6dUtmrAwCxFjOoqTI5ts+Guj/AoJEZkF/J76UUqrRWSTql7u2zwVKFDVXQ0OiQdWiUg09f0h7wJP+LbdCzwvIjcCe6kPmKAKCxN6J8TQOyGGcQO/OOJWVSk9XkPhEV+AHK0PkcIjFew/epxNBUc5WlFzyjFR4WH0S4qpPxtJij3ljCStRyyp3WOIsM4xY9qN3KIyYiLDGNAjzulS2iW/QkJVXwZebmLbe8Ck09rcwHlN7H8YmOZPPYEmIiTFRZEUF8XotMRG9ymrqvWdfVT4AuQ4hb4weXvHIUrKqk7ZPzxMSO0eU38m4guRoSnxjOqbyLCUeAsQY0LM5faQ3jvBrgA0IWiXm7qKbtERjEhNaPJUtbKmjgNHTz0Tqb+0VcG/dx2m6FglXl+vSlREGCNTExjVtztn9+vOqH7dGZnanfho+89kTLC43B6mpKc4XUa7Zb99giwmMpyhKd0YmtL4nDA1dV52l5Sz7UApnx44xqcHj/H6tiKe+7gAABEYkhzPWf18wdG3Pjx6J8SE8scwplM6Ul7NIU8VI1JtzqamWEg4LDI8jIw+CWT0SWDWuPo2VeVgaSXbDhzzBUcpmwuOsnrLwZPHpSREn3LGMapvdwYnx9spszGt4LKRTc2ykGiHRIR+SbH0S4rl0lGf31NYerzm5NnGpweOse1AKevyS6j1Xa+KiwrnrL6nnnFk9Emw+fGNaYKNbGqehUQHkhgbyeRhyUwelnyyraq2jjx32cnw2HaglJc27uevVXuB+o7y4SndTjnjGNWvO0lxUU79GMa0Gy53GQnREaR2t8u3TbGQ6OCiI8IZnZZ4yugrr1cpOFLR4HLVMdbtLOGlT/af3CctKfaU0Di7X3f62xBA08Xkuj1kpCbYDbJnYCHRCYWFCYOS4xmUHM+V5/Q92V5SVvWFy1VvbXejvtFVt09P5/bpGQ5VbUxoqSout4crRvdtfucuzEKiC+nVLZqpGSlMzfh8uF9FdS07ijw89m4+j723k3nnDaBvYqyDVRoTGsWeKo5W1DDCVqM7I7tzq4uLi4rg3IE9+Nk1Z6OqPPJ2ntMlGRMSuSdGNlmn9RlZSBgABvSMY+HEQTyfXciu4jKnyzEm6Fzu+r/nNvz1zCwkzEk3Xzyc6IgwHnjT5XQpxgSdq8hDcnwUvbpFO11Ku2YhYU5KSYjmGxcM4Z9bDpKzv9TpcowJqly3x84iWsBCwpziv6cOJTE2kvvfaG5VWmM6Lq9XyXN77Ca6FrCQMKdIjI3kpouG8V5uMet3HXa6HGOCYv/R45RX19mZRAtYSJgvuGHyYHonRPObNbmotmnhP2PatbxDJ+ZssuGvzbGQMF8QGxXOrdPS2bD3CO/saHIVWWM6rNyi+pFN6XYm0SwLCdOoeecNYFByHL9dk4vXa2cTpnNxuT30TYwhMTbS6VLaPQsJ06jI8DDuuDSDHUUeXtlywOlyjAmo3CIb2dRSfoWEiMwRkW0i4hWRrAbtC0VkU4OHV0QyTzt2lYjkNHjdU0TeFJE8359fXHTahNQ1Y/oxMjWBB950UVPndbocYwKizqvkF5fZyKYW8vdMIgeYDaxt2Kiqy1Q1U1UzgUXAblXddGK7iMwGTr+t927gbVVNB972vTYOCgsTfnD5CPYermC5b6U8Yzq6vYfLqa71kt7bOq1bwq+QUNXtqtrcgPoFwHMnXohIN+AO4Jen7TcTWOp7vhS41p/aTGBcMrI34wf14JG38zheXed0Ocb4zRYaap1Q9EnMA55t8PoXwO+AitP266OqJ9bnLAL60AQRWSwi2SKSXVxcHNBizalEhLtmjOSQp4qlH+1xuhxj/JZbVIYIDLcziRZpNiRE5C0RyWnkMbMFx04EKlQ1x/c6Eximqi+f6TitH5zf5JAaVV2iqlmqmpWSktLUbiZAJgzpyUUjUnj8vZ2UHq9xuhxj/OJyexjYM464KFspoSWaDQlVna6qoxt5rGzB+8/n1LOIyUCWiOwBPgAyROQ93za3iPQF8P1pA/TbkTsvG0Hp8Rr+uHaX06UY4xebs6l1gna5SUTCgLk06I9Q1cdVtZ+qDgYuBFyqepFv8yrgBt/zG4CWhJAJkdFpiVw9pi9Pr9tNsafK6XKMaZOq2jr2lJTbndat4O8Q2FkiUkj9GcJqEVnTYPNUoEBVW/pPz3uBS0UkD5jue23ake9fNoKqWi+PvpvvdCnGtMnuknJqvWpnEq3g10U5X99Co/0LqvoeMOkMx+4BRjd4fRiY5k89JriG9IpnblZ/lq3fy40XDmFAzzinSzKmVXKLbGRTa9kd16ZVbp2Wjojw0Fudd5nTvYfL+dnKHI6UVztdigkwl9tDRJgwtJddbmopCwnTKn0TY7lh8iBe+qTw5HjzzqS8qpZvLs1m6Ud7+fHLW20W3E4mt6iMIb3iiYqwX30tZd+UabWbLhpOfFQE96/pXAsTqSo/XLGFncVlXDWmL6/lFPHyJ/udLssEUN4hG9nUWhYSptV6xkfx31OG8sanbj7Zd8TpcgJmydpdrN56kLtmjOSR+ePIGtSDn63cRuGR0+/7NB1RRXUt+z6rsJBoJQsJ0yY3ThlCcnwUv+0kZxPr8ku47/UdXHVOXxZPHUp4mPDA3Ey8qtz5wmabLr0TyD9UhiqMSLX+iNawkDBt0i06gu9cPJwPdx7mg7wSp8vxS+GRCm55ZiPDUrrxm+vGICIADEyO42fXnM2/d33G0+t2O1yl8deJkU12JtE6FhKmzRZOHEhaUiy/XbOjw3bwVtbU8e2/b6C2Tnly0Xjio08dFT4nqz+XjurDb17PPflLxnRMLreHqIgwBiXHO11Kh2IhYdosJjKc26ans7mwlDXbipwup9VUlf/5Rw45+4/x4LxMhqZ88TKEiHDP7HPoHhvB7cs3UVVrM+F2VC53GcNTuhEeJk6X0qFYSBi/zB6XxrCUeO5/w0VdB7tu//f1+1ixoZBbp6UzfVSTkw7Tq1s0984ew/aDx3jwzc57f0hn53J77Ca6NrCQMH6JCA/jzstGkH+ojJc2FjpdTott2PsZ//fKNi4ekcLt09Kb3X/6qD4smDCAJ9fu5D+7PwtBhSaQSo/XcLC00voj2sBCwvhtxuhUxvRP5KG38jrE5ZhDxyr59t830i8plofmjSOshZcf/ueqUQzoEccdz2/CU2lTpnckeScXGrKRTa1lIWH8JlK/zOn+o8d5Zv0+p8s5o+paL99ZtpGyylqeXDSexLjIFh8bHx3Bg/PGcuDocf7vlU+DWKUJtFy3jWxqKwsJExAXDu/F5KHJ/OGdfMqrap0up0m/XP0p2XuPcN91YxiZ2r3Vx48f1JObLhrGCxsKO2RnfVeV5y4jPiqctKRYp0vpcCwkTECICD+YMYLD5dU8/UH7vKdgxYZC/vrRXv57yhC+NLZfm9/ntmkZnN2vOz96aautrdFB5BZ5SO+TcPIeGNNyFhImYM4d2INLR/Vhydpd7W4G1Zz9pfzk5a1MHprMXTNG+vVeURFhPDQvk7KqWu5+cUuHvUekK3G5PYywS01tYiFhAurOy0ZQVl3L4//a6XQpJ31WXs23/raB5Pgo/vCVcUSE+//XPr1PAnfPGMnbOw7x3McFAajSBEtJWRWHy6vJsOGvbWIhYQJqRGoCszLTWPrhHopKK50uh9o6L7c++wnFZVU8fv14krtFB+y9v3b+YC4Ynswv/vkpe0rKA/a+JrBcJxYasjOJNvF3+dI5IrJNRLwiktWgfaGIbGrw8IpIpm/beyKS22Bbb197tIgsF5F8EVkvIoP9qc0453uXZuBV5eG3nb/x7P43XHyQX8IvZ45m7ICkgL53WJhw/5yxRIQJdzy/ido6b0Df3wSG6+TIJhv+2hb+nknkALOBtQ0bVXWZqmaqaiawCNitqpsa7LLwxHZVPeRruxE4oqrDgQeB+/yszThkQM84FkwYyPPZBex28F/Yr249yBP/2slXJg5k7nkDgvIZfRNj+cW1o9m47yhPtKNLbOZzue4ykuIiSUkI3FlkV+JXSKjqdlVtbq7oBcBzLXi7mcBS3/MVwDSxoQgd1i2XDCcqPIwH3nQ58vl5bg93vrCZcQOT+Nk1o4L6WTMz07hmbD8eeiuPrYWlQf0s03oud/1CQ/brpG1C0ScxD3j2tLY/+y41/bRBEKQBBQCqWguUAskhqM8EQe+EGL5x4WBe2XyAbQdC+4vzWGUNi/+2gbioCB5fOJ7oiPCgf+YvZp5Ncrcobl/+CZU17f+u865CVXEV2cgmfzQbEiLylojkNPKY2YJjJwIVqprToHmhqp4DTPE9FrW2aBFZLCLZIpJdXFzc2sNNiCyeOozE2MiQLnPq9Sp3LN9MwWcVPLbwXFITY0LyuUlxUdw/Zyw7i8u57/UdIflM07yDpZV4qmptZJMfmg0JVZ2uqqMbeaxswfvP57SzCFXd7/vTAzwDTPBt2g8MABCRCCARONxETUtUNUtVs1JSUlpQhnFCYmwk3/6vYbybW8zHe0IzKd4f3s3nre1ufnLVWUwY0jMkn3nClPQUvnb+YP68bk+HX4ipszjZad3bOq3bKmiXm0QkDJhLg/4IEYkQkV6+55HA1dR3fgOsAm7wPb8OeEftLqUO72vnD6Z3QjS/eT34CxO9u+MQD77lYta4NL52/uCgflZT7poxkmEp8dz5wmZKK2wSQKe5bM4mv/k7BHaWiBQCk4HVIrKmweapQIGq7mrQFg2sEZEtwCbqzx7+6Nv2FJAsIvnAHcDd/tRm2ofYqHC+Oy2dj/cc4b3c4F0a3FNSzm3PfcJZqd359axzHOukjI0K58F5mZSUVfHTlTnNH2CCKreojN4J0fSIj3K6lA7L39FNL6tqf1WNVtU+qnp5g23vqeqk0/YvV9XxqjpGVc9W1dtUtc63rVJV56jqcFWdcFq4mA5sXtYABvaM4zdrcvEGYWGiiupavv33DYgITy4aT2xU8Duqz2RM/yRum5bOqs0HWLlpv6O1dHW20JD/7I5rE3RREWHccWkG2w8e459bDwb0vVWVu17cSq7bwyMLxjGgZ1xA37+tbrpoGOMGJvHTf+RwsPS40+V0SV6vknfIY5ea/GQhYULiS2P7MTI1gQfeyKUmgHcmP/XBbl7ZfIA7LxvBf2W0n0EMEeFhPDg3k5o65QcvbAnKGZQ5s4IjFVTWeO1Oaz9ZSJiQCAsT7rxsBHsOV/BCdmCWOf1wZwn3vLaDy8/uw3cuGhaQ9wykwb3i+enVo/ggv4S/frTH6XK6nNwi67QOBAsJEzLTzurNuQOTePhtl983nB04epzvPvMJg5PjuH/O2HZ7N+2CCQO4ZGRv7nltB/mHPE6X06WcGNmUbiHhFwsJEzIiwg9njMR9rIqlH+5p8/tU1tRx0983UFXr5clFWSTEtHwJ0lATEe798jnERYVz+/JNVNfaJIChkusuo3+PWLpFRzhdSodmIWFCatLQZKZmpPD4v3ZyrLJt9xH8fNU2NheW8ru5YxneAW6S6p0Qwz2zzyFn/zF+/47zM+N2FXm20FBAWEiYkPvh5SM4WlHDH9e2fpTzM+v38dzHBdxy8XAuPzs1CNUFx4zRfblufH8efTefDXuPOF1Op1dT52VncZldagoACwkTcqPTErlqTF+e+mB3q9aI3rjvCD9blcPUjBS+d2lGECsMjp9dM4q+ibHc8fwmyqtqnS6nU9tTUk5NnTIitf2fabZ3FhLGEd+/NIOqWi+Pvpvfov2LPVV85+8bSU2M4ZH5mYSHtc+O6jNJiInkgblj2fdZBb9cvd3pcjq1XJuOI2AsJIwjhqZ0Y874/jyzfh+FRyrOuG9NnZebn9nI0ePVPHl9FklxHXeKhYlDk1k8dSjP/mcfb293O11Op+Uq8hAmMCzFziT8ZSFhHHPb9HQQeOitM3fm/vrV7fxn92fcO3sMo/p1D1F1wXPHpRmMTE3grhe3cris5ZfbTMu53GUM7hVPTKSzU7R0BhYSxjF9E2P56qRBvLSxkDx34/cQ/OOT/fx53R6+fsFgrh2XFuIKgyM6IpyH5mdy7HgNP3ppa9Bnx+2KXG4PGb3tUlMgWEgYR33n4uHERUXwuze+uMzptgOl3P3SFiYM6cmPrzzLgeqCZ2Rqd+68PIM3PnWzYkNg7kA39Spr6thzuNwWGgoQCwnjqJ7xUXxzyhBe31bE5oKjJ9uPVlTz7b9vICk2ike/ci6R4Z3vr+qNFw5l4pCe/O8rn1Lw2Zn7ZUzL5R8qw6vYPRIB0vn+zzMdzjenDKVnfBS/9S1zWudVbn1uE+7SKh6//lxSEqIdrjA4wsOE380dC8D3n99MnU0CGBAnpuOw4a+BYSFhHNctOoLvXDSMD/JL+DC/hAffdLHWVczPv3Q24wb2cLq8oOrfI47//dLZ/GfPZ/zxfVtCJRBc7jKiwsMYlBzvdCmdgoWEaReunzSIfokx3PnCZv7wbj7zzxvAVyYOdLqskJh9bhpXjE7ld2/k8umBY06X0+G53B6GpsR3ykuUTrBv0bQLMZHh3DY9nQOllYztn8jPv3S20yWFjIjwq1nnkBQXxfeWb/J7htyuLrfIFhoKJH/XuJ4jIttExCsiWQ3aF4rIpgYPr4hk+rZFicgSEXGJyA4R+bKvPVpElotIvoisF5HB/tRmOp4vn9ufX80azR9vyOpy49t7xkfxm+vGkOv28MCbXxzpZVrGU1nD/qPHbcnSAPL3TCIHmA2sbdioqstUNVNVM4FFwG5V3eTb/BPgkKpmAKOAf/nabwSOqOpw4EHgPj9rMx1MRHgYCycOondCjNOlOOLiEb25ftJA/vj+Lj7aedjpcjqkvENlgE3HEUh+hYSqblfV3GZ2WwA81+D1N4B7fMd7VbXE1z4TWOp7vgKYJu11JRljguTHV57F4OR47nydNTO5AAAPXklEQVRhc5unUu/KTtyUacNfAycUfRLzgGcBRCTJ1/YLEdkoIi+ISB9fWxpQAKCqtUApkByC+oxpN+KiInhg7liKjlXy81XbnC6nw8ktKiM2Mpz+PWKdLqXTaDYkROQtEclp5DGzBcdOBCpUNcfXFAH0Bz5U1XOBj4D7W1u0iCwWkWwRyS4uLm7t4ca0a+MG9uDmi4fz0sb9vLr1oNPldCgut4f0Pt0I64CzBLdXza7rp6rT/Xj/+fjOInwOAxXAS77XL1DfFwGwHxgAFIpIBJDo27+xmpYASwCysrLsDiTT6Xz3kuG8l3uIu1ZsQYArzunrdEkdQq7bw39lpDhdRqcStMtNIhIGzKVBf4TWz2T2CnCRr2ka8Knv+SrgBt/z64B31GY+M11UZHgYjy08lyEp8dy0bCM/XLHZFipqxpHyaoo9VdYfEWD+DoGdJSKFwGRgtYisabB5KlCgqqffRnoX8HMR2UL9yKfv+9qfApJFJB+4A7jbn9qM6ej694jjxZvO5+aLh/HChkKufOR9PtlnS5825cR0HDaxX2A1e7npTFT1ZeDlJra9B0xqpH0v9QFyenslMMefeozpbCLDw/jB5SP5r4zefG/5Jq574iNuvSSdmy8eRoTdUXyKkyHRx+ZsCiT7W2ZMBzBhSE9evW0KV4/py4NvuZi35N/sO2wzxzaU6/aQEBNBaveueZ9NsFhIGNNBJMZG8vD8cTw8PxNXkYcrH3mfFzcU2qJFPq6iMkb0ScBurwosCwljOpiZmWm8dvsURvXtzvdf2Mwtz35CaUXXvvFOVcl1e6w/IggsJIzpgPr3iOPZxZP4weUjWJNTxIyH1/LhzpLmD+ykij1VlB6vsZFNQWAhYUwHFR4m9Tfdfed8YiLDWfin9dzz2naqa71OlxZyub5O63TrtA44CwljOrgx/ZNYfeuFzD9vIE/+axezHltH/iGP02WFVG6RzdkULBYSxnQCcVER3DP7HJYsGs+Bo8e5+vcf8LeP9nSZTm2X20OvblEkd+ucS906yULCmE7ksrNTWXP7VCYMSeanK7dx49JsSsqqnC4r6HLdZTY9eJBYSBjTyfTuHsNfvnYeP7tmFB/klzDjobW8u+OQ02UFjder5LttNbpgsZAwphMKCxO+fsEQXrnlQnp1i+brf/mY/7cyp1Mujbr/6HHKq+ssJILEQsKYTmxEagL/uPkCbrxwCH/9aC9X//4Dth0odbqsgDoxHceIVBvZFAwWEsZ0cjGR4fz06lH87cYJHDtew7WPruPJf+3E6+0cndqfD3+1M4lgsJAwpouYkp7CmtuncsnI3tzz2g6uf2o9B0uPO12W31xFHvolxtA9JtLpUjolCwljupAe8VE8cf147vvyOWwqOMqMh95n9ZaOvfqdy11m03EEkYWEMV2MiDDvvIGsvnUKg3vFc/MzG/n+85sp64CLGtXWeckvtuGvwWQhYUwXNaRXPCu+PZlbLxnOy58UcuXD77Nhb8da1GjvZxVU13otJILIQsKYLiwyPIw7LhvB8m9NxqvK3Cc/4sE3XdTWdYz5n1w2HUfQWUgYYzhvcP2iRjPH9uPht/OY8+RH7D1c7nRZzcp1exCB4b1t+Guw+LvG9RwR2SYiXhHJatC+UEQ2NXh4RSRTRBJOay8RkYd8x0SLyHIRyReR9SIy2L8fzRjTGt1jInlgXiaPLBhH/qEyrnz4fV7ILmjX8z/lucsY1DOO2Khwp0vptPw9k8gBZgNrGzaq6jJVzVTVTGARsFtVN6mq50S7b9te4CXfYTcCR1R1OPAgcJ+ftRlj2uBLY/vx+u1TGZ2WyA9WbOHmZzZytKLa6bIalev22P0RQeZXSKjqdlXNbWa3BcBzpzeKSAbQG3jf1zQTWOp7vgKYJrYOoTGOSEuK5Zn/nsRdM0byxjY3Vz3yAdsPHnO6rFNU1daxu6Tc+iOCLBR9EvOAZxtpnw8s18/PZdOAAgBVrQVKgeTG3lBEFotItohkFxcXB6FkY0x4mHDTRcN48abzqfV6+fLjH/Lmp26nyzppV3E5dV61eySCrNmQEJG3RCSnkcfMFhw7EahQ1ZxGNs+n8fBolqouUdUsVc1KSUlpy1sYY1po7IAkVt1yIcN7d2Px37J58l8720U/xck5m+xMIqgimttBVaf78f6NBoGIjAUiVHVDg+b9wACgUEQigETgsB+fbYwJkD7dY1i+eDJ3rtjMPa/tIO9QGb+aNZroCOc6jF1uDxFhwpBe8Y7V0BU0GxJtJSJhwFxgSiObF/DF8FgF3AB8BFwHvKPt4Z8rxhgAYqPC+cOCcQxP6cbDb+ex93A5T1w/3rHV4HKLyhjSK56oCBvJH0z+DoGdJSKFwGRgtYisabB5KlCgqrsaOXQuXwyJp4BkEckH7gDu9qc2Y0zgiQjfuzSD3y8Yx5bCUmY+uu7k+tKh5nJ7rD8iBPwd3fSyqvZX1WhV7aOqlzfY9p6qTmriuKGquuO0tkpVnaOqw1V1QhPhYoxpB64Z24/nvzWZ6lovsx9bxzs7QtuhXVFdy77PKqw/IgTsPM0Y0yZjBySx8pYLGNwrnhuXZvOn93eFrEM7z10GYHM2hYCFhDGmzfomxvLCtycz4+xUfrl6O3e/uJXq2uDP+/T5anQWEsFmIWGM8UtcVASPfuVcvnvJcJZnF3D9U+v5rDy4d2i73B6iI8IY2DMuqJ9jLCSMMQEQFiZ8/7IRPDw/k00FR7n20XXkuYPXoZ3rLmN4726Eh9mkDMFmIWGMCZiZmWksXzyJiuo6Zj/2Ie/lHgrK57iKPNZpHSIWEsaYgBo3sAcrb7mA/j3j+MZfPubP63YHtEO79HgNRccqbfhriFhIGGMCLi0plhXfnsz0s/rwv698yo9fzqEmQAsZ5dl0HCFlIWGMCYr46AieuH4837loGM/+Zx9ffeo/AZlyPNcXEul9bKGhULCQMMYETViY8MMZI3lg7lg27D3CtY+uI/9QmV/v6SryEB8VTlpSbICqNGdiIWGMCbrZ5/bn2cUT8VTWMuuxdbyf1/Yp/nN903HYcjOhYSFhjAmJ8YN6svKWC0hLiuVrf/6Yv360p03vk+cus/6IELKQMMaETP8ecay46XwuHpHC/1u5jZ/+o3Ud2iVlVRwur7bpOELIQsIYE1LdoiN4clEW35o6lL/9ey9f//PHlFbUtOhYl2/GWQuJ0LGQMMaEXHiY8KMrz+I3141h/e7DzHpsHbuKm+/QPjGyKSPVRjaFioWEMcYxc7MGsOybkzh6vIZrH13HuvySM+7vcnvoERdJikMLHXVFFhLGGEdNGNKTlTdfQGpiDF99+j8sW7+3yX1d7jIy+tjIplCykDDGOG5AzzhevOl8pqb34icv5/DzVduoPa1DW1Xr52yy6ThCyt/lS+eIyDYR8YpIVoP2hSKyqcHDKyKZvm0LRGSriGwRkddFpJevvaeIvCkieb4/e/j3oxljOpKEmEj+dMN5fPPCIfzlwz18/S8fU3r88w7tg6WVeKpqSbdO65Dy90wiB5gNrG3YqKrLVDVTVTOBRcBuVd0kIhHAw8DFqjoG2ALc4jvsbuBtVU0H3sbWuDamywkPE/7n6lHcO/scPtp5mNmPrWNPSTnweae13SMRWv6ucb1dVXOb2W0B8Jzvufge8VJ/UbE7cMC3bSaw1Pd8KXCtP7UZYzqu+RMG8rcbJ3K4vJprH1vHRzsPNxj+aiObQikUfRLzgGcBVLUGuAnYSn04jAKe8u3XR1UP+p4XAX1CUJsxpp2aPCyZlTdfQHJ8FIueWs9zHxfQp3s0SXFRTpfWpTQbEiLylojkNPKY2YJjJwIVqprjex1JfUiMA/pRf7npR6cfp/WTzzc5Ab2ILBaRbBHJLi5u+xwwxpj2bVByPC/ffAHnD+/F7pJyu4nOARHN7aCq0/14//n4ziJ8Mn3vuRNARJ7n874Ht4j0VdWDItIXaHJJK1VdAiwByMrKCtxqJsaYdqd7TCRP35DF0+t2Mzot0elyupygXW4SkTBgLp/3RwDsB0aJSIrv9aXAdt/zVcANvuc3ACuDVZsxpmOJCA9j8dRhnD+sl9OldDn+DoGdJSKFwGRgtYisabB5KlCgqrtONKjqAeB/gbUisoX6M4tf+zbfC1wqInnAdN9rY4wxDpJArj3rhKysLM3Ozna6DGOM6VBEZIOqZjW3n91xbYwxpkkWEsYYY5pkIWGMMaZJFhLGGGOaZCFhjDGmSRYSxhhjmtThh8CKSDHQ9ColZ9YLOPNSWF2LfR+fs+/iVPZ9nKozfB+DVDWluZ06fEj4Q0SyWzJOuKuw7+Nz9l2cyr6PU3Wl78MuNxljjGmShYQxxpgmdfWQWOJ0Ae2MfR+fs+/iVPZ9nKrLfB9duk/CGGPMmXX1MwljjDFn0GVDQkRmiEiuiOSLyN3NH9E5icgAEXlXRD4VkW0icpvTNbUHIhIuIp+IyD+drsVpIpIkIitEZIeIbBeRyU7X5BQR+Z7v/5McEXlWRGKcrinYumRIiEg48ChwBfXrbC8QkVHOVuWYWuD7qjoKmATc3IW/i4Zu4/MFsbq6h4HXVXUkMJYu+r2ISBpwK5ClqqOBcOpX3+zUumRIABOAfFXdparV1K+e1+ya3Z2Rqh5U1Y2+5x7qfwGkOVuVs0SkP3AV8Cena3GaiCRSv4DYUwCqWq2qR52tylERQKyIRABxwAGH6wm6rhoSaUBBg9eFdPFfjAAiMhgYB6x3thLHPQT8EPA6XUg7MAQoBv7su/z2JxGJd7ooJ6jqfuB+YB9wEChV1TecrSr4umpImNOISDfgReB2VT3mdD1OEZGrgUOqusHpWtqJCOBc4HFVHQeUA12yD09EelB/xWEI0A+IF5Hrna0q+LpqSOwHBjR43d/X1iWJSCT1AbFMVV9yuh6HXQB8SUT2UH8Z8hIR+buzJTmqEChU1RNnlyuoD42uaDqwW1WLVbUGeAk43+Gagq6rhsTHQLqIDBGRKOo7n1Y5XJMjRESov968XVUfcLoep6nqj1S1v6oOpv7vxTuq2un/tdgUVS0CCkRkhK9pGvCpgyU5aR8wSUTifP/fTKMLdOJHOF2AE1S1VkRuAdZQP0LhaVXd5nBZTrkAWARsFZFNvrYfq+qrDtZk2pfvAst8/6DaBXzd4XocoarrRWQFsJH6UYGf0AXuvLY7ro0xxjSpq15uMsYY0wIWEsYYY5pkIWGMMaZJFhLGGGOaZCFhjDGmSRYSxhhjmmQhYYwxpkkWEsYYY5r0/wGKtE2Kfdh95gAAAABJRU5ErkJggg==\n", + "text/plain": "
" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ] + } + }, + "891e2bdcc12d4314affa4fd372ed7ade": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "89880b2c3e03469da53b8a7e9e2e930b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8991ca296f464086aab8e12cc644430c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget011" + } + }, + "89ae5379ee8b4e2d92f116a018b9420e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_c2eca071d21942c98a47aaf881130883", + "IPY_MODEL_a6a4d48baea44d659e3b2dd7e54fcd17" + ], + "layout": "IPY_MODEL_3044da8a1f89485398f1ea9d4965bc55" + } + }, + "8ae2c037e98f420486a61a8570daf106": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8b14eeb5b78e4e4cb98441ffaeccf4fb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget002" + } + }, + "8c168f5c8ecc4d0ba203b60193856d1c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_b9ad33908a4f4a6ba687c820c123c37a", + "style": "IPY_MODEL_094d34956035446984a6cb8a6efc22a7", + "value": "1e-07" + } + }, + "8c27b4b759354d64b25bcb3462c444ef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "AC", + "DDPG", + "PG", + "PPO", + "SAC", + "TD3", + "TRPO" + ], + "description": "Algorithms:", + "index": 0, + "layout": "IPY_MODEL_b5ac8df291f9438bacc64a6cb2805620", + "style": "IPY_MODEL_45850b0512424834a6d4c70e60892ae8" + } + }, + "8c59866961674911b2157bded443e366": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_fc69d16aa7e547b09859e2ca7dbfbde8", + "IPY_MODEL_6caef128e4df40ebb76ef90ad9a40d41" + ], + "layout": "IPY_MODEL_00663174be1342fbbd29bc99cdd6d3aa" + } + }, + "8ca1f8992583484a8a0ff2f7f46afee2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_76c7ceb7a42e44048e694b71f27f56eb", + "style": "IPY_MODEL_97b119b9f8fc4a5f80b7f35b2fbc20dd", + "value": "Input(shape=(None, 3), name='input_layer')" + } + }, + "8d025735275c4dfdbbbf2d491e727c08": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_ce5b912531614dfe90ee3e20fa7ba467", + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD8CAYAAABpcuN4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xd81fW9+PHXO5uEkJCQhAz23hAiiANBUXEiWHGPum613ttbtdba3229HVZba/WqtUWte1WFgsUFCOICDSHskYCMnJMJWWQn5/P743yjB8g44ZycQd7Px+M88j3f+eabw3nn+5lijEEppZRqT4i/A1BKKRXYNFEopZTqkCYKpZRSHdJEoZRSqkOaKJRSSnVIE4VSSqkOaaJQSinVIU0USimlOqSJQimlVIfCPDlYRK4AHgTGANOMMdku2yYCfwf6AA7gFGNMvYhcDTwAGMAOXGeMKRORBOAtYDCwD1hojCnvLIZ+/fqZwYMHe/LPUEqpHmfDhg1lxpgkd/YVT4bwEJExOJPA34F7WxOFiIQBOcD1xphNIpIIVACCMzmMtZLDH4FaY8yD1vJhY8zDInI/0NcY8/POYsjKyjLZ2dmd7aaUUsqFiGwwxmS5s69HRU/GmB3GmF1tbDoP2GyM2WTtd8gY04IzUQgQIyKC82nDbh0zD3jJWn4JuMyT2JRSSnlHd9VRjASMiHwkIjkich+AMaYJuAPYgvVkATxvHZNijCm0louAlG6KTSmlVBd0mihEZKWIbG3jNa+Dw8KAM4BrrZ/zReQcEQnHmSimAGnAZuAXxx5snOVh7ZaJicjtIpItItmlpaWd/ROUUkp5oNPKbGPMnBM4bwGw1hhTBiAi7wOZQJV1zj3W+n8C91vHFItIqjGmUERSgZIOYloELAJnHcUJxKeUUspN3VX09BEwQUSirYrts4DtgA0YKyKtNe3nAjus5WXAjdbyjcDSbopNKaVUF3jaPHY+8CSQBCwXkVxjzPnGmHIReQz4BmcR0vvGmOXWMf8LrBWRJmA/cJN1uoeBf4rILdb6hZ7EppRSyjs8ah4bCLR5rFJKdZ3PmscqpZTyvZqGZn777+3sK6vxyfU0USilVJBZuaOY5z//ltIjDT65niYKpZQKMktz7aTH92LqwL4+uZ4mCqWUCiKHaxpZu7uUiyelEhIiPrmmJgqllAoi728ppNlhmDcp3WfX1EShlFJBZFmunRHJvRmTGuuza2qiUEqpIGGrqOPrfYeZNzkN57iqvqGJQimlgsR7m5yDbV/qw2In0EShlFJBY2munSkD4xmYGO3T62qiUEqpIJBXXM2OwirmTUrz+bU1USilVBBYtslOiMBFEzVRKKWUOoYxhqW5dk4f3o+k2EifX18ThVJKBbjcgxUcOFzLpX4odgJNFEopFfCW5tqJCAvh/PH9/XJ9TRRKKRXAmlsc/HtzIeeMTqZPVLhfYtBEoZRSAeyrvYcoO9LAvMn+KXYCTRRKKRXQlubaiY0MY9aoZL/FoIlCqSBXXd/EVlulv8NQ3aC+qYUPtxYxd3x/osJD/RaHJgqlgtyvl27j0qc+Z93eQ/4ORXnZ6p0lHGloZt5k3w7ZcSxNFEoFsSMNzby/tRCHgf96YyNlPprxTPnGsk12+vWOZMawRL/GoYlCqSD2/pZC6pscPDR/AhV1Tfz0rVwcDuPvsJQXVNU3sWpnCZdMSiXURxMUtUcThVJBbHFOAUP7xXD1tAH8+pKxfJZXxjOf7vF3WMoLPtpaRGOzw2+d7FxpolAqSB08XMu6vYdZkJmOiHDNtIFcPDGVx1bs5utvD/s7POWhZZvsDEyIZvKAeH+HoolCqWD1r402AC6b4qzoFBH+sGACA/r24r/e2MjhmkZ/hqc8UFJdzxf5ZT6foKg9miiUCkLGGBZvtDFjaCIZfb+fmyA2KpynrsnkcE0jd/9T6yuC1fLNzgYK/uxk50oThVJBKOdABd+W1bAg8/hmk+PT4/ifi8ewZlcpiz7b64folKeW5toZm9qH4cm+mxe7I5oolApCi3MK6BUeygUTUtvcft2pg7hwQn/+9NEuNuzX+opgsv9QDbkHKwLmaQI0USgVdOqbWnhvk5254/vTOzKszX1EhIcvn0h6fC/+8/WNlGt9RdBYluucF/uSAGjt1EoThVJB5pOdJVTVN3N5ZkaH+/WJCufpazIpO9LIvW9vwhitrwh0xhiWbrIzbUgCafG9/B3OdzRRKBVk3t1QQP8+UW711p2QEccDF45m1c4SnvvsWx9Epzyxo7Ca/JIjAdF3wpUmCqWCSNmRBtbsLmV+ZrrbvXVvPG0wc8f155EPd5JzoLybI1SeWLrJRliIcGE7dU/+oolCqSCyNNdOi8OwYIr7g8SJCI/8YCL946L4z9c3Ulnb1I0RqhPlcBjey7Uzc2QSCTER/g7nKB4lChG5QkS2iYhDRLJc1l8rIrkuL4eITLa2TRWRLSKSLyL/J1ZvEhFJEJEVIpJn/ezr2T9NqZPP4pwCJmbEMSKla80m43o5+1eUVNdz7ztaXxGIsveXY6+sD6jWTq08faLYCiwA1rquNMa8ZoyZbIyZDFwPfGuMybU2PwPcBoywXnOt9fcDq4wxI4BV1nullGVnURXb7FWdVmK3Z/KAeH4+dzQrthfzwhf7vBuc8tjSXBu9wkOZMybF36Ecx6NEYYzZYYzZ1cluVwNvAohIKtDHGLPOOP+keRm4zNpvHvCStfySy3qlFLA4x0Z4qHjUbPKWM4YwZ0wKf/hgB5sOVngxOuWJxmYHy7cUcu7YFGLaafLsT76oo7gSeMNaTgcKXLYVWOsAUowxhdZyERB4aVUpP2lucbBko43Zo5I9Kr8WER69YiLJsVH8+PUcKutO3voKYwxrd5fyo1c2sGyT3d/hdOjz/FIqapsCstgJoNPUJSIrgf5tbPqlMWZpJ8dOB2qNMVu7EpQxxohIu4WoInI7cDvAwIEDu3JqpYLS5/lllFY3sOAEi51cxUdH8OQ1U1j4t6/4+Tubeea6zIAYeM5bGpsdLNtk57nP9rKzqJoQgU93lzIxPY7B/WL8HV6blubaiY8O58wRSf4OpU2dPlEYY+YYY8a38eowSViu4vunCQAb4PpJz7DWARRbRVOtRVQlHcS0yBiTZYzJSkoKzBurlDctzrERHx3O2aOTvXK+zIF9uW/uKD7cVsTLX+33yjn9raK2kadX53PGI59YHQzhTz+YyJp7ZxMeKtzz9iZaAnCQxNrGZlZsL+bCCalEhAVmQ9RuKwwTkRBgIXBm6zpjTKGIVInIqcB64AbgSWvzMuBG4GHrpzuJSKmTXlV9Ex9tK+LKUwZ49Yvk1jOGsm7vYX6/fAeZA/syISPOa+f2pf2HavjH59/yz+wC6ppaOHNEP/50xSRmjuj33ZPSby8bz0/ezGXR2r3cMWuYnyM+2sodJdQ2tgRcJztXnjaPnS8iBcAMYLmIfOSyeSZw0Bhz7PCVdwLPAfnAHuADa/3DwLkikgfMsd4r1eN9sKWQhmaHV4qdXIWECH++YhKJvSP48es5VNUHV33Fhv3l3PHqBmY/uobXvz7AhRNS+eAnZ/LKLdM5a2TSUcVpl05K48IJ/XlsxS52FFb5MerjLcu10b9PFNMGJ/g7lHZJsLenzsrKMtnZ2f4OQ6lus/BvX3GopoGVd5/VLXUJG/YfZuHf1zF3XH+eumZKQNdXtDgMH28r4tnP9pJzoIK4XuFcO30gN542mJQ+UR0ee7imkfP+spak2EiW/vj0gCjmKa9p5JTfr+TmM4bwwIVjfHptEdlgjMnqfE/tma1UQDtwqJav9x1mQWZGt32BTx2UwL3njWL5lkJeXX+gW67hqdrGZl76ch+zH13DHa/lUHakkf+9dBxf3n82980d3WmSAEiIieDhBRPYUVjFE6t2+yDqzn2wtYhmhwnoYifoxjoKpZTnFm8sQATmd2HIjhPxHzOH8tXeQzy0fAfnjU1x64vXF0qq6nnxy328tv4AlXVNZA6M5xcXjOa8cf3dHuvK1ZyxKSzMyuCZNXs4e3QKUwf5dwCIpbk2hiXFMC6tj1/j6Iw+USgVoIwxLM6xcdqwxG4fcjokRPjdvPG0OAx//LCzPrS+8ehHuzj9kU/426d7OG1YIu/ecRqL7zydCyaknlCSaPU/F48lNa4X9769idrGZi9G3DX2ijq+3neYeZPTA7q4DzRRKBWwNuwv58Dh2hMesqOrBiZG88MzBvNuTgFbCip9cs32fLmnjKdW53P+uP6svncWz1w31Wt//cdGhfPoFZP4tqyGhz/Y6ZVznoh/b7ZjDAFf7ASaKJQKWO/mFBAdEcr549rq79o97po9nMSYCH7z721+GziwqcXBr5ZuY0BCLx69YhKDEr3fSW7GsERuOWMIL3+1n8/ySr1+fncszbUzaUB8wHYCdKWJQqkAVN/Uwr83FzJ3fH+fjv0TGxXOPeeN4pt95Xywtchn13X1whffkl9yhAcvGUdUeGi3Xedn549ieHJvfvb2Zp8PZZJfcoRt9irmBcHTBGiiUKrLSqrq+eOHO6nuxn4HK7YXU13fzA98VOzk6spTBjC6fywPvb+D+qYWn167qLKeJ1bmcc7oZM7p5lFUo8JDeWzhJEqPNPC/y7Z167WOtWyTnRCBiycG1gRF7dFEoVQXfby9mL+u2cMtL2ZT19g9X6SLcwpIi4vi1KGdT3fqbaEhwv9cPJaC8jqfD0f++/d30OQw/PqScT653sSMeO6aPZzFG218uLWw8wO8wBjDslwbM4Ylkhwgrcs6o4lCqS6yV9QhAtn7D3P7K9k0NHs3WZRU17M2r4z5memEeNC6xxOnD+/HnDHJPL06n9LqBp9c88s9Zby3yc4dZw1jYGK0T64JcNfZw5mQHscDS7b65N+6uaCSfYdqmTepe5s8e5MmCqW6yFZRR0bfXjy8YCKf5ZVx1+sbaWpxeO38y1qnO/VDsZOrBy4cQ31TC4+t6P7msk0tDn5tVWD7eiym8NAQHls4iSMNzTywZEu3V+IvzbUTERrC+eN910jBU5oolOoie0UdaXG9WHjKAB68ZCwrthdzrxdHJn1nQwGTB8QzLKm3V853ooYm9eaGGYN585uDbLd37/hIL36xjzwfVGC3Z0RKLPedP4oV24t5Z0NB5wecoIraRt7bbGf26CTieoV323W8TROFUl1kr6gnva+zA9xNpw/hvrmjWJpr55de+Gt0u72KnUXVXJ4ZGMUSPzlnBHG9wvntv7d321/aRZX1PL5yt08qsDty8+lDmD4kgd+8t52C8lqvnru+qYVFa/cw84+rKTvSwLXTB3n1/N1NE4VSXdDc4qCoqp50l57Sd84azl2zh/PmNwf5jYdfqO/mFHg83ak3xUWH89M5I/lq7yFWbC/ulmv4ugK7PSEhwqNXTMJhDD97ezMOLzwhOhyGpbk2zvnzpzz0/k4yB/Xlg5+cycyRwTWPjiYKpbqguLqBFoc5bkiNe84byQ9PH8wLX+zjsRUnNuBcc4vD+aUyOoX46BOf7tTbrpk+kOHJvXno/R00NnuvLgb8V4HdngEJ0fzqkrF8tfcQL321z6NzfbXnEPOe/oKfvJlLXK9wXrt1Oi/+cBqj+wf2uE5t0UShVBfYyusAjnqiAOdc1L+6eCxXnTKAJz/J569r8rt87s/yyig70siCACl2ahUeGsIvLxrDvkO1vPzVPq+d158V2B1ZmDWAs0cn8/AHO8kvOdLl4/OKq7nlxW+4+tl1HDrSwGMLJ/Hv/zyD04f364ZofUMThVJdYK9wJoq2BukTEX4/fwKXTkrjjx/u4qUv93Xp3O/kFJAQE8GsUd6Z7tSbZo9KZubIJJ5YlcfhmkavnLO1AvvXF/unArs9IsLDl08gOiKUe97eRLObLdpKqur5xeLNnP/4Wr7+9jA/nzuaT+6dxYLMDL81c/YWTRRKdYGtou0nilahIcKfF07i3LEp/HrZNv6ZfdCt81bWNbFiezGXTkoLiAl12vL/LhpDbWMLj6/0fC6H4qrvK7DnjPVfBXZ7kmOj+N1lE9h0sIJn1uzpcN+ahmb+smI3sx5dwzsbCrjxtMF8et9s7pg1LKASoCcC8xOpVICyVdSREBNBr4j2vwDCQ0N46popnDmiH/e/u5n3Ntk7Pe/yzYU0Njt8NlLsiRiZEss10wby2voD5BVXe3Su3y8PjArsjlw0MZVLJ6XxxKo8ttqOH023ucXB6+sPMOvRNTyxKo/Zo5JZefdZ/PqScSTEBE4dkzdoolCqC+wVdaTFdz7sQmRYKIuuzyJrUAI/fSuXlZ20GFqcU8CI5N6MTw/sis6fnjuS6IhQfrd8xwmf48s9ZSwLoArsjvxm3jgSe0dw9z9zvxv3yhjDqh3FzH3iMx5YsoVBCdEsvvM0nr42s1tGug0EmiiU6gJbeV27xU7H6hURyvM3ZTE2rQ93vp7D53llbe63r6yG7P3lXD61+6Y79ZaEmAh+cs4IPt1dyupdJV0+PlArsNsTHx3BI5dPZHfxER5bsZvNBRVc/ew6bnkpmxaH4W/XTeXtH80gc6B/Z8rrbpoolHKTMcZ6onB/trnYqHBevnkaQ/vFcNvL2WTvO3zcPos32hCByyYHVmun9twwYzCDE6OdxUddHLokUCuwOzJrVDLXTh/Is5/t5dKnviCv+Ai/mTeOj386k7nj+wd8cvcGTRRKuamqrpmaxha3nyhaxUdH8Mot00mNi+KHL3xz1OxxDodhcU4BZwzvR/+44BhJNCIshAcuHEN+yRFeX3/A7eMCvQK7Iw9cOIazRyXz49nDWPOzWdwwYzDhoT3n67Pn/EuV8lBBhXNYh64mCoCk2EhevXU6fXqFc/0/1rOryFkZ/M2+wxSU1wV0JXZbzh2bwmnDEvnLyt1U1ro3L0cwVGC3JyYyjOdvOoWfnT+a2KjgGaPJWzRRKOUme0U90HYfCnekxffi9dumExEawnXPr+fbshoW59iIiQjlvHHB9Re2iPD/LhpLZV0TT6zK63T/YKrAVsfTRKGUm1o727UOCHgiBiXG8Nqt02lxGK59dh3LtxRy4YRUoiN8N92pt4xN68OVWQN4+at97C1tvwdzsFVgq+NpolDKTbaKOiLCQkj0sI38iJRYXr55GtUNzRxpaPb7vBOeuOe8UUSFh/LQ++03lw3GCmx1NE0USrnJVuFsGuuNVi7j0+N4/dZTufe8kUwfkuCF6PwjKTaSO2cPY+WOkjab/7ZWYJ8dhBXY6nuaKJRyU1f6ULhjQkYcd509IujHAbr59CFk9O3F75ZvP27ypu8rsMf6KTrlDZoolHKTu72ye5qo8FB+ccEYdhZV89Y3349t5VqBfbL2WO4pNFEo5YaG5hZKqhtIj9cWO225cEJ/Thnclz9/vIuq+iatwD7JaKJQyg1Fla1NY/WJoi0iwv9cPJZDNY08vTpfK7BPMsHXJk8pP+hseHEFEzPiuTwzgxc+30d4qGgF9knEoycKEblCRLaJiENEslzWXysiuS4vh4hMFpFoEVkuIjut4x52OSZSRN4SkXwRWS8igz2JTSlv+m5mOw/6UPQE980dRWiIaAX2ScbTJ4qtwALg764rjTGvAa8BiMgE4F/GmFwRiQYeNcasFpEIYJWIXGCM+QC4BSg3xgwXkauAR4ArPYxPKa9o7ZUdLOMx+UtKnyj+el0mLS1GK7BPIh4lCmPMDqCzduVXA29a+9cCq63lRhHJAVp7G80DHrSW3wGeEhExxhzd3k4pP7BX1JEcG0lkmJa3d2Z2AE7lqjzji8rsK4E3jl0pIvHAJcAqa1U6cBDAGNMMVAKJbZ1QRG4XkWwRyS4tLe2WoJVyZevi8OJKnUw6TRQislJEtrbxmufGsdOBWmPM1mPWh+FMHv9njNnb1aCNMYuMMVnGmKykpKSuHq5Ul9krvNvZTqlg0mnRkzFmjgfnv4o2niaARUCeMeZxl3U2YABQYCWSOOCQB9dWyiuMMdgq6rQFj+qxuq3oSURCgIVY9RMu63+HMwn89zGHLANutJZ/AHyi9RMqEByqaaSh2UGaVmSrHsrT5rHzRaQAmAEsF5GPXDbPBA66Fi2JSAbwS2AskGM1nb3V2vw8kCgi+cDdwP2exKaUt3w/vLj2ylY9k6etnpYAS9rZtgY49Zh1BUCbTaSMMfXAFZ7Eo1R3aO1Dob2yVU+lQ3go1Qntla16Ok0USnXCXlFPTEQocb163lzJSoEmCqU6ZauoJc1LExYpFYw0USjVCXtFvY7xpHo0TRRKdUJ7ZaueThOFUh2oa2zhcE2jVmSrHk0ThVIdsFdqiyelNFEo1YHv+1BoolA9lyYKpTrwfa9sTRSq59JEoVQHbBV1hAikxEb6OxSl/EYThVIdsFXU0b9PFGGh+l9F9Vz66VeqA/aKOi12Uj2eJgqlOqB9KJTSRKFUu1ochqLKek0UqsfrsYmi7EgD1fVN/g5DBbDS6gaaWoz2oVA9nkfzUQSzpz7J58Uv95Ee34vR/WMZZb1G9+/D0KQYwrXyssfT4cWVcuqxieKSSWkkxUayq6iaXUXVfLq7lGaHc+bV8FBhWFLv7xJHayJJjYvSEUR7EO1DoZRTj00UUwf1Zeqgvt+9b2x2sKf0CLuKqtlZVM2uoiq++fYwS3Pt3+3TJyrsuyePUS4JpE+UzlNwMmp9okjVubJVD9djE8WxIsJCGJPahzGpfY5aX1nbxK5iZ+LYaT19LN1op7rhwHf7DOkXwxnD+3HmiH7MGJZIrCaOk4K9oo4+UWH6+1Q9niaKTsRFhzNtSALThiR8t84Yg72ynp2FzuSxYX8572wo4JV1+wkNEaYMiOfMEUmcObIfE9PjtLNWkLKV15HeN9rfYSjld5ooToCIkB7fi/T4XpwzJgWAhuYWcvZX8Hl+KZ/llfH4qt38ZeVu+kSFcdqwfpw5sh8zRyQxIEG/eIKFraKODK2fUEoThbdEhoUyY1giM4Yl8rPzobymkS/2lPHZ7jI+yyvlw21FAAxKjObMEf04c0QSM4Ylav1GALNX1DHd5UlSqZ5KE0U36RsTwcUT07h4YhrGGPaW1fDZ7lI+zy9jSY6NV9cdIDREmDwg/rvEMXlAPKEh2qoqEFTXN1FV36yd7ZRCE4VPiDib2w5L6s1Npw+hsdnBxgPlfJ5fxtq8Mp5YlcfjK/O4bHIaj181xd/hKpzzZIPOQ6EUaKLwi4iwEKYPTWT60ETuOW8UFbWN/OH9nbyTU8AvLxpLkg5p7Xfah0Kp72lznAAQHx3BbTOH0OIwLNtk7/wA1e0KtFe2Ut/RRBEghifHMjEjjsU5Bf4OReF8oggPFZJ669OdUpooAsiCKelss1exq6ja36H0eLbyOlLjehGijQuU0kQRSC6ZlEZYiLB4oz5V+Ju9oo60eB26QynQRBFQEntHMmtUEv/aaKPFGqBQ+Ye9oo70eO0cqRRoogg4CzIzKK5q4Ms9Zf4OpcdqanFQVFVPuj5RKAV4mChE5AoR2SYiDhHJcll/rYjkurwcIjL5mGOXichWl/cJIrJCRPKsn33pgc4enUyfqDCW5Nj8HUqPVVxVj8No01ilWnn6RLEVWACsdV1pjHnNGDPZGDMZuB741hiT27pdRBYAR4451/3AKmPMCGCV9b7HiQoP5aKJaXywtYiahmZ/h9Mj2cqdTWO1s51STh4lCmPMDmPMrk52uxp4s/WNiPQG7gZ+d8x+84CXrOWXgMs8iS2YXZ6ZTl1TCx9uLfJ3KD2SvVIThVKufFFHcSXwhsv73wJ/BmqP2S/FGFNoLRcBKT6ILSBNHdSXgQnR2vrJT1qH79DOdko5dZooRGSliGxt4zXPjWOnA7XGmK3W+8nAMGPMko6OM8YYoN1mPyJyu4hki0h2aWlpZ2EEHRFhQWY6X+45RKH1163ynYLyOhJjIogKD/V3KEoFhE4ThTFmjjFmfBuvpW6c/yqOfpqYAWSJyD7gc2CkiKyxthWLSCqA9bOkg5gWGWOyjDFZSUlJboQRfOZPSccY+NdGHdLD1+wVdVqRrZSLbit6EpEQYCEu9RPGmGeMMWnGmMHAGcBuY8wsa/My4EZr+UbAnUR00hqUGEPWoL4szinA+YClfMVWUUdanCYKpVp52jx2vogU4HxSWC4iH7lsngkcNMbsdfN0DwPnikgeMMd636MtyMwgr+QIW21V/g6lxzDGWL2yNVEo1crTVk9LjDEZxphIY0yKMeZ8l21rjDGndnDsPmPMeJf3h4wx5xhjRljFXYc9ie1kcNGEVCLCQnxeqe1wGH7+zmY+2Vns0+sGgsq6JmobW7ToSSkX2jM7gMVFhzNnTDLLcu00tTh8dt0lG228lX2Q37y3nWYfXjcQFJS3Di+uvbKVaqWJIsAtmJLBoZpG1u72Teuu+qYW/vzxLvpGh7PvUC3LtxR2ftBJpHXCIi16Uup7migC3FmjkkiIiWCxj4b0eOGLfdgr63n6mkxGpvTm6dX5OHrQAIU2nbBIqeNooghw4aEhXDopjRU7iqmsa+rWax2uaeSvq/M5Z3Qypw3vx49nD2d38RE+3t5z6irsFXVEhYeQEBPh71CUChiaKILAgsx0GpsdvN/NxUBPfpJHTWMz918wGnBWpg9KjObp1fk9pomuvaKetPheiOiERUq10kQRBCakxzE8uXe3TpO6/1ANr67bz5WnDGBESiwAYaEh3DlrGFtslXzqozoSfyuoqNNiJ6WOoYkiCLQO6fHNvnIOHDp2iCzv+ONHuwgLCeGnc0YetX7+lAzS4qJ46pOe8VRh1852Sh1HE0WQuGxyOiLOpqvetvFAOcs3F3LbzKEk9zm6WWhEWAj/cdYwsveXs/7bk7trS0NzC6XVDdqHQqljaKIIEmnxvZgxNJHFG707pIcxhj+8v5N+vSO5febQNve58pQB9OsdydOr87123UBUaI0aq01jlTqaJoogsiAzg/2Hask5UO61c67YXszX+w7z33NG0DsyrM19osJDufXMIXyWV0buwQqvXTvQ2LVprFJt0kQRROaO70+v8FCv9alobnHw8Ic7GZoUw1WnDOhw3+tOHURcr3Ce+uTkfaoo0EShVJs0UQSR3pFhnD8uhfc22WlobvH4fG9+c5C9pTXcP3eTXfe3AAAS6ElEQVQ0YaEdfxR6R4bxw9MHs3JHMTsKT85BCu0VdYhA/zgdvkMpV5oogsyCzAyq6pv5ZEe703W45UhDM4+v3M20wQmcO9a9yQRvOm0wvSPDTtq6CntFHcmxkUSE6X8LpVzp/4ggc/rwfiTHRvKuh8VPi9bupexIIw9cNMbtzmXx0RFcd+oglm8pZE/pEY+uH4hsOry4Um3SRBFkQkOEy6aks2ZXCYdrGk/oHMVV9Ty7di8XTUxl8oD4Lh1765lDiAwL4Zk1e07o2oHMXlGv9RNKtUETRRBakJlOs8Pw3qYTmyb18ZW7aXY4uO/8UV0+tl/vSK46ZSD/2mjj4OHu6fznDw6Hwaa9spVqkyaKIDS6fx/GpvY5oSE9dhdX89Y3B7nu1EEMSow5oev/x1lDEYG/rz15nioO1TTS2OzQoiel2qCJIkgtyExnU0El+SVdqyt45IOdxESG8V9njzjha6fG9eIHUzP4Z3YBJVX1J3yeQKJ9KJRqnyaKIHXp5DRCBJZ0YZrUr/YcYtXOEu6cNZy+Hg6jfcdZw2lxGJ79zN0p0QObTScsUqpdmiiCVHJsFDNHJrEkx+bWxEIOh+EPH+wgLS6KH54+2OPrD0yM5tJJaby67sAJV6oHku+eKHScJ6WOo4kiiC3IzMBeWc+6bw91uu97m+1sLqjknvNGERUe6pXr3zlrGPXNLbzwxbdeOZ8/FZTX0TsyjD5RbQ9jolRPpokiiJ03NoXYyDCWdNKnoqG5hT99tIuxqX2YPyXda9cfkRLL3HH9efHLfVTVd+/se93NXlFHWnyUTlikVBs0UQSxqPBQLpjQn/e3FFLX2P6QHq98tZ+C8joeuHAMISHe/SL88ezhVNc388pX+716Xl+zV2rTWKXao4kiyC3IzKCmsYWPtxe1ub2ytoknP8ln5sgkzhjRz+vXH58ex+xRSTz32V5qG5u9fn5fsZVrr2yl2qOJIshNG5xAenyvdof0eHpNPlX1TfzCmge7O9x19nDKa5t4ff2BbrtGd6ptbKa8tkkrspVqhyaKIBcSIsyfks7neaXH9Wk4eLiWF7/Yx+WZGYxJ7dNtMUwdlMCMoYksWruX+ibPR7X1Ne1DoVTHNFGcBOZnpuMwsDT36CE9/vzxLkTgnvNGtnOk99x19nBKqht4Z0PXe4v7m01ntlOqQ5ooTgLDknozeUA877oM6bGloJJ/5dq55YwhpMZ1/xfgacMSmTIwnr99uoemFke3X8+b9IlCqY5pojhJXJ6Zzs6iarbbqzDG8ND7O0iIieBHs4b55Poiwl2zh1NQXnfck02gs5XXERoiJMdG+jsUpQKSJoqTxMUT0wgPFZZsLGDNrlK+2nuI/zp7OH2iwn0Ww9mjkxmT2oe/rsmnxY3e4oHCXlFH/z5Rnc7yp1RPpf8zThJ9YyKYPSqZf+Xa+cMHOxicGM010wf5NIbWp4q9pTV8sLXQp9f2hA4vrlTHNFGcRBZkZlBa3cDu4iP8fO5ov0zpOXd8f4YlxfDUJ/kYExxPFTarV7ZSqm0efZOIyBUisk1EHCKS5bL+WhHJdXk5RGSytS1CRBaJyG4R2Skil1vrI0XkLRHJF5H1IjLYk9h6otmjk+gbHU7mwHjmju/vlxhCQ4Q7Zw1nZ1E1qzyc19sXWhyGosp67UOhVAc8/ZNzK7AAWOu60hjzmjFmsjFmMnA98K0xJtfa/EugxBgzEhgLfGqtvwUoN8YMB/4CPOJhbD1OZFgo79xxGotuyPLrmEWXTk5jQEIvnlod+E8VJdX1NDuMNo1VqgMeJQpjzA5jzK5OdrsaeNPl/c3AH6zjHcaYMmv9POAla/kd4BzREdq6bFhSb/r19m/rnfDQEH501jByD1bwRX7nI9v6k13noVCqU74oxL4SeANAROKtdb8VkRwReVtEUqx16cBBAGNMM1AJJLZ1QhG5XUSyRSS7tLS0e6NXJ+QHUzNI6RPJU6vz/B1Kh1o722VoolCqXZ0mChFZKSJb23jNc+PY6UCtMWartSoMyAC+NMZkAl8Bj3Y1aGPMImNMljEmKykpqauHKx+IDAvl9pnDWLf3MF/ml3V+gJ/YyvWJQqnOdJoojDFzjDHj23gtdeP8V2E9TVgOAbXAYuv920CmtWwDBgCISBgQZ+2vgtQ10wYyKDGaH726ga22Sn+H0yZ7RR3x0eHEROqERUq1p9uKnkQkBFiIS/2EcdZsvgfMsladA2y3lpcBN1rLPwA+MYFeE6o61CsilNdunU5sVDjXPree7fYqf4d0HFtFHWk+GOJEqWDmafPY+SJSAMwAlovIRy6bZwIHjTF7jzns58CDIrIZZ4uoe6z1zwOJIpIP3A3c70lsKjBk9I3mzdtPJSYilOueX8+uomp/h3QU58x2miiU6oinrZ6WGGMyjDGRxpgUY8z5LtvWGGNObeOY/caYmcaYicaYc4wxB6z19caYK4wxw40x09pIMCpIDUiI5vXbTiU8VLj2uXXklwROsrBV1JGhfSiU6pD2zFY+MbhfDG/cdioiwtXPrmdP6RF/h0RVfRPV9c3aK1upTmiiUD4zNKk3b9w2HWMM1zy7jn1lNX6N5/vhxaP9GodSgU4ThfKp4cmxvHbrqTS1OJPFwcO1fovl+6ax+kShVEc0USifG9U/lldvmU5tUwtXLVpHQbl/koVOWKSUezRRKL8Ym9aHV2+ZTnV9E9c8u57Cyjqfx2CrqCciNMTvQ54oFeg0USi/GZ8exyu3TKe8ppGrF62juKrep9e3VdSRGh9FSIgOKaZURzRRKL+aNCCeF2+eRml1A1c/u46Sat8lC7tOWKSUWzRRKL+bOqgvL948jaLKeq59dj1lRxp8cl3tbKeUezRRqIBwyuAE/nHTKRwsr+W659ZzuKaxW6/X1OKguKpeE4VSbtBEoQLGqUMTef7GU/i2rIbrnltPRW33JYuiynocRocXV8odmihUQDl9eD8W3ZBFfskRrn/+ayrrmrrlOjadsEgpt2miUAHnrJFJ/P36qewsquLGf3xNdb33k8V3fSh0nCelOqWJQgWk2aOT+eu1U9lqq+SmF77hSEOzV8/fmihS47RXtlKd0UShAta5Y1N48uop5B6s4OYXvqG20XvJwlZRR7/eEUSFh3rtnEqdrDRRqIB2wYRUnrhqMtn7D3P3W5twOLwzl5Wtol77UCjlJk0UKuBdPDGNX140lg+3FfGXlbu9ck5bea1WZCvlJk0UKijcfPpgrswawJOf5LM01+bRuYwx2PWJQim3aaJQQUFE+O1l45k2OIH73tlM7sGKEz5XRW0TdU0t+kShlJs0UaigEREWwjPXZZIUG8ntL2dTVHli40JpHwqlukYThQoqib0jef7GU6hpaOa2l7Opa2zp8jlaE4XOla2UezRRqKAzqn8s/3f1FLbaK7n3nU0Y07WWUN/PbKeJQil3aKJQQemcMSncP3c0yzcX8sSqvC4da6+oIyo8hL7R4d0UnVInlzB/B6DUibp95lB2Fx/h8ZV5jEiO5aKJqW4dZ690zkMhohMWKeUOfaJQQUtEeGjBeKYO6ss9b+eypaDSreNs5ToPhVJdoYlCBbXIsFD+fv1UEmMiue3lbErcmE7VVlGvFdlKdYEmChX0+vWO5Nkbsqiqb+K2l7Opb2q/JVR9UwtlRxpIi9NEoZS7NFGok8LYtD785crJbCqo5L53NrfbEqrQ6nuhRU9KuU8ThTppnD+uPz87fxTLNtl5enV+m/voPBRKdZ22elInlTtnDSOvuJpHP97N8ORY5o7vf9T21j4UOs6TUu7TJwp1UhERHr58IpMHxPPTt3LZZj+6JZStog4R6K8TFinlNk0U6qQTFR7KohumEh8dzm0vZVNS/X1LKHtFHSmxUYSH6kdfKXd59L9FRK4QkW0i4hCRLJf114pIrsvLISKTrW1Xi8gWEdksIh+KSD9rfYKIrBCRPOtnX8/+aaonS46N4tkbsiivbeI/XtnwXUsoW0UdafH6NKFUV3j6Z9VWYAGw1nWlMeY1Y8xkY8xk4HrgW2NMroiEAU8As40xE4HNwF3WYfcDq4wxI4BV1nulTtj49DgeWziJjQcqeGDxFmseijrS+0b7OzSlgopHicIYs8MYs6uT3a4G3rSWxXrFiHP8hD6A3do2D3jJWn4JuMyT2JQC51Sqd587ksUbbTzz6R7sFfX6RKFUF/mi1dOVOJMAxpgmEbkD2ALUAHnAj639UowxhdZyEZDig9hUD/CfZw9nd3E1f/zQ+TdNhrZ4UqpLOn2iEJGVIrK1jdc8N46dDtQaY7Za78OBO4ApQBrOoqdfHHuccfaWanfsaBG5XUSyRSS7tLS0szBUDyciPHrFJCZmxAHa2U6prur0icIYM8eD818FvOHyfrJ1zj0AIvJPvq+LKBaRVGNMoYikAiUdxLQIWASQlZXVtckIVI8UFR7Kszdk8cyaPUwbkuDvcJQKKt3WRlBEQoCFfF8/AWADxopIkvX+XGCHtbwMuNFavhFY2l2xqZ4ppU8UD146jtgonYdCqa7wtHnsfBEpAGYAy0XkI5fNM4GDxpi9rSuMMXbgf4G1IrIZ5xPGQ9bmh4FzRSQPmGO9V0op5WfS1WkkA01WVpbJzs72dxhKKRVURGSDMSar8z21Z7ZSSqlOaKJQSinVIU0USimlOqSJQimlVIc0USillOqQJgqllFIdCvrmsSJSCuw/wcP7AWVeDMcXNObuF2zxgsbsK8EWc0fxDjLGJLWz7ShBnyg8ISLZ7rYjDhQac/cLtnhBY/aVYIvZW/Fq0ZNSSqkOaaJQSinVoZ6eKBb5O4AToDF3v2CLFzRmXwm2mL0Sb4+uo1BKKdW5nv5EoZRSqhM9IlGIyFwR2SUi+SJyfxvbI0XkLWv7ehEZ7Psoj4pngIisFpHtIrJNRH7Sxj6zRKRSRHKt16/8EesxMe0TkS1WPMcN6StO/2fd580ikumPOK1YRrncu1wRqRKR/z5mH7/fYxH5h4iUiMhWl3UJIrJCRPKsn33bOfZGa588EbmxrX18GPOfRGSn9XtfIiLx7Rzb4WfIxzE/KCI2l9//he0c2+H3iw/jfcsl1n0iktvOsV2/x8aYk/oFhAJ7gKFABLAJGHvMPncCf7OWrwLe8nPMqUCmtRwL7G4j5lnAv/19f4+JaR/Qr4PtFwIfAAKcCqz3d8wun5EinO3KA+oe45zXJRPY6rLuj8D91vL9wCNtHJcA7LV+9rWW+/ox5vOAMGv5kbZiducz5OOYHwTudeOz0+H3i6/iPWb7n4Ffeese94QnimlAvjFmrzGmEeeMe8fO9z0PeMlafgc4R0TEhzEexRhTaIzJsZarcc4CmO6veLxoHvCycVoHxFvT3vrbOcAeY8yJdtzsNsaYtcDhY1a7fl5fAi5r49DzgRXGmMPGmHJgBTC32wJ10VbMxpiPjTHN1tt1QIYvYnFXO/fZHe58v3hdR/Fa310LOXoaao/0hESRDhx0eV/A8V+63+1jfZgrgUSfRNcJqxhsCrC+jc0zRGSTiHwgIuN8GljbDPCxiGwQkdvb2O7O78Ifjp3b3VWg3WOAFGNMobVcBKS0sU+g3muAm3E+Wbals8+Qr91lFZf9o50ivkC8z2cCxcaYvHa2d/ke94REEbREpDfwLvDfxpiqYzbn4CwqmQQ8CfzL1/G14QxjTCZwAfBjEZnp74A6IyIRwKXA221sDsR7fBTjLEsImqaLIvJLoBl4rZ1dAukz9AwwDOeUzYU4i3OCwdV0/DTR5XvcExKFDRjg8j7DWtfmPiISBsQBh3wSXTtEJBxnknjNGLP42O3GmCpjzBFr+X0gXET6+TjMY2OyWT9LgCU4H8tdufO78LULgBxjTPGxGwLxHluKW4vsrJ8lbewTcPdaRG4CLgautRLccdz4DPmMMabYGNNijHEAz7YTS0DdZ+v7awHwVnv7nMg97gmJ4htghIgMsf56vApYdsw+y4DWViE/AD5p74PsC1YZ4/PADmPMY+3s07+1HkVEpuH8XfotuYlIjIjEti7jrLzcesxuy4AbrNZPpwKVLkUo/tLuX1+Bdo9duH5ebwSWtrHPR8B5ItLXKjI5z1rnFyIyF7gPuNQYU9vOPu58hnzmmPqz+e3E4s73iy/NAXYaYwra2njC97i7a+cD4YWztc1unK0Tfmmt+w3ODy1AFM6ih3zga2Con+M9A2dxwmYg13pdCPwI+JG1z13ANpytLNYBp/k55qFWLJusuFrvs2vMAjxt/R62AFl+jjkG5xd/nMu6gLrHOJNYIdCEs/z7Fpz1Z6uAPGAlkGDtmwU853LszdZnOh/4oZ9jzsdZlt/6eW5tZZgGvN/RZ8iPMb9ifU434/zyTz02Zuv9cd8v/ojXWv9i6+fXZV+P77H2zFZKKdWhnlD0pJRSygOaKJRSSnVIE4VSSqkOaaJQSinVIU0USimlOqSJQimlVIc0USillOqQJgqllFId+v/Mln+PZVMKegAAAABJRU5ErkJggg==\n", + "text/plain": "
" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ] + } + }, + "8d18e0fa10b94372a3edf64edb4814bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_8b14eeb5b78e4e4cb98441ffaeccf4fb", + "style": "IPY_MODEL_a89219097e994deb9caa9b27d8bd2866", + "value": "Adam" + } + }, + "8d80128792d44bf1a0467b7e86df0b54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_09c74a8b5d1a43828034e148d2edfbfc", + "max": 510, + "min": -490, + "readout_format": ".0f", + "style": "IPY_MODEL_e318e3ad8e11430d840261e7eb1b540e", + "value": 10 + } + }, + "8efed772f09f4ea1a1dabf91598fd49a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "8f01f6cb90754bcb8b2e64809505291d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8f5e2c19238240c38947f1a5d8e72792": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_413fd706b68148a099ed9af1a952ec6d", + "style": "IPY_MODEL_ff0e9f4940eb4b57bd99d96059b5e194", + "value": "Action space:" + } + }, + "8f90c0a8d78442cfa05aff9b006a94d6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget013" + } + }, + "8f9477722bb54e6185f07c7069ed73bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "8fd0788ed947457d8556dc976e0eda38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "8ff956034aa047d0a8809922cbefa856": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget012", + "justify_content": "center" + } + }, + "90d52d8b63c342f087384246a76680d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_f6f23b9ba55946d0aa626d62ba4bbdf5", + "style": "IPY_MODEL_3488ba4c7374447794395c4c315a1193", + "value": "Box(3,)" + } + }, + "91d86c9ddbfa4acdaf18e13d8adf3862": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_298f572cd2ec4a9ca5a6feafaf334040", + "style": "IPY_MODEL_de8a6e2e9cb447439055e987582fc63e", + "value": "Adam" + } + }, + "9384c24875c24e5b8be37d4c55e04820": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget008" + } + }, + "93fcd071ff834486b199ab26105f6901": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "946c2a2e7e8f4e36b0311e922520272f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "965b9a99694b4227a43121ae2e974290": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_bcb79998188240e99279f9cda7e004d9", + "style": "IPY_MODEL_02904d8bc2d442deb3da0b5e6e0363a9", + "value": "StochasticPolicyNetwork" + } + }, + "9689f9977c7f455282a9831bcd81905c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_59da397a7faa43c79c633dd523b6f07b", + "style": "IPY_MODEL_ec6b04eac2cd4e5a821244a954846a39", + "value": "Dense(n_units=1, tanh, in_channels='64', name='dense_2')" + } + }, + "9694a75a41e543a3b2642aee3572857d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_1202663af1bf4653bc967824c8574e1a", + "IPY_MODEL_e1d753092ae3420ead7a3086b9405f2a", + "IPY_MODEL_182107ee16aa4bfba497dd033e347d65", + "IPY_MODEL_6cb628f08ae2469db2ee42e38ca4de74", + "IPY_MODEL_885608d7df064c51ac0523ef9928e6b6", + "IPY_MODEL_22ff0e7129b04334b71044d77e3c9298", + "IPY_MODEL_43ca75c41e054155b5ad51e493b3b990", + "IPY_MODEL_84f7291061b34bfaaaec0711bd0cca56" + ], + "layout": "IPY_MODEL_3e9c9dcc814b47f8b2b392074c83d853" + } + }, + "96fc368f69794e5baa9433c3a31b1ec1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "9705108e9dd540fa8e02c1933e03eadd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "97b119b9f8fc4a5f80b7f35b2fbc20dd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "97f58376ed524fab85dde1ea5f67ee17": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_8fd0788ed947457d8556dc976e0eda38", + "style": "IPY_MODEL_c480ff00167c4205a51065548cbea855", + "value": "StochasticPolicyNetwork" + } + }, + "98824ad5eda8475394e9fb13819502a9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "98eeb6cc7ac643ac882d54fab647de04": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget014" + } + }, + "98f2c9b34e884cada9e2eedac93e1912": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "99ac959475eb4f75b586ed6599b99113": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_7a7ebee6dcf34f36b1d55d2cb443e387", + "style": "IPY_MODEL_55abe6fb296b491ba2e2a09a492b5ae8", + "value": "Dense(n_units=64, relu, in_channels='3', name='hidden_layer1')" + } + }, + "9a247aedcd64492d9b4ddf9d76c13062": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget005" + } + }, + "9ac98c15de5a4548a99d80e8ea3004c9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_d9b467355fa940af8f164b0b53137582", + "style": "IPY_MODEL_351ae05c16d040dab9a578c06a78858c", + "value": "Environment Selector" + } + }, + "9b276e72efa44a7e911ee209d08859b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "9b5f3fd4ebd341ac91227f9ded9fab19": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "9c226167c8fb4cfab3a7161a87588ae1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "9ce0362f9fac4e45a87ebe7a085a24af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "default" + ], + "description": "state type:", + "index": 0, + "layout": "IPY_MODEL_a6379873f0434d53a6ad52553c164bdb", + "style": "IPY_MODEL_dceb338b27c742cd8733350448a2e798" + } + }, + "9dd1d4acaad44f16b1bbf0693ee9fad5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_7e128d275e3c4e88829167514cec3bc6", + "style": "IPY_MODEL_10b2a84971164564ac50d9f53bd98579", + "value": "Input(shape=(None, 3), name='input_layer')" + } + }, + "9dfcd5e4ec744ed4a0a9091bed5ed2d8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_88fc41c33c024f4eb22b13e0ea98e605", + "style": "IPY_MODEL_5caab83d7d4d4658ac739d02b56e9fd6", + "value": "render" + } + }, + "9e37b046f2d841dd9572b2284a729bf5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "description": "Progress", + "layout": "IPY_MODEL_33ecf71f75a649a285ea6a8211b5acbd", + "style": "IPY_MODEL_68fcf5652dd14e5fad220fcbe777ddbb", + "value": 18 + } + }, + "9ee876553e424052a509a2daed8da1c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_cefe9c21582d46dc9471bee195b466b7", + "style": "IPY_MODEL_b04de6976e7d476fa4981293ded26bd6", + "value": "Dense(n_units=64, relu, in_channels='64', name='hidden_layer2')" + } + }, + "9fc5c513843a4c0fa7ae9c8b37c3b4ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "9fd6a74ce4e54ae38816e55d19327281": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_68d4eab6f1cf4e2fa0e229ecdce8d392", + "style": "IPY_MODEL_bb0110f57f39444db2d635a30437c85d", + "value": "amsgrad" + } + }, + "a01f34500cfc486289f3334e3cd222df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_7a4be7c4229640b18c29d60d30cc0e70", + "style": "IPY_MODEL_7d64c7c8f2dc4d4eb6218e55ae44bfbe", + "value": "Algorithm Selector" + } + }, + "a02320673c484c46848d7aeb6fda6e18": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a0371ec3949944198211395dc7848ba6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "Acrobot-v1", + "CartPole-v1", + "CartPole-v0", + "MountainCar-v0", + "MountainCarContinuous-v0", + "Pendulum-v0" + ], + "description": "env name:", + "index": 5, + "layout": "IPY_MODEL_45e906bdfe7a464d848f9c972f536d31", + "style": "IPY_MODEL_ad07aedb699c4a3da0110a187e381619" + } + }, + "a038c2e1def5473484b4d9bbc5393145": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_6923c73eeac747fdbe41b2062e257a58", + "style": "IPY_MODEL_93fcd071ff834486b199ab26105f6901", + "value": "save_interval" + } + }, + "a0b2c18704554c60bfb62c5c7ea46e34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_40747ee3248e4cbca2b22e3201e7ae52", + "style": "IPY_MODEL_7f3f44cbaac94755810c0e589d048490", + "value": "ValueNetwork" + } + }, + "a18265de326b4d399e760f9d2e5bb238": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_76d1b335a0134c19852090005ae135c4", + "IPY_MODEL_dd631605869640d9b8564da50fd7f14e", + "IPY_MODEL_dd3cb8ec44e2454a9fd787b26a794aa2", + "IPY_MODEL_e2d50772ac80494ea293f047efb33527", + "IPY_MODEL_3c695e15ebbd4ecfb555b0fe5221ad10", + "IPY_MODEL_f401d82a291f4cdb9d44cf62f1c48978", + "IPY_MODEL_f8eb99b0291b45dda1b391805141e984", + "IPY_MODEL_1d03aaf95d45497ca74e337a82632cee", + "IPY_MODEL_ec1d469669a2411f9a5a7a1774480576", + "IPY_MODEL_2c48650276864e79a7b82413ddd8c6fa", + "IPY_MODEL_e923a0f829b14a6b83f8ef159b7e1e67", + "IPY_MODEL_ad74a56ab452440e86d1ff508a37e2fc", + "IPY_MODEL_a8c7fbd1b9e64ebebfc11f7da9dfbfd5", + "IPY_MODEL_1eec2203d3bf49c2876604c21291cc18" + ], + "layout": "IPY_MODEL_31fe17808d8e4f7ead5964af2e4f5894" + } + }, + "a23a881ee9034a33a8d23c63c65490c7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a2bb633318304f79a811eb07e18da7f5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_fca1d8802f264b48aa3f7bef2b5f5b81", + "IPY_MODEL_27fbf57b093b4444b8990601eaddca26", + "IPY_MODEL_4b9184b437ac441e8c485894889e7fd4" + ], + "layout": "IPY_MODEL_1c09f9523eb2469ab864ddcd5f15f417" + } + }, + "a2bf112fa96c4e8aba14a96af2788dbc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a32e41356969452abe56558608109dc8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a496bd2aabab465fbcf0022dc1acd19f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_f72ef10c1acd44608d2db2b932f2b167", + "style": "IPY_MODEL_077609b632e64492acbc9a009222e086", + "value": "ValueNetwork" + } + }, + "a517b57a04ed49bf82a0820df4bcf3b2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "center" + } + }, + "a592a51f7f3d40cf81de06ff0c9e1546": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "a5d8986e9aad47b1ba7821ddf2850c7a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a6379873f0434d53a6ad52553c164bdb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a6a4d48baea44d659e3b2dd7e54fcd17": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_f9cd83ba01bb440b9510e0ada3cfd4aa", + "step": null, + "style": "IPY_MODEL_1a3aa6da2cad4cfd9696b32125ab645b", + "value": 200 + } + }, + "a7d002d3e5454965af1d9cdb2e54e7ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_1b48b0f90cef4077aa20b9ee8be52e9b", + "style": "IPY_MODEL_3d9166fc4fcf43f3b930ebc7f996a5bf", + "value": "Adam" + } + }, + "a7d8b17ff9fd43298bc30e0471ade94f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a860d9c958c646aa89ae598dc67eaa08": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a89219097e994deb9caa9b27d8bd2866": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "a899edcecbcf49d1a1f57b48bed97865": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "a8c7fbd1b9e64ebebfc11f7da9dfbfd5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_ae3b1f8332bd40ab9ef5ec6dfd688123", + "style": "IPY_MODEL_6efa143c4b9d43aa94ed8cfe56824583", + "value": "epsilon" + } + }, + "a8e550f371f94677a29e238776be2cdb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "aafbebe0ec5b4425acf54f0ad9f6c80f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_49c009585e524d98af99d984cf65a85b", + "step": null, + "style": "IPY_MODEL_76dec90334724f3ba9e51ba05856ff79", + "value": 100 + } + }, + "ab2e3b3dc5024debb0c00c3d27d48a8b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ac4da45cf7d84d5fa0ea8963afbe5c12": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "ad07aedb699c4a3da0110a187e381619": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ad34362a6d0b43edb782d9f50d666a41": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ad74a56ab452440e86d1ff508a37e2fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_379d32750a8c4e88b3b6a8d76c3ee91b", + "style": "IPY_MODEL_b1240a01113b4044b84ce15397d29251", + "value": "0.0" + } + }, + "ae1716b3153545b394ccc02357c0cecc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ae3b1f8332bd40ab9ef5ec6dfd688123": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget013" + } + }, + "ae877e1e2a554a19b78fb9a12f60e5d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "aeecfc3325ec482ebd31ced3fc2e6839": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "af4e53453b1a434e9426fd63d61888c5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_fca98009fe56433b97f1fd16969f9a35", + "IPY_MODEL_0c0d922d9ed14199ab9b8f48b9e8ba1d", + "IPY_MODEL_5bced3d11d4a41a4b3e1c712f83b98e4" + ], + "layout": "IPY_MODEL_7a6c0819e1344119aae9ef136830ad44" + } + }, + "afeba836a14d4fb6a7c5407794848b80": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "b04b868ce504489c82bd8818501b3ac3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_8991ca296f464086aab8e12cc644430c", + "style": "IPY_MODEL_683e3afa65604f1b85604a79ec228a2b", + "value": "decay" + } + }, + "b04de6976e7d476fa4981293ded26bd6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b04fb98f9bb24f24bfa2c883cb8bd2fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_434eec441fb94a30bcb70bec50c60d78", + "IPY_MODEL_0b1a53d081f547f8ab913cd15fe70058", + "IPY_MODEL_0af6103ca9e44bb4a44c62b84b39415f", + "IPY_MODEL_0b1a53d081f547f8ab913cd15fe70058", + "IPY_MODEL_6f0bd8ffadf44461a70b1031b3f65064" + ], + "layout": "IPY_MODEL_452324b6d7cc4cf28d456787efc23b8f" + } + }, + "b106f6f6a7f047a4a11ec9f9a23804e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "AC", + "DDPG", + "PG", + "PPO", + "SAC", + "TD3", + "TRPO" + ], + "description": "Algorithms:", + "index": 0, + "layout": "IPY_MODEL_eb5620a9d421450a9c0b629c52d3d8ba", + "style": "IPY_MODEL_1dbbcf0744194117b3463d5ae8af00ef" + } + }, + "b1240a01113b4044b84ce15397d29251": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b18ac7a05b7c4d58813a3e735173a3ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_a23a881ee9034a33a8d23c63c65490c7", + "style": "IPY_MODEL_014bf4270fea44b6aad4c80c7a5979b7", + "value": "Choose your environment" + } + }, + "b20aaab10e6a49138d9cf0a414321c49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_c06d332516bf42b2b764cc7b6117aade", + "IPY_MODEL_891909eab8204a4bb78c9a468bc20112" + ], + "layout": "IPY_MODEL_ce069bda2c504adabddf4308b196d410" + } + }, + "b2ed3221465c4c7097b79683b8e5c5f0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget002", + "justify_content": "center" + } + }, + "b316a517fda34deba03047080e565a59": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget008" + } + }, + "b3a43d5f73df48299fdf24a855c623a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b4047180a5aa44479c358d8c12f0c5d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_3025ff51115247eebfcfe7e2a18e414e", + "style": "IPY_MODEL_5f1fda7eb4ac4ce694f721e312e205ab", + "value": "0.0001" + } + }, + "b42c755dec514e6fa26ca97f3f0ef923": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_85d35dbed0594a3a837f536309af0b59", + "IPY_MODEL_0201bde3e922471d9bb86857be61df95" + ], + "layout": "IPY_MODEL_5efb085669c2400a909ac37b5cb4e45e" + } + }, + "b4d945e45eae41ceb40de345939615ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_42f8297b00d240308e7403a004a1c6b4", + "style": "IPY_MODEL_f48e72d8d0b5470798d5faeed3dc8e40", + "value": "learning_rate" + } + }, + "b50b99192c944a348df722c9f5cdaa90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "b5214d589d704727964cdb67261b2d47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_ad34362a6d0b43edb782d9f50d666a41", + "step": null, + "style": "IPY_MODEL_dca0afd22296462f8a0e11b82566f289", + "value": 0.9 + } + }, + "b58381d8050044ee9df6c0857e3a06e4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\n \"net_label net_info\"\n \"opt_label opt_info\"\n " + } + }, + "b5ac8df291f9438bacc64a6cb2805620": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "b5bcfb7873f44eba8f8f90e018f09b6a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget012" + } + }, + "b5dd447dec9c48bc8b1bb664c9553912": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "b64d5e345cb5482595aa92662c8f162c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b672fea2d3ac4732a92e992eaaef260e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget009" + } + }, + "b85dbc19731e4b84bb6122ea52367809": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b92bc4065ee4473aa6e1b4051e044dee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_629ece3b43ac4c8a8c2f83733a180978", + "style": "IPY_MODEL_38f46c0b84c84233a228758c9b306a79", + "value": "amsgrad" + } + }, + "b9743661bbd24d94969c463e1f77d6e8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "b979276c5b584ebab1400eea707b2c39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "b9ad33908a4f4a6ba687c820c123c37a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget014" + } + }, + "bb0110f57f39444db2d635a30437c85d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "bb04f52581bb496e9a6931ce291714c9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "bb5d38052b40427585a8ec928bdef7b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_ca41ace6e197496b8d0e375f53b92729", + "IPY_MODEL_081136f1075542a3999ce83eba68fdb5" + ], + "layout": "IPY_MODEL_4a88a99c974d47da993c8bde3faab362" + } + }, + "bcb79998188240e99279f9cda7e004d9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "bd7afa2132154beebd89e4320ebcad26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_b316a517fda34deba03047080e565a59", + "style": "IPY_MODEL_d21ecfeb69a54154ad0c0cadf69db4fa", + "value": "0.9" + } + }, + "bdb404863da84bdf870e550898f54848": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_7f94bb571172453a920e7bd6d7a9050f", + "IPY_MODEL_e2ecea0189544c41a0ca172743cf16a1" + ], + "layout": "IPY_MODEL_62a5e4f04f554e6580d63bb32f36b3be" + } + }, + "be4d4fbbc53d4705963f9b343aff399f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "bebb739676c74aacb396889de39592e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "bf3a856d0c5f4d47abf596f528a2d947": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "bf620c54949846b49135585c61101b19": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "bf7a578fb6204ce694235598a0f00ea2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "bfa16a837ebd4ec795d5aa0a893d5298": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "bfdfc9d77a654743a9ebdfc08ab167da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "bffd75c7e90346ebb8214c6fe0ce2ab4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d9864398064d4a4ea93f2f985bf70bb5", + "style": "IPY_MODEL_835ef9a1125846679a65d679afb62013", + "value": "./model/AC-Pendulum-v0" + } + }, + "c06d332516bf42b2b764cc7b6117aade": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d466ecd3ea76446fa72d90acf2d7c5ba", + "style": "IPY_MODEL_c726054bb59f40aab21ea2d4485ce77e", + "value": "Learning curve" + } + }, + "c083a4b8f36848ed9f277f423ae18084": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_8f90c0a8d78442cfa05aff9b006a94d6", + "style": "IPY_MODEL_d220d182817c44408e2df2a364760e43", + "value": "epsilon" + } + }, + "c096b60cb96b4aa68be8728e6feb2366": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "c12ffb6b4533460bbdfc7404ff89d807": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_842ea79123034275adec1df392a4846d", + "style": "IPY_MODEL_0cabfd585d5d4421a05805698bc1c8ad", + "value": "beta_2" + } + }, + "c2160078393b421d9f3a4343f37307e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_38484ea61c3449a1b809d8526ead582d", + "style": "IPY_MODEL_7ff9e3e9f09b40d398b6c898e5ee9653", + "value": "False" + } + }, + "c234ed19a3204e1d9452d6686e014efb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "c2aa94c81efc4f3f826adcb847fbdb89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "description": "Progress", + "layout": "IPY_MODEL_5b87473fb6cc473a89998a285388f4da", + "max": 10, + "style": "IPY_MODEL_6f525160109d45299758550c08196bd9", + "value": 10 + } + }, + "c2eca071d21942c98a47aaf881130883": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_ae1716b3153545b394ccc02357c0cecc", + "max": 400, + "readout_format": ".0f", + "style": "IPY_MODEL_8f9477722bb54e6185f07c7069ed73bc", + "value": 200 + } + }, + "c3233dc4967548279ff54f73e91e27a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c34d5f3024f24951b4f478bca62dd7c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_40c1e5560977460b86028ca09ee94662", + "style": "IPY_MODEL_e00c049b23f34848a62ee225b63ec0b7", + "value": "amsgrad" + } + }, + "c35cf89d5b4c42c886c9c83fdc93c8e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c3c09aa3ecea45eda2b142c857c5d7c5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget011" + } + }, + "c3d17e5a575344968f8b84a174b26ba9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "c3ef353dd171416da3dc55582107fa67": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_266e10703ed340a78b259c7d3ddc8836", + "IPY_MODEL_64750206fa3a48119aa85e75f5ff2de8" + ], + "layout": "IPY_MODEL_a517b57a04ed49bf82a0820df4bcf3b2" + } + }, + "c4662ffdadef4c7d82aba5ddca1fbfda": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c480ff00167c4205a51065548cbea855": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c60dc42b295c47138b76205df9071217": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_432a3a690b36409192aa3ee4dd5fedf8", + "IPY_MODEL_08f5684d8e194916ac04ed379e2bf022" + ], + "layout": "IPY_MODEL_48392da1f6c64d3fad859465d0d0095b" + } + }, + "c726054bb59f40aab21ea2d4485ce77e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c75a9640bb26465785ca214520007519": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c7a9f23b553e43a78d5c0ced37526327": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "c90e24c07a754360836c2acc6f3a7e22": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_9ac98c15de5a4548a99d80e8ea3004c9", + "IPY_MODEL_f10d3787733a4ece9120c3641017114b" + ], + "layout": "IPY_MODEL_6187b72c80f64272a6c33c90cb582c4c" + } + }, + "ca41ace6e197496b8d0e375f53b92729": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_516cc7132ca94faab3023ffcd1ed4cd4", + "IPY_MODEL_329f804132904f47a73d10b3ccba4b4d", + "IPY_MODEL_a0371ec3949944198211395dc7848ba6" + ], + "layout": "IPY_MODEL_9c226167c8fb4cfab3a7161a87588ae1" + } + }, + "ce069bda2c504adabddf4308b196d410": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ce5b0166c393435a840819472b761b8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ce5b912531614dfe90ee3e20fa7ba467": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "height": "250px", + "width": "350px" + } + }, + "ce777268358f48608666122680449e3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ce96b4fa2ae14c6f8f4af830f9442000": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "cefe9c21582d46dc9471bee195b466b7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "cf3de6c59d124068af4aef37293c26e2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "cfb6b6bcedad4f61893206fb1eb28385": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_283080f17fcf4286b2e6e059bcda3370", + "IPY_MODEL_04461564de8c45d6af4c6055f7b4c17f", + "IPY_MODEL_9dfcd5e4ec744ed4a0a9091bed5ed2d8", + "IPY_MODEL_334d1a726d2347db82e42df5760618b3", + "IPY_MODEL_6c751fa2c2aa415ea57d3c9b0e11b22d", + "IPY_MODEL_43730220bf8e489cae588fcf375d08cf", + "IPY_MODEL_a038c2e1def5473484b4d9bbc5393145", + "IPY_MODEL_7af9623e94c64555b01efa581f338e60", + "IPY_MODEL_389174ab87e24a48a23ad5f81a32da61", + "IPY_MODEL_4ee9cbafcaad44de9f9e7453ee765047", + "IPY_MODEL_3a3916bde1e849aeae0e2701258ddc34", + "IPY_MODEL_88aafdf648784ac7954ce933431f9a3a" + ], + "layout": "IPY_MODEL_19b0d8173d9141e0a0db8d0b2110c98c" + } + }, + "cfc4c351d9da4a2bbe36bb1288f74e82": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d02f0cd6f8f94156ac86605286a6ee78": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "center" + } + }, + "d16d026731104f40ad77f1c7b8f77bf6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget009" + } + }, + "d1b7a611e0ea474991c6034e7e7a9e98": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d1ba6fbf21674589b3f585f6e0f9638b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_79611f87c64c431794f17eccbbd60f38", + "style": "IPY_MODEL_a2bf112fa96c4e8aba14a96af2788dbc", + "value": "0.0" + } + }, + "d20f2266d6fc44df988c78b63b202a81": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget014" + } + }, + "d21ecfeb69a54154ad0c0cadf69db4fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d220d182817c44408e2df2a364760e43": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d2ba7f491ec94768be174bba323aff6d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget009" + } + }, + "d34c7789bb974de1a36ef3cc45737b52": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget010" + } + }, + "d439f3de7aeb4f059483dedb8aca131a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d466ecd3ea76446fa72d90acf2d7c5ba": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "d48e8464b37c4f0099d42e59369dbab6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_b672fea2d3ac4732a92e992eaaef260e", + "style": "IPY_MODEL_f834d6547a954a478d9e755653e4f5a1", + "value": "beta_2" + } + }, + "d4c91e304ca34f88a4c959ecc4683678": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d5a3129aed5d47718c478523d35359ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "d6a04d9b77b54ae89af21fa5551e205e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_214c87e57eb641bb89644c9f465889ca", + "style": "IPY_MODEL_7a807eea55d14bae96d792b1e475adcb", + "value": "save_interval" + } + }, + "d6ddb43e654a421ead72beacfae7145e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_2dab24721ba34bd789afa55d1479464b", + "style": "IPY_MODEL_0a21d0f35913467a9b266a75d2af8db0", + "value": "Supported algorithms are shown below" + } + }, + "d915d378018e4bd085cf4a0a935e2aaa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_d16d026731104f40ad77f1c7b8f77bf6", + "style": "IPY_MODEL_7aba7921241e41af9a32cbe042699485", + "value": "test_episodes" + } + }, + "d91d58d65e864faa90c9cc7bfd2959b0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_182c5797541f4476bb02c95a710f1bca", + "step": null, + "style": "IPY_MODEL_6dc0399123f94dd1831a2b2cfb6c3078", + "value": 10 + } + }, + "d932e823fc31419d9d00cb89736f8a5f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_a0b2c18704554c60bfb62c5c7ea46e34", + "IPY_MODEL_f80bd1f80d99494595e88c9fc5f055d2" + ], + "layout": "IPY_MODEL_f3645a595f8c4e1f82d71ed6f97e7dd6" + } + }, + "d9864398064d4a4ea93f2f985bf70bb5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "d99dceda8ae6483f8df298525d45be82": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "solid" + } + }, + "d9b467355fa940af8f164b0b53137582": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "da04b8e9a4464f7ea141e41904fa3b0f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "da5536ed85464ee5a97c44660b985348": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "GridBoxModel", + "state": { + "children": [ + "IPY_MODEL_469da089cf804101a4cbc570975a1aed", + "IPY_MODEL_dc4226a0086147b29ba43f099ccad551", + "IPY_MODEL_7df23ef826fb4c568071b0667bafcd3b", + "IPY_MODEL_f5879b9ebaab4df9b53830cef8c25e62" + ], + "layout": "IPY_MODEL_de78a9211dba417182808fc83d0ebbf8" + } + }, + "da5694fd870b41e79f41ebc7d7b8db5e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget002" + } + }, + "dc12042cc1bb40c98a69bef90468797a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "dc4226a0086147b29ba43f099ccad551": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_f5c5c8e022aa4f239006a40e2ac8b990", + "IPY_MODEL_b5214d589d704727964cdb67261b2d47" + ], + "layout": "IPY_MODEL_b2ed3221465c4c7097b79683b8e5c5f0" + } + }, + "dca0afd22296462f8a0e11b82566f289": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "dceb338b27c742cd8733350448a2e798": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "dd3cb8ec44e2454a9fd787b26a794aa2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_39c394badc7246fdb12032649f71a1b6", + "style": "IPY_MODEL_ce96b4fa2ae14c6f8f4af830f9442000", + "value": "learning_rate" + } + }, + "dd51349042bc4341b061da02df9f8be2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget003" + } + }, + "dd631605869640d9b8564da50fd7f14e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_2dece16eb4994e5082a1cbeeea4163d0", + "style": "IPY_MODEL_d439f3de7aeb4f059483dedb8aca131a", + "value": "Adam" + } + }, + "ddaf2150308c4af2876f9f423d0b803d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ddba268ea0db428898643ae0f9a259a3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "stretch", + "display": "flex", + "grid_area": "widget006", + "justify_content": "center" + } + }, + "de78a9211dba417182808fc83d0ebbf8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(2, 1fr)" + } + }, + "de8a6e2e9cb447439055e987582fc63e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "def02ee29d9a44b19a1fd20f8a4be1a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "df228d4f3b644bb081011555c9f36485": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget010" + } + }, + "df84370f89e949518569f900854e2510": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e00c049b23f34848a62ee225b63ec0b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e09e0ff65ebf454b80a965aaa0f61d32": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_5526ed8ea7b4499eadc0bbb165d7bbc4", + "IPY_MODEL_d932e823fc31419d9d00cb89736f8a5f" + ], + "layout": "IPY_MODEL_54927f9f2cde4416bf0e3b782fbd5118" + } + }, + "e0a1f12f4f0e4e31adc281b1fe6dee11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e14f5611fa9242af879512207669394f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e1d753092ae3420ead7a3086b9405f2a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e9add15a402448ee8f55d0a65f2d460c", + "style": "IPY_MODEL_ddaf2150308c4af2876f9f423d0b803d", + "value": "Pendulum-v0" + } + }, + "e1f03c622ff64b3bb4e59fc54e7898a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_b5bcfb7873f44eba8f8f90e018f09b6a", + "style": "IPY_MODEL_c3233dc4967548279ff54f73e91e27a0", + "value": "0.0" + } + }, + "e1f175e02edf40f39585c485ec11cbff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "height": "250px", + "width": "350px" + } + }, + "e210fdbc53d246a2ae55da6a3689745b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e224793bc1524f0c91ce3d7ef0e98f8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_4d8d22e583c64179817ad9c514bd4490", + "style": "IPY_MODEL_f91418c725364297a60aa4983253ae07", + "value": "0.0002" + } + }, + "e255dc6e7af7487e8a2729f670bffd8a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget008" + } + }, + "e27f2db74f874171acd272cf848ddc80": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget011" + } + }, + "e2d50772ac80494ea293f047efb33527": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_a592a51f7f3d40cf81de06ff0c9e1546", + "style": "IPY_MODEL_d5a3129aed5d47718c478523d35359ad", + "value": "0.0002" + } + }, + "e2ecea0189544c41a0ca172743cf16a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_1adbcde168d04bcdaed1c410feae74ac", + "step": null, + "style": "IPY_MODEL_4e6414fcd34b454e94c982f7233402a7", + "value": 100 + } + }, + "e318e3ad8e11430d840261e7eb1b540e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "description_width": "" + } + }, + "e35bce23c28f4af3b0d4dce2266ed2e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e3adb676dd9b48a6bd4e895ac644b653": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e41fe8ee1bf04764abe02428057a540a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e4665eee9731436a839eaebea246f048": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_0d95601931d94f8cac55349f5886038a", + "style": "IPY_MODEL_ee84c4f73d284618aa3241fcb758da9f", + "value": "Box(1,)" + } + }, + "e467ed3285684035a013df63ebb6b422": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e527873f8829445dbdb49e0710132c63": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e53d3b32848c4872a5e1254a2ed080f1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e57f860aafca4775a03574208f4944b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_fd1693effce0420c8f4bbbebde0ef7c3", + "IPY_MODEL_4bbe95c5e6b34795a2058cc7bf7416f9", + "IPY_MODEL_9ee876553e424052a509a2daed8da1c6", + "IPY_MODEL_07b040199f664673b2cb1b45c5a5af34" + ], + "layout": "IPY_MODEL_41425cf814dc44c49ac901aeec4c668f" + } + }, + "e62a214128d34799be2e1cc2cdb98b8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "e6958eae462d43d8bdb9c6227deddcc7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e6c798aa900740009741c67dfccb0d92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_df228d4f3b644bb081011555c9f36485", + "style": "IPY_MODEL_63d55c74d6ed493abe58361958b23046", + "value": "0.999" + } + }, + "e8260cb1f55049a49bdaf024528d43c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "e835260b70924edd959ac38cbdaa50d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget013" + } + }, + "e8b87d816ccb409083b0c522ef0bd9dd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget004" + } + }, + "e904337542fd4e5d8187b9b9190b7522": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatTextModel", + "state": { + "description": "Manual input:", + "layout": "IPY_MODEL_b50b99192c944a348df722c9f5cdaa90", + "step": null, + "style": "IPY_MODEL_831ed45407f74193acc07dacada162a9", + "value": 50 + } + }, + "e923a0f829b14a6b83f8ef159b7e1e67": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_e27f2db74f874171acd272cf848ddc80", + "style": "IPY_MODEL_b3a43d5f73df48299fdf24a855c623a7", + "value": "decay" + } + }, + "e944a76d793541058cf5f32563847fb3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e9794b57be6c4c0e981a017d3fa82a36": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e9add15a402448ee8f55d0a65f2d460c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "e9d6d91ceda64a63b9fe358e90337820": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_fe785154b75c4badbab0d946f05802cf", + "style": "IPY_MODEL_78f5897896d144fe839fafd65e76816e", + "value": "Environment Information" + } + }, + "eb54eb7b3c674e67b10610ce2aaf309a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_547d2113aae04e20ba41d30deb33ec5f", + "style": "IPY_MODEL_7b48f1fae96e40519787018ed628b99b", + "value": "1e-07" + } + }, + "eb5620a9d421450a9c0b629c52d3d8ba": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "eb5fdb48aa1d483fa9acf05a229ef307": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ebff747fea3f4cf2abb9efcd9f998ddb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ec1d469669a2411f9a5a7a1774480576": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_4749f46df2c4438e874ed6912a4d7ef1", + "style": "IPY_MODEL_7cc3bf6293494425b70569d1eca3af03", + "value": "beta_2" + } + }, + "ec6b04eac2cd4e5a821244a954846a39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ecc6da99cf7944f5a5a6cfd1f0516aa6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ed746bfae28741e9ae1d450dd1394423": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "center" + } + }, + "ee84c4f73d284618aa3241fcb758da9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "eef437964b4e4fa29ea42afc6b9a69ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_8f01f6cb90754bcb8b2e64809505291d", + "style": "IPY_MODEL_ce777268358f48608666122680449e3c", + "value": "Box(1,)" + } + }, + "ef95b43fb5cd436cb6f737f2defc8e38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_747e88ebfefc4efb95f60f63e725dcc1", + "style": "IPY_MODEL_078c44ca72d24661bbeb9921196ddace", + "value": "The action space is continuous." + } + }, + "f10d3787733a4ece9120c3641017114b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_26036b1a064245a6a1cef60ec7d39376", + "IPY_MODEL_af4e53453b1a434e9426fd63d61888c5" + ], + "layout": "IPY_MODEL_70c300868924433094e74b74d260a4a2" + } + }, + "f1888922c93c435f8bac11033ae325e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_455c6fed537d48b188edef0200ab0fb1", + "IPY_MODEL_2e65a763e5db40ca8969c36950c0d9bd", + "IPY_MODEL_8c27b4b759354d64b25bcb3462c444ef" + ], + "layout": "IPY_MODEL_74d03d1491d4451d879384ab357f33a9" + } + }, + "f1985e262a7d401ea97c903091713789": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f2612900bd944258af3be77cacc7a46b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f29a7f4ff2a74bbf8d6485cbfb086152": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_bf3a856d0c5f4d47abf596f528a2d947", + "style": "IPY_MODEL_10685777c5384041b62b4ce3aa26bf6e", + "value": "Environment Selector" + } + }, + "f29ba87ee02f4fc38760b98a32e20581": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f2db93e6094b47d0bfce3821b33d707a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f2ffc80dd5074916b1a69e9de91149f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget001" + } + }, + "f3645a595f8c4e1f82d71ed6f97e7dd6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_content": "center", + "align_items": "center", + "border": "dotted" + } + }, + "f401d82a291f4cdb9d44cf62f1c48978": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_86e357397076415ba3ac239b26a8bc8f", + "style": "IPY_MODEL_faea715cb8894b8ca444f80d17c07e12", + "value": "False" + } + }, + "f48e72d8d0b5470798d5faeed3dc8e40": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f4d0297192f5464bac7ab02b3dabed2c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f5879b9ebaab4df9b53830cef8c25e62": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_afeba836a14d4fb6a7c5407794848b80", + "style": "IPY_MODEL_9fc5c513843a4c0fa7ae9c8b37c3b4ff", + "value": "./model/AC-Pendulum-v0" + } + }, + "f5c5c8e022aa4f239006a40e2ac8b990": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_e527873f8829445dbdb49e0710132c63", + "max": 1.8, + "readout_format": ".1f", + "step": 0.1, + "style": "IPY_MODEL_2b0d8567d4aa4e53a5837284b315cc58", + "value": 0.9 + } + }, + "f63f7fca433e4d32ad6252416895155b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f6f23b9ba55946d0aa626d62ba4bbdf5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f72ef10c1acd44608d2db2b932f2b167": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f74c2a3b52114bbc80056d7097731209": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "continuous_update": false, + "description": "Slider input:", + "layout": "IPY_MODEL_80d9bf94c37c49708820ccb5a2aa8f8b", + "max": 200, + "readout_format": ".0f", + "style": "IPY_MODEL_731d299fb9dd45c1a41a5d4df4f41f94", + "value": 100 + } + }, + "f77e6fff86704faea6c01e0262104c70": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_0a575cd57803474a9574922e07d3d316", + "IPY_MODEL_8d025735275c4dfdbbbf2d491e727c08" + ], + "layout": "IPY_MODEL_5b759ba6fc8f451c97ee15467069a6ed" + } + }, + "f80bd1f80d99494595e88c9fc5f055d2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_9dd1d4acaad44f16b1bbf0693ee9fad5", + "IPY_MODEL_1cb88e139a0642afb2f3c958dff539aa", + "IPY_MODEL_2e6e71650a6a48878fce055c8e563538", + "IPY_MODEL_fe6a7094bdd649e6b5270a701e12253a" + ], + "layout": "IPY_MODEL_bfa16a837ebd4ec795d5aa0a893d5298" + } + }, + "f834d6547a954a478d9e755653e4f5a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f8a20f2f4b8b4c03857bcd85bf96b136": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f8eb99b0291b45dda1b391805141e984": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_5afcc13ec3d94e6299bd06fb87ed7885", + "style": "IPY_MODEL_d4c91e304ca34f88a4c959ecc4683678", + "value": "beta_1" + } + }, + "f91418c725364297a60aa4983253ae07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f9a9a8529629435f926e28c9e2ff6d21": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "f9b983bef3a14087b6d1f966b8b041ed": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "f9cd83ba01bb440b9510e0ada3cfd4aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "fa3877a284354fd08f33d320314b6765": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_57f97e2ebec542f8b297365916bf571e", + "style": "IPY_MODEL_454021a337164bae8a96f5a5a7749b78", + "value": "decay" + } + }, + "faea715cb8894b8ca444f80d17c07e12": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "fb06877af7ae451baefc12dfd27d9348": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "border": "dotted", + "grid_template_areas": "\"widget001 widget002\"\n\"widget003 widget004\"\n\"widget005 widget006\"\n\"widget007 widget008\"\n\"widget009 widget010\"\n\"widget011 widget012\"\n\"widget013 widget014\"", + "grid_template_columns": "repeat(2, 1fr)", + "grid_template_rows": "repeat(7, 1fr)" + } + }, + "fb19638e8a38465f844aaf06c6378b29": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "fbd450c8b01f4ab9ae7ea1caa129bd66": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_content": "center", + "align_items": "center", + "border": "dotted" + } + }, + "fc20a5f1e967425c840960c1948f00c8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget011" + } + }, + "fc69d16aa7e547b09859e2ca7dbfbde8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_97f58376ed524fab85dde1ea5f67ee17", + "IPY_MODEL_0dc03ae5db46484a85272ce1899e53c0" + ], + "layout": "IPY_MODEL_81f34a95028440608c8a5a307cd7ee9b" + } + }, + "fc6a2f4827034d64b99a15547f3d9f43": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_cf3de6c59d124068af4aef37293c26e2", + "style": "IPY_MODEL_1222c8a942134f83aa262d9b321ee413", + "value": "render" + } + }, + "fc83fd9df36b4c0fa6ee544fe520cde7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_area": "widget007" + } + }, + "fca1d8802f264b48aa3f7bef2b5f5b81": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_c096b60cb96b4aa68be8728e6feb2366", + "style": "IPY_MODEL_7532b84aea3a4f4290efa4b0369e846a", + "value": "Algorithm Parameters" + } + }, + "fca98009fe56433b97f1fd16969f9a35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_43f9446733e242f1977bbe394ddc479b", + "style": "IPY_MODEL_660e8c250f974ff685128c61b3d57fe3", + "value": "Environment settings" + } + }, + "fd1693effce0420c8f4bbbebde0ef7c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_4fa0861e758940d9b9c2775304ebb140", + "style": "IPY_MODEL_661fd55473c0431aa9dffd6876d1d559", + "value": "Input(shape=(None, 3), name='input_layer')" + } + }, + "fe547223f16e423fa8493d4c6ae577ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "children": [ + "IPY_MODEL_f77e6fff86704faea6c01e0262104c70", + "IPY_MODEL_9e37b046f2d841dd9572b2284a729bf5" + ], + "layout": "IPY_MODEL_48a97cf1c4a44a858c3376f962060321" + } + }, + "fe6a7094bdd649e6b5270a701e12253a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "layout": "IPY_MODEL_a860d9c958c646aa89ae598dc67eaa08", + "style": "IPY_MODEL_85514e8a938240e7b2df7c2a8ad6b6e8", + "value": "Dense(n_units=1, No Activation, in_channels='64', name='dense_1')" + } + }, + "fe785154b75c4badbab0d946f05802cf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": {} + }, + "ff06931e66b544389c8f409734b472e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "grid_template_areas": "\n \"net_label net_info\"\n \"opt_label opt_info\"\n " + } + }, + "ff0e9f4940eb4b57bd99d96059b5e194": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "ffce2434eb114cd1a7f6961dd71ff755": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "align_items": "center" + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/rlzoo/run_rlzoo.py b/rlzoo/run_rlzoo.py old mode 100644 new mode 100755 index 4ad2581..3618434 --- a/rlzoo/run_rlzoo.py +++ b/rlzoo/run_rlzoo.py @@ -1,60 +1,60 @@ -from rlzoo.common.env_wrappers import * -from rlzoo.common.utils import * -from rlzoo.algorithms import * - -# EnvName = 'PongNoFrameskip-v4' -# EnvType = 'atari' - -# EnvName = 'CartPole-v0' -EnvName = 'Pendulum-v0' -EnvType = 'classic_control' - -# EnvName = 'BipedalWalker-v2' -# EnvType = 'box2d' - -# EnvName = 'Ant-v2' -# EnvType = 'mujoco' - -# EnvName = 'FetchPush-v1' -# EnvType = 'robotics' - -# EnvName = 'FishSwim-v0' -# EnvType = 'dm_control' - -# EnvName = 'ReachTarget' -# EnvType = 'rlbench' -# env = build_env(EnvName, EnvType, state_type='vision') - -AlgName = 'SAC' -env = build_env(EnvName, EnvType) -alg_params, learn_params = call_default_params(env, EnvType, AlgName) -alg = eval(AlgName+'(**alg_params)') -alg.learn(env=env, mode='train', render=False, **learn_params) -alg.learn(env=env, mode='test', render=True, **learn_params) - -# AlgName = 'DPPO' -# number_workers = 2 # need to specify number of parallel workers in parallel algorithms like A3C and DPPO -# env = build_env(EnvName, EnvType, nenv=number_workers) -# alg_params, learn_params = call_default_params(env, EnvType, AlgName) -# alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO -# alg = eval(AlgName+'(**alg_params)') -# alg.learn(env=env, mode='train', render=False, **learn_params) -# alg.learn(env=env, mode='test', render=True, **learn_params) - -# AlgName = 'PPO' -# env = build_env(EnvName, EnvType) -# alg_params, learn_params = call_default_params(env, EnvType, AlgName) -# alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO -# alg = eval(AlgName+'(**alg_params)') -# alg.learn(env=env, mode='train', render=False, **learn_params) -# alg.learn(env=env, mode='test', render=True, **learn_params) - -# AlgName = 'A3C' -# number_workers = 2 # need to specify number of parallel workers -# env = build_env(EnvName, EnvType, nenv=number_workers) -# alg_params, learn_params = call_default_params(env, EnvType, 'A3C') -# alg = eval(AlgName+'(**alg_params)') -# alg.learn(env=env, mode='train', render=False, **learn_params) -# alg.learn(env=env, mode='test', render=True, **learn_params) - -env.close() +from rlzoo.common.env_wrappers import * +from rlzoo.common.utils import * +from rlzoo.algorithms import * + +# EnvName = 'PongNoFrameskip-v4' +# EnvType = 'atari' + +# EnvName = 'CartPole-v0' +EnvName = 'Pendulum-v0' +EnvType = 'classic_control' + +# EnvName = 'BipedalWalker-v2' +# EnvType = 'box2d' + +# EnvName = 'Ant-v2' +# EnvType = 'mujoco' + +# EnvName = 'FetchPush-v1' +# EnvType = 'robotics' + +# EnvName = 'FishSwim-v0' +# EnvType = 'dm_control' + +# EnvName = 'ReachTarget' +# EnvType = 'rlbench' +# env = build_env(EnvName, EnvType, state_type='vision') + +AlgName = 'SAC' +env = build_env(EnvName, EnvType) +alg_params, learn_params = call_default_params(env, EnvType, AlgName) +alg = eval(AlgName+'(**alg_params)') +alg.learn(env=env, mode='train', render=False, **learn_params) +alg.learn(env=env, mode='test', render=True, **learn_params) + +# AlgName = 'DPPO' +# number_workers = 2 # need to specify number of parallel workers in parallel algorithms like A3C and DPPO +# env = build_env(EnvName, EnvType, nenv=number_workers) +# alg_params, learn_params = call_default_params(env, EnvType, AlgName) +# alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO +# alg = eval(AlgName+'(**alg_params)') +# alg.learn(env=env, mode='train', render=False, **learn_params) +# alg.learn(env=env, mode='test', render=True, **learn_params) + +# AlgName = 'PPO' +# env = build_env(EnvName, EnvType) +# alg_params, learn_params = call_default_params(env, EnvType, AlgName) +# alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO +# alg = eval(AlgName+'(**alg_params)') +# alg.learn(env=env, mode='train', render=False, **learn_params) +# alg.learn(env=env, mode='test', render=True, **learn_params) + +# AlgName = 'A3C' +# number_workers = 2 # need to specify number of parallel workers +# env = build_env(EnvName, EnvType, nenv=number_workers) +# alg_params, learn_params = call_default_params(env, EnvType, 'A3C') +# alg = eval(AlgName+'(**alg_params)') +# alg.learn(env=env, mode='train', render=False, **learn_params) +# alg.learn(env=env, mode='test', render=True, **learn_params) + +env.close() diff --git a/setup.py b/setup.py index e69c424..36aa0d4 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name = "rlzoo", - version = "1.0.3", + version = "1.0.4", include_package_data=True, author='Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong', author_email='zhding@mail.ustc.edu.cn',