forked from datamllab/rlcard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
leduc_holdem_nfsp_save_model.py
129 lines (102 loc) · 4.62 KB
/
leduc_holdem_nfsp_save_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
''' An example of learning a NFSP Agent on Leduc Hold'em
The hyperparameters are slightly tuned for better performance
according to the playing experiences against it with human
interfaces. Thus, they may be different from the version
without save code. Please contact us if you find better
hyperparameters. Models can be saved with similar procedures
for other games/algorithms. For loading models, refer to
https://github.com/datamllab/rlcard/blob/master/rlcard/models/pretrained_models.py
'''
import os
import tensorflow as tf
import rlcard
from rlcard.agents.nfsp_agent import NFSPAgent
from rlcard.agents.random_agent import RandomAgent
from rlcard.utils.utils import set_global_seed
from rlcard.utils.logger import Logger
# Make environment
env = rlcard.make('leduc-holdem')
eval_env = rlcard.make('leduc-holdem')
# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100000
save_plot_every = 100000
evaluate_num = 10000
episode_num = 100000000
# Set the the number of steps for collecting normalization statistics
# and intial memory size
memory_init_size = 1000
norm_step = 1000
# The paths for saving the logs and learning curves
root_path = './experiments/leduc_holdem_nfsp_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'
# Model save path
if not os.path.exists('models'):
os.makedirs('models')
model_path = 'models/leduc_holdem_nfsp/model'
# Set a global seed
set_global_seed(0)
with tf.Session() as sess:
# Set agents
global_step = tf.Variable(0, name='global_step', trainable=False)
agents = []
for i in range(env.player_num):
agent = NFSPAgent(sess,
scope='nfsp' + str(i),
anticipatory_param=0.1,
action_num=env.action_num,
state_shape=env.state_shape,
rl_learning_rate=0.1,
sl_learning_rate=0.005,
hidden_layers_sizes=[128,128],
min_buffer_size_to_learn=memory_init_size,
q_replay_memory_init_size=memory_init_size,
q_epsilon_start=0.06,
q_epsilon_end=0.0,
q_norm_step=norm_step,
q_mlp_layers=[128,128])
agents.append(agent)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
random_agent = RandomAgent(action_num=eval_env.action_num)
env.set_agents(agents)
eval_env.set_agents([agents[0], random_agent])
# Count the number of steps
step_counters = [0 for _ in range(env.player_num)]
# Init a Logger to plot the learning curve
logger = Logger(xlabel='timestep', ylabel='reward', legend='NFSP on Leduc Holdem', log_path=log_path, csv_path=csv_path)
for episode in range(episode_num):
# First sample a policy for the episode
for agent in agents:
agent.sample_episode_policy()
# Generate data from the environment
trajectories, _ = env.run(is_training=True)
# Feed transitions into agent memory, and train the agent
for i in range(env.player_num):
for ts in trajectories[i]:
agents[i].feed(ts)
step_counters[i] += 1
# Train the agent
train_count = step_counters[i] - (memory_init_size + norm_step)
if train_count > 0 and train_count % 64 == 0:
rl_loss = agents[i].train_rl()
sl_loss = agents[i].train_sl()
print('\rINFO - Agent {}, step {}, rl-loss: {}, sl-loss: {}'.format(i, step_counters[i], rl_loss, sl_loss), end='')
# Evaluate the performance. Play with random agents.
if episode % evaluate_every == 0:
# Save Model
saver.save(sess, 'models/leduc_holdem_nfsp/model')
reward = 0
for eval_episode in range(evaluate_num):
_, payoffs = eval_env.run(is_training=False)
reward += payoffs[0]
logger.log('\n########## Evaluation ##########')
logger.log('Timestep: {} Average reward is {}'.format(env.timestep, float(reward)/evaluate_num))
# Add point to logger
logger.add_point(x=env.timestep, y=float(reward)/evaluate_num)
# Make plot
if episode % save_plot_every == 0 and episode > 0:
logger.make_plot(save_path=figure_path+str(episode)+'.png')
# Make the final plot
logger.make_plot(save_path=figure_path+'final_'+str(episode)+'.png')