forked from nalsil/kimhun_rl_windows
-
Notifications
You must be signed in to change notification settings - Fork 0
/
07_1_q_net_cartpole.py
102 lines (82 loc) · 3.09 KB
/
07_1_q_net_cartpole.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
'''
This code is based on
https://github.com/hunkim/DeepRL-Agents
'''
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # default value = 0 From http://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information
import tensorflow as tf
env = gym.make('CartPole-v0')
# Constants defining our neural network
learning_rate = 1e-1
input_size = env.observation_space.shape[0];
output_size = env.action_space.n;
X = tf.placeholder(tf.float32, [None, input_size], name="input_x" )
# First layer of weights
W1 = tf.get_variable("W1", shape=[input_size, output_size], initializer=tf.contrib.layers.xavier_initializer() ) # weight
Qpred = tf.matmul(X, W1)
# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32) # Y label
# Loss function
loss = tf.reduce_sum(tf.square(Y-Qpred))
# Learning
train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
# Values for q-learning
dis = .99
num_episodes = 2000
rList = []
start_time = time.time()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes):
e = 1. / ((i / 10) + 1)
rAll = 0
step_count = 0
s = env.reset()
done = False
# The Q-Table learning algorithm
while not done:
step_count += 1
x = np.reshape(s, [1, input_size])
# Choose an action by greedly (with a chance of random action) from the Q-network
Qs = sess.run(Qpred, feed_dict={X: x})
if np.random.rand(1) < e:
a = env.action_space.sample()
else:
a = np.argmax(Qs)
# Get new state and reward from environment
s1, reward, done, _ = env.step(a)
if done:
# Update Q, and no Qs+1, since it's a termial state
Qs[0, a] = -100
else:
x1 = np.reshape(s1, [1, input_size])
# Obtain the Q_s` values by feeding the new state through our network
Qs1 = sess.run(Qpred, feed_dict={X: x1})
Qs[0, a] = reward + dis*np.max(Qs1)
# Train our network using target (Y) and predicted Q (Qpred) values
sess.run(train, feed_dict={X: x, Y: Qs})
s = s1
rList.append(step_count)
print("Episode: {} steps: {}".format(i, step_count))
# If last 10's avg steps are 500, It's good enough
if len(rList) > 10 and np.mean(rList[-10:]) > 500:
break
print("--- %s seconds ---" % (time.time() - start_time))
# See our trained network in action
observation = env.reset()
reward_sum = 0
while True:
env.render()
x = np.reshape(observation, [1, input_size])
Qs = sess.run(Qpred, feed_dict={X: x})
a = np.argmax(Qs)
observation, reward, done, _ = env.step(a)
reward_sum += reward
if done:
print("Total score: {}".format(reward_sum))
break