forked from krasheninnikov/max-causal-ent-irl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mdps.py
90 lines (79 loc) · 3.08 KB
/
mdps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
class MDP(object):
'''
MDP object
Attributes
----------
self.nS : int
Number of states in the MDP.
self.nA : int
Number of actions in the MDP.
self.P : two-level dict of lists of tuples
First key is the state and the second key is the action.
self.P[state][action] is a list of tuples (prob, nextstate, reward).
self.T : 3D numpy array
The transition prob matrix of the MDP. p(s'|s,a) = self.T[s,a,s']
'''
def __init__(self, env):
P, nS, nA, desc = MDP.env2mdp(env)
self.P = P # state transition and reward probabilities, explained below
self.nS = nS # number of states
self.nA = nA # number of actions
self.desc = desc # 2D array specifying what each grid cell means
self.env = env
self.T = self.get_transition_matrix()
self.s = self.reset()
def env2mdp(env):
return ({s : {a : [tup[:3] for tup in tups]
for (a, tups) in a2d.items()} for (s, a2d) in env.P.items()},
env.nS, env.nA, env.desc)
def get_transition_matrix(self):
'''Return a matrix with index S,A,S' -> P(S'|S,A)'''
T = np.zeros([self.nS, self.nA, self.nS])
for s in range(self.nS):
for a in range(self.nA):
transitions = self.P[s][a]
s_a_s = {t[1]:t[0] for t in transitions}
for s_prime in range(self.nS):
if s_prime in s_a_s:
T[s, a, s_prime] = s_a_s[s_prime]
return T
def reset(self):
self.s = 0
return self.s
def step(self, a, s=None):
if s == None: s = self.s
if len(self.P[s][a])==1:
self.s = self.P[s][a][0][1]
return self.s
else:
p_s_sa = np.asarray(self.P[s][a])[:,0]
next_state_index = np.random.choice(len(p_s_sa), 1, p_s_sa)
self.s = self.P[s][a][next_state_index][1]
return self.s
class MDPOneTimeR(MDP):
'''
MDP object;
One time reward: added a state with 0 reward to which the agent
deterministically goes after collecting the reward once; always stays
at the newly added state afterwards.)
Attributes
----------
self.nS : int
Number of states in the MDP.
self.nA : int
Number of actions in the MDP.
self.P : two-level dict of lists of tuples
First key is the state and the second key is the action.
self.P[state][action] is a list of tuples (prob, nextstate, reward).
self.T : 3D numpy array
The transition prob matrix of the MDP. p(s'|s,a) = self.T[s,a,s']
'''
def __init__(self, env):
super().__init__(env)
self.P.update({self.nS-1:{0:[(1.0,self.nS,0.0)], 1:[(1.0,self.nS,0.0)],
2:[(1.0,self.nS,0.0)], 3:[(1.0,self.nS,0.0)]}})
self.P.update({self.nS:{0:[(1.0,self.nS,0.0)], 1:[(1.0,self.nS,0.0)],
2:[(1.0,self.nS,0.0)], 3:[(1.0,self.nS,0.0)]}})
self.nS += 1
self.T = self.get_transition_matrix()