algorithms.planner

Author: Miguel Morales BSD 3-Clause License

Copyright (c) 2018, Miguel Morales All rights reserved. https://github.com/mimoralea/gdrl/blob/master/LICENSE

  1"""
  2Author: Miguel Morales
  3BSD 3-Clause License
  4
  5Copyright (c) 2018, Miguel Morales
  6All rights reserved.
  7https://github.com/mimoralea/gdrl/blob/master/LICENSE
  8"""
  9
 10"""
 11modified by: John Mansfield
 12
 13documentation added by: Gagandeep Randhawa
 14"""
 15
 16"""
 17Class that contains functions related to planning algorithms (Value Iteration, Policy Iteration). 
 18Planner init expects a reward and transitions matrix P, which is nested dictionary gym style discrete environment 
 19where P[state][action] is a list of tuples (probability, next state, reward, terminal).
 20
 21Model-based learning algorithms: Value Iteration and Policy Iteration
 22"""
 23
 24import warnings
 25
 26import numpy as np
 27
 28
 29class Planner:
 30    def __init__(self, P):
 31        self.P = P
 32
 33    def value_iteration(self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32):
 34        """
 35        PARAMETERS:
 36
 37        gamma {float}:
 38            Discount factor
 39
 40        n_iters {int}:
 41            Number of iterations
 42
 43        theta {float}:
 44            Convergence criterion for value iteration.
 45            State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
 46            Stops at n_iters or theta convergence - whichever comes first.
 47
 48
 49        RETURNS:
 50
 51        V {numpy array}, shape(possible states):
 52            State values array
 53
 54        V_track {numpy array}, shape(n_episodes, nS):
 55            Log of V(s) for each iteration
 56
 57        pi {lambda}, input state value, output action value:
 58            Policy mapping states to actions.
 59        """
 60        V = np.zeros(len(self.P), dtype=dtype)
 61        V_track = np.zeros((n_iters, len(self.P)), dtype=dtype)
 62        i = 0
 63        converged = False
 64        while i < n_iters - 1 and not converged:
 65            i += 1
 66            Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype)
 67            for s in range(len(self.P)):
 68                for a in range(len(self.P[s])):
 69                    for prob, next_state, reward, done in self.P[s][a]:
 70                        Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
 71            if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
 72                converged = True
 73            V = np.max(Q, axis=1)
 74            V_track[i] = V
 75        if not converged:
 76            warnings.warn("Max iterations reached before convergence.  Check n_iters.")
 77
 78        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
 79        return V, V_track, pi
 80
 81    def value_iteration_vectorized(
 82        self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32
 83    ):
 84        """
 85        PARAMETERS:
 86
 87        gamma {float}:
 88            Discount factor
 89
 90        n_iters {int}:
 91            Number of iterations
 92
 93        theta {float}:
 94            Convergence criterion for value iteration.
 95            State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
 96            Stops at n_iters or theta convergence - whichever comes first.
 97
 98
 99        RETURNS:
100
101        V {numpy array}, shape(possible states):
102            State values array
103
104        V_track {numpy array}, shape(n_episodes, nS):
105            Log of V(s) for each iteration
106
107        pi {lambda}, input state value, output action value:
108            Policy mapping states to actions.
109        """
110        S = len(self.P)
111        A = len(self.P[0])
112
113        max_K = max(len(self.P[s][a]) for s in range(S) for a in range(A))
114
115        prob_array = np.zeros((S, A, max_K), dtype=dtype)
116        next_state_array = np.zeros((S, A, max_K), dtype=np.int32)
117        reward_array = np.zeros((S, A, max_K), dtype=dtype)
118        done_array = np.zeros((S, A, max_K), dtype=bool)
119        mask_array = np.zeros((S, A, max_K), dtype=bool)
120
121        for s in range(S):
122            for a in range(A):
123                transitions = self.P[s][a]
124                for k, (prob, next_state, reward, done) in enumerate(transitions):
125                    prob_array[s, a, k] = prob
126                    next_state_array[s, a, k] = next_state
127                    reward_array[s, a, k] = reward
128                    done_array[s, a, k] = done
129                    mask_array[s, a, k] = True
130
131        V = np.zeros(S, dtype=dtype)
132        V_track = np.zeros((n_iters, S), dtype=dtype)
133        converged = False
134        i = 0
135
136        # Simpler way to handle done states
137        not_done_array = 1 - done_array
138
139        while i < n_iters - 1 and not converged:
140            i += 1
141
142            Q = np.sum(
143                prob_array
144                * (reward_array + gamma * V[next_state_array] * not_done_array)
145                * mask_array,
146                axis=2,
147            )
148            V_new = np.max(Q, axis=1)
149
150            if np.max(np.abs(V - V_new)) < theta:
151                converged = True
152
153            V = V_new
154            V_track[i] = V
155
156        if not converged:
157            warnings.warn("Max iterations reached before convergence. Check n_iters.")
158
159        return V, V_track, dict(enumerate(np.argmax(Q, axis=1)))
160
161    def policy_iteration(self, gamma=1.0, n_iters=50, theta=1e-10, dtype=np.float32):
162        """
163        PARAMETERS:
164
165        gamma {float}:
166            Discount factor
167
168        n_iters {int}:
169            Number of iterations
170
171        theta {float}:
172            Convergence criterion for policy evaluation.
173            State values are considered to be converged when the maximum difference between new and previous state
174            values is less than theta.
175
176
177        RETURNS:
178
179        V {numpy array}, shape(possible states):
180            State values array
181
182        V_track {numpy array}, shape(n_episodes, nS):
183            Log of V(s) for each iteration
184
185        pi {lambda}, input state value, output action value:
186            Policy mapping states to actions.
187        """
188        random_actions = np.random.choice(tuple(self.P[0].keys()), len(self.P))
189
190        pi = {s: a for s, a in enumerate(random_actions)}
191        # initial V to give to `policy_evaluation` for the first time
192        V = np.zeros(len(self.P), dtype=dtype)
193        V_track = np.zeros((n_iters, len(self.P)), dtype=dtype)
194        i = 0
195        converged = False
196        while i < n_iters - 1 and not converged:
197            i += 1
198            old_pi = pi
199            V = self.policy_evaluation(pi, V, gamma=gamma, theta=theta, dtype=dtype)
200            V_track[i] = V
201            pi = self.policy_improvement(V, gamma=gamma, dtype=dtype)
202            if old_pi == pi:
203                converged = True
204        if not converged:
205            warnings.warn("Max iterations reached before convergence.  Check n_iters.")
206        return V, V_track, pi
207
208    def policy_evaluation(self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=np.float32):
209        while True:
210            V = np.zeros(len(self.P), dtype=dtype)
211            for s in range(len(self.P)):
212                for prob, next_state, reward, done in self.P[s][pi[s]]:
213                    V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
214            if np.max(np.abs(prev_V - V)) < theta:
215                break
216            prev_V = V.copy()
217        return V
218
219    def policy_improvement(self, V, gamma=1.0, dtype=np.float32):
220        Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype)
221        for s in range(len(self.P)):
222            for a in range(len(self.P[s])):
223                for prob, next_state, reward, done in self.P[s][a]:
224                    Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
225
226        return dict(enumerate(np.argmax(Q, axis=1)))
class Planner:
 30class Planner:
 31    def __init__(self, P):
 32        self.P = P
 33
 34    def value_iteration(self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32):
 35        """
 36        PARAMETERS:
 37
 38        gamma {float}:
 39            Discount factor
 40
 41        n_iters {int}:
 42            Number of iterations
 43
 44        theta {float}:
 45            Convergence criterion for value iteration.
 46            State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
 47            Stops at n_iters or theta convergence - whichever comes first.
 48
 49
 50        RETURNS:
 51
 52        V {numpy array}, shape(possible states):
 53            State values array
 54
 55        V_track {numpy array}, shape(n_episodes, nS):
 56            Log of V(s) for each iteration
 57
 58        pi {lambda}, input state value, output action value:
 59            Policy mapping states to actions.
 60        """
 61        V = np.zeros(len(self.P), dtype=dtype)
 62        V_track = np.zeros((n_iters, len(self.P)), dtype=dtype)
 63        i = 0
 64        converged = False
 65        while i < n_iters - 1 and not converged:
 66            i += 1
 67            Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype)
 68            for s in range(len(self.P)):
 69                for a in range(len(self.P[s])):
 70                    for prob, next_state, reward, done in self.P[s][a]:
 71                        Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
 72            if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
 73                converged = True
 74            V = np.max(Q, axis=1)
 75            V_track[i] = V
 76        if not converged:
 77            warnings.warn("Max iterations reached before convergence.  Check n_iters.")
 78
 79        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
 80        return V, V_track, pi
 81
 82    def value_iteration_vectorized(
 83        self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32
 84    ):
 85        """
 86        PARAMETERS:
 87
 88        gamma {float}:
 89            Discount factor
 90
 91        n_iters {int}:
 92            Number of iterations
 93
 94        theta {float}:
 95            Convergence criterion for value iteration.
 96            State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
 97            Stops at n_iters or theta convergence - whichever comes first.
 98
 99
100        RETURNS:
101
102        V {numpy array}, shape(possible states):
103            State values array
104
105        V_track {numpy array}, shape(n_episodes, nS):
106            Log of V(s) for each iteration
107
108        pi {lambda}, input state value, output action value:
109            Policy mapping states to actions.
110        """
111        S = len(self.P)
112        A = len(self.P[0])
113
114        max_K = max(len(self.P[s][a]) for s in range(S) for a in range(A))
115
116        prob_array = np.zeros((S, A, max_K), dtype=dtype)
117        next_state_array = np.zeros((S, A, max_K), dtype=np.int32)
118        reward_array = np.zeros((S, A, max_K), dtype=dtype)
119        done_array = np.zeros((S, A, max_K), dtype=bool)
120        mask_array = np.zeros((S, A, max_K), dtype=bool)
121
122        for s in range(S):
123            for a in range(A):
124                transitions = self.P[s][a]
125                for k, (prob, next_state, reward, done) in enumerate(transitions):
126                    prob_array[s, a, k] = prob
127                    next_state_array[s, a, k] = next_state
128                    reward_array[s, a, k] = reward
129                    done_array[s, a, k] = done
130                    mask_array[s, a, k] = True
131
132        V = np.zeros(S, dtype=dtype)
133        V_track = np.zeros((n_iters, S), dtype=dtype)
134        converged = False
135        i = 0
136
137        # Simpler way to handle done states
138        not_done_array = 1 - done_array
139
140        while i < n_iters - 1 and not converged:
141            i += 1
142
143            Q = np.sum(
144                prob_array
145                * (reward_array + gamma * V[next_state_array] * not_done_array)
146                * mask_array,
147                axis=2,
148            )
149            V_new = np.max(Q, axis=1)
150
151            if np.max(np.abs(V - V_new)) < theta:
152                converged = True
153
154            V = V_new
155            V_track[i] = V
156
157        if not converged:
158            warnings.warn("Max iterations reached before convergence. Check n_iters.")
159
160        return V, V_track, dict(enumerate(np.argmax(Q, axis=1)))
161
162    def policy_iteration(self, gamma=1.0, n_iters=50, theta=1e-10, dtype=np.float32):
163        """
164        PARAMETERS:
165
166        gamma {float}:
167            Discount factor
168
169        n_iters {int}:
170            Number of iterations
171
172        theta {float}:
173            Convergence criterion for policy evaluation.
174            State values are considered to be converged when the maximum difference between new and previous state
175            values is less than theta.
176
177
178        RETURNS:
179
180        V {numpy array}, shape(possible states):
181            State values array
182
183        V_track {numpy array}, shape(n_episodes, nS):
184            Log of V(s) for each iteration
185
186        pi {lambda}, input state value, output action value:
187            Policy mapping states to actions.
188        """
189        random_actions = np.random.choice(tuple(self.P[0].keys()), len(self.P))
190
191        pi = {s: a for s, a in enumerate(random_actions)}
192        # initial V to give to `policy_evaluation` for the first time
193        V = np.zeros(len(self.P), dtype=dtype)
194        V_track = np.zeros((n_iters, len(self.P)), dtype=dtype)
195        i = 0
196        converged = False
197        while i < n_iters - 1 and not converged:
198            i += 1
199            old_pi = pi
200            V = self.policy_evaluation(pi, V, gamma=gamma, theta=theta, dtype=dtype)
201            V_track[i] = V
202            pi = self.policy_improvement(V, gamma=gamma, dtype=dtype)
203            if old_pi == pi:
204                converged = True
205        if not converged:
206            warnings.warn("Max iterations reached before convergence.  Check n_iters.")
207        return V, V_track, pi
208
209    def policy_evaluation(self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=np.float32):
210        while True:
211            V = np.zeros(len(self.P), dtype=dtype)
212            for s in range(len(self.P)):
213                for prob, next_state, reward, done in self.P[s][pi[s]]:
214                    V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
215            if np.max(np.abs(prev_V - V)) < theta:
216                break
217            prev_V = V.copy()
218        return V
219
220    def policy_improvement(self, V, gamma=1.0, dtype=np.float32):
221        Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype)
222        for s in range(len(self.P)):
223            for a in range(len(self.P[s])):
224                for prob, next_state, reward, done in self.P[s][a]:
225                    Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
226
227        return dict(enumerate(np.argmax(Q, axis=1)))
Planner(P)
31    def __init__(self, P):
32        self.P = P
P
def value_iteration( self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=<class 'numpy.float32'>):
34    def value_iteration(self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32):
35        """
36        PARAMETERS:
37
38        gamma {float}:
39            Discount factor
40
41        n_iters {int}:
42            Number of iterations
43
44        theta {float}:
45            Convergence criterion for value iteration.
46            State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
47            Stops at n_iters or theta convergence - whichever comes first.
48
49
50        RETURNS:
51
52        V {numpy array}, shape(possible states):
53            State values array
54
55        V_track {numpy array}, shape(n_episodes, nS):
56            Log of V(s) for each iteration
57
58        pi {lambda}, input state value, output action value:
59            Policy mapping states to actions.
60        """
61        V = np.zeros(len(self.P), dtype=dtype)
62        V_track = np.zeros((n_iters, len(self.P)), dtype=dtype)
63        i = 0
64        converged = False
65        while i < n_iters - 1 and not converged:
66            i += 1
67            Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype)
68            for s in range(len(self.P)):
69                for a in range(len(self.P[s])):
70                    for prob, next_state, reward, done in self.P[s][a]:
71                        Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
72            if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
73                converged = True
74            V = np.max(Q, axis=1)
75            V_track[i] = V
76        if not converged:
77            warnings.warn("Max iterations reached before convergence.  Check n_iters.")
78
79        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
80        return V, V_track, pi

PARAMETERS:

gamma {float}: Discount factor

n_iters {int}: Number of iterations

theta {float}: Convergence criterion for value iteration. State values are considered to be converged when the maximum difference between new and previous state values is less than theta. Stops at n_iters or theta convergence - whichever comes first.

RETURNS:

V {numpy array}, shape(possible states): State values array

V_track {numpy array}, shape(n_episodes, nS): Log of V(s) for each iteration

pi {lambda}, input state value, output action value: Policy mapping states to actions.

def value_iteration_vectorized( self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=<class 'numpy.float32'>):
 82    def value_iteration_vectorized(
 83        self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32
 84    ):
 85        """
 86        PARAMETERS:
 87
 88        gamma {float}:
 89            Discount factor
 90
 91        n_iters {int}:
 92            Number of iterations
 93
 94        theta {float}:
 95            Convergence criterion for value iteration.
 96            State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
 97            Stops at n_iters or theta convergence - whichever comes first.
 98
 99
100        RETURNS:
101
102        V {numpy array}, shape(possible states):
103            State values array
104
105        V_track {numpy array}, shape(n_episodes, nS):
106            Log of V(s) for each iteration
107
108        pi {lambda}, input state value, output action value:
109            Policy mapping states to actions.
110        """
111        S = len(self.P)
112        A = len(self.P[0])
113
114        max_K = max(len(self.P[s][a]) for s in range(S) for a in range(A))
115
116        prob_array = np.zeros((S, A, max_K), dtype=dtype)
117        next_state_array = np.zeros((S, A, max_K), dtype=np.int32)
118        reward_array = np.zeros((S, A, max_K), dtype=dtype)
119        done_array = np.zeros((S, A, max_K), dtype=bool)
120        mask_array = np.zeros((S, A, max_K), dtype=bool)
121
122        for s in range(S):
123            for a in range(A):
124                transitions = self.P[s][a]
125                for k, (prob, next_state, reward, done) in enumerate(transitions):
126                    prob_array[s, a, k] = prob
127                    next_state_array[s, a, k] = next_state
128                    reward_array[s, a, k] = reward
129                    done_array[s, a, k] = done
130                    mask_array[s, a, k] = True
131
132        V = np.zeros(S, dtype=dtype)
133        V_track = np.zeros((n_iters, S), dtype=dtype)
134        converged = False
135        i = 0
136
137        # Simpler way to handle done states
138        not_done_array = 1 - done_array
139
140        while i < n_iters - 1 and not converged:
141            i += 1
142
143            Q = np.sum(
144                prob_array
145                * (reward_array + gamma * V[next_state_array] * not_done_array)
146                * mask_array,
147                axis=2,
148            )
149            V_new = np.max(Q, axis=1)
150
151            if np.max(np.abs(V - V_new)) < theta:
152                converged = True
153
154            V = V_new
155            V_track[i] = V
156
157        if not converged:
158            warnings.warn("Max iterations reached before convergence. Check n_iters.")
159
160        return V, V_track, dict(enumerate(np.argmax(Q, axis=1)))

PARAMETERS:

gamma {float}: Discount factor

n_iters {int}: Number of iterations

theta {float}: Convergence criterion for value iteration. State values are considered to be converged when the maximum difference between new and previous state values is less than theta. Stops at n_iters or theta convergence - whichever comes first.

RETURNS:

V {numpy array}, shape(possible states): State values array

V_track {numpy array}, shape(n_episodes, nS): Log of V(s) for each iteration

pi {lambda}, input state value, output action value: Policy mapping states to actions.

def policy_iteration( self, gamma=1.0, n_iters=50, theta=1e-10, dtype=<class 'numpy.float32'>):
162    def policy_iteration(self, gamma=1.0, n_iters=50, theta=1e-10, dtype=np.float32):
163        """
164        PARAMETERS:
165
166        gamma {float}:
167            Discount factor
168
169        n_iters {int}:
170            Number of iterations
171
172        theta {float}:
173            Convergence criterion for policy evaluation.
174            State values are considered to be converged when the maximum difference between new and previous state
175            values is less than theta.
176
177
178        RETURNS:
179
180        V {numpy array}, shape(possible states):
181            State values array
182
183        V_track {numpy array}, shape(n_episodes, nS):
184            Log of V(s) for each iteration
185
186        pi {lambda}, input state value, output action value:
187            Policy mapping states to actions.
188        """
189        random_actions = np.random.choice(tuple(self.P[0].keys()), len(self.P))
190
191        pi = {s: a for s, a in enumerate(random_actions)}
192        # initial V to give to `policy_evaluation` for the first time
193        V = np.zeros(len(self.P), dtype=dtype)
194        V_track = np.zeros((n_iters, len(self.P)), dtype=dtype)
195        i = 0
196        converged = False
197        while i < n_iters - 1 and not converged:
198            i += 1
199            old_pi = pi
200            V = self.policy_evaluation(pi, V, gamma=gamma, theta=theta, dtype=dtype)
201            V_track[i] = V
202            pi = self.policy_improvement(V, gamma=gamma, dtype=dtype)
203            if old_pi == pi:
204                converged = True
205        if not converged:
206            warnings.warn("Max iterations reached before convergence.  Check n_iters.")
207        return V, V_track, pi

PARAMETERS:

gamma {float}: Discount factor

n_iters {int}: Number of iterations

theta {float}: Convergence criterion for policy evaluation. State values are considered to be converged when the maximum difference between new and previous state values is less than theta.

RETURNS:

V {numpy array}, shape(possible states): State values array

V_track {numpy array}, shape(n_episodes, nS): Log of V(s) for each iteration

pi {lambda}, input state value, output action value: Policy mapping states to actions.

def policy_evaluation( self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=<class 'numpy.float32'>):
209    def policy_evaluation(self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=np.float32):
210        while True:
211            V = np.zeros(len(self.P), dtype=dtype)
212            for s in range(len(self.P)):
213                for prob, next_state, reward, done in self.P[s][pi[s]]:
214                    V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
215            if np.max(np.abs(prev_V - V)) < theta:
216                break
217            prev_V = V.copy()
218        return V
def policy_improvement(self, V, gamma=1.0, dtype=<class 'numpy.float32'>):
220    def policy_improvement(self, V, gamma=1.0, dtype=np.float32):
221        Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype)
222        for s in range(len(self.P)):
223            for a in range(len(self.P[s])):
224                for prob, next_state, reward, done in self.P[s][a]:
225                    Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
226
227        return dict(enumerate(np.argmax(Q, axis=1)))