algorithms.planner
Author: Miguel Morales BSD 3-Clause License
Copyright (c) 2018, Miguel Morales All rights reserved. https://github.com/mimoralea/gdrl/blob/master/LICENSE
1""" 2Author: Miguel Morales 3BSD 3-Clause License 4 5Copyright (c) 2018, Miguel Morales 6All rights reserved. 7https://github.com/mimoralea/gdrl/blob/master/LICENSE 8""" 9 10""" 11modified by: John Mansfield 12 13documentation added by: Gagandeep Randhawa 14""" 15 16""" 17Class that contains functions related to planning algorithms (Value Iteration, Policy Iteration). 18Planner init expects a reward and transitions matrix P, which is nested dictionary gym style discrete environment 19where P[state][action] is a list of tuples (probability, next state, reward, terminal). 20 21Model-based learning algorithms: Value Iteration and Policy Iteration 22""" 23 24import warnings 25 26import numpy as np 27 28 29class Planner: 30 def __init__(self, P): 31 self.P = P 32 33 def value_iteration(self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32): 34 """ 35 PARAMETERS: 36 37 gamma {float}: 38 Discount factor 39 40 n_iters {int}: 41 Number of iterations 42 43 theta {float}: 44 Convergence criterion for value iteration. 45 State values are considered to be converged when the maximum difference between new and previous state values is less than theta. 46 Stops at n_iters or theta convergence - whichever comes first. 47 48 49 RETURNS: 50 51 V {numpy array}, shape(possible states): 52 State values array 53 54 V_track {numpy array}, shape(n_episodes, nS): 55 Log of V(s) for each iteration 56 57 pi {lambda}, input state value, output action value: 58 Policy mapping states to actions. 59 """ 60 V = np.zeros(len(self.P), dtype=dtype) 61 V_track = np.zeros((n_iters, len(self.P)), dtype=dtype) 62 i = 0 63 converged = False 64 while i < n_iters - 1 and not converged: 65 i += 1 66 Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype) 67 for s in range(len(self.P)): 68 for a in range(len(self.P[s])): 69 for prob, next_state, reward, done in self.P[s][a]: 70 Q[s][a] += prob * (reward + gamma * V[next_state] * (not done)) 71 if np.max(np.abs(V - np.max(Q, axis=1))) < theta: 72 converged = True 73 V = np.max(Q, axis=1) 74 V_track[i] = V 75 if not converged: 76 warnings.warn("Max iterations reached before convergence. Check n_iters.") 77 78 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 79 return V, V_track, pi 80 81 def value_iteration_vectorized( 82 self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32 83 ): 84 """ 85 PARAMETERS: 86 87 gamma {float}: 88 Discount factor 89 90 n_iters {int}: 91 Number of iterations 92 93 theta {float}: 94 Convergence criterion for value iteration. 95 State values are considered to be converged when the maximum difference between new and previous state values is less than theta. 96 Stops at n_iters or theta convergence - whichever comes first. 97 98 99 RETURNS: 100 101 V {numpy array}, shape(possible states): 102 State values array 103 104 V_track {numpy array}, shape(n_episodes, nS): 105 Log of V(s) for each iteration 106 107 pi {lambda}, input state value, output action value: 108 Policy mapping states to actions. 109 """ 110 S = len(self.P) 111 A = len(self.P[0]) 112 113 max_K = max(len(self.P[s][a]) for s in range(S) for a in range(A)) 114 115 prob_array = np.zeros((S, A, max_K), dtype=dtype) 116 next_state_array = np.zeros((S, A, max_K), dtype=np.int32) 117 reward_array = np.zeros((S, A, max_K), dtype=dtype) 118 done_array = np.zeros((S, A, max_K), dtype=bool) 119 mask_array = np.zeros((S, A, max_K), dtype=bool) 120 121 for s in range(S): 122 for a in range(A): 123 transitions = self.P[s][a] 124 for k, (prob, next_state, reward, done) in enumerate(transitions): 125 prob_array[s, a, k] = prob 126 next_state_array[s, a, k] = next_state 127 reward_array[s, a, k] = reward 128 done_array[s, a, k] = done 129 mask_array[s, a, k] = True 130 131 V = np.zeros(S, dtype=dtype) 132 V_track = np.zeros((n_iters, S), dtype=dtype) 133 converged = False 134 i = 0 135 136 # Simpler way to handle done states 137 not_done_array = 1 - done_array 138 139 while i < n_iters - 1 and not converged: 140 i += 1 141 142 Q = np.sum( 143 prob_array 144 * (reward_array + gamma * V[next_state_array] * not_done_array) 145 * mask_array, 146 axis=2, 147 ) 148 V_new = np.max(Q, axis=1) 149 150 if np.max(np.abs(V - V_new)) < theta: 151 converged = True 152 153 V = V_new 154 V_track[i] = V 155 156 if not converged: 157 warnings.warn("Max iterations reached before convergence. Check n_iters.") 158 159 return V, V_track, dict(enumerate(np.argmax(Q, axis=1))) 160 161 def policy_iteration(self, gamma=1.0, n_iters=50, theta=1e-10, dtype=np.float32): 162 """ 163 PARAMETERS: 164 165 gamma {float}: 166 Discount factor 167 168 n_iters {int}: 169 Number of iterations 170 171 theta {float}: 172 Convergence criterion for policy evaluation. 173 State values are considered to be converged when the maximum difference between new and previous state 174 values is less than theta. 175 176 177 RETURNS: 178 179 V {numpy array}, shape(possible states): 180 State values array 181 182 V_track {numpy array}, shape(n_episodes, nS): 183 Log of V(s) for each iteration 184 185 pi {lambda}, input state value, output action value: 186 Policy mapping states to actions. 187 """ 188 random_actions = np.random.choice(tuple(self.P[0].keys()), len(self.P)) 189 190 pi = {s: a for s, a in enumerate(random_actions)} 191 # initial V to give to `policy_evaluation` for the first time 192 V = np.zeros(len(self.P), dtype=dtype) 193 V_track = np.zeros((n_iters, len(self.P)), dtype=dtype) 194 i = 0 195 converged = False 196 while i < n_iters - 1 and not converged: 197 i += 1 198 old_pi = pi 199 V = self.policy_evaluation(pi, V, gamma=gamma, theta=theta, dtype=dtype) 200 V_track[i] = V 201 pi = self.policy_improvement(V, gamma=gamma, dtype=dtype) 202 if old_pi == pi: 203 converged = True 204 if not converged: 205 warnings.warn("Max iterations reached before convergence. Check n_iters.") 206 return V, V_track, pi 207 208 def policy_evaluation(self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=np.float32): 209 while True: 210 V = np.zeros(len(self.P), dtype=dtype) 211 for s in range(len(self.P)): 212 for prob, next_state, reward, done in self.P[s][pi[s]]: 213 V[s] += prob * (reward + gamma * prev_V[next_state] * (not done)) 214 if np.max(np.abs(prev_V - V)) < theta: 215 break 216 prev_V = V.copy() 217 return V 218 219 def policy_improvement(self, V, gamma=1.0, dtype=np.float32): 220 Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype) 221 for s in range(len(self.P)): 222 for a in range(len(self.P[s])): 223 for prob, next_state, reward, done in self.P[s][a]: 224 Q[s][a] += prob * (reward + gamma * V[next_state] * (not done)) 225 226 return dict(enumerate(np.argmax(Q, axis=1)))
30class Planner: 31 def __init__(self, P): 32 self.P = P 33 34 def value_iteration(self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32): 35 """ 36 PARAMETERS: 37 38 gamma {float}: 39 Discount factor 40 41 n_iters {int}: 42 Number of iterations 43 44 theta {float}: 45 Convergence criterion for value iteration. 46 State values are considered to be converged when the maximum difference between new and previous state values is less than theta. 47 Stops at n_iters or theta convergence - whichever comes first. 48 49 50 RETURNS: 51 52 V {numpy array}, shape(possible states): 53 State values array 54 55 V_track {numpy array}, shape(n_episodes, nS): 56 Log of V(s) for each iteration 57 58 pi {lambda}, input state value, output action value: 59 Policy mapping states to actions. 60 """ 61 V = np.zeros(len(self.P), dtype=dtype) 62 V_track = np.zeros((n_iters, len(self.P)), dtype=dtype) 63 i = 0 64 converged = False 65 while i < n_iters - 1 and not converged: 66 i += 1 67 Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype) 68 for s in range(len(self.P)): 69 for a in range(len(self.P[s])): 70 for prob, next_state, reward, done in self.P[s][a]: 71 Q[s][a] += prob * (reward + gamma * V[next_state] * (not done)) 72 if np.max(np.abs(V - np.max(Q, axis=1))) < theta: 73 converged = True 74 V = np.max(Q, axis=1) 75 V_track[i] = V 76 if not converged: 77 warnings.warn("Max iterations reached before convergence. Check n_iters.") 78 79 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 80 return V, V_track, pi 81 82 def value_iteration_vectorized( 83 self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32 84 ): 85 """ 86 PARAMETERS: 87 88 gamma {float}: 89 Discount factor 90 91 n_iters {int}: 92 Number of iterations 93 94 theta {float}: 95 Convergence criterion for value iteration. 96 State values are considered to be converged when the maximum difference between new and previous state values is less than theta. 97 Stops at n_iters or theta convergence - whichever comes first. 98 99 100 RETURNS: 101 102 V {numpy array}, shape(possible states): 103 State values array 104 105 V_track {numpy array}, shape(n_episodes, nS): 106 Log of V(s) for each iteration 107 108 pi {lambda}, input state value, output action value: 109 Policy mapping states to actions. 110 """ 111 S = len(self.P) 112 A = len(self.P[0]) 113 114 max_K = max(len(self.P[s][a]) for s in range(S) for a in range(A)) 115 116 prob_array = np.zeros((S, A, max_K), dtype=dtype) 117 next_state_array = np.zeros((S, A, max_K), dtype=np.int32) 118 reward_array = np.zeros((S, A, max_K), dtype=dtype) 119 done_array = np.zeros((S, A, max_K), dtype=bool) 120 mask_array = np.zeros((S, A, max_K), dtype=bool) 121 122 for s in range(S): 123 for a in range(A): 124 transitions = self.P[s][a] 125 for k, (prob, next_state, reward, done) in enumerate(transitions): 126 prob_array[s, a, k] = prob 127 next_state_array[s, a, k] = next_state 128 reward_array[s, a, k] = reward 129 done_array[s, a, k] = done 130 mask_array[s, a, k] = True 131 132 V = np.zeros(S, dtype=dtype) 133 V_track = np.zeros((n_iters, S), dtype=dtype) 134 converged = False 135 i = 0 136 137 # Simpler way to handle done states 138 not_done_array = 1 - done_array 139 140 while i < n_iters - 1 and not converged: 141 i += 1 142 143 Q = np.sum( 144 prob_array 145 * (reward_array + gamma * V[next_state_array] * not_done_array) 146 * mask_array, 147 axis=2, 148 ) 149 V_new = np.max(Q, axis=1) 150 151 if np.max(np.abs(V - V_new)) < theta: 152 converged = True 153 154 V = V_new 155 V_track[i] = V 156 157 if not converged: 158 warnings.warn("Max iterations reached before convergence. Check n_iters.") 159 160 return V, V_track, dict(enumerate(np.argmax(Q, axis=1))) 161 162 def policy_iteration(self, gamma=1.0, n_iters=50, theta=1e-10, dtype=np.float32): 163 """ 164 PARAMETERS: 165 166 gamma {float}: 167 Discount factor 168 169 n_iters {int}: 170 Number of iterations 171 172 theta {float}: 173 Convergence criterion for policy evaluation. 174 State values are considered to be converged when the maximum difference between new and previous state 175 values is less than theta. 176 177 178 RETURNS: 179 180 V {numpy array}, shape(possible states): 181 State values array 182 183 V_track {numpy array}, shape(n_episodes, nS): 184 Log of V(s) for each iteration 185 186 pi {lambda}, input state value, output action value: 187 Policy mapping states to actions. 188 """ 189 random_actions = np.random.choice(tuple(self.P[0].keys()), len(self.P)) 190 191 pi = {s: a for s, a in enumerate(random_actions)} 192 # initial V to give to `policy_evaluation` for the first time 193 V = np.zeros(len(self.P), dtype=dtype) 194 V_track = np.zeros((n_iters, len(self.P)), dtype=dtype) 195 i = 0 196 converged = False 197 while i < n_iters - 1 and not converged: 198 i += 1 199 old_pi = pi 200 V = self.policy_evaluation(pi, V, gamma=gamma, theta=theta, dtype=dtype) 201 V_track[i] = V 202 pi = self.policy_improvement(V, gamma=gamma, dtype=dtype) 203 if old_pi == pi: 204 converged = True 205 if not converged: 206 warnings.warn("Max iterations reached before convergence. Check n_iters.") 207 return V, V_track, pi 208 209 def policy_evaluation(self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=np.float32): 210 while True: 211 V = np.zeros(len(self.P), dtype=dtype) 212 for s in range(len(self.P)): 213 for prob, next_state, reward, done in self.P[s][pi[s]]: 214 V[s] += prob * (reward + gamma * prev_V[next_state] * (not done)) 215 if np.max(np.abs(prev_V - V)) < theta: 216 break 217 prev_V = V.copy() 218 return V 219 220 def policy_improvement(self, V, gamma=1.0, dtype=np.float32): 221 Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype) 222 for s in range(len(self.P)): 223 for a in range(len(self.P[s])): 224 for prob, next_state, reward, done in self.P[s][a]: 225 Q[s][a] += prob * (reward + gamma * V[next_state] * (not done)) 226 227 return dict(enumerate(np.argmax(Q, axis=1)))
34 def value_iteration(self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32): 35 """ 36 PARAMETERS: 37 38 gamma {float}: 39 Discount factor 40 41 n_iters {int}: 42 Number of iterations 43 44 theta {float}: 45 Convergence criterion for value iteration. 46 State values are considered to be converged when the maximum difference between new and previous state values is less than theta. 47 Stops at n_iters or theta convergence - whichever comes first. 48 49 50 RETURNS: 51 52 V {numpy array}, shape(possible states): 53 State values array 54 55 V_track {numpy array}, shape(n_episodes, nS): 56 Log of V(s) for each iteration 57 58 pi {lambda}, input state value, output action value: 59 Policy mapping states to actions. 60 """ 61 V = np.zeros(len(self.P), dtype=dtype) 62 V_track = np.zeros((n_iters, len(self.P)), dtype=dtype) 63 i = 0 64 converged = False 65 while i < n_iters - 1 and not converged: 66 i += 1 67 Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype) 68 for s in range(len(self.P)): 69 for a in range(len(self.P[s])): 70 for prob, next_state, reward, done in self.P[s][a]: 71 Q[s][a] += prob * (reward + gamma * V[next_state] * (not done)) 72 if np.max(np.abs(V - np.max(Q, axis=1))) < theta: 73 converged = True 74 V = np.max(Q, axis=1) 75 V_track[i] = V 76 if not converged: 77 warnings.warn("Max iterations reached before convergence. Check n_iters.") 78 79 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 80 return V, V_track, pi
PARAMETERS:
gamma {float}: Discount factor
n_iters {int}: Number of iterations
theta {float}: Convergence criterion for value iteration. State values are considered to be converged when the maximum difference between new and previous state values is less than theta. Stops at n_iters or theta convergence - whichever comes first.
RETURNS:
V {numpy array}, shape(possible states): State values array
V_track {numpy array}, shape(n_episodes, nS): Log of V(s) for each iteration
pi {lambda}, input state value, output action value: Policy mapping states to actions.
82 def value_iteration_vectorized( 83 self, gamma=1.0, n_iters=1000, theta=1e-10, dtype=np.float32 84 ): 85 """ 86 PARAMETERS: 87 88 gamma {float}: 89 Discount factor 90 91 n_iters {int}: 92 Number of iterations 93 94 theta {float}: 95 Convergence criterion for value iteration. 96 State values are considered to be converged when the maximum difference between new and previous state values is less than theta. 97 Stops at n_iters or theta convergence - whichever comes first. 98 99 100 RETURNS: 101 102 V {numpy array}, shape(possible states): 103 State values array 104 105 V_track {numpy array}, shape(n_episodes, nS): 106 Log of V(s) for each iteration 107 108 pi {lambda}, input state value, output action value: 109 Policy mapping states to actions. 110 """ 111 S = len(self.P) 112 A = len(self.P[0]) 113 114 max_K = max(len(self.P[s][a]) for s in range(S) for a in range(A)) 115 116 prob_array = np.zeros((S, A, max_K), dtype=dtype) 117 next_state_array = np.zeros((S, A, max_K), dtype=np.int32) 118 reward_array = np.zeros((S, A, max_K), dtype=dtype) 119 done_array = np.zeros((S, A, max_K), dtype=bool) 120 mask_array = np.zeros((S, A, max_K), dtype=bool) 121 122 for s in range(S): 123 for a in range(A): 124 transitions = self.P[s][a] 125 for k, (prob, next_state, reward, done) in enumerate(transitions): 126 prob_array[s, a, k] = prob 127 next_state_array[s, a, k] = next_state 128 reward_array[s, a, k] = reward 129 done_array[s, a, k] = done 130 mask_array[s, a, k] = True 131 132 V = np.zeros(S, dtype=dtype) 133 V_track = np.zeros((n_iters, S), dtype=dtype) 134 converged = False 135 i = 0 136 137 # Simpler way to handle done states 138 not_done_array = 1 - done_array 139 140 while i < n_iters - 1 and not converged: 141 i += 1 142 143 Q = np.sum( 144 prob_array 145 * (reward_array + gamma * V[next_state_array] * not_done_array) 146 * mask_array, 147 axis=2, 148 ) 149 V_new = np.max(Q, axis=1) 150 151 if np.max(np.abs(V - V_new)) < theta: 152 converged = True 153 154 V = V_new 155 V_track[i] = V 156 157 if not converged: 158 warnings.warn("Max iterations reached before convergence. Check n_iters.") 159 160 return V, V_track, dict(enumerate(np.argmax(Q, axis=1)))
PARAMETERS:
gamma {float}: Discount factor
n_iters {int}: Number of iterations
theta {float}: Convergence criterion for value iteration. State values are considered to be converged when the maximum difference between new and previous state values is less than theta. Stops at n_iters or theta convergence - whichever comes first.
RETURNS:
V {numpy array}, shape(possible states): State values array
V_track {numpy array}, shape(n_episodes, nS): Log of V(s) for each iteration
pi {lambda}, input state value, output action value: Policy mapping states to actions.
162 def policy_iteration(self, gamma=1.0, n_iters=50, theta=1e-10, dtype=np.float32): 163 """ 164 PARAMETERS: 165 166 gamma {float}: 167 Discount factor 168 169 n_iters {int}: 170 Number of iterations 171 172 theta {float}: 173 Convergence criterion for policy evaluation. 174 State values are considered to be converged when the maximum difference between new and previous state 175 values is less than theta. 176 177 178 RETURNS: 179 180 V {numpy array}, shape(possible states): 181 State values array 182 183 V_track {numpy array}, shape(n_episodes, nS): 184 Log of V(s) for each iteration 185 186 pi {lambda}, input state value, output action value: 187 Policy mapping states to actions. 188 """ 189 random_actions = np.random.choice(tuple(self.P[0].keys()), len(self.P)) 190 191 pi = {s: a for s, a in enumerate(random_actions)} 192 # initial V to give to `policy_evaluation` for the first time 193 V = np.zeros(len(self.P), dtype=dtype) 194 V_track = np.zeros((n_iters, len(self.P)), dtype=dtype) 195 i = 0 196 converged = False 197 while i < n_iters - 1 and not converged: 198 i += 1 199 old_pi = pi 200 V = self.policy_evaluation(pi, V, gamma=gamma, theta=theta, dtype=dtype) 201 V_track[i] = V 202 pi = self.policy_improvement(V, gamma=gamma, dtype=dtype) 203 if old_pi == pi: 204 converged = True 205 if not converged: 206 warnings.warn("Max iterations reached before convergence. Check n_iters.") 207 return V, V_track, pi
PARAMETERS:
gamma {float}: Discount factor
n_iters {int}: Number of iterations
theta {float}: Convergence criterion for policy evaluation. State values are considered to be converged when the maximum difference between new and previous state values is less than theta.
RETURNS:
V {numpy array}, shape(possible states): State values array
V_track {numpy array}, shape(n_episodes, nS): Log of V(s) for each iteration
pi {lambda}, input state value, output action value: Policy mapping states to actions.
209 def policy_evaluation(self, pi, prev_V, gamma=1.0, theta=1e-10, dtype=np.float32): 210 while True: 211 V = np.zeros(len(self.P), dtype=dtype) 212 for s in range(len(self.P)): 213 for prob, next_state, reward, done in self.P[s][pi[s]]: 214 V[s] += prob * (reward + gamma * prev_V[next_state] * (not done)) 215 if np.max(np.abs(prev_V - V)) < theta: 216 break 217 prev_V = V.copy() 218 return V
220 def policy_improvement(self, V, gamma=1.0, dtype=np.float32): 221 Q = np.zeros((len(self.P), len(self.P[0])), dtype=dtype) 222 for s in range(len(self.P)): 223 for a in range(len(self.P[s])): 224 for prob, next_state, reward, done in self.P[s][a]: 225 Q[s][a] += prob * (reward + gamma * V[next_state] * (not done)) 226 227 return dict(enumerate(np.argmax(Q, axis=1)))