bettermdptools.algorithms.rl
Author: Miguel Morales BSD 3-Clause License
Copyright (c) 2018, Miguel Morales All rights reserved. https://github.com/mimoralea/gdrl/blob/master/LICENSE
1""" 2Author: Miguel Morales 3BSD 3-Clause License 4 5Copyright (c) 2018, Miguel Morales 6All rights reserved. 7https://github.com/mimoralea/gdrl/blob/master/LICENSE 8""" 9 10""" 11modified by: John Mansfield 12 13documentation added by: Gagandeep Randhawa 14""" 15 16""" 17Class that contains functions related to reinforcement learning algorithms. RL init expects an OpenAI environment (env). 18 19Model-free learning algorithms: Q-Learning and SARSA 20work out of the box with any gymnasium environments that 21have single discrete valued state spaces, like frozen lake. A lambda function 22is required to convert state spaces not in this format. 23""" 24 25import warnings 26 27import numpy as np 28from tqdm.auto import tqdm 29 30from bettermdptools.utils.callbacks import MyCallbacks 31 32 33class RL: 34 def __init__(self, env): 35 self.env = env 36 self.callbacks = MyCallbacks() 37 self.render = False 38 # Explanation of lambda: 39 # def select_action(state, Q, epsilon): 40 # if np.random.random() > epsilon: 41 # max_val = np.max(Q[state]) 42 # indxs_selector = np.isclose(Q[state], max_val) 43 # indxs = np.arange(len(Q[state]))[indxs_selector] 44 # return np.random.choice(indxs) 45 # else: 46 # return np.random.randint(len(Q[state])) 47 self.select_action = ( 48 lambda state, Q, epsilon: np.random.choice( 49 np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))] 50 ) 51 if np.random.random() > epsilon 52 else np.random.randint(len(Q[state])) 53 ) 54 55 @staticmethod 56 def decay_schedule( 57 init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10 58 ): 59 """ 60 Generates a decay schedule for a given initial value. 61 62 Parameters 63 ---------- 64 init_value : float 65 Initial value of the quantity being decayed. 66 min_value : float 67 Minimum value init_value is allowed to decay to. 68 decay_ratio : float 69 The exponential factor exp(decay_ratio). 70 max_steps : int 71 Max iteration steps for decaying init_value. 72 log_start : float, optional 73 Starting value of the decay sequence, by default -2. 74 log_base : float, optional 75 Base of the log space, by default 10. 76 77 Returns 78 ------- 79 np.ndarray 80 Decay values where values[i] is the value used at i-th step. 81 """ 82 decay_steps = int(max_steps * decay_ratio) 83 rem_steps = max_steps - decay_steps 84 values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[ 85 ::-1 86 ] 87 values = (values - values.min()) / (values.max() - values.min()) 88 values = (init_value - min_value) * values + min_value 89 values = np.pad(values, (0, rem_steps), "edge") 90 return values 91 92 def q_learning( 93 self, 94 nS=None, 95 nA=None, 96 convert_state_obs=lambda state: state, 97 gamma=0.99, 98 init_alpha=0.5, 99 min_alpha=0.01, 100 alpha_decay_ratio=0.5, 101 init_epsilon=1.0, 102 min_epsilon=0.1, 103 epsilon_decay_ratio=0.9, 104 n_episodes=10000, 105 ): 106 """ 107 Q-Learning algorithm. 108 109 Parameters 110 ---------- 111 nS : int, optional 112 Number of states, by default None. 113 nA : int, optional 114 Number of available actions, by default None. 115 convert_state_obs : function, optional 116 Converts state into an integer, by default lambda state: state. 117 gamma : float, optional 118 Discount factor, by default 0.99. 119 init_alpha : float, optional 120 Initial learning rate, by default 0.5. 121 min_alpha : float, optional 122 Minimum learning rate, by default 0.01. 123 alpha_decay_ratio : float, optional 124 Decay schedule of learning rate for future iterations, by default 0.5. 125 init_epsilon : float, optional 126 Initial epsilon value for epsilon greedy strategy, by default 1.0. 127 min_epsilon : float, optional 128 Minimum epsilon, by default 0.1. 129 epsilon_decay_ratio : float, optional 130 Decay schedule of epsilon for future iterations, by default 0.9. 131 n_episodes : int, optional 132 Number of episodes for the agent, by default 10000. 133 134 Returns 135 ------- 136 tuple 137 Q : np.ndarray 138 Final action-value function Q(s,a). 139 V : np.ndarray 140 State values array. 141 pi : dict 142 Policy mapping states to actions. 143 Q_track : np.ndarray 144 Log of Q(s,a) for each episode. 145 pi_track : list 146 Log of complete policy for each episode. 147 rewards : np.ndarray 148 Rewards obtained in each episode. 149 """ 150 if nS is None: 151 nS = self.env.observation_space.n 152 if nA is None: 153 nA = self.env.action_space.n 154 pi_track = [] 155 Q = np.zeros((nS, nA), dtype=np.float32) 156 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 157 alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes) 158 epsilons = RL.decay_schedule( 159 init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes 160 ) 161 rewards = np.zeros(n_episodes, dtype=np.float32) 162 for e in tqdm(range(n_episodes), leave=False): 163 self.callbacks.on_episode_begin(self) 164 self.callbacks.on_episode(self, episode=e) 165 state, info = self.env.reset() 166 done = False 167 state = convert_state_obs(state) 168 total_reward = 0 169 while not done: 170 if self.render: 171 warnings.warn( 172 "Occasional render has been deprecated by openAI. Use test_env.py to render." 173 ) 174 action = self.select_action(state, Q, epsilons[e]) 175 next_state, reward, terminated, truncated, _ = self.env.step(action) 176 if truncated: 177 warnings.warn( 178 "Episode was truncated. TD target value may be incorrect." 179 ) 180 done = terminated or truncated 181 self.callbacks.on_env_step(self) 182 next_state = convert_state_obs(next_state) 183 td_target = reward + gamma * Q[next_state].max() * (not done) 184 td_error = td_target - Q[state][action] 185 Q[state][action] = Q[state][action] + alphas[e] * td_error 186 state = next_state 187 total_reward += reward 188 rewards[e] = total_reward 189 Q_track[e] = Q 190 pi_track.append(np.argmax(Q, axis=1)) 191 self.render = False 192 self.callbacks.on_episode_end(self) 193 194 V = np.max(Q, axis=1) 195 196 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 197 return Q, V, pi, Q_track, pi_track, rewards 198 199 def sarsa( 200 self, 201 nS=None, 202 nA=None, 203 convert_state_obs=lambda state: state, 204 gamma=0.99, 205 init_alpha=0.5, 206 min_alpha=0.01, 207 alpha_decay_ratio=0.5, 208 init_epsilon=1.0, 209 min_epsilon=0.1, 210 epsilon_decay_ratio=0.9, 211 n_episodes=10000, 212 ): 213 """ 214 SARSA algorithm. 215 216 Parameters 217 ---------- 218 nS : int, optional 219 Number of states, by default None. 220 nA : int, optional 221 Number of available actions, by default None. 222 convert_state_obs : function, optional 223 Converts state into an integer, by default lambda state: state. 224 gamma : float, optional 225 Discount factor, by default 0.99. 226 init_alpha : float, optional 227 Initial learning rate, by default 0.5. 228 min_alpha : float, optional 229 Minimum learning rate, by default 0.01. 230 alpha_decay_ratio : float, optional 231 Decay schedule of learning rate for future iterations, by default 0.5. 232 init_epsilon : float, optional 233 Initial epsilon value for epsilon greedy strategy, by default 1.0. 234 min_epsilon : float, optional 235 Minimum epsilon, by default 0.1. 236 epsilon_decay_ratio : float, optional 237 Decay schedule of epsilon for future iterations, by default 0.9. 238 n_episodes : int, optional 239 Number of episodes for the agent, by default 10000. 240 241 Returns 242 ------- 243 tuple 244 Q : np.ndarray 245 Final action-value function Q(s,a). 246 V : np.ndarray 247 State values array. 248 pi : dict 249 Policy mapping states to actions. 250 Q_track : np.ndarray 251 Log of Q(s,a) for each episode. 252 pi_track : list 253 Log of complete policy for each episode. 254 rewards : np.ndarray 255 Rewards obtained in each episode. 256 """ 257 if nS is None: 258 nS = self.env.observation_space.n 259 if nA is None: 260 nA = self.env.action_space.n 261 pi_track = [] 262 Q = np.zeros((nS, nA), dtype=np.float32) 263 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 264 rewards = np.zeros(n_episodes, dtype=np.float32) 265 alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes) 266 epsilons = RL.decay_schedule( 267 init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes 268 ) 269 270 for e in tqdm(range(n_episodes), leave=False): 271 self.callbacks.on_episode_begin(self) 272 self.callbacks.on_episode(self, episode=e) 273 state, info = self.env.reset() 274 done = False 275 state = convert_state_obs(state) 276 action = self.select_action(state, Q, epsilons[e]) 277 total_reward = 0 278 while not done: 279 if self.render: 280 warnings.warn( 281 "Occasional render has been deprecated by openAI. Use test_env.py to render." 282 ) 283 next_state, reward, terminated, truncated, _ = self.env.step(action) 284 if truncated: 285 warnings.warn( 286 "Episode was truncated. TD target value may be incorrect." 287 ) 288 done = terminated or truncated 289 self.callbacks.on_env_step(self) 290 next_state = convert_state_obs(next_state) 291 next_action = self.select_action(next_state, Q, epsilons[e]) 292 td_target = reward + gamma * Q[next_state][next_action] * (not done) 293 td_error = td_target - Q[state][action] 294 Q[state][action] = Q[state][action] + alphas[e] * td_error 295 state, action = next_state, next_action 296 total_reward += reward 297 rewards[e] = total_reward 298 Q_track[e] = Q 299 pi_track.append(np.argmax(Q, axis=1)) 300 self.render = False 301 self.callbacks.on_episode_end(self) 302 303 V = np.max(Q, axis=1) 304 305 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 306 return Q, V, pi, Q_track, pi_track, rewards
class
RL:
34class RL: 35 def __init__(self, env): 36 self.env = env 37 self.callbacks = MyCallbacks() 38 self.render = False 39 # Explanation of lambda: 40 # def select_action(state, Q, epsilon): 41 # if np.random.random() > epsilon: 42 # max_val = np.max(Q[state]) 43 # indxs_selector = np.isclose(Q[state], max_val) 44 # indxs = np.arange(len(Q[state]))[indxs_selector] 45 # return np.random.choice(indxs) 46 # else: 47 # return np.random.randint(len(Q[state])) 48 self.select_action = ( 49 lambda state, Q, epsilon: np.random.choice( 50 np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))] 51 ) 52 if np.random.random() > epsilon 53 else np.random.randint(len(Q[state])) 54 ) 55 56 @staticmethod 57 def decay_schedule( 58 init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10 59 ): 60 """ 61 Generates a decay schedule for a given initial value. 62 63 Parameters 64 ---------- 65 init_value : float 66 Initial value of the quantity being decayed. 67 min_value : float 68 Minimum value init_value is allowed to decay to. 69 decay_ratio : float 70 The exponential factor exp(decay_ratio). 71 max_steps : int 72 Max iteration steps for decaying init_value. 73 log_start : float, optional 74 Starting value of the decay sequence, by default -2. 75 log_base : float, optional 76 Base of the log space, by default 10. 77 78 Returns 79 ------- 80 np.ndarray 81 Decay values where values[i] is the value used at i-th step. 82 """ 83 decay_steps = int(max_steps * decay_ratio) 84 rem_steps = max_steps - decay_steps 85 values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[ 86 ::-1 87 ] 88 values = (values - values.min()) / (values.max() - values.min()) 89 values = (init_value - min_value) * values + min_value 90 values = np.pad(values, (0, rem_steps), "edge") 91 return values 92 93 def q_learning( 94 self, 95 nS=None, 96 nA=None, 97 convert_state_obs=lambda state: state, 98 gamma=0.99, 99 init_alpha=0.5, 100 min_alpha=0.01, 101 alpha_decay_ratio=0.5, 102 init_epsilon=1.0, 103 min_epsilon=0.1, 104 epsilon_decay_ratio=0.9, 105 n_episodes=10000, 106 ): 107 """ 108 Q-Learning algorithm. 109 110 Parameters 111 ---------- 112 nS : int, optional 113 Number of states, by default None. 114 nA : int, optional 115 Number of available actions, by default None. 116 convert_state_obs : function, optional 117 Converts state into an integer, by default lambda state: state. 118 gamma : float, optional 119 Discount factor, by default 0.99. 120 init_alpha : float, optional 121 Initial learning rate, by default 0.5. 122 min_alpha : float, optional 123 Minimum learning rate, by default 0.01. 124 alpha_decay_ratio : float, optional 125 Decay schedule of learning rate for future iterations, by default 0.5. 126 init_epsilon : float, optional 127 Initial epsilon value for epsilon greedy strategy, by default 1.0. 128 min_epsilon : float, optional 129 Minimum epsilon, by default 0.1. 130 epsilon_decay_ratio : float, optional 131 Decay schedule of epsilon for future iterations, by default 0.9. 132 n_episodes : int, optional 133 Number of episodes for the agent, by default 10000. 134 135 Returns 136 ------- 137 tuple 138 Q : np.ndarray 139 Final action-value function Q(s,a). 140 V : np.ndarray 141 State values array. 142 pi : dict 143 Policy mapping states to actions. 144 Q_track : np.ndarray 145 Log of Q(s,a) for each episode. 146 pi_track : list 147 Log of complete policy for each episode. 148 rewards : np.ndarray 149 Rewards obtained in each episode. 150 """ 151 if nS is None: 152 nS = self.env.observation_space.n 153 if nA is None: 154 nA = self.env.action_space.n 155 pi_track = [] 156 Q = np.zeros((nS, nA), dtype=np.float32) 157 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 158 alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes) 159 epsilons = RL.decay_schedule( 160 init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes 161 ) 162 rewards = np.zeros(n_episodes, dtype=np.float32) 163 for e in tqdm(range(n_episodes), leave=False): 164 self.callbacks.on_episode_begin(self) 165 self.callbacks.on_episode(self, episode=e) 166 state, info = self.env.reset() 167 done = False 168 state = convert_state_obs(state) 169 total_reward = 0 170 while not done: 171 if self.render: 172 warnings.warn( 173 "Occasional render has been deprecated by openAI. Use test_env.py to render." 174 ) 175 action = self.select_action(state, Q, epsilons[e]) 176 next_state, reward, terminated, truncated, _ = self.env.step(action) 177 if truncated: 178 warnings.warn( 179 "Episode was truncated. TD target value may be incorrect." 180 ) 181 done = terminated or truncated 182 self.callbacks.on_env_step(self) 183 next_state = convert_state_obs(next_state) 184 td_target = reward + gamma * Q[next_state].max() * (not done) 185 td_error = td_target - Q[state][action] 186 Q[state][action] = Q[state][action] + alphas[e] * td_error 187 state = next_state 188 total_reward += reward 189 rewards[e] = total_reward 190 Q_track[e] = Q 191 pi_track.append(np.argmax(Q, axis=1)) 192 self.render = False 193 self.callbacks.on_episode_end(self) 194 195 V = np.max(Q, axis=1) 196 197 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 198 return Q, V, pi, Q_track, pi_track, rewards 199 200 def sarsa( 201 self, 202 nS=None, 203 nA=None, 204 convert_state_obs=lambda state: state, 205 gamma=0.99, 206 init_alpha=0.5, 207 min_alpha=0.01, 208 alpha_decay_ratio=0.5, 209 init_epsilon=1.0, 210 min_epsilon=0.1, 211 epsilon_decay_ratio=0.9, 212 n_episodes=10000, 213 ): 214 """ 215 SARSA algorithm. 216 217 Parameters 218 ---------- 219 nS : int, optional 220 Number of states, by default None. 221 nA : int, optional 222 Number of available actions, by default None. 223 convert_state_obs : function, optional 224 Converts state into an integer, by default lambda state: state. 225 gamma : float, optional 226 Discount factor, by default 0.99. 227 init_alpha : float, optional 228 Initial learning rate, by default 0.5. 229 min_alpha : float, optional 230 Minimum learning rate, by default 0.01. 231 alpha_decay_ratio : float, optional 232 Decay schedule of learning rate for future iterations, by default 0.5. 233 init_epsilon : float, optional 234 Initial epsilon value for epsilon greedy strategy, by default 1.0. 235 min_epsilon : float, optional 236 Minimum epsilon, by default 0.1. 237 epsilon_decay_ratio : float, optional 238 Decay schedule of epsilon for future iterations, by default 0.9. 239 n_episodes : int, optional 240 Number of episodes for the agent, by default 10000. 241 242 Returns 243 ------- 244 tuple 245 Q : np.ndarray 246 Final action-value function Q(s,a). 247 V : np.ndarray 248 State values array. 249 pi : dict 250 Policy mapping states to actions. 251 Q_track : np.ndarray 252 Log of Q(s,a) for each episode. 253 pi_track : list 254 Log of complete policy for each episode. 255 rewards : np.ndarray 256 Rewards obtained in each episode. 257 """ 258 if nS is None: 259 nS = self.env.observation_space.n 260 if nA is None: 261 nA = self.env.action_space.n 262 pi_track = [] 263 Q = np.zeros((nS, nA), dtype=np.float32) 264 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 265 rewards = np.zeros(n_episodes, dtype=np.float32) 266 alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes) 267 epsilons = RL.decay_schedule( 268 init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes 269 ) 270 271 for e in tqdm(range(n_episodes), leave=False): 272 self.callbacks.on_episode_begin(self) 273 self.callbacks.on_episode(self, episode=e) 274 state, info = self.env.reset() 275 done = False 276 state = convert_state_obs(state) 277 action = self.select_action(state, Q, epsilons[e]) 278 total_reward = 0 279 while not done: 280 if self.render: 281 warnings.warn( 282 "Occasional render has been deprecated by openAI. Use test_env.py to render." 283 ) 284 next_state, reward, terminated, truncated, _ = self.env.step(action) 285 if truncated: 286 warnings.warn( 287 "Episode was truncated. TD target value may be incorrect." 288 ) 289 done = terminated or truncated 290 self.callbacks.on_env_step(self) 291 next_state = convert_state_obs(next_state) 292 next_action = self.select_action(next_state, Q, epsilons[e]) 293 td_target = reward + gamma * Q[next_state][next_action] * (not done) 294 td_error = td_target - Q[state][action] 295 Q[state][action] = Q[state][action] + alphas[e] * td_error 296 state, action = next_state, next_action 297 total_reward += reward 298 rewards[e] = total_reward 299 Q_track[e] = Q 300 pi_track.append(np.argmax(Q, axis=1)) 301 self.render = False 302 self.callbacks.on_episode_end(self) 303 304 V = np.max(Q, axis=1) 305 306 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 307 return Q, V, pi, Q_track, pi_track, rewards
RL(env)
35 def __init__(self, env): 36 self.env = env 37 self.callbacks = MyCallbacks() 38 self.render = False 39 # Explanation of lambda: 40 # def select_action(state, Q, epsilon): 41 # if np.random.random() > epsilon: 42 # max_val = np.max(Q[state]) 43 # indxs_selector = np.isclose(Q[state], max_val) 44 # indxs = np.arange(len(Q[state]))[indxs_selector] 45 # return np.random.choice(indxs) 46 # else: 47 # return np.random.randint(len(Q[state])) 48 self.select_action = ( 49 lambda state, Q, epsilon: np.random.choice( 50 np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))] 51 ) 52 if np.random.random() > epsilon 53 else np.random.randint(len(Q[state])) 54 )
@staticmethod
def
decay_schedule( init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
56 @staticmethod 57 def decay_schedule( 58 init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10 59 ): 60 """ 61 Generates a decay schedule for a given initial value. 62 63 Parameters 64 ---------- 65 init_value : float 66 Initial value of the quantity being decayed. 67 min_value : float 68 Minimum value init_value is allowed to decay to. 69 decay_ratio : float 70 The exponential factor exp(decay_ratio). 71 max_steps : int 72 Max iteration steps for decaying init_value. 73 log_start : float, optional 74 Starting value of the decay sequence, by default -2. 75 log_base : float, optional 76 Base of the log space, by default 10. 77 78 Returns 79 ------- 80 np.ndarray 81 Decay values where values[i] is the value used at i-th step. 82 """ 83 decay_steps = int(max_steps * decay_ratio) 84 rem_steps = max_steps - decay_steps 85 values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[ 86 ::-1 87 ] 88 values = (values - values.min()) / (values.max() - values.min()) 89 values = (init_value - min_value) * values + min_value 90 values = np.pad(values, (0, rem_steps), "edge") 91 return values
Generates a decay schedule for a given initial value.
Parameters
- init_value (float): Initial value of the quantity being decayed.
- min_value (float): Minimum value init_value is allowed to decay to.
- decay_ratio (float): The exponential factor exp(decay_ratio).
- max_steps (int): Max iteration steps for decaying init_value.
- log_start (float, optional): Starting value of the decay sequence, by default -2.
- log_base (float, optional): Base of the log space, by default 10.
Returns
- np.ndarray: Decay values where values[i] is the value used at i-th step.
def
q_learning( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
93 def q_learning( 94 self, 95 nS=None, 96 nA=None, 97 convert_state_obs=lambda state: state, 98 gamma=0.99, 99 init_alpha=0.5, 100 min_alpha=0.01, 101 alpha_decay_ratio=0.5, 102 init_epsilon=1.0, 103 min_epsilon=0.1, 104 epsilon_decay_ratio=0.9, 105 n_episodes=10000, 106 ): 107 """ 108 Q-Learning algorithm. 109 110 Parameters 111 ---------- 112 nS : int, optional 113 Number of states, by default None. 114 nA : int, optional 115 Number of available actions, by default None. 116 convert_state_obs : function, optional 117 Converts state into an integer, by default lambda state: state. 118 gamma : float, optional 119 Discount factor, by default 0.99. 120 init_alpha : float, optional 121 Initial learning rate, by default 0.5. 122 min_alpha : float, optional 123 Minimum learning rate, by default 0.01. 124 alpha_decay_ratio : float, optional 125 Decay schedule of learning rate for future iterations, by default 0.5. 126 init_epsilon : float, optional 127 Initial epsilon value for epsilon greedy strategy, by default 1.0. 128 min_epsilon : float, optional 129 Minimum epsilon, by default 0.1. 130 epsilon_decay_ratio : float, optional 131 Decay schedule of epsilon for future iterations, by default 0.9. 132 n_episodes : int, optional 133 Number of episodes for the agent, by default 10000. 134 135 Returns 136 ------- 137 tuple 138 Q : np.ndarray 139 Final action-value function Q(s,a). 140 V : np.ndarray 141 State values array. 142 pi : dict 143 Policy mapping states to actions. 144 Q_track : np.ndarray 145 Log of Q(s,a) for each episode. 146 pi_track : list 147 Log of complete policy for each episode. 148 rewards : np.ndarray 149 Rewards obtained in each episode. 150 """ 151 if nS is None: 152 nS = self.env.observation_space.n 153 if nA is None: 154 nA = self.env.action_space.n 155 pi_track = [] 156 Q = np.zeros((nS, nA), dtype=np.float32) 157 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 158 alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes) 159 epsilons = RL.decay_schedule( 160 init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes 161 ) 162 rewards = np.zeros(n_episodes, dtype=np.float32) 163 for e in tqdm(range(n_episodes), leave=False): 164 self.callbacks.on_episode_begin(self) 165 self.callbacks.on_episode(self, episode=e) 166 state, info = self.env.reset() 167 done = False 168 state = convert_state_obs(state) 169 total_reward = 0 170 while not done: 171 if self.render: 172 warnings.warn( 173 "Occasional render has been deprecated by openAI. Use test_env.py to render." 174 ) 175 action = self.select_action(state, Q, epsilons[e]) 176 next_state, reward, terminated, truncated, _ = self.env.step(action) 177 if truncated: 178 warnings.warn( 179 "Episode was truncated. TD target value may be incorrect." 180 ) 181 done = terminated or truncated 182 self.callbacks.on_env_step(self) 183 next_state = convert_state_obs(next_state) 184 td_target = reward + gamma * Q[next_state].max() * (not done) 185 td_error = td_target - Q[state][action] 186 Q[state][action] = Q[state][action] + alphas[e] * td_error 187 state = next_state 188 total_reward += reward 189 rewards[e] = total_reward 190 Q_track[e] = Q 191 pi_track.append(np.argmax(Q, axis=1)) 192 self.render = False 193 self.callbacks.on_episode_end(self) 194 195 V = np.max(Q, axis=1) 196 197 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 198 return Q, V, pi, Q_track, pi_track, rewards
Q-Learning algorithm.
Parameters
- nS (int, optional): Number of states, by default None.
- nA (int, optional): Number of available actions, by default None.
- convert_state_obs (function, optional): Converts state into an integer, by default lambda state: state.
- gamma (float, optional): Discount factor, by default 0.99.
- init_alpha (float, optional): Initial learning rate, by default 0.5.
- min_alpha (float, optional): Minimum learning rate, by default 0.01.
- alpha_decay_ratio (float, optional): Decay schedule of learning rate for future iterations, by default 0.5.
- init_epsilon (float, optional): Initial epsilon value for epsilon greedy strategy, by default 1.0.
- min_epsilon (float, optional): Minimum epsilon, by default 0.1.
- epsilon_decay_ratio (float, optional): Decay schedule of epsilon for future iterations, by default 0.9.
- n_episodes (int, optional): Number of episodes for the agent, by default 10000.
Returns
- tuple: Q : np.ndarray Final action-value function Q(s,a). V : np.ndarray State values array. pi : dict Policy mapping states to actions. Q_track : np.ndarray Log of Q(s,a) for each episode. pi_track : list Log of complete policy for each episode. rewards : np.ndarray Rewards obtained in each episode.
def
sarsa( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
200 def sarsa( 201 self, 202 nS=None, 203 nA=None, 204 convert_state_obs=lambda state: state, 205 gamma=0.99, 206 init_alpha=0.5, 207 min_alpha=0.01, 208 alpha_decay_ratio=0.5, 209 init_epsilon=1.0, 210 min_epsilon=0.1, 211 epsilon_decay_ratio=0.9, 212 n_episodes=10000, 213 ): 214 """ 215 SARSA algorithm. 216 217 Parameters 218 ---------- 219 nS : int, optional 220 Number of states, by default None. 221 nA : int, optional 222 Number of available actions, by default None. 223 convert_state_obs : function, optional 224 Converts state into an integer, by default lambda state: state. 225 gamma : float, optional 226 Discount factor, by default 0.99. 227 init_alpha : float, optional 228 Initial learning rate, by default 0.5. 229 min_alpha : float, optional 230 Minimum learning rate, by default 0.01. 231 alpha_decay_ratio : float, optional 232 Decay schedule of learning rate for future iterations, by default 0.5. 233 init_epsilon : float, optional 234 Initial epsilon value for epsilon greedy strategy, by default 1.0. 235 min_epsilon : float, optional 236 Minimum epsilon, by default 0.1. 237 epsilon_decay_ratio : float, optional 238 Decay schedule of epsilon for future iterations, by default 0.9. 239 n_episodes : int, optional 240 Number of episodes for the agent, by default 10000. 241 242 Returns 243 ------- 244 tuple 245 Q : np.ndarray 246 Final action-value function Q(s,a). 247 V : np.ndarray 248 State values array. 249 pi : dict 250 Policy mapping states to actions. 251 Q_track : np.ndarray 252 Log of Q(s,a) for each episode. 253 pi_track : list 254 Log of complete policy for each episode. 255 rewards : np.ndarray 256 Rewards obtained in each episode. 257 """ 258 if nS is None: 259 nS = self.env.observation_space.n 260 if nA is None: 261 nA = self.env.action_space.n 262 pi_track = [] 263 Q = np.zeros((nS, nA), dtype=np.float32) 264 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 265 rewards = np.zeros(n_episodes, dtype=np.float32) 266 alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes) 267 epsilons = RL.decay_schedule( 268 init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes 269 ) 270 271 for e in tqdm(range(n_episodes), leave=False): 272 self.callbacks.on_episode_begin(self) 273 self.callbacks.on_episode(self, episode=e) 274 state, info = self.env.reset() 275 done = False 276 state = convert_state_obs(state) 277 action = self.select_action(state, Q, epsilons[e]) 278 total_reward = 0 279 while not done: 280 if self.render: 281 warnings.warn( 282 "Occasional render has been deprecated by openAI. Use test_env.py to render." 283 ) 284 next_state, reward, terminated, truncated, _ = self.env.step(action) 285 if truncated: 286 warnings.warn( 287 "Episode was truncated. TD target value may be incorrect." 288 ) 289 done = terminated or truncated 290 self.callbacks.on_env_step(self) 291 next_state = convert_state_obs(next_state) 292 next_action = self.select_action(next_state, Q, epsilons[e]) 293 td_target = reward + gamma * Q[next_state][next_action] * (not done) 294 td_error = td_target - Q[state][action] 295 Q[state][action] = Q[state][action] + alphas[e] * td_error 296 state, action = next_state, next_action 297 total_reward += reward 298 rewards[e] = total_reward 299 Q_track[e] = Q 300 pi_track.append(np.argmax(Q, axis=1)) 301 self.render = False 302 self.callbacks.on_episode_end(self) 303 304 V = np.max(Q, axis=1) 305 306 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 307 return Q, V, pi, Q_track, pi_track, rewards
SARSA algorithm.
Parameters
- nS (int, optional): Number of states, by default None.
- nA (int, optional): Number of available actions, by default None.
- convert_state_obs (function, optional): Converts state into an integer, by default lambda state: state.
- gamma (float, optional): Discount factor, by default 0.99.
- init_alpha (float, optional): Initial learning rate, by default 0.5.
- min_alpha (float, optional): Minimum learning rate, by default 0.01.
- alpha_decay_ratio (float, optional): Decay schedule of learning rate for future iterations, by default 0.5.
- init_epsilon (float, optional): Initial epsilon value for epsilon greedy strategy, by default 1.0.
- min_epsilon (float, optional): Minimum epsilon, by default 0.1.
- epsilon_decay_ratio (float, optional): Decay schedule of epsilon for future iterations, by default 0.9.
- n_episodes (int, optional): Number of episodes for the agent, by default 10000.
Returns
- tuple: Q : np.ndarray Final action-value function Q(s,a). V : np.ndarray State values array. pi : dict Policy mapping states to actions. Q_track : np.ndarray Log of Q(s,a) for each episode. pi_track : list Log of complete policy for each episode. rewards : np.ndarray Rewards obtained in each episode.