algorithms.rl
Author: Miguel Morales BSD 3-Clause License
Copyright (c) 2018, Miguel Morales All rights reserved. https://github.com/mimoralea/gdrl/blob/master/LICENSE
1""" 2Author: Miguel Morales 3BSD 3-Clause License 4 5Copyright (c) 2018, Miguel Morales 6All rights reserved. 7https://github.com/mimoralea/gdrl/blob/master/LICENSE 8""" 9 10""" 11modified by: John Mansfield 12 13documentation added by: Gagandeep Randhawa 14""" 15 16""" 17Class that contains functions related to reinforcement learning algorithms. RL init expects an OpenAI environment (env). 18 19Model-free learning algorithms: Q-Learning and SARSA 20work out of the box with any gymnasium environments that 21have single discrete valued state spaces, like frozen lake. A lambda function 22is required to convert state spaces not in this format. 23""" 24 25import numpy as np 26from tqdm.auto import tqdm 27from bettermdptools.utils.callbacks import MyCallbacks 28import warnings 29 30 31class RL: 32 def __init__(self, env): 33 self.env = env 34 self.callbacks = MyCallbacks() 35 self.render = False 36 # Explanation of lambda: 37 # def select_action(state, Q, epsilon): 38 # if np.random.random() > epsilon: 39 # max_val = np.max(Q[state]) 40 # indxs_selector = np.isclose(Q[state], max_val) 41 # indxs = np.arange(len(Q[state]))[indxs_selector] 42 # return np.random.choice(indxs) 43 # else: 44 # return np.random.randint(len(Q[state])) 45 self.select_action = lambda state, Q, epsilon: \ 46 np.random.choice(np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]) \ 47 if np.random.random() > epsilon \ 48 else np.random.randint(len(Q[state])) 49 50 @staticmethod 51 def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10): 52 """ 53 Parameters 54 ---------------------------- 55 init_value {float}: 56 Initial value of the quantity being decayed 57 58 min_value {float}: 59 Minimum value init_value is allowed to decay to 60 61 decay_ratio {float}: 62 The exponential factor exp(decay_ratio). 63 Updated decayed value is calculated as 64 65 max_steps {int}: 66 Max iteration steps for decaying init_value 67 68 log_start {array-like}, default = -2: 69 Starting value of the decay sequence. 70 Default value starts it at 0.01 71 72 log_base {array-like}, default = 10: 73 Base of the log space. 74 75 76 Returns 77 ---------------------------- 78 values {array-like}, shape(max_steps): 79 Decay values where values[i] is the value used at i-th step 80 """ 81 decay_steps = int(max_steps * decay_ratio) 82 rem_steps = max_steps - decay_steps 83 values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1] 84 values = (values - values.min()) / (values.max() - values.min()) 85 values = (init_value - min_value) * values + min_value 86 values = np.pad(values, (0, rem_steps), 'edge') 87 return values 88 89 def q_learning(self, 90 nS=None, 91 nA=None, 92 convert_state_obs=lambda state: state, 93 gamma=.99, 94 init_alpha=0.5, 95 min_alpha=0.01, 96 alpha_decay_ratio=0.5, 97 init_epsilon=1.0, 98 min_epsilon=0.1, 99 epsilon_decay_ratio=0.9, 100 n_episodes=10000): 101 """ 102 Parameters 103 ---------------------------- 104 nS {int}: 105 Number of states 106 107 nA {int}: 108 Number of available actions 109 110 convert_state_obs {lambda}: 111 Converts state into an integer 112 113 gamma {float}, default = 0.99: 114 Discount factor 115 116 init_alpha {float}, default = 0.5: 117 Learning rate 118 119 min_alpha {float}, default = 0.01: 120 Minimum learning rate 121 122 alpha_decay_ratio {float}, default = 0.5: 123 Decay schedule of learing rate for future iterations 124 125 init_epsilon {float}, default = 1.0: 126 Initial epsilon value for epsilon greedy strategy. 127 Chooses max(Q) over available actions with probability 1-epsilon. 128 129 min_epsilon {float}, default = 0.1: 130 Minimum epsilon. Used to balance exploration in later stages. 131 132 epsilon_decay_ratio {float}, default = 0.9: 133 Decay schedule of epsilon for future iterations 134 135 n_episodes {int}, default = 10000: 136 Number of episodes for the agent 137 138 139 Returns 140 ---------------------------- 141 Q {numpy array}, shape(nS, nA): 142 Final action-value function Q(s,a) 143 144 pi {lambda}, input state value, output action value: 145 Policy mapping states to actions. 146 147 V {numpy array}, shape(nS): 148 State values array 149 150 Q_track {numpy array}, shape(n_episodes, nS, nA): 151 Log of Q(s,a) for each episode 152 153 pi_track {list}, len(n_episodes): 154 Log of complete policy for each episode 155 """ 156 if nS is None: 157 nS=self.env.observation_space.n 158 if nA is None: 159 nA=self.env.action_space.n 160 pi_track = [] 161 Q = np.zeros((nS, nA), dtype=np.float32) 162 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 163 alphas = RL.decay_schedule(init_alpha, 164 min_alpha, 165 alpha_decay_ratio, 166 n_episodes) 167 epsilons = RL.decay_schedule(init_epsilon, 168 min_epsilon, 169 epsilon_decay_ratio, 170 n_episodes) 171 rewards = np.zeros(n_episodes, dtype=np.float32) 172 for e in tqdm(range(n_episodes), leave=False): 173 self.callbacks.on_episode_begin(self) 174 self.callbacks.on_episode(self, episode=e) 175 state, info = self.env.reset() 176 done = False 177 state = convert_state_obs(state) 178 total_reward = 0 179 while not done: 180 if self.render: 181 warnings.warn("Occasional render has been deprecated by openAI. Use test_env.py to render.") 182 action = self.select_action(state, Q, epsilons[e]) 183 next_state, reward, terminated, truncated, _ = self.env.step(action) 184 if truncated: 185 warnings.warn("Episode was truncated. TD target value may be incorrect.") 186 done = terminated or truncated 187 self.callbacks.on_env_step(self) 188 next_state = convert_state_obs(next_state) 189 td_target = reward + gamma * Q[next_state].max() * (not done) 190 td_error = td_target - Q[state][action] 191 Q[state][action] = Q[state][action] + alphas[e] * td_error 192 state = next_state 193 total_reward += reward 194 rewards[e] = total_reward 195 Q_track[e] = Q 196 pi_track.append(np.argmax(Q, axis=1)) 197 self.render = False 198 self.callbacks.on_episode_end(self) 199 200 V = np.max(Q, axis=1) 201 202 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 203 return Q, V, pi, Q_track, pi_track, rewards 204 205 def sarsa(self, 206 nS=None, 207 nA=None, 208 convert_state_obs=lambda state: state, 209 gamma=.99, 210 init_alpha=0.5, 211 min_alpha=0.01, 212 alpha_decay_ratio=0.5, 213 init_epsilon=1.0, 214 min_epsilon=0.1, 215 epsilon_decay_ratio=0.9, 216 n_episodes=10000): 217 """ 218 Parameters 219 ---------------------------- 220 nS {int}: 221 Number of states 222 223 nA {int}: 224 Number of available actions 225 226 convert_state_obs {lambda}: 227 Converts state into an integer 228 229 gamma {float}, default = 0.99: 230 Discount factor 231 232 init_alpha {float}, default = 0.5: 233 Learning rate 234 235 min_alpha {float}, default = 0.01: 236 Minimum learning rate 237 238 alpha_decay_ratio {float}, default = 0.5: 239 Decay schedule of learing rate for future iterations 240 241 init_epsilon {float}, default = 1.0: 242 Initial epsilon value for epsilon greedy strategy. 243 Chooses max(Q) over available actions with probability 1-epsilon. 244 245 min_epsilon {float}, default = 0.1: 246 Minimum epsilon. Used to balance exploration in later stages. 247 248 epsilon_decay_ratio {float}, default = 0.9: 249 Decay schedule of epsilon for future iterations 250 251 n_episodes {int}, default = 10000: 252 Number of episodes for the agent 253 254 255 Returns 256 ---------------------------- 257 Q {numpy array}, shape(nS, nA): 258 Final action-value function Q(s,a) 259 260 pi {lambda}, input state value, output action value: 261 Policy mapping states to actions. 262 263 V {numpy array}, shape(nS): 264 State values array 265 266 Q_track {numpy array}, shape(n_episodes, nS, nA): 267 Log of Q(s,a) for each episode 268 269 pi_track {list}, len(n_episodes): 270 Log of complete policy for each episode 271 """ 272 if nS is None: 273 nS = self.env.observation_space.n 274 if nA is None: 275 nA = self.env.action_space.n 276 pi_track = [] 277 Q = np.zeros((nS, nA), dtype=np.float32) 278 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 279 rewards = np.zeros(n_episodes, dtype=np.float32) 280 alphas = RL.decay_schedule(init_alpha, 281 min_alpha, 282 alpha_decay_ratio, 283 n_episodes) 284 epsilons = RL.decay_schedule(init_epsilon, 285 min_epsilon, 286 epsilon_decay_ratio, 287 n_episodes) 288 289 for e in tqdm(range(n_episodes), leave=False): 290 self.callbacks.on_episode_begin(self) 291 self.callbacks.on_episode(self, episode=e) 292 state, info = self.env.reset() 293 done = False 294 state = convert_state_obs(state) 295 action = self.select_action(state, Q, epsilons[e]) 296 total_reward = 0 297 while not done: 298 if self.render: 299 warnings.warn("Occasional render has been deprecated by openAI. Use test_env.py to render.") 300 next_state, reward, terminated, truncated, _ = self.env.step(action) 301 if truncated: 302 warnings.warn("Episode was truncated. TD target value may be incorrect.") 303 done = terminated or truncated 304 self.callbacks.on_env_step(self) 305 next_state = convert_state_obs(next_state) 306 next_action = self.select_action(next_state, Q, epsilons[e]) 307 td_target = reward + gamma * Q[next_state][next_action] * (not done) 308 td_error = td_target - Q[state][action] 309 Q[state][action] = Q[state][action] + alphas[e] * td_error 310 state, action = next_state, next_action 311 total_reward += reward 312 rewards[e] = total_reward 313 Q_track[e] = Q 314 pi_track.append(np.argmax(Q, axis=1)) 315 self.render = False 316 self.callbacks.on_episode_end(self) 317 318 V = np.max(Q, axis=1) 319 320 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 321 return Q, V, pi, Q_track, pi_track, rewards
class
RL:
32class RL: 33 def __init__(self, env): 34 self.env = env 35 self.callbacks = MyCallbacks() 36 self.render = False 37 # Explanation of lambda: 38 # def select_action(state, Q, epsilon): 39 # if np.random.random() > epsilon: 40 # max_val = np.max(Q[state]) 41 # indxs_selector = np.isclose(Q[state], max_val) 42 # indxs = np.arange(len(Q[state]))[indxs_selector] 43 # return np.random.choice(indxs) 44 # else: 45 # return np.random.randint(len(Q[state])) 46 self.select_action = lambda state, Q, epsilon: \ 47 np.random.choice(np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]) \ 48 if np.random.random() > epsilon \ 49 else np.random.randint(len(Q[state])) 50 51 @staticmethod 52 def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10): 53 """ 54 Parameters 55 ---------------------------- 56 init_value {float}: 57 Initial value of the quantity being decayed 58 59 min_value {float}: 60 Minimum value init_value is allowed to decay to 61 62 decay_ratio {float}: 63 The exponential factor exp(decay_ratio). 64 Updated decayed value is calculated as 65 66 max_steps {int}: 67 Max iteration steps for decaying init_value 68 69 log_start {array-like}, default = -2: 70 Starting value of the decay sequence. 71 Default value starts it at 0.01 72 73 log_base {array-like}, default = 10: 74 Base of the log space. 75 76 77 Returns 78 ---------------------------- 79 values {array-like}, shape(max_steps): 80 Decay values where values[i] is the value used at i-th step 81 """ 82 decay_steps = int(max_steps * decay_ratio) 83 rem_steps = max_steps - decay_steps 84 values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1] 85 values = (values - values.min()) / (values.max() - values.min()) 86 values = (init_value - min_value) * values + min_value 87 values = np.pad(values, (0, rem_steps), 'edge') 88 return values 89 90 def q_learning(self, 91 nS=None, 92 nA=None, 93 convert_state_obs=lambda state: state, 94 gamma=.99, 95 init_alpha=0.5, 96 min_alpha=0.01, 97 alpha_decay_ratio=0.5, 98 init_epsilon=1.0, 99 min_epsilon=0.1, 100 epsilon_decay_ratio=0.9, 101 n_episodes=10000): 102 """ 103 Parameters 104 ---------------------------- 105 nS {int}: 106 Number of states 107 108 nA {int}: 109 Number of available actions 110 111 convert_state_obs {lambda}: 112 Converts state into an integer 113 114 gamma {float}, default = 0.99: 115 Discount factor 116 117 init_alpha {float}, default = 0.5: 118 Learning rate 119 120 min_alpha {float}, default = 0.01: 121 Minimum learning rate 122 123 alpha_decay_ratio {float}, default = 0.5: 124 Decay schedule of learing rate for future iterations 125 126 init_epsilon {float}, default = 1.0: 127 Initial epsilon value for epsilon greedy strategy. 128 Chooses max(Q) over available actions with probability 1-epsilon. 129 130 min_epsilon {float}, default = 0.1: 131 Minimum epsilon. Used to balance exploration in later stages. 132 133 epsilon_decay_ratio {float}, default = 0.9: 134 Decay schedule of epsilon for future iterations 135 136 n_episodes {int}, default = 10000: 137 Number of episodes for the agent 138 139 140 Returns 141 ---------------------------- 142 Q {numpy array}, shape(nS, nA): 143 Final action-value function Q(s,a) 144 145 pi {lambda}, input state value, output action value: 146 Policy mapping states to actions. 147 148 V {numpy array}, shape(nS): 149 State values array 150 151 Q_track {numpy array}, shape(n_episodes, nS, nA): 152 Log of Q(s,a) for each episode 153 154 pi_track {list}, len(n_episodes): 155 Log of complete policy for each episode 156 """ 157 if nS is None: 158 nS=self.env.observation_space.n 159 if nA is None: 160 nA=self.env.action_space.n 161 pi_track = [] 162 Q = np.zeros((nS, nA), dtype=np.float32) 163 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 164 alphas = RL.decay_schedule(init_alpha, 165 min_alpha, 166 alpha_decay_ratio, 167 n_episodes) 168 epsilons = RL.decay_schedule(init_epsilon, 169 min_epsilon, 170 epsilon_decay_ratio, 171 n_episodes) 172 rewards = np.zeros(n_episodes, dtype=np.float32) 173 for e in tqdm(range(n_episodes), leave=False): 174 self.callbacks.on_episode_begin(self) 175 self.callbacks.on_episode(self, episode=e) 176 state, info = self.env.reset() 177 done = False 178 state = convert_state_obs(state) 179 total_reward = 0 180 while not done: 181 if self.render: 182 warnings.warn("Occasional render has been deprecated by openAI. Use test_env.py to render.") 183 action = self.select_action(state, Q, epsilons[e]) 184 next_state, reward, terminated, truncated, _ = self.env.step(action) 185 if truncated: 186 warnings.warn("Episode was truncated. TD target value may be incorrect.") 187 done = terminated or truncated 188 self.callbacks.on_env_step(self) 189 next_state = convert_state_obs(next_state) 190 td_target = reward + gamma * Q[next_state].max() * (not done) 191 td_error = td_target - Q[state][action] 192 Q[state][action] = Q[state][action] + alphas[e] * td_error 193 state = next_state 194 total_reward += reward 195 rewards[e] = total_reward 196 Q_track[e] = Q 197 pi_track.append(np.argmax(Q, axis=1)) 198 self.render = False 199 self.callbacks.on_episode_end(self) 200 201 V = np.max(Q, axis=1) 202 203 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 204 return Q, V, pi, Q_track, pi_track, rewards 205 206 def sarsa(self, 207 nS=None, 208 nA=None, 209 convert_state_obs=lambda state: state, 210 gamma=.99, 211 init_alpha=0.5, 212 min_alpha=0.01, 213 alpha_decay_ratio=0.5, 214 init_epsilon=1.0, 215 min_epsilon=0.1, 216 epsilon_decay_ratio=0.9, 217 n_episodes=10000): 218 """ 219 Parameters 220 ---------------------------- 221 nS {int}: 222 Number of states 223 224 nA {int}: 225 Number of available actions 226 227 convert_state_obs {lambda}: 228 Converts state into an integer 229 230 gamma {float}, default = 0.99: 231 Discount factor 232 233 init_alpha {float}, default = 0.5: 234 Learning rate 235 236 min_alpha {float}, default = 0.01: 237 Minimum learning rate 238 239 alpha_decay_ratio {float}, default = 0.5: 240 Decay schedule of learing rate for future iterations 241 242 init_epsilon {float}, default = 1.0: 243 Initial epsilon value for epsilon greedy strategy. 244 Chooses max(Q) over available actions with probability 1-epsilon. 245 246 min_epsilon {float}, default = 0.1: 247 Minimum epsilon. Used to balance exploration in later stages. 248 249 epsilon_decay_ratio {float}, default = 0.9: 250 Decay schedule of epsilon for future iterations 251 252 n_episodes {int}, default = 10000: 253 Number of episodes for the agent 254 255 256 Returns 257 ---------------------------- 258 Q {numpy array}, shape(nS, nA): 259 Final action-value function Q(s,a) 260 261 pi {lambda}, input state value, output action value: 262 Policy mapping states to actions. 263 264 V {numpy array}, shape(nS): 265 State values array 266 267 Q_track {numpy array}, shape(n_episodes, nS, nA): 268 Log of Q(s,a) for each episode 269 270 pi_track {list}, len(n_episodes): 271 Log of complete policy for each episode 272 """ 273 if nS is None: 274 nS = self.env.observation_space.n 275 if nA is None: 276 nA = self.env.action_space.n 277 pi_track = [] 278 Q = np.zeros((nS, nA), dtype=np.float32) 279 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 280 rewards = np.zeros(n_episodes, dtype=np.float32) 281 alphas = RL.decay_schedule(init_alpha, 282 min_alpha, 283 alpha_decay_ratio, 284 n_episodes) 285 epsilons = RL.decay_schedule(init_epsilon, 286 min_epsilon, 287 epsilon_decay_ratio, 288 n_episodes) 289 290 for e in tqdm(range(n_episodes), leave=False): 291 self.callbacks.on_episode_begin(self) 292 self.callbacks.on_episode(self, episode=e) 293 state, info = self.env.reset() 294 done = False 295 state = convert_state_obs(state) 296 action = self.select_action(state, Q, epsilons[e]) 297 total_reward = 0 298 while not done: 299 if self.render: 300 warnings.warn("Occasional render has been deprecated by openAI. Use test_env.py to render.") 301 next_state, reward, terminated, truncated, _ = self.env.step(action) 302 if truncated: 303 warnings.warn("Episode was truncated. TD target value may be incorrect.") 304 done = terminated or truncated 305 self.callbacks.on_env_step(self) 306 next_state = convert_state_obs(next_state) 307 next_action = self.select_action(next_state, Q, epsilons[e]) 308 td_target = reward + gamma * Q[next_state][next_action] * (not done) 309 td_error = td_target - Q[state][action] 310 Q[state][action] = Q[state][action] + alphas[e] * td_error 311 state, action = next_state, next_action 312 total_reward += reward 313 rewards[e] = total_reward 314 Q_track[e] = Q 315 pi_track.append(np.argmax(Q, axis=1)) 316 self.render = False 317 self.callbacks.on_episode_end(self) 318 319 V = np.max(Q, axis=1) 320 321 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 322 return Q, V, pi, Q_track, pi_track, rewards
RL(env)
33 def __init__(self, env): 34 self.env = env 35 self.callbacks = MyCallbacks() 36 self.render = False 37 # Explanation of lambda: 38 # def select_action(state, Q, epsilon): 39 # if np.random.random() > epsilon: 40 # max_val = np.max(Q[state]) 41 # indxs_selector = np.isclose(Q[state], max_val) 42 # indxs = np.arange(len(Q[state]))[indxs_selector] 43 # return np.random.choice(indxs) 44 # else: 45 # return np.random.randint(len(Q[state])) 46 self.select_action = lambda state, Q, epsilon: \ 47 np.random.choice(np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]) \ 48 if np.random.random() > epsilon \ 49 else np.random.randint(len(Q[state]))
@staticmethod
def
decay_schedule( init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
51 @staticmethod 52 def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10): 53 """ 54 Parameters 55 ---------------------------- 56 init_value {float}: 57 Initial value of the quantity being decayed 58 59 min_value {float}: 60 Minimum value init_value is allowed to decay to 61 62 decay_ratio {float}: 63 The exponential factor exp(decay_ratio). 64 Updated decayed value is calculated as 65 66 max_steps {int}: 67 Max iteration steps for decaying init_value 68 69 log_start {array-like}, default = -2: 70 Starting value of the decay sequence. 71 Default value starts it at 0.01 72 73 log_base {array-like}, default = 10: 74 Base of the log space. 75 76 77 Returns 78 ---------------------------- 79 values {array-like}, shape(max_steps): 80 Decay values where values[i] is the value used at i-th step 81 """ 82 decay_steps = int(max_steps * decay_ratio) 83 rem_steps = max_steps - decay_steps 84 values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1] 85 values = (values - values.min()) / (values.max() - values.min()) 86 values = (init_value - min_value) * values + min_value 87 values = np.pad(values, (0, rem_steps), 'edge') 88 return values
Parameters
- init_value {float}:: Initial value of the quantity being decayed
- min_value {float}:: Minimum value init_value is allowed to decay to
- decay_ratio {float}:: The exponential factor exp(decay_ratio). Updated decayed value is calculated as
- max_steps {int}:: Max iteration steps for decaying init_value
- log_start {array-like}, default = -2:: Starting value of the decay sequence. Default value starts it at 0.01
- log_base {array-like}, default = 10:: Base of the log space.
Returns
- values {array-like}, shape(max_steps):: Decay values where values[i] is the value used at i-th step
def
q_learning( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
90 def q_learning(self, 91 nS=None, 92 nA=None, 93 convert_state_obs=lambda state: state, 94 gamma=.99, 95 init_alpha=0.5, 96 min_alpha=0.01, 97 alpha_decay_ratio=0.5, 98 init_epsilon=1.0, 99 min_epsilon=0.1, 100 epsilon_decay_ratio=0.9, 101 n_episodes=10000): 102 """ 103 Parameters 104 ---------------------------- 105 nS {int}: 106 Number of states 107 108 nA {int}: 109 Number of available actions 110 111 convert_state_obs {lambda}: 112 Converts state into an integer 113 114 gamma {float}, default = 0.99: 115 Discount factor 116 117 init_alpha {float}, default = 0.5: 118 Learning rate 119 120 min_alpha {float}, default = 0.01: 121 Minimum learning rate 122 123 alpha_decay_ratio {float}, default = 0.5: 124 Decay schedule of learing rate for future iterations 125 126 init_epsilon {float}, default = 1.0: 127 Initial epsilon value for epsilon greedy strategy. 128 Chooses max(Q) over available actions with probability 1-epsilon. 129 130 min_epsilon {float}, default = 0.1: 131 Minimum epsilon. Used to balance exploration in later stages. 132 133 epsilon_decay_ratio {float}, default = 0.9: 134 Decay schedule of epsilon for future iterations 135 136 n_episodes {int}, default = 10000: 137 Number of episodes for the agent 138 139 140 Returns 141 ---------------------------- 142 Q {numpy array}, shape(nS, nA): 143 Final action-value function Q(s,a) 144 145 pi {lambda}, input state value, output action value: 146 Policy mapping states to actions. 147 148 V {numpy array}, shape(nS): 149 State values array 150 151 Q_track {numpy array}, shape(n_episodes, nS, nA): 152 Log of Q(s,a) for each episode 153 154 pi_track {list}, len(n_episodes): 155 Log of complete policy for each episode 156 """ 157 if nS is None: 158 nS=self.env.observation_space.n 159 if nA is None: 160 nA=self.env.action_space.n 161 pi_track = [] 162 Q = np.zeros((nS, nA), dtype=np.float32) 163 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 164 alphas = RL.decay_schedule(init_alpha, 165 min_alpha, 166 alpha_decay_ratio, 167 n_episodes) 168 epsilons = RL.decay_schedule(init_epsilon, 169 min_epsilon, 170 epsilon_decay_ratio, 171 n_episodes) 172 rewards = np.zeros(n_episodes, dtype=np.float32) 173 for e in tqdm(range(n_episodes), leave=False): 174 self.callbacks.on_episode_begin(self) 175 self.callbacks.on_episode(self, episode=e) 176 state, info = self.env.reset() 177 done = False 178 state = convert_state_obs(state) 179 total_reward = 0 180 while not done: 181 if self.render: 182 warnings.warn("Occasional render has been deprecated by openAI. Use test_env.py to render.") 183 action = self.select_action(state, Q, epsilons[e]) 184 next_state, reward, terminated, truncated, _ = self.env.step(action) 185 if truncated: 186 warnings.warn("Episode was truncated. TD target value may be incorrect.") 187 done = terminated or truncated 188 self.callbacks.on_env_step(self) 189 next_state = convert_state_obs(next_state) 190 td_target = reward + gamma * Q[next_state].max() * (not done) 191 td_error = td_target - Q[state][action] 192 Q[state][action] = Q[state][action] + alphas[e] * td_error 193 state = next_state 194 total_reward += reward 195 rewards[e] = total_reward 196 Q_track[e] = Q 197 pi_track.append(np.argmax(Q, axis=1)) 198 self.render = False 199 self.callbacks.on_episode_end(self) 200 201 V = np.max(Q, axis=1) 202 203 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 204 return Q, V, pi, Q_track, pi_track, rewards
Parameters
- nS {int}:: Number of states
- nA {int}:: Number of available actions
- convert_state_obs {lambda}:: Converts state into an integer
- gamma {float}, default = 0.99:: Discount factor
- init_alpha {float}, default = 0.5:: Learning rate
- min_alpha {float}, default = 0.01:: Minimum learning rate
- alpha_decay_ratio {float}, default = 0.5:: Decay schedule of learing rate for future iterations
- init_epsilon {float}, default = 1.0:: Initial epsilon value for epsilon greedy strategy. Chooses max(Q) over available actions with probability 1-epsilon.
- min_epsilon {float}, default = 0.1:: Minimum epsilon. Used to balance exploration in later stages.
- epsilon_decay_ratio {float}, default = 0.9:: Decay schedule of epsilon for future iterations
- n_episodes {int}, default = 10000:: Number of episodes for the agent
Returns
- Q {numpy array}, shape(nS, nA):: Final action-value function Q(s,a)
- pi {lambda}, input state value, output action value:: Policy mapping states to actions.
- V {numpy array}, shape(nS):: State values array
- Q_track {numpy array}, shape(n_episodes, nS, nA):: Log of Q(s,a) for each episode
- pi_track {list}, len(n_episodes):: Log of complete policy for each episode
def
sarsa( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
206 def sarsa(self, 207 nS=None, 208 nA=None, 209 convert_state_obs=lambda state: state, 210 gamma=.99, 211 init_alpha=0.5, 212 min_alpha=0.01, 213 alpha_decay_ratio=0.5, 214 init_epsilon=1.0, 215 min_epsilon=0.1, 216 epsilon_decay_ratio=0.9, 217 n_episodes=10000): 218 """ 219 Parameters 220 ---------------------------- 221 nS {int}: 222 Number of states 223 224 nA {int}: 225 Number of available actions 226 227 convert_state_obs {lambda}: 228 Converts state into an integer 229 230 gamma {float}, default = 0.99: 231 Discount factor 232 233 init_alpha {float}, default = 0.5: 234 Learning rate 235 236 min_alpha {float}, default = 0.01: 237 Minimum learning rate 238 239 alpha_decay_ratio {float}, default = 0.5: 240 Decay schedule of learing rate for future iterations 241 242 init_epsilon {float}, default = 1.0: 243 Initial epsilon value for epsilon greedy strategy. 244 Chooses max(Q) over available actions with probability 1-epsilon. 245 246 min_epsilon {float}, default = 0.1: 247 Minimum epsilon. Used to balance exploration in later stages. 248 249 epsilon_decay_ratio {float}, default = 0.9: 250 Decay schedule of epsilon for future iterations 251 252 n_episodes {int}, default = 10000: 253 Number of episodes for the agent 254 255 256 Returns 257 ---------------------------- 258 Q {numpy array}, shape(nS, nA): 259 Final action-value function Q(s,a) 260 261 pi {lambda}, input state value, output action value: 262 Policy mapping states to actions. 263 264 V {numpy array}, shape(nS): 265 State values array 266 267 Q_track {numpy array}, shape(n_episodes, nS, nA): 268 Log of Q(s,a) for each episode 269 270 pi_track {list}, len(n_episodes): 271 Log of complete policy for each episode 272 """ 273 if nS is None: 274 nS = self.env.observation_space.n 275 if nA is None: 276 nA = self.env.action_space.n 277 pi_track = [] 278 Q = np.zeros((nS, nA), dtype=np.float32) 279 Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32) 280 rewards = np.zeros(n_episodes, dtype=np.float32) 281 alphas = RL.decay_schedule(init_alpha, 282 min_alpha, 283 alpha_decay_ratio, 284 n_episodes) 285 epsilons = RL.decay_schedule(init_epsilon, 286 min_epsilon, 287 epsilon_decay_ratio, 288 n_episodes) 289 290 for e in tqdm(range(n_episodes), leave=False): 291 self.callbacks.on_episode_begin(self) 292 self.callbacks.on_episode(self, episode=e) 293 state, info = self.env.reset() 294 done = False 295 state = convert_state_obs(state) 296 action = self.select_action(state, Q, epsilons[e]) 297 total_reward = 0 298 while not done: 299 if self.render: 300 warnings.warn("Occasional render has been deprecated by openAI. Use test_env.py to render.") 301 next_state, reward, terminated, truncated, _ = self.env.step(action) 302 if truncated: 303 warnings.warn("Episode was truncated. TD target value may be incorrect.") 304 done = terminated or truncated 305 self.callbacks.on_env_step(self) 306 next_state = convert_state_obs(next_state) 307 next_action = self.select_action(next_state, Q, epsilons[e]) 308 td_target = reward + gamma * Q[next_state][next_action] * (not done) 309 td_error = td_target - Q[state][action] 310 Q[state][action] = Q[state][action] + alphas[e] * td_error 311 state, action = next_state, next_action 312 total_reward += reward 313 rewards[e] = total_reward 314 Q_track[e] = Q 315 pi_track.append(np.argmax(Q, axis=1)) 316 self.render = False 317 self.callbacks.on_episode_end(self) 318 319 V = np.max(Q, axis=1) 320 321 pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))} 322 return Q, V, pi, Q_track, pi_track, rewards
Parameters
- nS {int}:: Number of states
- nA {int}:: Number of available actions
- convert_state_obs {lambda}:: Converts state into an integer
- gamma {float}, default = 0.99:: Discount factor
- init_alpha {float}, default = 0.5:: Learning rate
- min_alpha {float}, default = 0.01:: Minimum learning rate
- alpha_decay_ratio {float}, default = 0.5:: Decay schedule of learing rate for future iterations
- init_epsilon {float}, default = 1.0:: Initial epsilon value for epsilon greedy strategy. Chooses max(Q) over available actions with probability 1-epsilon.
- min_epsilon {float}, default = 0.1:: Minimum epsilon. Used to balance exploration in later stages.
- epsilon_decay_ratio {float}, default = 0.9:: Decay schedule of epsilon for future iterations
- n_episodes {int}, default = 10000:: Number of episodes for the agent
Returns
- Q {numpy array}, shape(nS, nA):: Final action-value function Q(s,a)
- pi {lambda}, input state value, output action value:: Policy mapping states to actions.
- V {numpy array}, shape(nS):: State values array
- Q_track {numpy array}, shape(n_episodes, nS, nA):: Log of Q(s,a) for each episode
- pi_track {list}, len(n_episodes):: Log of complete policy for each episode