algorithms.rl

Author: Miguel Morales BSD 3-Clause License

Copyright (c) 2018, Miguel Morales All rights reserved. https://github.com/mimoralea/gdrl/blob/master/LICENSE

  1"""
  2Author: Miguel Morales
  3BSD 3-Clause License
  4
  5Copyright (c) 2018, Miguel Morales
  6All rights reserved.
  7https://github.com/mimoralea/gdrl/blob/master/LICENSE
  8"""
  9
 10"""
 11modified by: John Mansfield
 12
 13documentation added by: Gagandeep Randhawa
 14"""
 15
 16"""
 17Class that contains functions related to reinforcement learning algorithms. RL init expects an OpenAI environment (env).
 18
 19Model-free learning algorithms: Q-Learning and SARSA
 20work out of the box with any gymnasium environments that 
 21have single discrete valued state spaces, like frozen lake. A lambda function 
 22is required to convert state spaces not in this format.
 23"""
 24
 25import numpy as np
 26from tqdm.auto import tqdm
 27from bettermdptools.utils.callbacks import MyCallbacks
 28import warnings
 29
 30
 31class RL:
 32    def __init__(self, env):
 33        self.env = env
 34        self.callbacks = MyCallbacks()
 35        self.render = False
 36        # Explanation of lambda:
 37        # def select_action(state, Q, epsilon):
 38        #   if np.random.random() > epsilon:
 39        #       max_val = np.max(Q[state])
 40        #       indxs_selector = np.isclose(Q[state], max_val)
 41        #       indxs = np.arange(len(Q[state]))[indxs_selector]
 42        #       return np.random.choice(indxs)
 43        #   else:
 44        #       return np.random.randint(len(Q[state]))
 45        self.select_action = lambda state, Q, epsilon: \
 46            np.random.choice(np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]) \
 47            if np.random.random() > epsilon \
 48            else np.random.randint(len(Q[state]))
 49
 50    @staticmethod
 51    def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
 52        """
 53        Parameters
 54        ----------------------------
 55        init_value {float}:
 56            Initial value of the quantity being decayed
 57
 58        min_value {float}:
 59            Minimum value init_value is allowed to decay to
 60
 61        decay_ratio {float}:
 62            The exponential factor exp(decay_ratio).
 63            Updated decayed value is calculated as
 64
 65        max_steps {int}:
 66            Max iteration steps for decaying init_value
 67
 68        log_start {array-like}, default = -2:
 69            Starting value of the decay sequence.
 70            Default value starts it at 0.01
 71
 72        log_base {array-like}, default = 10:
 73            Base of the log space.
 74
 75
 76        Returns
 77        ----------------------------
 78        values {array-like}, shape(max_steps):
 79            Decay values where values[i] is the value used at i-th step
 80        """
 81        decay_steps = int(max_steps * decay_ratio)
 82        rem_steps = max_steps - decay_steps
 83        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
 84        values = (values - values.min()) / (values.max() - values.min())
 85        values = (init_value - min_value) * values + min_value
 86        values = np.pad(values, (0, rem_steps), 'edge')
 87        return values
 88
 89    def q_learning(self,
 90                   nS=None,
 91                   nA=None,
 92                   convert_state_obs=lambda state: state,
 93                   gamma=.99,
 94                   init_alpha=0.5,
 95                   min_alpha=0.01,
 96                   alpha_decay_ratio=0.5,
 97                   init_epsilon=1.0,
 98                   min_epsilon=0.1,
 99                   epsilon_decay_ratio=0.9,
100                   n_episodes=10000):
101        """
102        Parameters
103        ----------------------------
104        nS {int}:
105            Number of states
106
107        nA {int}:
108            Number of available actions
109
110        convert_state_obs {lambda}:
111            Converts state into an integer
112
113        gamma {float}, default = 0.99:
114            Discount factor
115
116        init_alpha {float}, default = 0.5:
117            Learning rate
118
119        min_alpha {float}, default = 0.01:
120            Minimum learning rate
121
122        alpha_decay_ratio {float}, default = 0.5:
123            Decay schedule of learing rate for future iterations
124
125        init_epsilon {float}, default = 1.0:
126            Initial epsilon value for epsilon greedy strategy.
127            Chooses max(Q) over available actions with probability 1-epsilon.
128
129        min_epsilon {float}, default = 0.1:
130            Minimum epsilon. Used to balance exploration in later stages.
131
132        epsilon_decay_ratio {float}, default = 0.9:
133            Decay schedule of epsilon for future iterations
134
135        n_episodes {int}, default = 10000:
136            Number of episodes for the agent
137
138
139        Returns
140        ----------------------------
141        Q {numpy array}, shape(nS, nA):
142            Final action-value function Q(s,a)
143
144        pi {lambda}, input state value, output action value:
145            Policy mapping states to actions.
146
147        V {numpy array}, shape(nS):
148            State values array
149
150        Q_track {numpy array}, shape(n_episodes, nS, nA):
151            Log of Q(s,a) for each episode
152
153        pi_track {list}, len(n_episodes):
154            Log of complete policy for each episode
155        """
156        if nS is None:
157            nS=self.env.observation_space.n
158        if nA is None:
159            nA=self.env.action_space.n
160        pi_track = []
161        Q = np.zeros((nS, nA), dtype=np.float32)
162        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
163        alphas = RL.decay_schedule(init_alpha,
164                                min_alpha,
165                                alpha_decay_ratio,
166                                n_episodes)
167        epsilons = RL.decay_schedule(init_epsilon,
168                                  min_epsilon,
169                                  epsilon_decay_ratio,
170                                  n_episodes)
171        rewards = np.zeros(n_episodes, dtype=np.float32)
172        for e in tqdm(range(n_episodes), leave=False):
173            self.callbacks.on_episode_begin(self)
174            self.callbacks.on_episode(self, episode=e)
175            state, info = self.env.reset()
176            done = False
177            state = convert_state_obs(state)
178            total_reward = 0
179            while not done:
180                if self.render:
181                    warnings.warn("Occasional render has been deprecated by openAI.  Use test_env.py to render.")
182                action = self.select_action(state, Q, epsilons[e])
183                next_state, reward, terminated, truncated, _ = self.env.step(action)
184                if truncated:
185                    warnings.warn("Episode was truncated.  TD target value may be incorrect.")
186                done = terminated or truncated
187                self.callbacks.on_env_step(self)
188                next_state = convert_state_obs(next_state)
189                td_target = reward + gamma * Q[next_state].max() * (not done)
190                td_error = td_target - Q[state][action]
191                Q[state][action] = Q[state][action] + alphas[e] * td_error
192                state = next_state
193                total_reward += reward
194            rewards[e] = total_reward
195            Q_track[e] = Q
196            pi_track.append(np.argmax(Q, axis=1))
197            self.render = False
198            self.callbacks.on_episode_end(self)
199
200        V = np.max(Q, axis=1)
201
202        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
203        return Q, V, pi, Q_track, pi_track, rewards
204
205    def sarsa(self,
206              nS=None,
207              nA=None,
208              convert_state_obs=lambda state: state,
209              gamma=.99,
210              init_alpha=0.5,
211              min_alpha=0.01,
212              alpha_decay_ratio=0.5,
213              init_epsilon=1.0,
214              min_epsilon=0.1,
215              epsilon_decay_ratio=0.9,
216              n_episodes=10000):
217        """
218        Parameters
219        ----------------------------
220        nS {int}:
221            Number of states
222
223        nA {int}:
224            Number of available actions
225
226        convert_state_obs {lambda}:
227            Converts state into an integer
228
229        gamma {float}, default = 0.99:
230            Discount factor
231
232        init_alpha {float}, default = 0.5:
233            Learning rate
234
235        min_alpha {float}, default = 0.01:
236            Minimum learning rate
237
238        alpha_decay_ratio {float}, default = 0.5:
239            Decay schedule of learing rate for future iterations
240
241        init_epsilon {float}, default = 1.0:
242            Initial epsilon value for epsilon greedy strategy.
243            Chooses max(Q) over available actions with probability 1-epsilon.
244
245        min_epsilon {float}, default = 0.1:
246            Minimum epsilon. Used to balance exploration in later stages.
247
248        epsilon_decay_ratio {float}, default = 0.9:
249            Decay schedule of epsilon for future iterations
250
251        n_episodes {int}, default = 10000:
252            Number of episodes for the agent
253
254
255        Returns
256        ----------------------------
257        Q {numpy array}, shape(nS, nA):
258            Final action-value function Q(s,a)
259
260        pi {lambda}, input state value, output action value:
261            Policy mapping states to actions.
262
263        V {numpy array}, shape(nS):
264            State values array
265
266        Q_track {numpy array}, shape(n_episodes, nS, nA):
267            Log of Q(s,a) for each episode
268
269        pi_track {list}, len(n_episodes):
270            Log of complete policy for each episode
271        """
272        if nS is None:
273            nS = self.env.observation_space.n
274        if nA is None:
275            nA = self.env.action_space.n
276        pi_track = []
277        Q = np.zeros((nS, nA), dtype=np.float32)
278        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
279        rewards = np.zeros(n_episodes, dtype=np.float32)
280        alphas = RL.decay_schedule(init_alpha,
281                                min_alpha,
282                                alpha_decay_ratio,
283                                n_episodes)
284        epsilons = RL.decay_schedule(init_epsilon,
285                                  min_epsilon,
286                                  epsilon_decay_ratio,
287                                  n_episodes)
288
289        for e in tqdm(range(n_episodes), leave=False):
290            self.callbacks.on_episode_begin(self)
291            self.callbacks.on_episode(self, episode=e)
292            state, info = self.env.reset()
293            done = False
294            state = convert_state_obs(state)
295            action = self.select_action(state, Q, epsilons[e])
296            total_reward = 0
297            while not done:
298                if self.render:
299                    warnings.warn("Occasional render has been deprecated by openAI.  Use test_env.py to render.")
300                next_state, reward, terminated, truncated, _ = self.env.step(action)
301                if truncated:
302                    warnings.warn("Episode was truncated.  TD target value may be incorrect.")
303                done = terminated or truncated
304                self.callbacks.on_env_step(self)
305                next_state = convert_state_obs(next_state)
306                next_action = self.select_action(next_state, Q, epsilons[e])
307                td_target = reward + gamma * Q[next_state][next_action] * (not done)
308                td_error = td_target - Q[state][action]
309                Q[state][action] = Q[state][action] + alphas[e] * td_error
310                state, action = next_state, next_action
311                total_reward += reward
312            rewards[e] = total_reward
313            Q_track[e] = Q
314            pi_track.append(np.argmax(Q, axis=1))
315            self.render = False
316            self.callbacks.on_episode_end(self)
317
318        V = np.max(Q, axis=1)
319
320        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
321        return Q, V, pi, Q_track, pi_track, rewards
class RL:
 32class RL:
 33    def __init__(self, env):
 34        self.env = env
 35        self.callbacks = MyCallbacks()
 36        self.render = False
 37        # Explanation of lambda:
 38        # def select_action(state, Q, epsilon):
 39        #   if np.random.random() > epsilon:
 40        #       max_val = np.max(Q[state])
 41        #       indxs_selector = np.isclose(Q[state], max_val)
 42        #       indxs = np.arange(len(Q[state]))[indxs_selector]
 43        #       return np.random.choice(indxs)
 44        #   else:
 45        #       return np.random.randint(len(Q[state]))
 46        self.select_action = lambda state, Q, epsilon: \
 47            np.random.choice(np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]) \
 48            if np.random.random() > epsilon \
 49            else np.random.randint(len(Q[state]))
 50
 51    @staticmethod
 52    def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
 53        """
 54        Parameters
 55        ----------------------------
 56        init_value {float}:
 57            Initial value of the quantity being decayed
 58
 59        min_value {float}:
 60            Minimum value init_value is allowed to decay to
 61
 62        decay_ratio {float}:
 63            The exponential factor exp(decay_ratio).
 64            Updated decayed value is calculated as
 65
 66        max_steps {int}:
 67            Max iteration steps for decaying init_value
 68
 69        log_start {array-like}, default = -2:
 70            Starting value of the decay sequence.
 71            Default value starts it at 0.01
 72
 73        log_base {array-like}, default = 10:
 74            Base of the log space.
 75
 76
 77        Returns
 78        ----------------------------
 79        values {array-like}, shape(max_steps):
 80            Decay values where values[i] is the value used at i-th step
 81        """
 82        decay_steps = int(max_steps * decay_ratio)
 83        rem_steps = max_steps - decay_steps
 84        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
 85        values = (values - values.min()) / (values.max() - values.min())
 86        values = (init_value - min_value) * values + min_value
 87        values = np.pad(values, (0, rem_steps), 'edge')
 88        return values
 89
 90    def q_learning(self,
 91                   nS=None,
 92                   nA=None,
 93                   convert_state_obs=lambda state: state,
 94                   gamma=.99,
 95                   init_alpha=0.5,
 96                   min_alpha=0.01,
 97                   alpha_decay_ratio=0.5,
 98                   init_epsilon=1.0,
 99                   min_epsilon=0.1,
100                   epsilon_decay_ratio=0.9,
101                   n_episodes=10000):
102        """
103        Parameters
104        ----------------------------
105        nS {int}:
106            Number of states
107
108        nA {int}:
109            Number of available actions
110
111        convert_state_obs {lambda}:
112            Converts state into an integer
113
114        gamma {float}, default = 0.99:
115            Discount factor
116
117        init_alpha {float}, default = 0.5:
118            Learning rate
119
120        min_alpha {float}, default = 0.01:
121            Minimum learning rate
122
123        alpha_decay_ratio {float}, default = 0.5:
124            Decay schedule of learing rate for future iterations
125
126        init_epsilon {float}, default = 1.0:
127            Initial epsilon value for epsilon greedy strategy.
128            Chooses max(Q) over available actions with probability 1-epsilon.
129
130        min_epsilon {float}, default = 0.1:
131            Minimum epsilon. Used to balance exploration in later stages.
132
133        epsilon_decay_ratio {float}, default = 0.9:
134            Decay schedule of epsilon for future iterations
135
136        n_episodes {int}, default = 10000:
137            Number of episodes for the agent
138
139
140        Returns
141        ----------------------------
142        Q {numpy array}, shape(nS, nA):
143            Final action-value function Q(s,a)
144
145        pi {lambda}, input state value, output action value:
146            Policy mapping states to actions.
147
148        V {numpy array}, shape(nS):
149            State values array
150
151        Q_track {numpy array}, shape(n_episodes, nS, nA):
152            Log of Q(s,a) for each episode
153
154        pi_track {list}, len(n_episodes):
155            Log of complete policy for each episode
156        """
157        if nS is None:
158            nS=self.env.observation_space.n
159        if nA is None:
160            nA=self.env.action_space.n
161        pi_track = []
162        Q = np.zeros((nS, nA), dtype=np.float32)
163        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
164        alphas = RL.decay_schedule(init_alpha,
165                                min_alpha,
166                                alpha_decay_ratio,
167                                n_episodes)
168        epsilons = RL.decay_schedule(init_epsilon,
169                                  min_epsilon,
170                                  epsilon_decay_ratio,
171                                  n_episodes)
172        rewards = np.zeros(n_episodes, dtype=np.float32)
173        for e in tqdm(range(n_episodes), leave=False):
174            self.callbacks.on_episode_begin(self)
175            self.callbacks.on_episode(self, episode=e)
176            state, info = self.env.reset()
177            done = False
178            state = convert_state_obs(state)
179            total_reward = 0
180            while not done:
181                if self.render:
182                    warnings.warn("Occasional render has been deprecated by openAI.  Use test_env.py to render.")
183                action = self.select_action(state, Q, epsilons[e])
184                next_state, reward, terminated, truncated, _ = self.env.step(action)
185                if truncated:
186                    warnings.warn("Episode was truncated.  TD target value may be incorrect.")
187                done = terminated or truncated
188                self.callbacks.on_env_step(self)
189                next_state = convert_state_obs(next_state)
190                td_target = reward + gamma * Q[next_state].max() * (not done)
191                td_error = td_target - Q[state][action]
192                Q[state][action] = Q[state][action] + alphas[e] * td_error
193                state = next_state
194                total_reward += reward
195            rewards[e] = total_reward
196            Q_track[e] = Q
197            pi_track.append(np.argmax(Q, axis=1))
198            self.render = False
199            self.callbacks.on_episode_end(self)
200
201        V = np.max(Q, axis=1)
202
203        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
204        return Q, V, pi, Q_track, pi_track, rewards
205
206    def sarsa(self,
207              nS=None,
208              nA=None,
209              convert_state_obs=lambda state: state,
210              gamma=.99,
211              init_alpha=0.5,
212              min_alpha=0.01,
213              alpha_decay_ratio=0.5,
214              init_epsilon=1.0,
215              min_epsilon=0.1,
216              epsilon_decay_ratio=0.9,
217              n_episodes=10000):
218        """
219        Parameters
220        ----------------------------
221        nS {int}:
222            Number of states
223
224        nA {int}:
225            Number of available actions
226
227        convert_state_obs {lambda}:
228            Converts state into an integer
229
230        gamma {float}, default = 0.99:
231            Discount factor
232
233        init_alpha {float}, default = 0.5:
234            Learning rate
235
236        min_alpha {float}, default = 0.01:
237            Minimum learning rate
238
239        alpha_decay_ratio {float}, default = 0.5:
240            Decay schedule of learing rate for future iterations
241
242        init_epsilon {float}, default = 1.0:
243            Initial epsilon value for epsilon greedy strategy.
244            Chooses max(Q) over available actions with probability 1-epsilon.
245
246        min_epsilon {float}, default = 0.1:
247            Minimum epsilon. Used to balance exploration in later stages.
248
249        epsilon_decay_ratio {float}, default = 0.9:
250            Decay schedule of epsilon for future iterations
251
252        n_episodes {int}, default = 10000:
253            Number of episodes for the agent
254
255
256        Returns
257        ----------------------------
258        Q {numpy array}, shape(nS, nA):
259            Final action-value function Q(s,a)
260
261        pi {lambda}, input state value, output action value:
262            Policy mapping states to actions.
263
264        V {numpy array}, shape(nS):
265            State values array
266
267        Q_track {numpy array}, shape(n_episodes, nS, nA):
268            Log of Q(s,a) for each episode
269
270        pi_track {list}, len(n_episodes):
271            Log of complete policy for each episode
272        """
273        if nS is None:
274            nS = self.env.observation_space.n
275        if nA is None:
276            nA = self.env.action_space.n
277        pi_track = []
278        Q = np.zeros((nS, nA), dtype=np.float32)
279        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
280        rewards = np.zeros(n_episodes, dtype=np.float32)
281        alphas = RL.decay_schedule(init_alpha,
282                                min_alpha,
283                                alpha_decay_ratio,
284                                n_episodes)
285        epsilons = RL.decay_schedule(init_epsilon,
286                                  min_epsilon,
287                                  epsilon_decay_ratio,
288                                  n_episodes)
289
290        for e in tqdm(range(n_episodes), leave=False):
291            self.callbacks.on_episode_begin(self)
292            self.callbacks.on_episode(self, episode=e)
293            state, info = self.env.reset()
294            done = False
295            state = convert_state_obs(state)
296            action = self.select_action(state, Q, epsilons[e])
297            total_reward = 0
298            while not done:
299                if self.render:
300                    warnings.warn("Occasional render has been deprecated by openAI.  Use test_env.py to render.")
301                next_state, reward, terminated, truncated, _ = self.env.step(action)
302                if truncated:
303                    warnings.warn("Episode was truncated.  TD target value may be incorrect.")
304                done = terminated or truncated
305                self.callbacks.on_env_step(self)
306                next_state = convert_state_obs(next_state)
307                next_action = self.select_action(next_state, Q, epsilons[e])
308                td_target = reward + gamma * Q[next_state][next_action] * (not done)
309                td_error = td_target - Q[state][action]
310                Q[state][action] = Q[state][action] + alphas[e] * td_error
311                state, action = next_state, next_action
312                total_reward += reward
313            rewards[e] = total_reward
314            Q_track[e] = Q
315            pi_track.append(np.argmax(Q, axis=1))
316            self.render = False
317            self.callbacks.on_episode_end(self)
318
319        V = np.max(Q, axis=1)
320
321        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
322        return Q, V, pi, Q_track, pi_track, rewards
RL(env)
33    def __init__(self, env):
34        self.env = env
35        self.callbacks = MyCallbacks()
36        self.render = False
37        # Explanation of lambda:
38        # def select_action(state, Q, epsilon):
39        #   if np.random.random() > epsilon:
40        #       max_val = np.max(Q[state])
41        #       indxs_selector = np.isclose(Q[state], max_val)
42        #       indxs = np.arange(len(Q[state]))[indxs_selector]
43        #       return np.random.choice(indxs)
44        #   else:
45        #       return np.random.randint(len(Q[state]))
46        self.select_action = lambda state, Q, epsilon: \
47            np.random.choice(np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]) \
48            if np.random.random() > epsilon \
49            else np.random.randint(len(Q[state]))
env
callbacks
render
select_action
@staticmethod
def decay_schedule( init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
51    @staticmethod
52    def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
53        """
54        Parameters
55        ----------------------------
56        init_value {float}:
57            Initial value of the quantity being decayed
58
59        min_value {float}:
60            Minimum value init_value is allowed to decay to
61
62        decay_ratio {float}:
63            The exponential factor exp(decay_ratio).
64            Updated decayed value is calculated as
65
66        max_steps {int}:
67            Max iteration steps for decaying init_value
68
69        log_start {array-like}, default = -2:
70            Starting value of the decay sequence.
71            Default value starts it at 0.01
72
73        log_base {array-like}, default = 10:
74            Base of the log space.
75
76
77        Returns
78        ----------------------------
79        values {array-like}, shape(max_steps):
80            Decay values where values[i] is the value used at i-th step
81        """
82        decay_steps = int(max_steps * decay_ratio)
83        rem_steps = max_steps - decay_steps
84        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
85        values = (values - values.min()) / (values.max() - values.min())
86        values = (init_value - min_value) * values + min_value
87        values = np.pad(values, (0, rem_steps), 'edge')
88        return values
Parameters
  • init_value {float}:: Initial value of the quantity being decayed
  • min_value {float}:: Minimum value init_value is allowed to decay to
  • decay_ratio {float}:: The exponential factor exp(decay_ratio). Updated decayed value is calculated as
  • max_steps {int}:: Max iteration steps for decaying init_value
  • log_start {array-like}, default = -2:: Starting value of the decay sequence. Default value starts it at 0.01
  • log_base {array-like}, default = 10:: Base of the log space.
Returns
  • values {array-like}, shape(max_steps):: Decay values where values[i] is the value used at i-th step
def q_learning( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
 90    def q_learning(self,
 91                   nS=None,
 92                   nA=None,
 93                   convert_state_obs=lambda state: state,
 94                   gamma=.99,
 95                   init_alpha=0.5,
 96                   min_alpha=0.01,
 97                   alpha_decay_ratio=0.5,
 98                   init_epsilon=1.0,
 99                   min_epsilon=0.1,
100                   epsilon_decay_ratio=0.9,
101                   n_episodes=10000):
102        """
103        Parameters
104        ----------------------------
105        nS {int}:
106            Number of states
107
108        nA {int}:
109            Number of available actions
110
111        convert_state_obs {lambda}:
112            Converts state into an integer
113
114        gamma {float}, default = 0.99:
115            Discount factor
116
117        init_alpha {float}, default = 0.5:
118            Learning rate
119
120        min_alpha {float}, default = 0.01:
121            Minimum learning rate
122
123        alpha_decay_ratio {float}, default = 0.5:
124            Decay schedule of learing rate for future iterations
125
126        init_epsilon {float}, default = 1.0:
127            Initial epsilon value for epsilon greedy strategy.
128            Chooses max(Q) over available actions with probability 1-epsilon.
129
130        min_epsilon {float}, default = 0.1:
131            Minimum epsilon. Used to balance exploration in later stages.
132
133        epsilon_decay_ratio {float}, default = 0.9:
134            Decay schedule of epsilon for future iterations
135
136        n_episodes {int}, default = 10000:
137            Number of episodes for the agent
138
139
140        Returns
141        ----------------------------
142        Q {numpy array}, shape(nS, nA):
143            Final action-value function Q(s,a)
144
145        pi {lambda}, input state value, output action value:
146            Policy mapping states to actions.
147
148        V {numpy array}, shape(nS):
149            State values array
150
151        Q_track {numpy array}, shape(n_episodes, nS, nA):
152            Log of Q(s,a) for each episode
153
154        pi_track {list}, len(n_episodes):
155            Log of complete policy for each episode
156        """
157        if nS is None:
158            nS=self.env.observation_space.n
159        if nA is None:
160            nA=self.env.action_space.n
161        pi_track = []
162        Q = np.zeros((nS, nA), dtype=np.float32)
163        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
164        alphas = RL.decay_schedule(init_alpha,
165                                min_alpha,
166                                alpha_decay_ratio,
167                                n_episodes)
168        epsilons = RL.decay_schedule(init_epsilon,
169                                  min_epsilon,
170                                  epsilon_decay_ratio,
171                                  n_episodes)
172        rewards = np.zeros(n_episodes, dtype=np.float32)
173        for e in tqdm(range(n_episodes), leave=False):
174            self.callbacks.on_episode_begin(self)
175            self.callbacks.on_episode(self, episode=e)
176            state, info = self.env.reset()
177            done = False
178            state = convert_state_obs(state)
179            total_reward = 0
180            while not done:
181                if self.render:
182                    warnings.warn("Occasional render has been deprecated by openAI.  Use test_env.py to render.")
183                action = self.select_action(state, Q, epsilons[e])
184                next_state, reward, terminated, truncated, _ = self.env.step(action)
185                if truncated:
186                    warnings.warn("Episode was truncated.  TD target value may be incorrect.")
187                done = terminated or truncated
188                self.callbacks.on_env_step(self)
189                next_state = convert_state_obs(next_state)
190                td_target = reward + gamma * Q[next_state].max() * (not done)
191                td_error = td_target - Q[state][action]
192                Q[state][action] = Q[state][action] + alphas[e] * td_error
193                state = next_state
194                total_reward += reward
195            rewards[e] = total_reward
196            Q_track[e] = Q
197            pi_track.append(np.argmax(Q, axis=1))
198            self.render = False
199            self.callbacks.on_episode_end(self)
200
201        V = np.max(Q, axis=1)
202
203        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
204        return Q, V, pi, Q_track, pi_track, rewards
Parameters
  • nS {int}:: Number of states
  • nA {int}:: Number of available actions
  • convert_state_obs {lambda}:: Converts state into an integer
  • gamma {float}, default = 0.99:: Discount factor
  • init_alpha {float}, default = 0.5:: Learning rate
  • min_alpha {float}, default = 0.01:: Minimum learning rate
  • alpha_decay_ratio {float}, default = 0.5:: Decay schedule of learing rate for future iterations
  • init_epsilon {float}, default = 1.0:: Initial epsilon value for epsilon greedy strategy. Chooses max(Q) over available actions with probability 1-epsilon.
  • min_epsilon {float}, default = 0.1:: Minimum epsilon. Used to balance exploration in later stages.
  • epsilon_decay_ratio {float}, default = 0.9:: Decay schedule of epsilon for future iterations
  • n_episodes {int}, default = 10000:: Number of episodes for the agent
Returns
  • Q {numpy array}, shape(nS, nA):: Final action-value function Q(s,a)
  • pi {lambda}, input state value, output action value:: Policy mapping states to actions.
  • V {numpy array}, shape(nS):: State values array
  • Q_track {numpy array}, shape(n_episodes, nS, nA):: Log of Q(s,a) for each episode
  • pi_track {list}, len(n_episodes):: Log of complete policy for each episode
def sarsa( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
206    def sarsa(self,
207              nS=None,
208              nA=None,
209              convert_state_obs=lambda state: state,
210              gamma=.99,
211              init_alpha=0.5,
212              min_alpha=0.01,
213              alpha_decay_ratio=0.5,
214              init_epsilon=1.0,
215              min_epsilon=0.1,
216              epsilon_decay_ratio=0.9,
217              n_episodes=10000):
218        """
219        Parameters
220        ----------------------------
221        nS {int}:
222            Number of states
223
224        nA {int}:
225            Number of available actions
226
227        convert_state_obs {lambda}:
228            Converts state into an integer
229
230        gamma {float}, default = 0.99:
231            Discount factor
232
233        init_alpha {float}, default = 0.5:
234            Learning rate
235
236        min_alpha {float}, default = 0.01:
237            Minimum learning rate
238
239        alpha_decay_ratio {float}, default = 0.5:
240            Decay schedule of learing rate for future iterations
241
242        init_epsilon {float}, default = 1.0:
243            Initial epsilon value for epsilon greedy strategy.
244            Chooses max(Q) over available actions with probability 1-epsilon.
245
246        min_epsilon {float}, default = 0.1:
247            Minimum epsilon. Used to balance exploration in later stages.
248
249        epsilon_decay_ratio {float}, default = 0.9:
250            Decay schedule of epsilon for future iterations
251
252        n_episodes {int}, default = 10000:
253            Number of episodes for the agent
254
255
256        Returns
257        ----------------------------
258        Q {numpy array}, shape(nS, nA):
259            Final action-value function Q(s,a)
260
261        pi {lambda}, input state value, output action value:
262            Policy mapping states to actions.
263
264        V {numpy array}, shape(nS):
265            State values array
266
267        Q_track {numpy array}, shape(n_episodes, nS, nA):
268            Log of Q(s,a) for each episode
269
270        pi_track {list}, len(n_episodes):
271            Log of complete policy for each episode
272        """
273        if nS is None:
274            nS = self.env.observation_space.n
275        if nA is None:
276            nA = self.env.action_space.n
277        pi_track = []
278        Q = np.zeros((nS, nA), dtype=np.float32)
279        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
280        rewards = np.zeros(n_episodes, dtype=np.float32)
281        alphas = RL.decay_schedule(init_alpha,
282                                min_alpha,
283                                alpha_decay_ratio,
284                                n_episodes)
285        epsilons = RL.decay_schedule(init_epsilon,
286                                  min_epsilon,
287                                  epsilon_decay_ratio,
288                                  n_episodes)
289
290        for e in tqdm(range(n_episodes), leave=False):
291            self.callbacks.on_episode_begin(self)
292            self.callbacks.on_episode(self, episode=e)
293            state, info = self.env.reset()
294            done = False
295            state = convert_state_obs(state)
296            action = self.select_action(state, Q, epsilons[e])
297            total_reward = 0
298            while not done:
299                if self.render:
300                    warnings.warn("Occasional render has been deprecated by openAI.  Use test_env.py to render.")
301                next_state, reward, terminated, truncated, _ = self.env.step(action)
302                if truncated:
303                    warnings.warn("Episode was truncated.  TD target value may be incorrect.")
304                done = terminated or truncated
305                self.callbacks.on_env_step(self)
306                next_state = convert_state_obs(next_state)
307                next_action = self.select_action(next_state, Q, epsilons[e])
308                td_target = reward + gamma * Q[next_state][next_action] * (not done)
309                td_error = td_target - Q[state][action]
310                Q[state][action] = Q[state][action] + alphas[e] * td_error
311                state, action = next_state, next_action
312                total_reward += reward
313            rewards[e] = total_reward
314            Q_track[e] = Q
315            pi_track.append(np.argmax(Q, axis=1))
316            self.render = False
317            self.callbacks.on_episode_end(self)
318
319        V = np.max(Q, axis=1)
320
321        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
322        return Q, V, pi, Q_track, pi_track, rewards
Parameters
  • nS {int}:: Number of states
  • nA {int}:: Number of available actions
  • convert_state_obs {lambda}:: Converts state into an integer
  • gamma {float}, default = 0.99:: Discount factor
  • init_alpha {float}, default = 0.5:: Learning rate
  • min_alpha {float}, default = 0.01:: Minimum learning rate
  • alpha_decay_ratio {float}, default = 0.5:: Decay schedule of learing rate for future iterations
  • init_epsilon {float}, default = 1.0:: Initial epsilon value for epsilon greedy strategy. Chooses max(Q) over available actions with probability 1-epsilon.
  • min_epsilon {float}, default = 0.1:: Minimum epsilon. Used to balance exploration in later stages.
  • epsilon_decay_ratio {float}, default = 0.9:: Decay schedule of epsilon for future iterations
  • n_episodes {int}, default = 10000:: Number of episodes for the agent
Returns
  • Q {numpy array}, shape(nS, nA):: Final action-value function Q(s,a)
  • pi {lambda}, input state value, output action value:: Policy mapping states to actions.
  • V {numpy array}, shape(nS):: State values array
  • Q_track {numpy array}, shape(n_episodes, nS, nA):: Log of Q(s,a) for each episode
  • pi_track {list}, len(n_episodes):: Log of complete policy for each episode