bettermdptools.algorithms.rl

Author: Miguel Morales BSD 3-Clause License

Copyright (c) 2018, Miguel Morales All rights reserved. https://github.com/mimoralea/gdrl/blob/master/LICENSE

  1"""
  2Author: Miguel Morales
  3BSD 3-Clause License
  4
  5Copyright (c) 2018, Miguel Morales
  6All rights reserved.
  7https://github.com/mimoralea/gdrl/blob/master/LICENSE
  8"""
  9
 10"""
 11modified by: John Mansfield
 12
 13documentation added by: Gagandeep Randhawa
 14"""
 15
 16"""
 17Class that contains functions related to reinforcement learning algorithms. RL init expects an OpenAI environment (env).
 18
 19Model-free learning algorithms: Q-Learning and SARSA
 20work out of the box with any gymnasium environments that 
 21have single discrete valued state spaces, like frozen lake. A lambda function 
 22is required to convert state spaces not in this format.
 23"""
 24
 25import warnings
 26
 27import numpy as np
 28from tqdm.auto import tqdm
 29
 30from bettermdptools.utils.callbacks import MyCallbacks
 31
 32
 33class RL:
 34    def __init__(self, env):
 35        self.env = env
 36        self.callbacks = MyCallbacks()
 37        self.render = False
 38        # Explanation of lambda:
 39        # def select_action(state, Q, epsilon):
 40        #   if np.random.random() > epsilon:
 41        #       max_val = np.max(Q[state])
 42        #       indxs_selector = np.isclose(Q[state], max_val)
 43        #       indxs = np.arange(len(Q[state]))[indxs_selector]
 44        #       return np.random.choice(indxs)
 45        #   else:
 46        #       return np.random.randint(len(Q[state]))
 47        self.select_action = (
 48            lambda state, Q, epsilon: np.random.choice(
 49                np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]
 50            )
 51            if np.random.random() > epsilon
 52            else np.random.randint(len(Q[state]))
 53        )
 54
 55    @staticmethod
 56    def decay_schedule(
 57        init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10
 58    ):
 59        """
 60        Generates a decay schedule for a given initial value.
 61
 62        Parameters
 63        ----------
 64        init_value : float
 65            Initial value of the quantity being decayed.
 66        min_value : float
 67            Minimum value init_value is allowed to decay to.
 68        decay_ratio : float
 69            The exponential factor exp(decay_ratio).
 70        max_steps : int
 71            Max iteration steps for decaying init_value.
 72        log_start : float, optional
 73            Starting value of the decay sequence, by default -2.
 74        log_base : float, optional
 75            Base of the log space, by default 10.
 76
 77        Returns
 78        -------
 79        np.ndarray
 80            Decay values where values[i] is the value used at i-th step.
 81        """
 82        decay_steps = int(max_steps * decay_ratio)
 83        rem_steps = max_steps - decay_steps
 84        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[
 85            ::-1
 86        ]
 87        values = (values - values.min()) / (values.max() - values.min())
 88        values = (init_value - min_value) * values + min_value
 89        values = np.pad(values, (0, rem_steps), "edge")
 90        return values
 91
 92    def q_learning(
 93        self,
 94        nS=None,
 95        nA=None,
 96        convert_state_obs=lambda state: state,
 97        gamma=0.99,
 98        init_alpha=0.5,
 99        min_alpha=0.01,
100        alpha_decay_ratio=0.5,
101        init_epsilon=1.0,
102        min_epsilon=0.1,
103        epsilon_decay_ratio=0.9,
104        n_episodes=10000,
105    ):
106        """
107        Q-Learning algorithm.
108
109        Parameters
110        ----------
111        nS : int, optional
112            Number of states, by default None.
113        nA : int, optional
114            Number of available actions, by default None.
115        convert_state_obs : function, optional
116            Converts state into an integer, by default lambda state: state.
117        gamma : float, optional
118            Discount factor, by default 0.99.
119        init_alpha : float, optional
120            Initial learning rate, by default 0.5.
121        min_alpha : float, optional
122            Minimum learning rate, by default 0.01.
123        alpha_decay_ratio : float, optional
124            Decay schedule of learning rate for future iterations, by default 0.5.
125        init_epsilon : float, optional
126            Initial epsilon value for epsilon greedy strategy, by default 1.0.
127        min_epsilon : float, optional
128            Minimum epsilon, by default 0.1.
129        epsilon_decay_ratio : float, optional
130            Decay schedule of epsilon for future iterations, by default 0.9.
131        n_episodes : int, optional
132            Number of episodes for the agent, by default 10000.
133
134        Returns
135        -------
136        tuple
137            Q : np.ndarray
138                Final action-value function Q(s,a).
139            V : np.ndarray
140                State values array.
141            pi : dict
142                Policy mapping states to actions.
143            Q_track : np.ndarray
144                Log of Q(s,a) for each episode.
145            pi_track : list
146                Log of complete policy for each episode.
147            rewards : np.ndarray
148                Rewards obtained in each episode.
149        """
150        if nS is None:
151            nS = self.env.observation_space.n
152        if nA is None:
153            nA = self.env.action_space.n
154        pi_track = []
155        Q = np.zeros((nS, nA), dtype=np.float32)
156        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
157        alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
158        epsilons = RL.decay_schedule(
159            init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes
160        )
161        rewards = np.zeros(n_episodes, dtype=np.float32)
162        for e in tqdm(range(n_episodes), leave=False):
163            self.callbacks.on_episode_begin(self)
164            self.callbacks.on_episode(self, episode=e)
165            state, info = self.env.reset()
166            done = False
167            state = convert_state_obs(state)
168            total_reward = 0
169            while not done:
170                if self.render:
171                    warnings.warn(
172                        "Occasional render has been deprecated by openAI.  Use test_env.py to render."
173                    )
174                action = self.select_action(state, Q, epsilons[e])
175                next_state, reward, terminated, truncated, _ = self.env.step(action)
176                if truncated:
177                    warnings.warn(
178                        "Episode was truncated.  TD target value may be incorrect."
179                    )
180                done = terminated or truncated
181                self.callbacks.on_env_step(self)
182                next_state = convert_state_obs(next_state)
183                td_target = reward + gamma * Q[next_state].max() * (not done)
184                td_error = td_target - Q[state][action]
185                Q[state][action] = Q[state][action] + alphas[e] * td_error
186                state = next_state
187                total_reward += reward
188            rewards[e] = total_reward
189            Q_track[e] = Q
190            pi_track.append(np.argmax(Q, axis=1))
191            self.render = False
192            self.callbacks.on_episode_end(self)
193
194        V = np.max(Q, axis=1)
195
196        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
197        return Q, V, pi, Q_track, pi_track, rewards
198
199    def sarsa(
200        self,
201        nS=None,
202        nA=None,
203        convert_state_obs=lambda state: state,
204        gamma=0.99,
205        init_alpha=0.5,
206        min_alpha=0.01,
207        alpha_decay_ratio=0.5,
208        init_epsilon=1.0,
209        min_epsilon=0.1,
210        epsilon_decay_ratio=0.9,
211        n_episodes=10000,
212    ):
213        """
214        SARSA algorithm.
215
216        Parameters
217        ----------
218        nS : int, optional
219            Number of states, by default None.
220        nA : int, optional
221            Number of available actions, by default None.
222        convert_state_obs : function, optional
223            Converts state into an integer, by default lambda state: state.
224        gamma : float, optional
225            Discount factor, by default 0.99.
226        init_alpha : float, optional
227            Initial learning rate, by default 0.5.
228        min_alpha : float, optional
229            Minimum learning rate, by default 0.01.
230        alpha_decay_ratio : float, optional
231            Decay schedule of learning rate for future iterations, by default 0.5.
232        init_epsilon : float, optional
233            Initial epsilon value for epsilon greedy strategy, by default 1.0.
234        min_epsilon : float, optional
235            Minimum epsilon, by default 0.1.
236        epsilon_decay_ratio : float, optional
237            Decay schedule of epsilon for future iterations, by default 0.9.
238        n_episodes : int, optional
239            Number of episodes for the agent, by default 10000.
240
241        Returns
242        -------
243        tuple
244            Q : np.ndarray
245                Final action-value function Q(s,a).
246            V : np.ndarray
247                State values array.
248            pi : dict
249                Policy mapping states to actions.
250            Q_track : np.ndarray
251                Log of Q(s,a) for each episode.
252            pi_track : list
253                Log of complete policy for each episode.
254            rewards : np.ndarray
255                Rewards obtained in each episode.
256        """
257        if nS is None:
258            nS = self.env.observation_space.n
259        if nA is None:
260            nA = self.env.action_space.n
261        pi_track = []
262        Q = np.zeros((nS, nA), dtype=np.float32)
263        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
264        rewards = np.zeros(n_episodes, dtype=np.float32)
265        alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
266        epsilons = RL.decay_schedule(
267            init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes
268        )
269
270        for e in tqdm(range(n_episodes), leave=False):
271            self.callbacks.on_episode_begin(self)
272            self.callbacks.on_episode(self, episode=e)
273            state, info = self.env.reset()
274            done = False
275            state = convert_state_obs(state)
276            action = self.select_action(state, Q, epsilons[e])
277            total_reward = 0
278            while not done:
279                if self.render:
280                    warnings.warn(
281                        "Occasional render has been deprecated by openAI.  Use test_env.py to render."
282                    )
283                next_state, reward, terminated, truncated, _ = self.env.step(action)
284                if truncated:
285                    warnings.warn(
286                        "Episode was truncated.  TD target value may be incorrect."
287                    )
288                done = terminated or truncated
289                self.callbacks.on_env_step(self)
290                next_state = convert_state_obs(next_state)
291                next_action = self.select_action(next_state, Q, epsilons[e])
292                td_target = reward + gamma * Q[next_state][next_action] * (not done)
293                td_error = td_target - Q[state][action]
294                Q[state][action] = Q[state][action] + alphas[e] * td_error
295                state, action = next_state, next_action
296                total_reward += reward
297            rewards[e] = total_reward
298            Q_track[e] = Q
299            pi_track.append(np.argmax(Q, axis=1))
300            self.render = False
301            self.callbacks.on_episode_end(self)
302
303        V = np.max(Q, axis=1)
304
305        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
306        return Q, V, pi, Q_track, pi_track, rewards
class RL:
 34class RL:
 35    def __init__(self, env):
 36        self.env = env
 37        self.callbacks = MyCallbacks()
 38        self.render = False
 39        # Explanation of lambda:
 40        # def select_action(state, Q, epsilon):
 41        #   if np.random.random() > epsilon:
 42        #       max_val = np.max(Q[state])
 43        #       indxs_selector = np.isclose(Q[state], max_val)
 44        #       indxs = np.arange(len(Q[state]))[indxs_selector]
 45        #       return np.random.choice(indxs)
 46        #   else:
 47        #       return np.random.randint(len(Q[state]))
 48        self.select_action = (
 49            lambda state, Q, epsilon: np.random.choice(
 50                np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]
 51            )
 52            if np.random.random() > epsilon
 53            else np.random.randint(len(Q[state]))
 54        )
 55
 56    @staticmethod
 57    def decay_schedule(
 58        init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10
 59    ):
 60        """
 61        Generates a decay schedule for a given initial value.
 62
 63        Parameters
 64        ----------
 65        init_value : float
 66            Initial value of the quantity being decayed.
 67        min_value : float
 68            Minimum value init_value is allowed to decay to.
 69        decay_ratio : float
 70            The exponential factor exp(decay_ratio).
 71        max_steps : int
 72            Max iteration steps for decaying init_value.
 73        log_start : float, optional
 74            Starting value of the decay sequence, by default -2.
 75        log_base : float, optional
 76            Base of the log space, by default 10.
 77
 78        Returns
 79        -------
 80        np.ndarray
 81            Decay values where values[i] is the value used at i-th step.
 82        """
 83        decay_steps = int(max_steps * decay_ratio)
 84        rem_steps = max_steps - decay_steps
 85        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[
 86            ::-1
 87        ]
 88        values = (values - values.min()) / (values.max() - values.min())
 89        values = (init_value - min_value) * values + min_value
 90        values = np.pad(values, (0, rem_steps), "edge")
 91        return values
 92
 93    def q_learning(
 94        self,
 95        nS=None,
 96        nA=None,
 97        convert_state_obs=lambda state: state,
 98        gamma=0.99,
 99        init_alpha=0.5,
100        min_alpha=0.01,
101        alpha_decay_ratio=0.5,
102        init_epsilon=1.0,
103        min_epsilon=0.1,
104        epsilon_decay_ratio=0.9,
105        n_episodes=10000,
106    ):
107        """
108        Q-Learning algorithm.
109
110        Parameters
111        ----------
112        nS : int, optional
113            Number of states, by default None.
114        nA : int, optional
115            Number of available actions, by default None.
116        convert_state_obs : function, optional
117            Converts state into an integer, by default lambda state: state.
118        gamma : float, optional
119            Discount factor, by default 0.99.
120        init_alpha : float, optional
121            Initial learning rate, by default 0.5.
122        min_alpha : float, optional
123            Minimum learning rate, by default 0.01.
124        alpha_decay_ratio : float, optional
125            Decay schedule of learning rate for future iterations, by default 0.5.
126        init_epsilon : float, optional
127            Initial epsilon value for epsilon greedy strategy, by default 1.0.
128        min_epsilon : float, optional
129            Minimum epsilon, by default 0.1.
130        epsilon_decay_ratio : float, optional
131            Decay schedule of epsilon for future iterations, by default 0.9.
132        n_episodes : int, optional
133            Number of episodes for the agent, by default 10000.
134
135        Returns
136        -------
137        tuple
138            Q : np.ndarray
139                Final action-value function Q(s,a).
140            V : np.ndarray
141                State values array.
142            pi : dict
143                Policy mapping states to actions.
144            Q_track : np.ndarray
145                Log of Q(s,a) for each episode.
146            pi_track : list
147                Log of complete policy for each episode.
148            rewards : np.ndarray
149                Rewards obtained in each episode.
150        """
151        if nS is None:
152            nS = self.env.observation_space.n
153        if nA is None:
154            nA = self.env.action_space.n
155        pi_track = []
156        Q = np.zeros((nS, nA), dtype=np.float32)
157        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
158        alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
159        epsilons = RL.decay_schedule(
160            init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes
161        )
162        rewards = np.zeros(n_episodes, dtype=np.float32)
163        for e in tqdm(range(n_episodes), leave=False):
164            self.callbacks.on_episode_begin(self)
165            self.callbacks.on_episode(self, episode=e)
166            state, info = self.env.reset()
167            done = False
168            state = convert_state_obs(state)
169            total_reward = 0
170            while not done:
171                if self.render:
172                    warnings.warn(
173                        "Occasional render has been deprecated by openAI.  Use test_env.py to render."
174                    )
175                action = self.select_action(state, Q, epsilons[e])
176                next_state, reward, terminated, truncated, _ = self.env.step(action)
177                if truncated:
178                    warnings.warn(
179                        "Episode was truncated.  TD target value may be incorrect."
180                    )
181                done = terminated or truncated
182                self.callbacks.on_env_step(self)
183                next_state = convert_state_obs(next_state)
184                td_target = reward + gamma * Q[next_state].max() * (not done)
185                td_error = td_target - Q[state][action]
186                Q[state][action] = Q[state][action] + alphas[e] * td_error
187                state = next_state
188                total_reward += reward
189            rewards[e] = total_reward
190            Q_track[e] = Q
191            pi_track.append(np.argmax(Q, axis=1))
192            self.render = False
193            self.callbacks.on_episode_end(self)
194
195        V = np.max(Q, axis=1)
196
197        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
198        return Q, V, pi, Q_track, pi_track, rewards
199
200    def sarsa(
201        self,
202        nS=None,
203        nA=None,
204        convert_state_obs=lambda state: state,
205        gamma=0.99,
206        init_alpha=0.5,
207        min_alpha=0.01,
208        alpha_decay_ratio=0.5,
209        init_epsilon=1.0,
210        min_epsilon=0.1,
211        epsilon_decay_ratio=0.9,
212        n_episodes=10000,
213    ):
214        """
215        SARSA algorithm.
216
217        Parameters
218        ----------
219        nS : int, optional
220            Number of states, by default None.
221        nA : int, optional
222            Number of available actions, by default None.
223        convert_state_obs : function, optional
224            Converts state into an integer, by default lambda state: state.
225        gamma : float, optional
226            Discount factor, by default 0.99.
227        init_alpha : float, optional
228            Initial learning rate, by default 0.5.
229        min_alpha : float, optional
230            Minimum learning rate, by default 0.01.
231        alpha_decay_ratio : float, optional
232            Decay schedule of learning rate for future iterations, by default 0.5.
233        init_epsilon : float, optional
234            Initial epsilon value for epsilon greedy strategy, by default 1.0.
235        min_epsilon : float, optional
236            Minimum epsilon, by default 0.1.
237        epsilon_decay_ratio : float, optional
238            Decay schedule of epsilon for future iterations, by default 0.9.
239        n_episodes : int, optional
240            Number of episodes for the agent, by default 10000.
241
242        Returns
243        -------
244        tuple
245            Q : np.ndarray
246                Final action-value function Q(s,a).
247            V : np.ndarray
248                State values array.
249            pi : dict
250                Policy mapping states to actions.
251            Q_track : np.ndarray
252                Log of Q(s,a) for each episode.
253            pi_track : list
254                Log of complete policy for each episode.
255            rewards : np.ndarray
256                Rewards obtained in each episode.
257        """
258        if nS is None:
259            nS = self.env.observation_space.n
260        if nA is None:
261            nA = self.env.action_space.n
262        pi_track = []
263        Q = np.zeros((nS, nA), dtype=np.float32)
264        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
265        rewards = np.zeros(n_episodes, dtype=np.float32)
266        alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
267        epsilons = RL.decay_schedule(
268            init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes
269        )
270
271        for e in tqdm(range(n_episodes), leave=False):
272            self.callbacks.on_episode_begin(self)
273            self.callbacks.on_episode(self, episode=e)
274            state, info = self.env.reset()
275            done = False
276            state = convert_state_obs(state)
277            action = self.select_action(state, Q, epsilons[e])
278            total_reward = 0
279            while not done:
280                if self.render:
281                    warnings.warn(
282                        "Occasional render has been deprecated by openAI.  Use test_env.py to render."
283                    )
284                next_state, reward, terminated, truncated, _ = self.env.step(action)
285                if truncated:
286                    warnings.warn(
287                        "Episode was truncated.  TD target value may be incorrect."
288                    )
289                done = terminated or truncated
290                self.callbacks.on_env_step(self)
291                next_state = convert_state_obs(next_state)
292                next_action = self.select_action(next_state, Q, epsilons[e])
293                td_target = reward + gamma * Q[next_state][next_action] * (not done)
294                td_error = td_target - Q[state][action]
295                Q[state][action] = Q[state][action] + alphas[e] * td_error
296                state, action = next_state, next_action
297                total_reward += reward
298            rewards[e] = total_reward
299            Q_track[e] = Q
300            pi_track.append(np.argmax(Q, axis=1))
301            self.render = False
302            self.callbacks.on_episode_end(self)
303
304        V = np.max(Q, axis=1)
305
306        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
307        return Q, V, pi, Q_track, pi_track, rewards
RL(env)
35    def __init__(self, env):
36        self.env = env
37        self.callbacks = MyCallbacks()
38        self.render = False
39        # Explanation of lambda:
40        # def select_action(state, Q, epsilon):
41        #   if np.random.random() > epsilon:
42        #       max_val = np.max(Q[state])
43        #       indxs_selector = np.isclose(Q[state], max_val)
44        #       indxs = np.arange(len(Q[state]))[indxs_selector]
45        #       return np.random.choice(indxs)
46        #   else:
47        #       return np.random.randint(len(Q[state]))
48        self.select_action = (
49            lambda state, Q, epsilon: np.random.choice(
50                np.arange(len(Q[state]))[np.isclose(Q[state], np.max(Q[state]))]
51            )
52            if np.random.random() > epsilon
53            else np.random.randint(len(Q[state]))
54        )
env
callbacks
render
select_action
@staticmethod
def decay_schedule( init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
56    @staticmethod
57    def decay_schedule(
58        init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10
59    ):
60        """
61        Generates a decay schedule for a given initial value.
62
63        Parameters
64        ----------
65        init_value : float
66            Initial value of the quantity being decayed.
67        min_value : float
68            Minimum value init_value is allowed to decay to.
69        decay_ratio : float
70            The exponential factor exp(decay_ratio).
71        max_steps : int
72            Max iteration steps for decaying init_value.
73        log_start : float, optional
74            Starting value of the decay sequence, by default -2.
75        log_base : float, optional
76            Base of the log space, by default 10.
77
78        Returns
79        -------
80        np.ndarray
81            Decay values where values[i] is the value used at i-th step.
82        """
83        decay_steps = int(max_steps * decay_ratio)
84        rem_steps = max_steps - decay_steps
85        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[
86            ::-1
87        ]
88        values = (values - values.min()) / (values.max() - values.min())
89        values = (init_value - min_value) * values + min_value
90        values = np.pad(values, (0, rem_steps), "edge")
91        return values

Generates a decay schedule for a given initial value.

Parameters
  • init_value (float): Initial value of the quantity being decayed.
  • min_value (float): Minimum value init_value is allowed to decay to.
  • decay_ratio (float): The exponential factor exp(decay_ratio).
  • max_steps (int): Max iteration steps for decaying init_value.
  • log_start (float, optional): Starting value of the decay sequence, by default -2.
  • log_base (float, optional): Base of the log space, by default 10.
Returns
  • np.ndarray: Decay values where values[i] is the value used at i-th step.
def q_learning( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
 93    def q_learning(
 94        self,
 95        nS=None,
 96        nA=None,
 97        convert_state_obs=lambda state: state,
 98        gamma=0.99,
 99        init_alpha=0.5,
100        min_alpha=0.01,
101        alpha_decay_ratio=0.5,
102        init_epsilon=1.0,
103        min_epsilon=0.1,
104        epsilon_decay_ratio=0.9,
105        n_episodes=10000,
106    ):
107        """
108        Q-Learning algorithm.
109
110        Parameters
111        ----------
112        nS : int, optional
113            Number of states, by default None.
114        nA : int, optional
115            Number of available actions, by default None.
116        convert_state_obs : function, optional
117            Converts state into an integer, by default lambda state: state.
118        gamma : float, optional
119            Discount factor, by default 0.99.
120        init_alpha : float, optional
121            Initial learning rate, by default 0.5.
122        min_alpha : float, optional
123            Minimum learning rate, by default 0.01.
124        alpha_decay_ratio : float, optional
125            Decay schedule of learning rate for future iterations, by default 0.5.
126        init_epsilon : float, optional
127            Initial epsilon value for epsilon greedy strategy, by default 1.0.
128        min_epsilon : float, optional
129            Minimum epsilon, by default 0.1.
130        epsilon_decay_ratio : float, optional
131            Decay schedule of epsilon for future iterations, by default 0.9.
132        n_episodes : int, optional
133            Number of episodes for the agent, by default 10000.
134
135        Returns
136        -------
137        tuple
138            Q : np.ndarray
139                Final action-value function Q(s,a).
140            V : np.ndarray
141                State values array.
142            pi : dict
143                Policy mapping states to actions.
144            Q_track : np.ndarray
145                Log of Q(s,a) for each episode.
146            pi_track : list
147                Log of complete policy for each episode.
148            rewards : np.ndarray
149                Rewards obtained in each episode.
150        """
151        if nS is None:
152            nS = self.env.observation_space.n
153        if nA is None:
154            nA = self.env.action_space.n
155        pi_track = []
156        Q = np.zeros((nS, nA), dtype=np.float32)
157        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
158        alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
159        epsilons = RL.decay_schedule(
160            init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes
161        )
162        rewards = np.zeros(n_episodes, dtype=np.float32)
163        for e in tqdm(range(n_episodes), leave=False):
164            self.callbacks.on_episode_begin(self)
165            self.callbacks.on_episode(self, episode=e)
166            state, info = self.env.reset()
167            done = False
168            state = convert_state_obs(state)
169            total_reward = 0
170            while not done:
171                if self.render:
172                    warnings.warn(
173                        "Occasional render has been deprecated by openAI.  Use test_env.py to render."
174                    )
175                action = self.select_action(state, Q, epsilons[e])
176                next_state, reward, terminated, truncated, _ = self.env.step(action)
177                if truncated:
178                    warnings.warn(
179                        "Episode was truncated.  TD target value may be incorrect."
180                    )
181                done = terminated or truncated
182                self.callbacks.on_env_step(self)
183                next_state = convert_state_obs(next_state)
184                td_target = reward + gamma * Q[next_state].max() * (not done)
185                td_error = td_target - Q[state][action]
186                Q[state][action] = Q[state][action] + alphas[e] * td_error
187                state = next_state
188                total_reward += reward
189            rewards[e] = total_reward
190            Q_track[e] = Q
191            pi_track.append(np.argmax(Q, axis=1))
192            self.render = False
193            self.callbacks.on_episode_end(self)
194
195        V = np.max(Q, axis=1)
196
197        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
198        return Q, V, pi, Q_track, pi_track, rewards

Q-Learning algorithm.

Parameters
  • nS (int, optional): Number of states, by default None.
  • nA (int, optional): Number of available actions, by default None.
  • convert_state_obs (function, optional): Converts state into an integer, by default lambda state: state.
  • gamma (float, optional): Discount factor, by default 0.99.
  • init_alpha (float, optional): Initial learning rate, by default 0.5.
  • min_alpha (float, optional): Minimum learning rate, by default 0.01.
  • alpha_decay_ratio (float, optional): Decay schedule of learning rate for future iterations, by default 0.5.
  • init_epsilon (float, optional): Initial epsilon value for epsilon greedy strategy, by default 1.0.
  • min_epsilon (float, optional): Minimum epsilon, by default 0.1.
  • epsilon_decay_ratio (float, optional): Decay schedule of epsilon for future iterations, by default 0.9.
  • n_episodes (int, optional): Number of episodes for the agent, by default 10000.
Returns
  • tuple: Q : np.ndarray Final action-value function Q(s,a). V : np.ndarray State values array. pi : dict Policy mapping states to actions. Q_track : np.ndarray Log of Q(s,a) for each episode. pi_track : list Log of complete policy for each episode. rewards : np.ndarray Rewards obtained in each episode.
def sarsa( self, nS=None, nA=None, convert_state_obs=<function RL.<lambda>>, gamma=0.99, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5, init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, n_episodes=10000):
200    def sarsa(
201        self,
202        nS=None,
203        nA=None,
204        convert_state_obs=lambda state: state,
205        gamma=0.99,
206        init_alpha=0.5,
207        min_alpha=0.01,
208        alpha_decay_ratio=0.5,
209        init_epsilon=1.0,
210        min_epsilon=0.1,
211        epsilon_decay_ratio=0.9,
212        n_episodes=10000,
213    ):
214        """
215        SARSA algorithm.
216
217        Parameters
218        ----------
219        nS : int, optional
220            Number of states, by default None.
221        nA : int, optional
222            Number of available actions, by default None.
223        convert_state_obs : function, optional
224            Converts state into an integer, by default lambda state: state.
225        gamma : float, optional
226            Discount factor, by default 0.99.
227        init_alpha : float, optional
228            Initial learning rate, by default 0.5.
229        min_alpha : float, optional
230            Minimum learning rate, by default 0.01.
231        alpha_decay_ratio : float, optional
232            Decay schedule of learning rate for future iterations, by default 0.5.
233        init_epsilon : float, optional
234            Initial epsilon value for epsilon greedy strategy, by default 1.0.
235        min_epsilon : float, optional
236            Minimum epsilon, by default 0.1.
237        epsilon_decay_ratio : float, optional
238            Decay schedule of epsilon for future iterations, by default 0.9.
239        n_episodes : int, optional
240            Number of episodes for the agent, by default 10000.
241
242        Returns
243        -------
244        tuple
245            Q : np.ndarray
246                Final action-value function Q(s,a).
247            V : np.ndarray
248                State values array.
249            pi : dict
250                Policy mapping states to actions.
251            Q_track : np.ndarray
252                Log of Q(s,a) for each episode.
253            pi_track : list
254                Log of complete policy for each episode.
255            rewards : np.ndarray
256                Rewards obtained in each episode.
257        """
258        if nS is None:
259            nS = self.env.observation_space.n
260        if nA is None:
261            nA = self.env.action_space.n
262        pi_track = []
263        Q = np.zeros((nS, nA), dtype=np.float32)
264        Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
265        rewards = np.zeros(n_episodes, dtype=np.float32)
266        alphas = RL.decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
267        epsilons = RL.decay_schedule(
268            init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes
269        )
270
271        for e in tqdm(range(n_episodes), leave=False):
272            self.callbacks.on_episode_begin(self)
273            self.callbacks.on_episode(self, episode=e)
274            state, info = self.env.reset()
275            done = False
276            state = convert_state_obs(state)
277            action = self.select_action(state, Q, epsilons[e])
278            total_reward = 0
279            while not done:
280                if self.render:
281                    warnings.warn(
282                        "Occasional render has been deprecated by openAI.  Use test_env.py to render."
283                    )
284                next_state, reward, terminated, truncated, _ = self.env.step(action)
285                if truncated:
286                    warnings.warn(
287                        "Episode was truncated.  TD target value may be incorrect."
288                    )
289                done = terminated or truncated
290                self.callbacks.on_env_step(self)
291                next_state = convert_state_obs(next_state)
292                next_action = self.select_action(next_state, Q, epsilons[e])
293                td_target = reward + gamma * Q[next_state][next_action] * (not done)
294                td_error = td_target - Q[state][action]
295                Q[state][action] = Q[state][action] + alphas[e] * td_error
296                state, action = next_state, next_action
297                total_reward += reward
298            rewards[e] = total_reward
299            Q_track[e] = Q
300            pi_track.append(np.argmax(Q, axis=1))
301            self.render = False
302            self.callbacks.on_episode_end(self)
303
304        V = np.max(Q, axis=1)
305
306        pi = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
307        return Q, V, pi, Q_track, pi_track, rewards

SARSA algorithm.

Parameters
  • nS (int, optional): Number of states, by default None.
  • nA (int, optional): Number of available actions, by default None.
  • convert_state_obs (function, optional): Converts state into an integer, by default lambda state: state.
  • gamma (float, optional): Discount factor, by default 0.99.
  • init_alpha (float, optional): Initial learning rate, by default 0.5.
  • min_alpha (float, optional): Minimum learning rate, by default 0.01.
  • alpha_decay_ratio (float, optional): Decay schedule of learning rate for future iterations, by default 0.5.
  • init_epsilon (float, optional): Initial epsilon value for epsilon greedy strategy, by default 1.0.
  • min_epsilon (float, optional): Minimum epsilon, by default 0.1.
  • epsilon_decay_ratio (float, optional): Decay schedule of epsilon for future iterations, by default 0.9.
  • n_episodes (int, optional): Number of episodes for the agent, by default 10000.
Returns
  • tuple: Q : np.ndarray Final action-value function Q(s,a). V : np.ndarray State values array. pi : dict Policy mapping states to actions. Q_track : np.ndarray Log of Q(s,a) for each episode. pi_track : list Log of complete policy for each episode. rewards : np.ndarray Rewards obtained in each episode.