bettermdptools.utils.grid_search

  1# -*- coding: utf-8 -*-
  2
  3import itertools
  4
  5import numpy as np
  6
  7from bettermdptools.algorithms.planner import Planner
  8from bettermdptools.algorithms.rl import RL
  9from bettermdptools.utils.test_env import TestEnv
 10
 11
 12class GridSearch:
 13    @staticmethod
 14    def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 15        highest_avg_reward = -np.inf
 16        best_params = None
 17        rewards_and_params_results = []
 18
 19        for i in itertools.product(gamma, epsilon_decay, iters):
 20            if verbose:
 21                print(
 22                    "running q_learning with gamma:",
 23                    i[0],
 24                    "epsilon decay:",
 25                    i[1],
 26                    " iterations:",
 27                    i[2],
 28                )
 29
 30            Q, V, pi, Q_track, pi_track = RL(env).q_learning(
 31                gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]
 32            )
 33            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 34            avg_reward = np.mean(episode_rewards)
 35            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
 36            if avg_reward > highest_avg_reward:
 37                highest_avg_reward = avg_reward
 38                best_params = i
 39
 40            if verbose:
 41                print("Avg. episode reward: ", avg_reward)
 42                print("###################")
 43
 44        return rewards_and_params_results, highest_avg_reward, best_params
 45
 46    @staticmethod
 47    def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 48        highest_avg_reward = -np.inf
 49        best_params = None
 50        rewards_and_params_results = []
 51
 52        for i in itertools.product(gamma, epsilon_decay, iters):
 53            if verbose:
 54                print(
 55                    "running sarsa with gamma:",
 56                    i[0],
 57                    "epsilon decay:",
 58                    i[1],
 59                    " iterations:",
 60                    i[2],
 61                )
 62
 63            Q, V, pi, Q_track, pi_track = RL(env).sarsa(
 64                gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]
 65            )
 66            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 67            avg_reward = np.mean(episode_rewards)
 68            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
 69            if avg_reward > highest_avg_reward:
 70                highest_avg_reward = avg_reward
 71                best_params = i
 72
 73            if verbose:
 74                print("Avg. episode reward: ", avg_reward)
 75                print("###################")
 76
 77        return rewards_and_params_results, highest_avg_reward, best_params
 78
 79    @staticmethod
 80    def pi_grid_search(env, gamma, n_iters, theta, verbose=True):
 81        highest_avg_reward = -np.inf
 82        best_params = None
 83        rewards_and_params_results = []
 84
 85        for i in itertools.product(gamma, n_iters, theta):
 86            if verbose:
 87                print(
 88                    "running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]
 89                )
 90
 91            V, V_track, pi = Planner(env.P).policy_iteration(
 92                gamma=i[0], n_iters=i[1], theta=i[2]
 93            )
 94            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 95            avg_reward = np.mean(episode_rewards)
 96            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
 97            if avg_reward > highest_avg_reward:
 98                highest_avg_reward = avg_reward
 99                best_params = i
100
101            if verbose:
102                print("Avg. episode reward: ", avg_reward)
103                print("###################")
104
105        return rewards_and_params_results, highest_avg_reward, best_params
106
107    @staticmethod
108    def vi_grid_search(env, gamma, n_iters, theta, verbose=True):
109        highest_avg_reward = -np.inf
110        best_params = None
111        rewards_and_params_results = []
112
113        for i in itertools.product(gamma, n_iters, theta):
114            if verbose:
115                print(
116                    "running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]
117                )
118
119            V, V_track, pi = Planner(env.P).value_iteration(
120                gamma=i[0], n_iters=i[1], theta=i[2]
121            )
122            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
123            avg_reward = np.mean(episode_rewards)
124            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
125            if avg_reward > highest_avg_reward:
126                highest_avg_reward = avg_reward
127                best_params = i
128
129            if verbose:
130                print("Avg. episode reward: ", avg_reward)
131                print("###################")
132
133        return rewards_and_params_results, highest_avg_reward, best_params
class GridSearch:
 13class GridSearch:
 14    @staticmethod
 15    def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 16        highest_avg_reward = -np.inf
 17        best_params = None
 18        rewards_and_params_results = []
 19
 20        for i in itertools.product(gamma, epsilon_decay, iters):
 21            if verbose:
 22                print(
 23                    "running q_learning with gamma:",
 24                    i[0],
 25                    "epsilon decay:",
 26                    i[1],
 27                    " iterations:",
 28                    i[2],
 29                )
 30
 31            Q, V, pi, Q_track, pi_track = RL(env).q_learning(
 32                gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]
 33            )
 34            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 35            avg_reward = np.mean(episode_rewards)
 36            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
 37            if avg_reward > highest_avg_reward:
 38                highest_avg_reward = avg_reward
 39                best_params = i
 40
 41            if verbose:
 42                print("Avg. episode reward: ", avg_reward)
 43                print("###################")
 44
 45        return rewards_and_params_results, highest_avg_reward, best_params
 46
 47    @staticmethod
 48    def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 49        highest_avg_reward = -np.inf
 50        best_params = None
 51        rewards_and_params_results = []
 52
 53        for i in itertools.product(gamma, epsilon_decay, iters):
 54            if verbose:
 55                print(
 56                    "running sarsa with gamma:",
 57                    i[0],
 58                    "epsilon decay:",
 59                    i[1],
 60                    " iterations:",
 61                    i[2],
 62                )
 63
 64            Q, V, pi, Q_track, pi_track = RL(env).sarsa(
 65                gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]
 66            )
 67            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 68            avg_reward = np.mean(episode_rewards)
 69            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
 70            if avg_reward > highest_avg_reward:
 71                highest_avg_reward = avg_reward
 72                best_params = i
 73
 74            if verbose:
 75                print("Avg. episode reward: ", avg_reward)
 76                print("###################")
 77
 78        return rewards_and_params_results, highest_avg_reward, best_params
 79
 80    @staticmethod
 81    def pi_grid_search(env, gamma, n_iters, theta, verbose=True):
 82        highest_avg_reward = -np.inf
 83        best_params = None
 84        rewards_and_params_results = []
 85
 86        for i in itertools.product(gamma, n_iters, theta):
 87            if verbose:
 88                print(
 89                    "running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]
 90                )
 91
 92            V, V_track, pi = Planner(env.P).policy_iteration(
 93                gamma=i[0], n_iters=i[1], theta=i[2]
 94            )
 95            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 96            avg_reward = np.mean(episode_rewards)
 97            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
 98            if avg_reward > highest_avg_reward:
 99                highest_avg_reward = avg_reward
100                best_params = i
101
102            if verbose:
103                print("Avg. episode reward: ", avg_reward)
104                print("###################")
105
106        return rewards_and_params_results, highest_avg_reward, best_params
107
108    @staticmethod
109    def vi_grid_search(env, gamma, n_iters, theta, verbose=True):
110        highest_avg_reward = -np.inf
111        best_params = None
112        rewards_and_params_results = []
113
114        for i in itertools.product(gamma, n_iters, theta):
115            if verbose:
116                print(
117                    "running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]
118                )
119
120            V, V_track, pi = Planner(env.P).value_iteration(
121                gamma=i[0], n_iters=i[1], theta=i[2]
122            )
123            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
124            avg_reward = np.mean(episode_rewards)
125            rewards_and_params_results.append({"avg_reward": avg_reward, "params": i})
126            if avg_reward > highest_avg_reward:
127                highest_avg_reward = avg_reward
128                best_params = i
129
130            if verbose:
131                print("Avg. episode reward: ", avg_reward)
132                print("###################")
133
134        return rewards_and_params_results, highest_avg_reward, best_params