utils.grid_search

  1# -*- coding: utf-8 -*-
  2
  3from bettermdptools.algorithms.rl import RL
  4from bettermdptools.algorithms.planner import Planner
  5from bettermdptools.utils.test_env import TestEnv
  6import numpy as np
  7import itertools
  8
  9class GridSearch:
 10    @staticmethod
 11    def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 12        highest_avg_reward = -np.inf
 13        best_params = None
 14        rewards_and_params_results = []
 15
 16        for i in itertools.product(gamma, epsilon_decay, iters):
 17            if verbose:
 18                print("running q_learning with gamma:", i[0],  "epsilon decay:", i[1],  " iterations:", i[2])
 19
 20            Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2])
 21            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 22            avg_reward = np.mean(episode_rewards)
 23            rewards_and_params_results.append({
 24                'avg_reward': avg_reward,
 25                'params': i
 26            })
 27            if avg_reward > highest_avg_reward:
 28                highest_avg_reward = avg_reward
 29                best_params = i
 30
 31            if verbose:
 32                print("Avg. episode reward: ", avg_reward)
 33                print("###################")
 34
 35        return rewards_and_params_results, highest_avg_reward, best_params
 36
 37    @staticmethod
 38    def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 39        highest_avg_reward = -np.inf
 40        best_params = None
 41        rewards_and_params_results = []
 42
 43        for i in itertools.product(gamma, epsilon_decay, iters):
 44            if verbose:
 45                print("running sarsa with gamma:", i[0],  "epsilon decay:", i[1],  " iterations:", i[2])
 46
 47            Q, V, pi, Q_track, pi_track = RL(env).sarsa(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2])
 48            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 49            avg_reward = np.mean(episode_rewards)
 50            rewards_and_params_results.append({
 51                'avg_reward': avg_reward,
 52                'params': i
 53            })
 54            if avg_reward > highest_avg_reward:
 55                highest_avg_reward = avg_reward
 56                best_params = i
 57
 58            if verbose:
 59                print("Avg. episode reward: ", avg_reward)
 60                print("###################")
 61
 62        return rewards_and_params_results, highest_avg_reward, best_params
 63
 64    @staticmethod
 65    def pi_grid_search(env, gamma, n_iters, theta, verbose=True):
 66        highest_avg_reward = -np.inf
 67        best_params = None
 68        rewards_and_params_results = []
 69
 70        for i in itertools.product(gamma, n_iters, theta):
 71            if verbose:
 72                print("running PI with gamma:", i[0],  " n_iters:", i[1], " theta:", i[2])
 73
 74            V, V_track, pi = Planner(env.P).policy_iteration(gamma=i[0], n_iters=i[1], theta=i[2])
 75            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 76            avg_reward = np.mean(episode_rewards)
 77            rewards_and_params_results.append({
 78                'avg_reward': avg_reward,
 79                'params': i
 80            })
 81            if avg_reward > highest_avg_reward:
 82                highest_avg_reward = avg_reward
 83                best_params = i
 84
 85            if verbose:
 86                print("Avg. episode reward: ", avg_reward)
 87                print("###################")
 88
 89        return rewards_and_params_results, highest_avg_reward, best_params
 90
 91    @staticmethod
 92    def vi_grid_search(env, gamma, n_iters, theta, verbose=True):
 93        highest_avg_reward = -np.inf
 94        best_params = None
 95        rewards_and_params_results = []
 96
 97        for i in itertools.product(gamma, n_iters, theta):
 98            if verbose:
 99                print("running VI with gamma:", i[0],  " n_iters:", i[1], " theta:", i[2])
100
101            V, V_track, pi = Planner(env.P).value_iteration(gamma=i[0], n_iters=i[1], theta=i[2])
102            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
103            avg_reward = np.mean(episode_rewards)
104            rewards_and_params_results.append({
105                'avg_reward': avg_reward,
106                'params': i
107            })
108            if avg_reward > highest_avg_reward:
109                highest_avg_reward = avg_reward
110                best_params = i
111
112            if verbose:
113                print("Avg. episode reward: ", avg_reward)
114                print("###################")
115
116        return rewards_and_params_results, highest_avg_reward, best_params
class GridSearch:
 10class GridSearch:
 11    @staticmethod
 12    def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 13        highest_avg_reward = -np.inf
 14        best_params = None
 15        rewards_and_params_results = []
 16
 17        for i in itertools.product(gamma, epsilon_decay, iters):
 18            if verbose:
 19                print("running q_learning with gamma:", i[0],  "epsilon decay:", i[1],  " iterations:", i[2])
 20
 21            Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2])
 22            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 23            avg_reward = np.mean(episode_rewards)
 24            rewards_and_params_results.append({
 25                'avg_reward': avg_reward,
 26                'params': i
 27            })
 28            if avg_reward > highest_avg_reward:
 29                highest_avg_reward = avg_reward
 30                best_params = i
 31
 32            if verbose:
 33                print("Avg. episode reward: ", avg_reward)
 34                print("###################")
 35
 36        return rewards_and_params_results, highest_avg_reward, best_params
 37
 38    @staticmethod
 39    def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
 40        highest_avg_reward = -np.inf
 41        best_params = None
 42        rewards_and_params_results = []
 43
 44        for i in itertools.product(gamma, epsilon_decay, iters):
 45            if verbose:
 46                print("running sarsa with gamma:", i[0],  "epsilon decay:", i[1],  " iterations:", i[2])
 47
 48            Q, V, pi, Q_track, pi_track = RL(env).sarsa(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2])
 49            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 50            avg_reward = np.mean(episode_rewards)
 51            rewards_and_params_results.append({
 52                'avg_reward': avg_reward,
 53                'params': i
 54            })
 55            if avg_reward > highest_avg_reward:
 56                highest_avg_reward = avg_reward
 57                best_params = i
 58
 59            if verbose:
 60                print("Avg. episode reward: ", avg_reward)
 61                print("###################")
 62
 63        return rewards_and_params_results, highest_avg_reward, best_params
 64
 65    @staticmethod
 66    def pi_grid_search(env, gamma, n_iters, theta, verbose=True):
 67        highest_avg_reward = -np.inf
 68        best_params = None
 69        rewards_and_params_results = []
 70
 71        for i in itertools.product(gamma, n_iters, theta):
 72            if verbose:
 73                print("running PI with gamma:", i[0],  " n_iters:", i[1], " theta:", i[2])
 74
 75            V, V_track, pi = Planner(env.P).policy_iteration(gamma=i[0], n_iters=i[1], theta=i[2])
 76            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
 77            avg_reward = np.mean(episode_rewards)
 78            rewards_and_params_results.append({
 79                'avg_reward': avg_reward,
 80                'params': i
 81            })
 82            if avg_reward > highest_avg_reward:
 83                highest_avg_reward = avg_reward
 84                best_params = i
 85
 86            if verbose:
 87                print("Avg. episode reward: ", avg_reward)
 88                print("###################")
 89
 90        return rewards_and_params_results, highest_avg_reward, best_params
 91
 92    @staticmethod
 93    def vi_grid_search(env, gamma, n_iters, theta, verbose=True):
 94        highest_avg_reward = -np.inf
 95        best_params = None
 96        rewards_and_params_results = []
 97
 98        for i in itertools.product(gamma, n_iters, theta):
 99            if verbose:
100                print("running VI with gamma:", i[0],  " n_iters:", i[1], " theta:", i[2])
101
102            V, V_track, pi = Planner(env.P).value_iteration(gamma=i[0], n_iters=i[1], theta=i[2])
103            episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi)
104            avg_reward = np.mean(episode_rewards)
105            rewards_and_params_results.append({
106                'avg_reward': avg_reward,
107                'params': i
108            })
109            if avg_reward > highest_avg_reward:
110                highest_avg_reward = avg_reward
111                best_params = i
112
113            if verbose:
114                print("Avg. episode reward: ", avg_reward)
115                print("###################")
116
117        return rewards_and_params_results, highest_avg_reward, best_params