utils.grid_search
1# -*- coding: utf-8 -*- 2 3from bettermdptools.algorithms.rl import RL 4from bettermdptools.algorithms.planner import Planner 5from bettermdptools.utils.test_env import TestEnv 6import numpy as np 7import itertools 8 9class GridSearch: 10 @staticmethod 11 def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 12 highest_avg_reward = -np.inf 13 best_params = None 14 rewards_and_params_results = [] 15 16 for i in itertools.product(gamma, epsilon_decay, iters): 17 if verbose: 18 print("running q_learning with gamma:", i[0], "epsilon decay:", i[1], " iterations:", i[2]) 19 20 Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]) 21 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 22 avg_reward = np.mean(episode_rewards) 23 rewards_and_params_results.append({ 24 'avg_reward': avg_reward, 25 'params': i 26 }) 27 if avg_reward > highest_avg_reward: 28 highest_avg_reward = avg_reward 29 best_params = i 30 31 if verbose: 32 print("Avg. episode reward: ", avg_reward) 33 print("###################") 34 35 return rewards_and_params_results, highest_avg_reward, best_params 36 37 @staticmethod 38 def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 39 highest_avg_reward = -np.inf 40 best_params = None 41 rewards_and_params_results = [] 42 43 for i in itertools.product(gamma, epsilon_decay, iters): 44 if verbose: 45 print("running sarsa with gamma:", i[0], "epsilon decay:", i[1], " iterations:", i[2]) 46 47 Q, V, pi, Q_track, pi_track = RL(env).sarsa(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]) 48 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 49 avg_reward = np.mean(episode_rewards) 50 rewards_and_params_results.append({ 51 'avg_reward': avg_reward, 52 'params': i 53 }) 54 if avg_reward > highest_avg_reward: 55 highest_avg_reward = avg_reward 56 best_params = i 57 58 if verbose: 59 print("Avg. episode reward: ", avg_reward) 60 print("###################") 61 62 return rewards_and_params_results, highest_avg_reward, best_params 63 64 @staticmethod 65 def pi_grid_search(env, gamma, n_iters, theta, verbose=True): 66 highest_avg_reward = -np.inf 67 best_params = None 68 rewards_and_params_results = [] 69 70 for i in itertools.product(gamma, n_iters, theta): 71 if verbose: 72 print("running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]) 73 74 V, V_track, pi = Planner(env.P).policy_iteration(gamma=i[0], n_iters=i[1], theta=i[2]) 75 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 76 avg_reward = np.mean(episode_rewards) 77 rewards_and_params_results.append({ 78 'avg_reward': avg_reward, 79 'params': i 80 }) 81 if avg_reward > highest_avg_reward: 82 highest_avg_reward = avg_reward 83 best_params = i 84 85 if verbose: 86 print("Avg. episode reward: ", avg_reward) 87 print("###################") 88 89 return rewards_and_params_results, highest_avg_reward, best_params 90 91 @staticmethod 92 def vi_grid_search(env, gamma, n_iters, theta, verbose=True): 93 highest_avg_reward = -np.inf 94 best_params = None 95 rewards_and_params_results = [] 96 97 for i in itertools.product(gamma, n_iters, theta): 98 if verbose: 99 print("running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]) 100 101 V, V_track, pi = Planner(env.P).value_iteration(gamma=i[0], n_iters=i[1], theta=i[2]) 102 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 103 avg_reward = np.mean(episode_rewards) 104 rewards_and_params_results.append({ 105 'avg_reward': avg_reward, 106 'params': i 107 }) 108 if avg_reward > highest_avg_reward: 109 highest_avg_reward = avg_reward 110 best_params = i 111 112 if verbose: 113 print("Avg. episode reward: ", avg_reward) 114 print("###################") 115 116 return rewards_and_params_results, highest_avg_reward, best_params
class
GridSearch:
10class GridSearch: 11 @staticmethod 12 def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 13 highest_avg_reward = -np.inf 14 best_params = None 15 rewards_and_params_results = [] 16 17 for i in itertools.product(gamma, epsilon_decay, iters): 18 if verbose: 19 print("running q_learning with gamma:", i[0], "epsilon decay:", i[1], " iterations:", i[2]) 20 21 Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]) 22 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 23 avg_reward = np.mean(episode_rewards) 24 rewards_and_params_results.append({ 25 'avg_reward': avg_reward, 26 'params': i 27 }) 28 if avg_reward > highest_avg_reward: 29 highest_avg_reward = avg_reward 30 best_params = i 31 32 if verbose: 33 print("Avg. episode reward: ", avg_reward) 34 print("###################") 35 36 return rewards_and_params_results, highest_avg_reward, best_params 37 38 @staticmethod 39 def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 40 highest_avg_reward = -np.inf 41 best_params = None 42 rewards_and_params_results = [] 43 44 for i in itertools.product(gamma, epsilon_decay, iters): 45 if verbose: 46 print("running sarsa with gamma:", i[0], "epsilon decay:", i[1], " iterations:", i[2]) 47 48 Q, V, pi, Q_track, pi_track = RL(env).sarsa(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]) 49 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 50 avg_reward = np.mean(episode_rewards) 51 rewards_and_params_results.append({ 52 'avg_reward': avg_reward, 53 'params': i 54 }) 55 if avg_reward > highest_avg_reward: 56 highest_avg_reward = avg_reward 57 best_params = i 58 59 if verbose: 60 print("Avg. episode reward: ", avg_reward) 61 print("###################") 62 63 return rewards_and_params_results, highest_avg_reward, best_params 64 65 @staticmethod 66 def pi_grid_search(env, gamma, n_iters, theta, verbose=True): 67 highest_avg_reward = -np.inf 68 best_params = None 69 rewards_and_params_results = [] 70 71 for i in itertools.product(gamma, n_iters, theta): 72 if verbose: 73 print("running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]) 74 75 V, V_track, pi = Planner(env.P).policy_iteration(gamma=i[0], n_iters=i[1], theta=i[2]) 76 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 77 avg_reward = np.mean(episode_rewards) 78 rewards_and_params_results.append({ 79 'avg_reward': avg_reward, 80 'params': i 81 }) 82 if avg_reward > highest_avg_reward: 83 highest_avg_reward = avg_reward 84 best_params = i 85 86 if verbose: 87 print("Avg. episode reward: ", avg_reward) 88 print("###################") 89 90 return rewards_and_params_results, highest_avg_reward, best_params 91 92 @staticmethod 93 def vi_grid_search(env, gamma, n_iters, theta, verbose=True): 94 highest_avg_reward = -np.inf 95 best_params = None 96 rewards_and_params_results = [] 97 98 for i in itertools.product(gamma, n_iters, theta): 99 if verbose: 100 print("running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]) 101 102 V, V_track, pi = Planner(env.P).value_iteration(gamma=i[0], n_iters=i[1], theta=i[2]) 103 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 104 avg_reward = np.mean(episode_rewards) 105 rewards_and_params_results.append({ 106 'avg_reward': avg_reward, 107 'params': i 108 }) 109 if avg_reward > highest_avg_reward: 110 highest_avg_reward = avg_reward 111 best_params = i 112 113 if verbose: 114 print("Avg. episode reward: ", avg_reward) 115 print("###################") 116 117 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
11 @staticmethod 12 def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 13 highest_avg_reward = -np.inf 14 best_params = None 15 rewards_and_params_results = [] 16 17 for i in itertools.product(gamma, epsilon_decay, iters): 18 if verbose: 19 print("running q_learning with gamma:", i[0], "epsilon decay:", i[1], " iterations:", i[2]) 20 21 Q, V, pi, Q_track, pi_track = RL(env).q_learning(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]) 22 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 23 avg_reward = np.mean(episode_rewards) 24 rewards_and_params_results.append({ 25 'avg_reward': avg_reward, 26 'params': i 27 }) 28 if avg_reward > highest_avg_reward: 29 highest_avg_reward = avg_reward 30 best_params = i 31 32 if verbose: 33 print("Avg. episode reward: ", avg_reward) 34 print("###################") 35 36 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
38 @staticmethod 39 def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 40 highest_avg_reward = -np.inf 41 best_params = None 42 rewards_and_params_results = [] 43 44 for i in itertools.product(gamma, epsilon_decay, iters): 45 if verbose: 46 print("running sarsa with gamma:", i[0], "epsilon decay:", i[1], " iterations:", i[2]) 47 48 Q, V, pi, Q_track, pi_track = RL(env).sarsa(gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2]) 49 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 50 avg_reward = np.mean(episode_rewards) 51 rewards_and_params_results.append({ 52 'avg_reward': avg_reward, 53 'params': i 54 }) 55 if avg_reward > highest_avg_reward: 56 highest_avg_reward = avg_reward 57 best_params = i 58 59 if verbose: 60 print("Avg. episode reward: ", avg_reward) 61 print("###################") 62 63 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
pi_grid_search(env, gamma, n_iters, theta, verbose=True):
65 @staticmethod 66 def pi_grid_search(env, gamma, n_iters, theta, verbose=True): 67 highest_avg_reward = -np.inf 68 best_params = None 69 rewards_and_params_results = [] 70 71 for i in itertools.product(gamma, n_iters, theta): 72 if verbose: 73 print("running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]) 74 75 V, V_track, pi = Planner(env.P).policy_iteration(gamma=i[0], n_iters=i[1], theta=i[2]) 76 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 77 avg_reward = np.mean(episode_rewards) 78 rewards_and_params_results.append({ 79 'avg_reward': avg_reward, 80 'params': i 81 }) 82 if avg_reward > highest_avg_reward: 83 highest_avg_reward = avg_reward 84 best_params = i 85 86 if verbose: 87 print("Avg. episode reward: ", avg_reward) 88 print("###################") 89 90 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
vi_grid_search(env, gamma, n_iters, theta, verbose=True):
92 @staticmethod 93 def vi_grid_search(env, gamma, n_iters, theta, verbose=True): 94 highest_avg_reward = -np.inf 95 best_params = None 96 rewards_and_params_results = [] 97 98 for i in itertools.product(gamma, n_iters, theta): 99 if verbose: 100 print("running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2]) 101 102 V, V_track, pi = Planner(env.P).value_iteration(gamma=i[0], n_iters=i[1], theta=i[2]) 103 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 104 avg_reward = np.mean(episode_rewards) 105 rewards_and_params_results.append({ 106 'avg_reward': avg_reward, 107 'params': i 108 }) 109 if avg_reward > highest_avg_reward: 110 highest_avg_reward = avg_reward 111 best_params = i 112 113 if verbose: 114 print("Avg. episode reward: ", avg_reward) 115 print("###################") 116 117 return rewards_and_params_results, highest_avg_reward, best_params