bettermdptools.utils.grid_search
1# -*- coding: utf-8 -*- 2 3import itertools 4 5import numpy as np 6 7from bettermdptools.algorithms.planner import Planner 8from bettermdptools.algorithms.rl import RL 9from bettermdptools.utils.test_env import TestEnv 10 11 12class GridSearch: 13 @staticmethod 14 def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 15 highest_avg_reward = -np.inf 16 best_params = None 17 rewards_and_params_results = [] 18 19 for i in itertools.product(gamma, epsilon_decay, iters): 20 if verbose: 21 print( 22 "running q_learning with gamma:", 23 i[0], 24 "epsilon decay:", 25 i[1], 26 " iterations:", 27 i[2], 28 ) 29 30 Q, V, pi, Q_track, pi_track = RL(env).q_learning( 31 gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2] 32 ) 33 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 34 avg_reward = np.mean(episode_rewards) 35 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 36 if avg_reward > highest_avg_reward: 37 highest_avg_reward = avg_reward 38 best_params = i 39 40 if verbose: 41 print("Avg. episode reward: ", avg_reward) 42 print("###################") 43 44 return rewards_and_params_results, highest_avg_reward, best_params 45 46 @staticmethod 47 def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 48 highest_avg_reward = -np.inf 49 best_params = None 50 rewards_and_params_results = [] 51 52 for i in itertools.product(gamma, epsilon_decay, iters): 53 if verbose: 54 print( 55 "running sarsa with gamma:", 56 i[0], 57 "epsilon decay:", 58 i[1], 59 " iterations:", 60 i[2], 61 ) 62 63 Q, V, pi, Q_track, pi_track = RL(env).sarsa( 64 gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2] 65 ) 66 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 67 avg_reward = np.mean(episode_rewards) 68 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 69 if avg_reward > highest_avg_reward: 70 highest_avg_reward = avg_reward 71 best_params = i 72 73 if verbose: 74 print("Avg. episode reward: ", avg_reward) 75 print("###################") 76 77 return rewards_and_params_results, highest_avg_reward, best_params 78 79 @staticmethod 80 def pi_grid_search(env, gamma, n_iters, theta, verbose=True): 81 highest_avg_reward = -np.inf 82 best_params = None 83 rewards_and_params_results = [] 84 85 for i in itertools.product(gamma, n_iters, theta): 86 if verbose: 87 print( 88 "running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2] 89 ) 90 91 V, V_track, pi = Planner(env.P).policy_iteration( 92 gamma=i[0], n_iters=i[1], theta=i[2] 93 ) 94 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 95 avg_reward = np.mean(episode_rewards) 96 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 97 if avg_reward > highest_avg_reward: 98 highest_avg_reward = avg_reward 99 best_params = i 100 101 if verbose: 102 print("Avg. episode reward: ", avg_reward) 103 print("###################") 104 105 return rewards_and_params_results, highest_avg_reward, best_params 106 107 @staticmethod 108 def vi_grid_search(env, gamma, n_iters, theta, verbose=True): 109 highest_avg_reward = -np.inf 110 best_params = None 111 rewards_and_params_results = [] 112 113 for i in itertools.product(gamma, n_iters, theta): 114 if verbose: 115 print( 116 "running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2] 117 ) 118 119 V, V_track, pi = Planner(env.P).value_iteration( 120 gamma=i[0], n_iters=i[1], theta=i[2] 121 ) 122 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 123 avg_reward = np.mean(episode_rewards) 124 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 125 if avg_reward > highest_avg_reward: 126 highest_avg_reward = avg_reward 127 best_params = i 128 129 if verbose: 130 print("Avg. episode reward: ", avg_reward) 131 print("###################") 132 133 return rewards_and_params_results, highest_avg_reward, best_params
class
GridSearch:
13class GridSearch: 14 @staticmethod 15 def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 16 highest_avg_reward = -np.inf 17 best_params = None 18 rewards_and_params_results = [] 19 20 for i in itertools.product(gamma, epsilon_decay, iters): 21 if verbose: 22 print( 23 "running q_learning with gamma:", 24 i[0], 25 "epsilon decay:", 26 i[1], 27 " iterations:", 28 i[2], 29 ) 30 31 Q, V, pi, Q_track, pi_track = RL(env).q_learning( 32 gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2] 33 ) 34 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 35 avg_reward = np.mean(episode_rewards) 36 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 37 if avg_reward > highest_avg_reward: 38 highest_avg_reward = avg_reward 39 best_params = i 40 41 if verbose: 42 print("Avg. episode reward: ", avg_reward) 43 print("###################") 44 45 return rewards_and_params_results, highest_avg_reward, best_params 46 47 @staticmethod 48 def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 49 highest_avg_reward = -np.inf 50 best_params = None 51 rewards_and_params_results = [] 52 53 for i in itertools.product(gamma, epsilon_decay, iters): 54 if verbose: 55 print( 56 "running sarsa with gamma:", 57 i[0], 58 "epsilon decay:", 59 i[1], 60 " iterations:", 61 i[2], 62 ) 63 64 Q, V, pi, Q_track, pi_track = RL(env).sarsa( 65 gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2] 66 ) 67 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 68 avg_reward = np.mean(episode_rewards) 69 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 70 if avg_reward > highest_avg_reward: 71 highest_avg_reward = avg_reward 72 best_params = i 73 74 if verbose: 75 print("Avg. episode reward: ", avg_reward) 76 print("###################") 77 78 return rewards_and_params_results, highest_avg_reward, best_params 79 80 @staticmethod 81 def pi_grid_search(env, gamma, n_iters, theta, verbose=True): 82 highest_avg_reward = -np.inf 83 best_params = None 84 rewards_and_params_results = [] 85 86 for i in itertools.product(gamma, n_iters, theta): 87 if verbose: 88 print( 89 "running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2] 90 ) 91 92 V, V_track, pi = Planner(env.P).policy_iteration( 93 gamma=i[0], n_iters=i[1], theta=i[2] 94 ) 95 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 96 avg_reward = np.mean(episode_rewards) 97 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 98 if avg_reward > highest_avg_reward: 99 highest_avg_reward = avg_reward 100 best_params = i 101 102 if verbose: 103 print("Avg. episode reward: ", avg_reward) 104 print("###################") 105 106 return rewards_and_params_results, highest_avg_reward, best_params 107 108 @staticmethod 109 def vi_grid_search(env, gamma, n_iters, theta, verbose=True): 110 highest_avg_reward = -np.inf 111 best_params = None 112 rewards_and_params_results = [] 113 114 for i in itertools.product(gamma, n_iters, theta): 115 if verbose: 116 print( 117 "running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2] 118 ) 119 120 V, V_track, pi = Planner(env.P).value_iteration( 121 gamma=i[0], n_iters=i[1], theta=i[2] 122 ) 123 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 124 avg_reward = np.mean(episode_rewards) 125 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 126 if avg_reward > highest_avg_reward: 127 highest_avg_reward = avg_reward 128 best_params = i 129 130 if verbose: 131 print("Avg. episode reward: ", avg_reward) 132 print("###################") 133 134 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
14 @staticmethod 15 def q_learning_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 16 highest_avg_reward = -np.inf 17 best_params = None 18 rewards_and_params_results = [] 19 20 for i in itertools.product(gamma, epsilon_decay, iters): 21 if verbose: 22 print( 23 "running q_learning with gamma:", 24 i[0], 25 "epsilon decay:", 26 i[1], 27 " iterations:", 28 i[2], 29 ) 30 31 Q, V, pi, Q_track, pi_track = RL(env).q_learning( 32 gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2] 33 ) 34 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 35 avg_reward = np.mean(episode_rewards) 36 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 37 if avg_reward > highest_avg_reward: 38 highest_avg_reward = avg_reward 39 best_params = i 40 41 if verbose: 42 print("Avg. episode reward: ", avg_reward) 43 print("###################") 44 45 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True):
47 @staticmethod 48 def sarsa_grid_search(env, gamma, epsilon_decay, iters, verbose=True): 49 highest_avg_reward = -np.inf 50 best_params = None 51 rewards_and_params_results = [] 52 53 for i in itertools.product(gamma, epsilon_decay, iters): 54 if verbose: 55 print( 56 "running sarsa with gamma:", 57 i[0], 58 "epsilon decay:", 59 i[1], 60 " iterations:", 61 i[2], 62 ) 63 64 Q, V, pi, Q_track, pi_track = RL(env).sarsa( 65 gamma=i[0], epsilon_decay_ratio=i[1], n_episodes=i[2] 66 ) 67 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 68 avg_reward = np.mean(episode_rewards) 69 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 70 if avg_reward > highest_avg_reward: 71 highest_avg_reward = avg_reward 72 best_params = i 73 74 if verbose: 75 print("Avg. episode reward: ", avg_reward) 76 print("###################") 77 78 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
pi_grid_search(env, gamma, n_iters, theta, verbose=True):
80 @staticmethod 81 def pi_grid_search(env, gamma, n_iters, theta, verbose=True): 82 highest_avg_reward = -np.inf 83 best_params = None 84 rewards_and_params_results = [] 85 86 for i in itertools.product(gamma, n_iters, theta): 87 if verbose: 88 print( 89 "running PI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2] 90 ) 91 92 V, V_track, pi = Planner(env.P).policy_iteration( 93 gamma=i[0], n_iters=i[1], theta=i[2] 94 ) 95 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 96 avg_reward = np.mean(episode_rewards) 97 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 98 if avg_reward > highest_avg_reward: 99 highest_avg_reward = avg_reward 100 best_params = i 101 102 if verbose: 103 print("Avg. episode reward: ", avg_reward) 104 print("###################") 105 106 return rewards_and_params_results, highest_avg_reward, best_params
@staticmethod
def
vi_grid_search(env, gamma, n_iters, theta, verbose=True):
108 @staticmethod 109 def vi_grid_search(env, gamma, n_iters, theta, verbose=True): 110 highest_avg_reward = -np.inf 111 best_params = None 112 rewards_and_params_results = [] 113 114 for i in itertools.product(gamma, n_iters, theta): 115 if verbose: 116 print( 117 "running VI with gamma:", i[0], " n_iters:", i[1], " theta:", i[2] 118 ) 119 120 V, V_track, pi = Planner(env.P).value_iteration( 121 gamma=i[0], n_iters=i[1], theta=i[2] 122 ) 123 episode_rewards = TestEnv.test_env(env=env, n_iters=100, pi=pi) 124 avg_reward = np.mean(episode_rewards) 125 rewards_and_params_results.append({"avg_reward": avg_reward, "params": i}) 126 if avg_reward > highest_avg_reward: 127 highest_avg_reward = avg_reward 128 best_params = i 129 130 if verbose: 131 print("Avg. episode reward: ", avg_reward) 132 print("###################") 133 134 return rewards_and_params_results, highest_avg_reward, best_params