Table Of Contents

N-Step Q-learning

In [1]:
import gym

from keras_gym.value_functions import LinearQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import NStepQLearning


# the Gym environment
env = gym.make('CartPole-v0')


# define Q, its induced policy and update algorithm
Q = LinearQ(env, interaction='elementwise_quadratic', lr=0.01, momentum=0.2)
policy = ValuePolicy(Q)
algo = NStepQLearning(Q, n=20, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, done)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s = s_next

        
    if last_episode:
        break


env.close()
failed after 16 steps
failed after 82 steps
failed after 30 steps
failed after 145 steps
num_consecutive_successes = 1
failed after 111 steps
failed after 11 steps
failed after 77 steps
failed after 37 steps
num_consecutive_successes = 1
failed after 159 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
num_consecutive_successes = 4
num_consecutive_successes = 5
num_consecutive_successes = 6
num_consecutive_successes = 7
failed after 180 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
num_consecutive_successes = 4
num_consecutive_successes = 5
num_consecutive_successes = 6
num_consecutive_successes = 7
num_consecutive_successes = 8
num_consecutive_successes = 9
num_consecutive_successes = 10

N-Step SARSA

In [2]:
import gym

from keras_gym.value_functions import GenericQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import NStepSarsa


# the Gym environment
env = gym.make('CartPole-v0')


# define Q, its induced policy and update algorithm
Q = LinearQ(env, interaction='elementwise_quadratic', lr=0.05, momentum=0.1, decay=0.05)
policy = ValuePolicy(Q)
algo = NStepSarsa(Q, n=20, gamma=0.8)


# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps


# used for early stopping
num_consecutive_successes = 0


for episode in range(1, num_episodes + 1):
    last_episode = episode == num_episodes or num_consecutive_successes == 9
    
    # init
    s = env.reset()
    a = policy.random()
    
    # amount of random exploration
    if last_episode:
        epsilon = 0
        env.render()
    elif episode < 10:
        epsilon = 0.5
    else:
        epsilon = 0.01
    
    for t in range(1, max_episode_steps + 1):
        s_next, r, done, info = env.step(a)
        a_next = policy.epsilon_greedy(s_next, epsilon)
        
        # update or render
        if not last_episode:
            algo.update(s, a, r, s_next, a_next, done)            
        else:
            env.render()
        
        # keep track of consecutive successes
        if done:
            if t == max_episode_steps:
                num_consecutive_successes += 1
                print(f"num_consecutive_successes = {num_consecutive_successes}")
            else:
                num_consecutive_successes = 0
                print(f"failed after {t} steps")
            break
    
        # prepare for next step
        s, a = s_next, a_next

        
    if last_episode:
        break


env.close()
failed after 11 steps
failed after 13 steps
failed after 37 steps
failed after 136 steps
failed after 86 steps
failed after 45 steps
failed after 156 steps
failed after 143 steps
failed after 56 steps
num_consecutive_successes = 1
num_consecutive_successes = 2
num_consecutive_successes = 3
num_consecutive_successes = 4
num_consecutive_successes = 5
num_consecutive_successes = 6
num_consecutive_successes = 7
num_consecutive_successes = 8
num_consecutive_successes = 9
num_consecutive_successes = 10