import gym
from keras_gym.value_functions import LinearQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import NStepQLearning
# the Gym environment
env = gym.make('CartPole-v0')
# define Q, its induced policy and update algorithm
Q = LinearQ(env, interaction='elementwise_quadratic', lr=0.01, momentum=0.2)
policy = ValuePolicy(Q)
algo = NStepQLearning(Q, n=20, gamma=0.8)
# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
# update or render
if not last_episode:
algo.update(s, a, r, s_next, done)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s = s_next
if last_episode:
break
env.close()
import gym
from keras_gym.value_functions import GenericQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import NStepSarsa
# the Gym environment
env = gym.make('CartPole-v0')
# define Q, its induced policy and update algorithm
Q = LinearQ(env, interaction='elementwise_quadratic', lr=0.05, momentum=0.1, decay=0.05)
policy = ValuePolicy(Q)
algo = NStepSarsa(Q, n=20, gamma=0.8)
# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
a = policy.random()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
s_next, r, done, info = env.step(a)
a_next = policy.epsilon_greedy(s_next, epsilon)
# update or render
if not last_episode:
algo.update(s, a, r, s_next, a_next, done)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s, a = s_next, a_next
if last_episode:
break
env.close()