import gym
from tensorflow import keras
from tensorflow.keras import backend as K
from keras_gym.value_functions import LinearQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import Sarsa
# the Gym environment
env = gym.make('CartPole-v0')
# define Q, its induced policy and update algorithm
Q = LinearQ(env, lr=0.08, interaction='elementwise_quadratic')
policy = ValuePolicy(Q)
algo = Sarsa(Q, gamma=0.8)
# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
a = env.action_space.sample()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
s_next, r, done, info = env.step(a)
a_next = policy.epsilon_greedy(s, epsilon)
# update or render
if not last_episode:
algo.update(s, a, r, s_next, a_next)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s, a = s_next, a_next
if last_episode:
break
env.close()
import gym
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import FunctionTransformer
from keras_gym.value_functions import GenericQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import Sarsa
from keras_gym.wrappers import SklearnModelWrapper
# the Gym environment
env = gym.make('CartPole-v0')
# define sklearn model for approximating Q-function
model = SklearnModelWrapper(
estimator=SGDRegressor(eta0=0.08, learning_rate='constant'),
transformer=FunctionTransformer(
lambda x: np.hstack([x, x ** 2]), validate=False),
)
# define Q, its induced policy and update algorithm
Q = GenericQ(env, model)
policy = ValuePolicy(Q)
algo = Sarsa(Q, gamma=0.8)
# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
a = env.action_space.sample()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
s_next, r, done, info = env.step(a)
a_next = policy.epsilon_greedy(s, epsilon)
# update or render
if not last_episode:
algo.update(s, a, r, s_next, a_next)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s, a = s_next, a_next
if last_episode:
break
env.close()
import gym
from tensorflow import keras
from tensorflow.keras import backend as K
from keras_gym.value_functions import LinearQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import QLearning
# the Gym environment
env = gym.make('CartPole-v0')
# define Q, its induced policy and update algorithm
Q = LinearQ(env, interaction='elementwise_quadratic', lr=0.8, momentum=0., decay=0.1)
policy = ValuePolicy(Q)
algo = QLearning(Q, gamma=0.8)
# number of iterations
num_episodes = 100
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
# update or render
if not last_episode:
algo.update(s, a, r, s_next)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s = s_next
if last_episode:
break
env.close()
import gym
from tensorflow import keras
from tensorflow.keras import backend as K
from keras_gym.value_functions import GenericQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import QLearning
# the Gym environment
env = gym.make('CartPole-v0')
# custom function apprixmator (linear regression)
model = keras.Sequential(layers=[
keras.layers.Lambda(lambda x: K.concatenate([x, x ** 2])),
keras.layers.Dense(1),
])
model.compile(
optimizer=keras.optimizers.SGD(lr=0.05, momentum=0.5),
loss=keras.metrics.mean_squared_error)
# define Q, its induced policy and update algorithm
Q = GenericQ(env, model)
policy = ValuePolicy(Q)
algo = QLearning(Q, gamma=0.8)
# number of iterations
num_episodes = 100
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
# update or render
if not last_episode:
algo.update(s, a, r, s_next)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s = s_next
if last_episode:
break
env.close()
import gym
from tensorflow import keras
from tensorflow.keras import backend as K
from keras_gym.value_functions import LinearQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import QLearning
# the Gym environment
env = gym.make('CartPole-v0')
# define Q, its induced policy and update algorithm
Q = LinearQ(env, model_type=2, lr=0.05, momentum=0.5, interaction='elementwise_quadratic')
policy = ValuePolicy(Q)
algo = QLearning(Q, gamma=0.8)
# number of iterations
num_episodes = 100
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
# update or render
if not last_episode:
algo.update(s, a, r, s_next)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s = s_next
if last_episode:
break
env.close()
import gym
from tensorflow import keras
from tensorflow.keras import backend as K
from keras_gym.value_functions import LinearQ
from keras_gym.policies import ValuePolicy
from keras_gym.algorithms import ExpectedSarsa
# the Gym environment
env = gym.make('CartPole-v0')
# define Q, its induced policy and update algorithm
Q = LinearQ(env, interaction='elementwise_quadratic', lr=0.01)
policy = ValuePolicy(Q)
algo = ExpectedSarsa(Q, policy, gamma=0.8)
# number of iterations
num_episodes = 200
max_episode_steps = env._max_episode_steps
# used for early stopping
num_consecutive_successes = 0
for episode in range(1, num_episodes + 1):
last_episode = episode == num_episodes or num_consecutive_successes == 9
# init
s = env.reset()
# amount of random exploration
if last_episode:
epsilon = 0
env.render()
elif episode < 10:
epsilon = 0.5
else:
epsilon = 0.01
for t in range(1, max_episode_steps + 1):
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
# update or render
if not last_episode:
algo.update(s, a, r, s_next)
else:
env.render()
# keep track of consecutive successes
if done:
if t == max_episode_steps:
num_consecutive_successes += 1
print(f"num_consecutive_successes = {num_consecutive_successes}")
else:
num_consecutive_successes = 0
print(f"failed after {t} steps")
break
# prepare for next step
s = s_next
if last_episode:
break
env.close()
# env = gym.make('CartPole-v1')
# env = gym.wrappers.Monitor(env, os.path.join('data', 'video', 'cartpole-linear-model-sarsa'), force=True)
# s = env.reset()
# env.render()
# done = False
# while not done:
# a = policy.greedy(s)
# s, _, done, _ = env.step(a)
# env.render()
# env.close()