%reload keras_gym.policies.base keras_gym.policies.value_based keras_gym.policies
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT
from keras_gym.policies import ValuePolicy
from keras_gym.value_functions import LinearQ
from keras_gym.algorithms import MonteCarlo
env = FrozenLakeEnv(is_slippery=False)
q = LinearQ(env, eta0=0.01)
policy = ValuePolicy(q)
algo = MonteCarlo(q)
s = env.reset()
a = policy.random()
def display_proba(p, s):
actions = dict([(UP, 'up'), (DOWN, 'down'), (LEFT, 'left'), (RIGHT, 'right')])
proba = p.proba(s).p
pmax = np.max(proba)
print('\n'.join("{2} {1:.3f} - {0}".format(actions[a], p, '*' if p == pmax else ' ') for a, p in enumerate(proba)))
def run_episode(env, q, policy, algo, epsilon=0, update=False, render=False):
s = env.reset()
done = False
while not done:
if render:
env.render()
display_proba(policy, s)
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
if update:
algo.update(s, a, r, s_next, done)
s = s_next
if render:
env.render()
for _ in range(200):
run_episode(env, q, policy, algo, epsilon=0.1, update=True)
run_episode(env, q, policy, algo, render=True)
print(*policy.random(return_propensity=True))
print(*policy.greedy(s, return_propensity=True))
print(*policy.epsilon_greedy(s, epsilon=0.2, return_propensity=True))
print(*policy.thompson(s, return_propensity=True))
%reload keras_gym.policies.base keras_gym.policies.value_based keras_gym.policies.updateable keras_gym.policies
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT
from keras_gym.policies import SoftmaxPolicy
from keras_gym.value_functions import LinearQ
from keras_gym.algorithms import MonteCarlo
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(learning_rate='constant', eta0=0.01, loss='log')
env = FrozenLakeEnv(is_slippery=False)
q = LinearQ(env, eta0=0.01)
policy = SoftmaxPolicy(env, clf)
algo = MonteCarlo(q)
s = env.reset()
a = policy.random()
def display_proba(p, s):
actions = dict([(UP, 'up'), (DOWN, 'down'), (LEFT, 'left'), (RIGHT, 'right')])
proba = p.proba(s).p
pmax = np.max(proba)
print('\n'.join("{2} {1:.3f} - {0}".format(actions[a], p, '*' if p == pmax else ' ') for a, p in enumerate(proba)))
def run_episode(env, q, policy, algo, epsilon=0, update=False, render=False):
s = env.reset()
done = False
while not done:
if render:
env.render()
display_proba(policy, s)
a = policy.epsilon_greedy(s, epsilon)
s_next, r, done, info = env.step(a)
if update:
algo.update(s, a, r, s_next, done)
s = s_next
if render:
env.render()
# for _ in range(200):
# run_episode(env, q, policy, algo, epsilon=0.1, update=True)
run_episode(env, q, policy, algo, render=True)