In [ ]:
%reload keras_gym.policies.base keras_gym.policies.value_based keras_gym.policies

from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT
from keras_gym.policies import ValuePolicy
from keras_gym.value_functions import LinearQ
from keras_gym.algorithms import MonteCarlo

env = FrozenLakeEnv(is_slippery=False)
q = LinearQ(env, eta0=0.01)
policy = ValuePolicy(q)
algo = MonteCarlo(q)

s = env.reset()
a = policy.random()


def display_proba(p, s):
    actions = dict([(UP, 'up'), (DOWN, 'down'), (LEFT, 'left'), (RIGHT, 'right')])
    proba = p.proba(s).p
    pmax = np.max(proba)
    print('\n'.join("{2} {1:.3f} - {0}".format(actions[a], p, '*' if p == pmax else ' ') for a, p in enumerate(proba)))


def run_episode(env, q, policy, algo, epsilon=0, update=False, render=False):
    s = env.reset()
    done = False
    while not done:
        if render:
            env.render()
            display_proba(policy, s)
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        if update:
            algo.update(s, a, r, s_next, done)
        s = s_next
    if render:
        env.render()


for _ in range(200):
    run_episode(env, q, policy, algo, epsilon=0.1, update=True)

run_episode(env, q, policy, algo, render=True)
In [ ]:
print(*policy.random(return_propensity=True))
print(*policy.greedy(s, return_propensity=True))
print(*policy.epsilon_greedy(s, epsilon=0.2, return_propensity=True))
print(*policy.thompson(s, return_propensity=True))
In [ ]:
%reload keras_gym.policies.base keras_gym.policies.value_based keras_gym.policies.updateable keras_gym.policies

from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT
from keras_gym.policies import SoftmaxPolicy
from keras_gym.value_functions import LinearQ
from keras_gym.algorithms import MonteCarlo

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(learning_rate='constant', eta0=0.01, loss='log')

env = FrozenLakeEnv(is_slippery=False)
q = LinearQ(env, eta0=0.01)
policy = SoftmaxPolicy(env, clf)
algo = MonteCarlo(q)

s = env.reset()
a = policy.random()


def display_proba(p, s):
    actions = dict([(UP, 'up'), (DOWN, 'down'), (LEFT, 'left'), (RIGHT, 'right')])
    proba = p.proba(s).p
    pmax = np.max(proba)
    print('\n'.join("{2} {1:.3f} - {0}".format(actions[a], p, '*' if p == pmax else ' ') for a, p in enumerate(proba)))


def run_episode(env, q, policy, algo, epsilon=0, update=False, render=False):
    s = env.reset()
    done = False
    while not done:
        if render:
            env.render()
            display_proba(policy, s)
        a = policy.epsilon_greedy(s, epsilon)
        s_next, r, done, info = env.step(a)
        if update:
            algo.update(s, a, r, s_next, done)
        s = s_next
    if render:
        env.render()


# for _ in range(200):
#     run_episode(env, q, policy, algo, epsilon=0.1, update=True)

run_episode(env, q, policy, algo, render=True)