In [3]:
# %load ../../scripts/frozen_lake/reinforce.py
import numpy as np
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT

from keras_gym.preprocessing import DefaultPreprocessor
from keras_gym.policies import LinearSoftmaxPolicy
from keras_gym.caching import MonteCarloCache


# env with preprocessing
env = FrozenLakeEnv(is_slippery=False)
env = DefaultPreprocessor(env)
actions = {LEFT: 'L', RIGHT: 'R', UP: 'U', DOWN: 'D'}

# updateable policy
policy = LinearSoftmaxPolicy(env, lr=0.1)
cache = MonteCarloCache(gamma=0.99)


# static parameters
num_episodes = 1000
num_steps = 30


# train
for ep in range(num_episodes):
    s = env.reset()
    cache.reset()

    for t in range(num_steps):
        a = policy(s)
        s_next, r, done, info = env.step(a)

        # small incentive to keep moving
        if np.array_equal(s_next, s):
            r = -0.1

        cache.add(s, a, r, done)

        if done:
            while cache:
                S, A, G = cache.pop()
                policy.batch_update(S, A, G)
            break

        s = s_next


# run env one more time to render
s = env.reset()
env.render()

for t in range(num_steps):

    # print individual action probabilities
    for i, p in enumerate(policy.proba(s)):
        print("  π({:s}|s) = {:.3f}".format(actions[i], p))

    a = policy.greedy(s)
    s, r, done, info = env.step(a)

    env.render()

    if done:
        break
SFFF
FHFH
FFFH
HFFG
  π(L|s) = 0.001
  π(D|s) = 0.993
  π(R|s) = 0.005
  π(U|s) = 0.001
  (Down)
SFFF
FHFH
FFFH
HFFG
  π(L|s) = 0.002
  π(D|s) = 0.993
  π(R|s) = 0.004
  π(U|s) = 0.001
  (Down)
SFFF
FHFH
FFFH
HFFG
  π(L|s) = 0.004
  π(D|s) = 0.012
  π(R|s) = 0.982
  π(U|s) = 0.002
  (Right)
SFFF
FHFH
FFFH
HFFG
  π(L|s) = 0.003
  π(D|s) = 0.974
  π(R|s) = 0.023
  π(U|s) = 0.001
  (Down)
SFFF
FHFH
FFFH
HFFG
  π(L|s) = 0.004
  π(D|s) = 0.078
  π(R|s) = 0.916
  π(U|s) = 0.003
  (Right)
SFFF
FHFH
FFFH
HFFG
  π(L|s) = 0.003
  π(D|s) = 0.044
  π(R|s) = 0.952
  π(U|s) = 0.001
  (Right)
SFFF
FHFH
FFFH
HFFG