In [2]:
# %load ../../scripts/frozen_lake/actor_critic.py
import numpy as np
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, LEFT, RIGHT, UP, DOWN

from keras_gym.preprocessing import DefaultPreprocessor
from keras_gym.policies import LinearSoftmaxPolicy, ActorCritic
from keras_gym.value_functions import LinearV


# env with preprocessing
env = FrozenLakeEnv(is_slippery=False)
env = DefaultPreprocessor(env)
actions = {LEFT: 'L', RIGHT: 'R', UP: 'U', DOWN: 'D'}


# updateable policy
policy = LinearSoftmaxPolicy(env, lr=0.1, update_strategy='vanilla')
V = LinearV(env, lr=0.1, gamma=0.9, bootstrap_n=1)
actor_critic = ActorCritic(policy, V)


# static parameters
num_episodes = 1000
num_steps = 30


# train
for ep in range(num_episodes):
    s = env.reset()

    for t in range(num_steps):
        a = policy(s)
        s_next, r, done, info = env.step(a)

        # small incentive to keep moving
        if np.array_equal(s_next, s):
            r = -0.1

        actor_critic.update(s, a, r, done)
        # g = r + (1 - done) * V.gamma * V(s_next)
        # policy.update(s, a, g - V(s))
        # V.update(s, r, done)

        if done:
            break

        s = s_next


# run env one more time to render
s = env.reset()
env.render()

for t in range(num_steps):

    # print individual action probabilities
    print("  V(s) = {:.3f}".format(V(s)))
    for i, p in enumerate(policy.proba(s)):
        print("  π({:s}|s) = {:.3f}".format(actions[i], p))

    a = policy.greedy(s)
    s, r, done, info = env.step(a)
    env.render()

    if done:
        break
WARNING:tensorflow:From /home/kris/.local/lib/python3.6/site-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /home/kris/.local/lib/python3.6/site-packages/tensorflow/python/ops/losses/losses_impl.py:448: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.

SFFF
FHFH
FFFH
HFFG
  V(s) = 0.530
  π(L|s) = 0.002
  π(D|s) = 0.973
  π(R|s) = 0.024
  π(U|s) = 0.002
  (Down)
SFFF
FHFH
FFFH
HFFG
  V(s) = 0.617
  π(L|s) = 0.002
  π(D|s) = 0.986
  π(R|s) = 0.010
  π(U|s) = 0.002
  (Down)
SFFF
FHFH
FFFH
HFFG
  V(s) = 0.694
  π(L|s) = 0.004
  π(D|s) = 0.032
  π(R|s) = 0.960
  π(U|s) = 0.004
  (Right)
SFFF
FHFH
FFFH
HFFG
  V(s) = 0.779
  π(L|s) = 0.008
  π(D|s) = 0.764
  π(R|s) = 0.222
  π(U|s) = 0.005
  (Down)
SFFF
FHFH
FFFH
HFFG
  V(s) = 0.876
  π(L|s) = 0.007
  π(D|s) = 0.141
  π(R|s) = 0.846
  π(U|s) = 0.007
  (Right)
SFFF
FHFH
FFFH
HFFG
  V(s) = 0.994
  π(L|s) = 0.004
  π(D|s) = 0.071
  π(R|s) = 0.921
  π(U|s) = 0.004
  (Right)
SFFF
FHFH
FFFH
HFFG