--- title: Bandit Policies keywords: fastai sidebar: home_sidebar summary: "Implementation of bandit policies including UCB and TS." description: "Implementation of bandit policies including UCB and TS." nb_path: "nbs/rl/policies/rl.policies.bandit_policies.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class ABPolicy[source]

ABPolicy(bandit, slate_size, scores_logging) :: ABC

Helper class that provides a standard way to create an ABC using inheritance.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

class EpsilonGreedy[source]

EpsilonGreedy(bandit, epsilon, slate_size=1, scores_logging=False) :: ABPolicy

Helper class that provides a standard way to create an ABC using inheritance.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

class UCB[source]

UCB(bandit, slate_size=1, scores_logging=False) :: ABPolicy

Helper class that provides a standard way to create an ABC using inheritance.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

class TS[source]

TS(bandit, slate_size=1, scores_logging=False) :: ABPolicy

Helper class that provides a standard way to create an ABC using inheritance.

{% endraw %} {% raw %}
{% endraw %}

Evaluating bandits offline with replay method on movielens dataset

{% raw %}
!wget -q --show-progress http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip

rating_df = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id','movie_id','rating','timestamp'], usecols=['movie_id', 'rating'])
rating_df.columns = ['movieId', 'rating']
ml-100k.zip         100%[===================>]   4.70M  5.76MB/s    in 0.8s    
{% endraw %} {% raw %}
def get_data(data, num_ratings, num_movies):
    """ Make each movieId/action uniformly distributed """
    # filters out movies with less than `num_ratings` ratings
    movies = data.groupby('movieId').agg({'rating': 'count'})
    if num_movies is not None:
        movies_to_keep = movies[(movies['rating'] >= num_ratings)].sample(
            n=num_movies, random_state=12).index
    else:
        movies_to_keep = movies[(movies['rating'] >= num_ratings)].index
    data = data[data['movieId'].isin(movies_to_keep)]
    # take a random sample of size `num_ratings` for each movie
    data = data.groupby('movieId').sample(n=num_ratings, random_state=42)
    # shuffle rows to randomize data stream
    data = data.sample(frac=1, random_state=42)
    # reset index to create pseudo-timestamp index
    data = data.reset_index(drop=True)
    return data
{% endraw %} {% raw %}
NUM_RATINGS = 30        # with full dataset  -> 10000
                        # with small dataset -> 30
NUM_MOVIES = None
SLATE_SIZE = 5
BATCH_SIZE = 100        # with replay eval   -> 100
                        # with simulated env -> 1
STREAM_LENGTH = 150     # with full dataset  -> 50000
                        # with small dataset -> 150
MODE = 'replay'         # 'replay' or 'sim'
SCORES_LOG = False      # logging movie scores or not
{% endraw %} {% raw %}
logged_events = get_data(rating_df, NUM_RATINGS, NUM_MOVIES)
{% endraw %} {% raw %}
class ReplayBandit():
    """ Implementation of a bandit problem with replay evaluation """
    def __init__(self, logged_events, batch_size=1):
        self.events = logged_events.rename(columns={'rating': 'reward'})
        self.actions = np.sort(logged_events['movieId'].unique())
        self.batch_size = batch_size
        self.stream_length = len(self.events) // batch_size
    
    def get_rewards(self, recommendations, n_event):
        # generate events
        idx = n_event * self.batch_size
        events =  self.events.iloc[idx:idx+self.batch_size]
        # keep only events that match with the recommendation slate
        rewards = events[events['movieId'].isin(recommendations)]
        return rewards
{% endraw %} {% raw %}
bandit = ReplayBandit(logged_events, BATCH_SIZE)
STREAM_LENGTH = bandit.stream_length

print("NUMBER OF MOVIES/ACTIONS: {}".format(len(bandit.actions)))

# instantiate policies
policies = [
    EpsilonGreedy(bandit, epsilon=0.1, slate_size=SLATE_SIZE, scores_logging=SCORES_LOG),
    UCB(bandit, slate_size=SLATE_SIZE, scores_logging=SCORES_LOG),
    TS(bandit, slate_size=SLATE_SIZE, scores_logging=SCORES_LOG),
    ]

# evaluate policies
for policy in policies:
    print("POLICY: {}".format(policy.name))
    from tqdm.notebook import tqdm
    for i in tqdm(range(STREAM_LENGTH), ascii=True):
        recs = policy.get_recommendations()
        rewards = bandit.get_rewards(recs, i)
        policy.update(rewards)
    print("HISTORY LENGTH: {}".format(len(policy.history)))
    print()
NUMBER OF MOVIES/ACTIONS: 806
POLICY: 0.1-Greedy
HISTORY LENGTH: 147

POLICY: UCB
HISTORY LENGTH: 157

POLICY: Thompson Sampling
HISTORY LENGTH: 150

{% endraw %} {% raw %}
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns


def plot_rewards(*policies, title=None):
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11,5))
    fig.suptitle(title)
    for policy in policies:
        # get cumulative rewards
        cumsum_rewards = policy.history.reward.cumsum()
        # get average rewards
        timesteps = np.arange(len(cumsum_rewards)) + 1
        avg_rewards = cumsum_rewards / timesteps
        # plots
        ax1.plot(timesteps, avg_rewards, label=policy.name)
        ax2.plot(timesteps, cumsum_rewards, label=policy.name)
    #
    ax1.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    ax1.set_xlabel('time step')
    ax1.set_ylabel('average reward')
    ax1.legend(loc='lower right')
    #
    ax2.yaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax2.set_xlabel('time step')
    ax2.set_ylabel('cumulative reward')
    ax2.legend(loc='lower right')
    #
    plt.tight_layout()
    plt.show()


def plot_action_values(*policies):
    fig, axs = plt.subplots(nrows=1, ncols=len(policies), figsize=(15,5), squeeze=False)
    fig.suptitle("Action scores")
    axs = axs.ravel()
    for i, policy in enumerate(policies):
        cbar = True if i == len(axs)-1 else False
        sns.heatmap(policy.scores_log.T, ax=axs[i], vmin=2.5, vmax=5, cmap='hot',
                    cbar=cbar, xticklabels=1000, yticklabels=False)
        axs[i].set_xlabel('time step')
        axs[i].title.set_text(policy.name)
    axs[0].set_ylabel('movieId')
    plt.tight_layout()
    plt.show()
{% endraw %} {% raw %}
plot_rewards(*policies)
if SCORES_LOG is True:
    plot_action_values(*policies)
{% endraw %}