--- title: Bandit Policies keywords: fastai sidebar: home_sidebar summary: "Implementation of bandit policies including UCB and TS." description: "Implementation of bandit policies including UCB and TS." nb_path: "nbs/rl/policies/rl.policies.bandit_policies.ipynb" ---
!wget -q --show-progress http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip
rating_df = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id','movie_id','rating','timestamp'], usecols=['movie_id', 'rating'])
rating_df.columns = ['movieId', 'rating']
def get_data(data, num_ratings, num_movies):
""" Make each movieId/action uniformly distributed """
# filters out movies with less than `num_ratings` ratings
movies = data.groupby('movieId').agg({'rating': 'count'})
if num_movies is not None:
movies_to_keep = movies[(movies['rating'] >= num_ratings)].sample(
n=num_movies, random_state=12).index
else:
movies_to_keep = movies[(movies['rating'] >= num_ratings)].index
data = data[data['movieId'].isin(movies_to_keep)]
# take a random sample of size `num_ratings` for each movie
data = data.groupby('movieId').sample(n=num_ratings, random_state=42)
# shuffle rows to randomize data stream
data = data.sample(frac=1, random_state=42)
# reset index to create pseudo-timestamp index
data = data.reset_index(drop=True)
return data
NUM_RATINGS = 30 # with full dataset -> 10000
# with small dataset -> 30
NUM_MOVIES = None
SLATE_SIZE = 5
BATCH_SIZE = 100 # with replay eval -> 100
# with simulated env -> 1
STREAM_LENGTH = 150 # with full dataset -> 50000
# with small dataset -> 150
MODE = 'replay' # 'replay' or 'sim'
SCORES_LOG = False # logging movie scores or not
logged_events = get_data(rating_df, NUM_RATINGS, NUM_MOVIES)
class ReplayBandit():
""" Implementation of a bandit problem with replay evaluation """
def __init__(self, logged_events, batch_size=1):
self.events = logged_events.rename(columns={'rating': 'reward'})
self.actions = np.sort(logged_events['movieId'].unique())
self.batch_size = batch_size
self.stream_length = len(self.events) // batch_size
def get_rewards(self, recommendations, n_event):
# generate events
idx = n_event * self.batch_size
events = self.events.iloc[idx:idx+self.batch_size]
# keep only events that match with the recommendation slate
rewards = events[events['movieId'].isin(recommendations)]
return rewards
bandit = ReplayBandit(logged_events, BATCH_SIZE)
STREAM_LENGTH = bandit.stream_length
print("NUMBER OF MOVIES/ACTIONS: {}".format(len(bandit.actions)))
# instantiate policies
policies = [
EpsilonGreedy(bandit, epsilon=0.1, slate_size=SLATE_SIZE, scores_logging=SCORES_LOG),
UCB(bandit, slate_size=SLATE_SIZE, scores_logging=SCORES_LOG),
TS(bandit, slate_size=SLATE_SIZE, scores_logging=SCORES_LOG),
]
# evaluate policies
for policy in policies:
print("POLICY: {}".format(policy.name))
from tqdm.notebook import tqdm
for i in tqdm(range(STREAM_LENGTH), ascii=True):
recs = policy.get_recommendations()
rewards = bandit.get_rewards(recs, i)
policy.update(rewards)
print("HISTORY LENGTH: {}".format(len(policy.history)))
print()
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
def plot_rewards(*policies, title=None):
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11,5))
fig.suptitle(title)
for policy in policies:
# get cumulative rewards
cumsum_rewards = policy.history.reward.cumsum()
# get average rewards
timesteps = np.arange(len(cumsum_rewards)) + 1
avg_rewards = cumsum_rewards / timesteps
# plots
ax1.plot(timesteps, avg_rewards, label=policy.name)
ax2.plot(timesteps, cumsum_rewards, label=policy.name)
#
ax1.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
ax1.set_xlabel('time step')
ax1.set_ylabel('average reward')
ax1.legend(loc='lower right')
#
ax2.yaxis.set_major_formatter(FormatStrFormatter('%d'))
ax2.set_xlabel('time step')
ax2.set_ylabel('cumulative reward')
ax2.legend(loc='lower right')
#
plt.tight_layout()
plt.show()
def plot_action_values(*policies):
fig, axs = plt.subplots(nrows=1, ncols=len(policies), figsize=(15,5), squeeze=False)
fig.suptitle("Action scores")
axs = axs.ravel()
for i, policy in enumerate(policies):
cbar = True if i == len(axs)-1 else False
sns.heatmap(policy.scores_log.T, ax=axs[i], vmin=2.5, vmax=5, cmap='hot',
cbar=cbar, xticklabels=1000, yticklabels=False)
axs[i].set_xlabel('time step')
axs[i].title.set_text(policy.name)
axs[0].set_ylabel('movieId')
plt.tight_layout()
plt.show()
plot_rewards(*policies)
if SCORES_LOG is True:
plot_action_values(*policies)