# -*- coding: utf-8 -*-
"""
@author: Aghiles Salah
"""
import numpy as np
import scipy.sparse as sp
from ..utils.util_functions import which_
from .evaluation_strategy import EvaluationStrategy
import sys
[docs]class Split(EvaluationStrategy):
"""Evaluation Strategy Split.
Parameters
----------
data: scipy sparse matrix, required
The user-item preference matrix.
prop_test: float, optional, default: 0.2
The propotion of the test set, \
if > 1 then it is treated as the size of the test set.
prop_validation: float, optional, default: 0.0
The propotion of the validation set, \
if > 1 then it is treated as the size of the validation set.
good_rating: float, optional, default: 1
The minimum value that is considered to be a good rating, \
e.g, if the ratings are in {1, ..., 5}, then good_rating = 4.
data_train: ..., optional, default: None
The training data.
data_validation: ..., optional, default: None
The validation data.
data_test: ..., optional, default: None
The test data.
index_train: 1d array, optional, default: None
The indexes of training data (starting from 0).
index_validation: 1d array, optional, default: None
The indexes of validation data (starting from 0).
index_test: 1d array, optional, default: None
The indexes of test data (starting from 0).
data_train_bin: ..., default: None
The binary training data.
data_validation_bin: ..., default: None
The binary validation data.
data_test_bin: ..., default: None
The binary test data.
"""
def __init__(self, data, prop_test=0.2, prop_validation=0.0, good_rating=1., data_train=None, data_validation=None,
data_test=None, index_train=None, index_validation=None, index_test=None):
EvaluationStrategy.__init__(self, data, good_rating=good_rating, data_train=data_train,
data_validation=data_validation, data_test=data_test)
self.prop_test = prop_test
self.prop_validation = prop_validation
# may be move these attributes to the parent class
self.index_train = index_train
self.index_validation = index_validation
self.index_test = index_test
# Additional attributes,
self.split_ran = False # check whether the data is already split or not
def _train_test_split(self):
print("Spliting the data")
n = self.data_nnz
if self.prop_test > 1:
print("\'prop_test\'>1 and is treated as the size of the test data")
if self.prop_test > n:
sys.exit("\'prop_test\' is greater than the number of users")
else:
size_train = n - int(self.prop_test)
else:
size_train = int(np.round((1 - self.prop_test) * n))
index_train = np.random.choice(n, size=size_train, replace=False, p=None) # sample without replacement
index_test = np.where(np.invert(np.in1d(np.array(range(n)), index_train)))[
0] # index_test are the indices which are not in index_train
return index_train, index_test
def run(self):
# Building train and test sets
if self._data_train is None or self._data_test is None:
if self.index_train is None or self.index_test is None:
self.index_train, self.index_test = self._train_test_split()
# preparing training set, creating the training sparse matrix
print("Preparing training data")
train_data = self.data[self.index_train, :]
id_train_users = np.array(train_data[:, 0], dtype='int64').flatten()
id_train_items = np.array(train_data[:, 1], dtype='int64').flatten()
ratings_train = np.array(train_data[:, 2], dtype='float64').flatten()
self._data_train = sp.csc_matrix((ratings_train, (id_train_users, id_train_items)),
shape=(self.data_nrows, self.data_ncols))
del (id_train_users, id_train_items, ratings_train)
self._data_train.eliminate_zeros()
self._data_train = sp.csc_matrix(self._data_train)
# preparing test set
print("Preparing test data")
test_data = self.data[self.index_test, :]
id_test_users = np.array(test_data[:, 0], dtype='int64').flatten()
id_test_items = np.array(test_data[:, 1], dtype='int64').flatten()
ratings_test = np.array(test_data[:, 2], dtype='float64').flatten()
self._data_test = sp.csc_matrix((ratings_test, (id_test_users, id_test_items)),
shape=(self.data_nrows, self.data_ncols))
self._data_test.eliminate_zeros()
self._data_test = sp.csc_matrix(self.data_test)
# Binary train data, useful to get some stats, such as the number of ratings per user
self._data_train_bin = self._data_train.copy() # always use copy() with sparse matrices affectations (the assignement is done in variables)
self._data_train_bin.data = np.full(len(self._data_train_bin.data), 1)
# update this binarization process
# Binary test data, useful for ranking and top@M evaluation
self._data_test_bin = self._data_test.copy()
self._data_test_bin.data[which_(self.data_test_bin.data, '<', self.good_rating)] = 0.
self._data_test_bin.eliminate_zeros()
self._data_test_bin.data = np.full(len(self.data_test_bin.data), 1)
self.split_ran = True
# This function is callable from the experiement class so as to run an experiment
def evaluate(self, model, metrics):
# Organize metrics into "rating" and "ranking" for efficiency purposes
ranking_metrics = metrics['ranking']
rating_metrics = metrics['rating']
if not self.split_ran:
self.run()
model.fit(self.data_train)
print("Starting evaluation")
res_per_u = sp.csc_matrix((self.data_test.shape[0],
len(ranking_metrics)+len(rating_metrics) + 1)) # this matrix will contain the evaluation results for each user
# evaluation is done user by user to avoid memory errors on large datasets.
# loops are inefficent in python, this part should be re-implement in cython or c/c++"""
nb_processed_users = 0
for u in range(self.data_test.shape[0]):
if not np.sum(self.data_test_bin[u, :]): # users with 0 heldout items should not be consider in the evaluation
nb_processed_users += 1
else:
known_items = which_(self.data_train[u, :].todense().A1, ">",0)
if len(ranking_metrics):
u_rank_list = model.rank(user_index=u,known_items = known_items)
if len(rating_metrics):
u_pred_scores = model.score(user_index=u, item_indexes = None)
# computing the diffirent metrics
idx = 0
for mt in ranking_metrics:
res_per_u[u, idx] = mt.compute(data_test=self.data_test_bin[u, :].todense().A1, reclist=u_rank_list)
idx = idx + 1
for mt in rating_metrics:
res_per_u[u, idx] = mt.compute(data_test=self.data_test[u, :].todense().A1, prediction=u_pred_scores)
idx = idx + 1
res_per_u[u, len(ranking_metrics)+len(rating_metrics)] = 1 # This column indicates whether a user have been preprocessed
nb_processed_users += 1
if nb_processed_users % 1000 == 0:
print(nb_processed_users, "processed users")
# computing the average results
average_res = res_per_u[which_(res_per_u[:, len(ranking_metrics)+len(rating_metrics)].todense().A1, ">", 0), :].mean(0).A1 # of type array
#res_tot = {"ResAvg": res_avg[0:len(ranking_metrics)+len(rating_metrics)], "ResPerUser": res}
return average_res[0:len(ranking_metrics)+len(rating_metrics)], res_per_u