Source code for cornac.evaluation_strategies.cross_validation

# -*- coding: utf-8 -*-

"""
@author: Aghiles Salah
"""

import numpy as np
from ..utils.util_functions import which_
from .evaluation_strategy import EvaluationStrategy
from .split import Split


[docs]class CrossValidation(EvaluationStrategy): """Evaluation Strategy Cross Validation. Parameters ---------- data: scipy sparse matrix, required The user-item interaction matrix. n_folds: int, optional, default: 5 The number of folds for cross validation. good_rating: float, optional, default: 1 The minimum value that is considered to be a good rating, \ e.g, if the ratings are in {1, ... ,5}, then good_rating = 4. partition: array-like, shape (n_observed_ratings,), optional, default: None The partition of ratings into n_folds (fold label of each rating) \ If None, random partitioning is performed to assign each rating into a fold. """ def __init__(self, data, n_folds=5, good_rating=1., partition=None, data_train=None, data_validation=None, data_test=None): EvaluationStrategy.__init__(self, data, good_rating=good_rating, data_train=data_train, data_validation=data_validation, data_test=data_test) self.n_folds = n_folds self.partition = partition self.current_fold = 0 self.current_split = None # Partition users into n_folds def _get_partition(self): n_fold_partition = np.random.choice(self.n_folds, size=self.data_nnz, replace=True, p=None) # sample with replacement while len(set(n_fold_partition)) != self.n_folds: # just in case some fold is empty n_fold_partition = np.random.choice(self.n_folds, size=self.data_nnz, replace=True, p=None) return n_fold_partition # This function is used to get the next train_test data def _get_next_train_test_split(self): print(len(self.partition)) index_test = np.where(self.partition == self.current_fold)[0] print(len(index_test)) index_train = np.where(self.partition != self.current_fold)[0] print(len(index_train)) self.current_split = Split(self.data, good_rating=self.good_rating, index_train=index_train, index_test=index_test) if self.current_fold < self.n_folds - 1: self.current_fold = self.current_fold + 1 else: self.current_fold = 0 def evaluate(self, model, metrics): if self.partition is None: self.partition = self._get_partition() for fold in range(self.n_folds): print("fold:", self.current_fold) self._get_next_train_test_split() if self.current_fold == 1: res_tot = self.current_split.evaluate(model=model, metrics=metrics) resAvg = res_tot["ResAvg"] print(resAvg) resPerU = res_tot["ResPerUser"] else: res_tot = self.current_split.evaluate(model=model, metrics=metrics) """ need to figure out how to average the resuls accoording""" resAvg = np.vstack((resAvg, res_tot["ResAvg"])) resPerU = resPerU + res_tot["ResPerUser"] avg_resAvg = resAvg.mean( 0) # we are averaging the average results across the n_folds, another possibility is to make it per-user? std_resAvg = resAvg.std(0, ddof=1) # Averaging the results per-user across diffirent folds n_processed_u = resPerU[which_(resPerU[:, len(metrics)].todense().A1, ">", 0), len(metrics)].shape[0] resPerU[which_(resPerU[:, len(metrics)].todense().A1, ">", 0), :] = resPerU[which_( resPerU[:, len(metrics)].todense().A1, ">", 0), :] / resPerU[which_(resPerU[:, len(metrics)].todense().A1, ">", 0), len( metrics)].todense().reshape(n_processed_u, 1) # This is a temporary solution, we just return a single structure containing all the results, (may be consider returning an object of class result instead) res_tot = {"ResAvg": avg_resAvg[0:len(metrics)], "ResStd": std_resAvg[0:len(metrics)], "ResPerUser": resPerU} return res_tot