"""hybparsimony for Python is a package for searching accurate parsimonious models by combining feature selection (FS), model
hyperparameter optimization (HO), and parsimonious model selection (PMS) based on a separate cost and complexity evaluation.
To improve the search for parsimony, the hybrid method combines GA mechanisms such as selection, crossover and mutation within a PSO-based optimization algorithm that includes a strategy in which the best position of each particle (thus also the best position of each neighborhood) is calculated taking into account not only the goodness-of-fit, but also the parsimony principle.
In hybparsimony, the percentage of variables to be replaced with GA at each iteration $t$ is selected by a decreasing exponential function:
$pcrossover=max(0.80 \cdot e^{(-\Gamma \cdot t)}, 0.10)$, that is adjusted by a $\Gamma$ parameter (by default $\Gamma$ is set to $0.50$). Thus, in the first iterations parsimony is promoted by GA mechanisms, i.e., replacing by crossover a high percentage of particles at the beginning. Subsequently, optimization with PSO becomes more relevant for the improvement of model accuracy. This differs from other hybrid methods in which the crossover is applied between the best individual position of each particle or other approaches in which the worst particles are also replaced by new particles, but at extreme positions.
Experiments show that, in general, and with a suitable $\Gamma$, hybparsimony allows to obtain better, more parsimonious and more robust models compared to other methods. It also reduces the number of iterations and, consequently, the computational effort.
References
----------
Divasón, J., Pernia-Espinoza, A., Martinez-de-Pison, F.J. (2022).
New Hybrid Methodology Based on Particle Swarm Optimization with Genetic Algorithms to Improve
the Search of Parsimonious Models in High-Dimensional Databases.
In: García Bringas, P., et al.
Hybrid Artificial Intelligent Systems. HAIS 2022.
Lecture Notes in Computer Science, vol 13469. Springer, Cham.
[https://doi.org/10.1007/978-3-031-15471-3_29](https://doi.org/10.1007/978-3-031-15471-3_29)
"""
import copy
import multiprocessing
import random
from multiprocessing import Pool
from functools import partial
from hybparsimony.util import Population, order, getFitness, parsimony_monitor, parsimony_summary, models
from hybparsimony.util.fitness import fitness_for_parallel
from hybparsimony.util.hyb_aux import _rerank, _crossover, _population
from hybparsimony.lhs import randomLHS
import math
import numpy as np
import pandas as pd
import time
from numpy.random import multinomial
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from hybparsimony.util.models import check_algorithm
[docs]class HYBparsimony(object):
[docs] def __init__(self,
fitness = None,
features = None,
algorithm = None,
custom_eval_fun=None,
cv=None,
scoring=None,
type_ini_pop="improvedLHS",
npart = 15,
maxiter=250,
early_stop=None,
Lambda=1.0,
c1 = 1/2 + math.log(2),
c2 = 1/2 + math.log(2),
IW_max=0.9,
IW_min=0.4,
K=3,
pmutation = 0.1,
#pcrossover_elitists = None, # an array or a float (number between 0 and 1).
#pcrossover = None, # an array or a float (number between 0 and 1), % of worst individuals to substitute from crossover.
gamma_crossover = 0.5,
tol = 1e-4,
rerank_error=1e-09,
keep_history = False,
feat_thres = 0.90,
best_global_thres = 1,
particles_to_delete=None,
seed_ini = 1234,
not_muted = 3,
feat_mut_thres = 0.1,
n_jobs=1,
verbose=0):
r"""
A class for searching parsimonious models by feature selection and parameter tuning with
an hybrid method based on genetic algorithms and particle swarm optimization.
Parameters
----------
fitness : function, optional
The fitness function, any function which takes as input a chromosome which combines the model parameters
to tune and the features to be selected. Fitness function returns a numerical vector with three values: validation_cost,
testing_cost and model_complexity, and the trained model.
features : list of str, default=None
The name of features/columns in the dataset. If None, it extracts the names if X is a dataframe, otherwise it generates a list of the positions according to the value of X.shape[1].
algorithm: string or dict, default=None
Id string, the name of the algorithm to optimize (defined in 'hybparsimony.util.models.py') or a dictionary defined
with the following properties: {'estimator': any machine learning algorithm compatible with scikit-learn,
'complexity': the function that measures the complexity of the model, 'the hyperparameters of the algorithm':
in this case, they can be fixed values (defined by Population.CONSTANT) or a search range $[min, max]$
defined by {"range":(min, max), "type": Population.X} and which type can be of three values:
integer (Population.INTEGER), float (Population.FLOAT) or in powers of 10 (Population.POWER),
i.e. $10^{[min, max]}$}. If algorithm==None, hybparsimony uses 'LogisticRegression()' for
classification problems, and 'Ridge' for regression problems.
custom_eval_fun : function, default=None
An evaluation function similar to scikit-learns's 'cross_val_score()'. If None, hybparsimony uses
'cross_val_score(cv=5)'.
cv: int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy (see scikit-learn's 'cross_val_score()' function)
scoring: str, callable, list, tuple, or dict, default=None.
Strategy to evaluate the performance of the cross-validated model on the test set. If None cv=5 and 'scoring' is defined as MSE for regression problems,
'log_loss' for binary classification problems, and 'f1_macro' for multiclass problems. (see scikit-learn's
'cross_val_score()' function)
type_ini_pop : str, {'randomLHS', 'geneticLHS', 'improvedLHS', 'maximinLHS', 'optimumLHS', 'random'}, optional
Method to create the first population with `GAparsimony._population` function. Possible values: `randomLHS`, `geneticLHS`,
`improvedLHS`, `maximinLHS`, `optimumLHS`, `random`. First 5 methods correspond with several latine hypercube for initial sampling. By default is set to `improvedLHS`.
npart = int, default=15
Number of particles in the swarm (population size)
maxiter = int, default=250
The maximum number of iterations to run before the HYB process is halted.
early_stop : int, optional
The number of consecutive generations without any improvement lower than a difference of 'tol'
in the 'best_fitness' value before the search process is stopped.
tol : float, default=1e-4,
Value defining a significant difference between the 'best_fitness' values between iterations for 'early stopping'.
rerank_error : float, default=1e-09
When a value is provided, a second reranking process according to the model complexities is called by `parsimony_rerank` function.
Its primary objective isto select individuals with high validation cost while maintaining the robustnessof a parsimonious model.
This function switches the position of two models if the first one is more complex than the latter and no significant difference
is found between their fitness values in terms of cost. Thus, if the absolute difference between the validation costs are
lower than `rerank_error` they are considered similar.
gamma_crossover : float, default=0.50
In hybparsimony, the percentage of variables to be replaced with GA at each iteration $t$ is selected by a decreasing exponential function
that is adjusted by a 'gamma_crossover' parameter (see references for more info).
Lambda : float, default=1.0
PSO parameter (see References)
c1 : float, default=1/2 + math.log(2)
PSO parameter (see References)
c2 : float, default=1/2 + math.log(2)
PSO parameter (see References)
IW_max : float, default=0.9
PSO parameter (see References)
IW_min : float, default=0.4
PSO parameter (see References)
K : int, default=4
PSO parameter (see References)
best_global_thres : float, default=1.0
Percentage of particles that will be influenced by the best global of their neighbourhoods
(otherwise, they will be influenced by the best of the iteration in each neighbourhood)
particles_to_delete is not None and len(particles_to_delete) < maxiter:
particles_to_delete : float, default=None
The length of the particles to delete is lower than the iterations,
the array is completed with zeros up to the number of iterations.
mutation : float, default=0.1
The probability of mutation in a parent chromosome. Usually mutation occurs with a small probability. By default is set to `0.10`.
feat_mut_thres : float, default=0.1
Probability of the muted `features-chromosome` to be one. Default value is set to `0.10`.
feat_thres : float, default=0.90
Proportion of selected features in the initial population. It is recommended a high percentage of the selected features for
the first generations.
keep_history : bool default=False,
If True keep results of all particles in each iteration into 'history' attribute.
seed_ini : int, optional
An integer value containing the random number generator state.
n_jobs : int, default=1,
Number of cores to parallelize the evaluation of the swarm. It should be used with caution because the
algorithms used or the 'cross_validate()' function used by default to evaluate individuals may also parallelize
their internal processes.
verbose : int, default=0
The level of messages that we want it to show us. Possible values: 0=silent mode, 1=monitor level, 2=debug level.
Attributes
----------
minutes_total : float
Total elapsed time (in minutes).
history : float
A list with the results of the population of all iterations.'history[iter]' returns a DataFrame
with the results of iteration 'iter'.
best_model
The best model in the whole optimization process.
best_score : float
The validation score of the best model.
best_complexity : float
The complexity of the best model.
selected_features : list,
The name of the selected features for the best model.
selected_features_bool : list,
The selected features for the best model in Boolean form.
best_model_conf : Chromosome
The parameters and features of the best model in the whole optimization process.
Examples
--------
Usage example for a regression model using the sklearn 'diabetes' dataset
.. highlight:: python
.. code-block:: python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from hybparsimony import hybparsimony
# Load 'diabetes' dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1234)
# Standarize X and y
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)
scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train.reshape(-1,1)).flatten()
y_test = scaler_y.transform(y_test.reshape(-1,1)).flatten()
algo = 'KernelRidge'
HYBparsimony_model = hybparsimony(algorithm=algo,
features=diabetes.feature_names,
rerank_error=0.001,
verbose=1)
# Search the best hyperparameters and features
# (increasing 'time_limit' to improve RMSE with high consuming algorithms)
HYBparsimony_model.fit(X_train, y_train, time_limit=0.20)
.. code-block:: text
Running iteration 0
Best model -> Score = -0.510786 Complexity = 9,017,405,352.5
Iter = 0 -> MeanVal = -0.88274 ValBest = -0.510786 ComplexBest = 9,017,405,352.5 Time(min) = 0.005858
Running iteration 1
Best model -> Score = -0.499005 Complexity = 8,000,032,783.88
Iter = 1 -> MeanVal = -0.659969 ValBest = -0.499005 ComplexBest = 8,000,032,783.88 Time(min) = 0.004452
...
...
...
Running iteration 34
Best model -> Score = -0.489468 Complexity = 8,000,002,255.68
Iter = 34 -> MeanVal = -0.527314 ValBest = -0.489468 ComplexBest = 8,000,002,255.68 Time(min) = 0.007533
Running iteration 35
Best model -> Score = -0.489457 Complexity = 8,000,002,199.12
Iter = 35 -> MeanVal = -0.526294 ValBest = -0.489457 ComplexBest = 8,000,002,199.12 Time(min) = 0.006522
Time limit reached. Stopped.
Usage example for a classification model using the 'breast_cancer' dataset
.. highlight:: python
.. code-block:: python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import log_loss
from hybparsimony import hybparsimony
# load 'breast_cancer' dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
# Standarize X and y (some algorithms require that)
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)
HYBparsimony_model = hybparsimony(features=breast_cancer.feature_names,
rerank_error=0.005,
verbose=1)
HYBparsimony_model.fit(X_train, y_train, time_limit=0.50)
# Extract probs of class==1
preds = HYBparsimony_model.predict_proba(X_test)[:,1]
print(f'\n\nBest Model = {HYBparsimony_model.best_model}')
print(f'Selected features:{HYBparsimony_model.selected_features}')
print(f'Complexity = {round(HYBparsimony_model.best_complexity, 2):,}')
print(f'5-CV logloss = {-round(HYBparsimony_model.best_score,6)}')
print(f'logloss test = {round(log_loss(y_test, preds),6)}')
.. code-block:: text
(569, 30)
Detected a binary-class problem. Using 'neg_log_loss' as default scoring function.
Running iteration 0
Best model -> Score = -0.091519 Complexity = 29,000,000,005.11
Iter = 0 -> MeanVal = -0.297448 ValBest = -0.091519 ComplexBest = 29,000,000,005.11 Time(min) = 0.006501
Running iteration 1
Best model -> Score = -0.085673 Complexity = 27,000,000,009.97
Iter = 1 -> MeanVal = -0.117216 ValBest = -0.085673 ComplexBest = 27,000,000,009.97 Time(min) = 0.004273
...
...
Running iteration 102
Best model -> Score = -0.064557 Complexity = 11,000,000,039.47
Iter = 102 -> MeanVal = -0.076314 ValBest = -0.066261 ComplexBest = 9,000,000,047.25 Time(min) = 0.004769
Running iteration 103
Best model -> Score = -0.064557 Complexity = 11,000,000,039.47
Iter = 103 -> MeanVal = -0.086243 ValBest = -0.064995 ComplexBest = 11,000,000,031.2 Time(min) = 0.004591
Time limit reached. Stopped.
Best Model = LogisticRegression(C=5.92705799354935)
Selected features:['mean texture' 'mean concave points' 'radius error' 'area error'
'compactness error' 'worst radius' 'worst perimeter' 'worst area'
'worst smoothness' 'worst concavity' 'worst symmetry']
Complexity = 11,000,000,039.47
5-CV logloss = 0.064557
logloss test = 0.076254
"""
self.type_ini_pop = type_ini_pop
self.fitness = fitness
self.features = features
self.npart = npart
self.maxiter = maxiter
self.early_stop = maxiter if not early_stop else early_stop
self.Lambda = Lambda
self.c1 = c1
self.c2 = c2
self.IW_max = IW_max
self.IW_min = IW_min
self.K = K
self.tol = tol
self.rerank_error = rerank_error
self.verbose = verbose
self.seed_ini = seed_ini
if pmutation is None:
self.pmutation = 0.0
else:
self.pmutation = pmutation
self.not_muted = not_muted
self.feat_mut_thres = feat_mut_thres
self.feat_thres = feat_thres
self.minutes_total = 0
self.history = list()
self.keep_history = keep_history
# Percentage of particles that will be influenced by the best global of their neighbourhoods
# (otherwise, they will be influenced by the best of the iteration in each neighbourhood)
self.best_global_thres = best_global_thres
# if pcrossover is not None:
# if isinstance(pcrossover,(list,np.ndarray)): #If it is a list or an np array
# if len(pcrossover) < maxiter:
# # If the length of the pcrossover array is lower than the iterations, the array is completed with zeros
# # up to the number of iterations.
# self.pcrossover = np.zeros(maxiter).astype(float)
# self.pcrossover[:len(pcrossover)] = pcrossover[:]
# else:
# self.pcrossover = pcrossover
# else:
# # If the parameter was a float, then an array is built in which each position contains that float.
# self.pcrossover = np.full(maxiter, pcrossover, dtype=float)
# # Ensure all numbers are in the range [0,1]
# self.pcrossover[self.pcrossover < 0] = 0
# self.pcrossover[self.pcrossover > 1] = 1
# else:
# self.pcrossover = None
# El gamma del crossover (ahora construyo el self.pcrossover a partir del gamma).
self.pcrossover = None
if gamma_crossover != 0.0:
perc_malos = 0.80 * np.exp(-gamma_crossover * np.arange(self.maxiter))
perc_malos[perc_malos < 0.10] = 0.10
self.pcrossover = perc_malos
self.n_jobs=n_jobs
if self.n_jobs < 1:
self.n_jobs = multiprocessing.cpu_count() # Si ponemos un -1, entonces todos los cores (aunque la validación cruzada ya hará más aún!).
if particles_to_delete is not None and len(particles_to_delete) < maxiter:
# If the length of the particles to delete is lower than the iterations, the array is completed with zeros
# up to the number of iterations.
self.particles_to_delete = np.zeros(maxiter).astype(int)
self.particles_to_delete[:len(particles_to_delete)] = particles_to_delete[:]
else:
self.particles_to_delete = particles_to_delete
if self.seed_ini:
np.random.seed(self.seed_ini)
# Custom cross val score
self.custom_eval_fun = custom_eval_fun
self._cv=cv
self._scoring=scoring
self.algorithm = algorithm
[docs] def fit(self, X, y, time_limit=None):
r"""
Performs the search of accurate parsimonious models by combining feature selection, hyperparameter optimizacion,
and parsimonious model selection (PMS) with data matrix (X) and targets (y).
Parameters
----------
X : pandas.DataFrame or numpy.array
Training vector.
y : pandas.DataFrame or numpy.array
Target vector relative to X.
time_limit : float, default=None
Maximum time to perform the optimization process in minutes.
"""
#############################################
# SOME LOGIC ON PARAMETERS' INITIALIZATION
#############################################
# Detect type of problem and define default scoring function.
def check_classification(y):
return np.issubdtype(y.dtype, np.integer)
if self._scoring is not None:
default_scoring = self._scoring
if self.verbose > 0:
print(f"Using '{default_scoring}' as scoring function.")
elif check_classification(y):
if len(np.unique(y))==2:
default_scoring = 'neg_log_loss'
if self.verbose > 0:
print("Detected a binary-class problem. Using 'neg_log_loss' as default scoring function.")
else:
default_scoring = 'f1_macro'
if self.verbose > 0:
print("Detected a multi-class problem. Using 'f1_macro' as default scoring function.")
else:
default_scoring = 'neg_mean_squared_error'
if self.verbose > 0:
print("Detected a regression problem. Using 'neg_mean_squared_error' as default scoring function.")
def default_cv_score(estimator, X, y):
return cross_val_score(estimator, X, y, cv=5, scoring=default_scoring)
# Create custom_eval_fun
if self._cv is not None and self.custom_eval_fun is None:
if self._scoring is not None:
self.custom_eval_fun = partial(cross_val_score, cv=self._cv, scoring=self._scoring)
else: # Por defecto:
self.custom_eval_fun = partial(cross_val_score, cv=self._cv, scoring=default_scoring)
elif self.custom_eval_fun is None:
if self._scoring is not None:
self.custom_eval_fun = partial(cross_val_score, scoring=self._scoring)
else:
self.custom_eval_fun = default_cv_score
# Select and check algorithm from dictionary
self.algorithm = check_algorithm(self.algorithm, check_classification(y))
self.params = {k: self.algorithm[k] for k in self.algorithm.keys() if k not in ["estimator", "complexity"]}
# Fitness function
if self.n_jobs == 1:
self.fitness = getFitness(self.algorithm['estimator'], self.algorithm['complexity'],
self.custom_eval_fun)
else: # Parallelization
self.fitness = partial(fitness_for_parallel, self.algorithm['estimator'],
self.algorithm['complexity'], self.custom_eval_fun)
if self.n_jobs > 1:
pool = Pool(self.n_jobs)
if self.features is None: # Si no hay features (nombre de las columnas a optimizar), entonces cojo todas
if "pandas" in str(type(X)):
self.features = X.columns # Si es un DataFrame, saco los nombres de las columnas.
else: # SI no, entonces es un numpy array y pongo números del 0 al número de columnas
num_rows, num_cols = X.shape
self.features = list(range(num_cols))
#############################################
# THE HYBRID METHOD
#############################################
start_time = time.time()
if self.seed_ini:
np.random.seed(self.seed_ini)
population = Population(self.params, columns=self.features)
population.population = _population(population, seed_ini=self.seed_ini, popSize=self.npart,
type_ini_pop=self.type_ini_pop) # To create the initial population
# Update population to satisfy the feat_thres
population.update_to_feat_thres(self.npart, self.feat_thres)
nfs = len(population.colsnames)
nparams = len(population._params)
self._summary = np.empty((self.maxiter, 6 * 2,))
self._summary[:] = np.nan
self.best_score = np.NINF
self.best_complexity = np.Inf
maxFitness = np.Inf
best_fit_particle = np.empty(self.npart)
best_fit_particle[:] = np.NINF
best_pos_particle = np.empty(shape=(self.npart, nparams + nfs))
best_complexity_particle = np.empty(self.npart) # Complexities
best_complexity_particle[:] = np.Inf
range_numbers = population._max - population._min
vmax = self.Lambda * range_numbers
range_as_pd = pd.Series(range_numbers)
lower_as_pd = pd.Series(population._min)
v_norm = randomLHS(self.npart, nparams + nfs)
v_norm = pd.DataFrame(v_norm)
v_norm = v_norm.apply(lambda row: row * range_as_pd, axis=1)
v_norm = v_norm.apply(lambda row: row + lower_as_pd, axis=1)
velocity = (v_norm - population._pop) / 2
velocity = velocity.to_numpy()
self.bestSolList = list()
self.best_models_list = list()
self.best_models_conf_list = list()
# Variables to store the best global positions, fitnessval and complexity of each particle
bestGlobalPopulation = copy.deepcopy(population._pop)
bestGlobalFitnessVal = np.empty(self.npart)
bestGlobalFitnessVal[:] = np.NINF
bestGlobalComplexity = np.empty(self.npart)
bestGlobalComplexity[:] = np.inf
#Variable that tracks the deleted particles (their number in the table)
deleted_particles = []
valid_particles = [x for x in range(self.npart) if
x not in deleted_particles] # valid particles (numbers in the table)
fitnessval = np.empty(self.npart)
fitnessval[:] = np.nan
fitnesstst = np.empty(self.npart)
fitnesstst[:] = np.nan
complexity = np.empty(self.npart)
complexity[:] = np.nan
_models = np.empty(self.npart).astype(object)
_models[:] = None
update_neighbourhoods = False
crossover_applied = False
for iter in range(self.maxiter):
if self.verbose > 0:
print("Running iteration", iter)
tic = time.time()
#####################################################
# Compute solutions
#####################################################
if self.n_jobs == 1: # Si NO hay paralelismo (comportamiento por defecto)
for t in valid_particles:
c = population.getChromosome(t)
if np.sum(c.columns) > 0:
fit = self.fitness(c, X=X, y=y)
fitnessval[t] = fit[0][0]
# fitnesstst[t] = fit[0][1]
complexity[t] = fit[0][1]
_models[t] = fit[1]
else:
list_params = []
for t in valid_particles: # Se entrenan todas siempre (salvo las que eliminemos del proceso)
c = population.getChromosome(t)
if np.sum(c.columns) > 0:
list_params.append([c,X,y])
results = pool.starmap(self.fitness, list_params) ## Aquí se hace el paralelismo.
# Recorremos los resultados
for fit, t in zip(results, valid_particles):
fitnessval[t] = fit[0][0]
#fitnesstst[t] = fit[0][1]
complexity[t] = fit[0][1]
_models[t] = fit[1]
if self.seed_ini:
np.random.seed(self.seed_ini * iter)
# Sort by the Fitness Value
# ----------------------------
sort = order(fitnessval, kind='heapsort', decreasing=True, na_last=True)
PopSorted = population[sort, :].copy()
FitnessValSorted = fitnessval[sort]
#FitnessTstSorted = fitnesstst[sort]
ComplexitySorted = complexity[sort]
_modelsSorted = _models[sort]
if self.verbose == 2:
print("\nStep 1. Fitness sorted")
print(np.c_[FitnessValSorted, ComplexitySorted, population.population][:10, :])
# input("Press [enter] to continue")
if self.rerank_error != 0.0:
ord_rerank = _rerank(FitnessValSorted, ComplexitySorted, self.npart, self.rerank_error)
PopSorted = PopSorted[ord_rerank]
FitnessValSorted = FitnessValSorted[ord_rerank]
# FitnessTstSorted = FitnessTstSorted[ord_rerank]
ComplexitySorted = ComplexitySorted[ord_rerank]
_modelsSorted = _modelsSorted[ord_rerank]
if self.verbose == 2:
print("\nStep 2. Fitness reranked")
print(np.c_[FitnessValSorted, ComplexitySorted, population.population][:10, :])
# input("Press [enter] to continue")
# Keep results
# ---------------
self._summary[iter, :] = parsimony_summary(FitnessValSorted, ComplexitySorted)
# Keep Best Solution of this iteration
# ------------------
bestfitnessVal = FitnessValSorted[0]
#bestfitnessTst = FitnessTstSorted[0]
bestcomplexity = ComplexitySorted[0]
bestIterSolution = np.concatenate([[bestfitnessVal, bestcomplexity], PopSorted[0]])
self.bestSolList.append(bestIterSolution)
self.best_models_list.append(_modelsSorted[0])
self.best_models_conf_list.append(PopSorted[0])
# Keep Global Best Model
# ------------------
# The best_score of the whole process. It is update if we find a better score, or equal but with lower complexity.
if bestfitnessVal > self.best_score or (bestfitnessVal == self.best_score and bestcomplexity < self.best_complexity):
self.best_score = bestfitnessVal
self.best_complexity = bestcomplexity
self.bestsolution = bestIterSolution
self.solution_best_score = np.r_[self.best_score, bestfitnessVal, bestcomplexity]
self.best_model = _modelsSorted[0]
self.best_model_conf = PopSorted[0].copy()
# print("ACTUALIZO", self.best_model.C, self.best_model_conf)
# if self.best_model_conf[0] != self.best_model.C:
# print("problemas")
# print("MODELS", _modelsSorted)
# print("POPSORTED", PopSorted)
# print("fitnessvalsorted", FitnessValSorted)
# if self.verbose > 0:
# print("Current best score:", self.best_score)
# Update global best positions, fitness and complexity of each particle (with NO rerank)
for i in range(self.npart):
if fitnessval[i] > bestGlobalFitnessVal[i] or (fitnessval[i] == bestGlobalFitnessVal[i] and complexity[i] < bestGlobalComplexity[i]):
bestGlobalPopulation[i,:] = population._pop[i,:]
bestGlobalFitnessVal[i] = fitnessval[i]
bestGlobalComplexity[i] = complexity[i]
# Keep elapsed time in minutes
# ----------------------------
tac = time.time()
elapsed_gen = (tac - tic) / 60.0
self.minutes_total += + elapsed_gen
# Keep this generation into the History list (with no order)
# ------------------------------------------
if self.keep_history:
self.history.append(
pd.DataFrame(np.c_[population.population, fitnessval, fitnesstst, complexity],
columns=list(population._params.keys()) + population.colsnames + ["fitnessval", "fitnesstst",
"complexity"]))
# Call to 'monitor' function
# --------------------------
if self.verbose > 0:
parsimony_monitor(iter, self.best_score, self.best_complexity,
fitnessval, bestfitnessVal, bestcomplexity, elapsed_gen)
if self.verbose == 2:
print("\nStep 3. Fitness results")
print(np.c_[FitnessValSorted, ComplexitySorted, population.population][:10, :])
# input("Press [enter] to continue")
#print((population._pop))
#print((population._pop[sort])[ord_rerank])
#print((fitnessval[sort])[ord_rerank])
# Exit?
# -----
best_val_cost = self._summary[:, 0][~np.isnan(self._summary[:, 0])]
if bestfitnessVal >= maxFitness:
break
if iter == self.maxiter:
break
if (len(best_val_cost) - (np.min(np.arange(len(best_val_cost))[best_val_cost >= (np.max(best_val_cost) - self.tol)]))) >= self.early_stop:
if self.verbose > 0:
print("Early stopping reached. Stopped.")
break
if time_limit is not None and time_limit < (time.time() - start_time)/60:
if self.verbose > 0:
print("Time limit reached. Stopped.")
break
####################################################
# Deletion step (disabled by default)
####################################################
if self.particles_to_delete is not None and self.particles_to_delete[iter]>0:
# particles_to_delete[iter] contains the number of particles to be deleted in that iteration
# We delete the worse particles at that point (in global, not in that iteration).
sort1 = order(bestGlobalFitnessVal, kind='heapsort', decreasing=True, na_last=True)
sort_not_deleted = [x for x in sort1 if x not in deleted_particles]
deleted_particles = deleted_particles + sort_not_deleted[-self.particles_to_delete[iter]:]
valid_particles = [x for x in range(self.npart) if x not in deleted_particles]
update_neighbourhoods = True
#####################################################
# Generation of the Neighbourhoods
#####################################################
# If there is no improvement in the current iteration, the neighbourhood is changed. It also changes if particles have been deleted.
if FitnessValSorted[0] <= self.best_score or update_neighbourhoods:
update_neighbourhoods = False
nb = list()
for i in range(self.npart):
# Each particle informs at random K particles (the same particle may be chosen several times), and informs itself.
# The parameter K is usually set to 3. It means that each particle informs at less one particle (itself), and at most K+1 particles (including itself)
# Thus, a random integer vector of K elements between 0 and npart-1 is created and we append the particle.
# Duplicates are removed and this represents the neighbourhood.
if i not in deleted_particles:
#nb.append(np.unique(np.append(np.random.randint(low=0, high=self.npart - 1, size=self.K), i)))
indices = np.random.randint(low=0, high=len(valid_particles), size=self.K) # High is not included
random_particles = [valid_particles[index] for index in indices]
nb.append(np.unique(np.append(random_particles, i)))
else:
nb.append(np.nan)
# Create an array to decide if a particle must be influenced by the best global of the neighbourhoods or the best of the iteration
nb_global = np.random.choice(a=[True, False], size=(self.npart,), p=[self.best_global_thres, 1-self.best_global_thres])
###########################################
# Update particular global bests (best position of the particle in the whole process, wrt to rerank)
###########################################
# We have to take care to avoid problems with rerank:
# EXAMPLE (rerank = 0.08):
# SCORE 0.80 0.85 0.90
# COST 10 100 200
# The best score wrt to rerank should be 0.85. But if we get 0.80 with cost 10 in the next
# iteration, that would be chosen. This is wrong, since we would be moving to worse scores. The
# rerank must be applied wrt the best global score of each particle.
for t in [p for p in range(self.npart) if np.isfinite(fitnessval[p])]:# Solo cogemos las partículas que tienen fitnessval finito (que no sea Nan ni inf)
# Three cases:
# (1) If the best improves much, then update.
# (2) If the best does not improve much, but the complexity is lower, then update.
# (3) Otherwise, rerank criterion, but "consuming the rerank" wrt to the global best.
if (fitnessval[t] > best_fit_particle[t] + self.rerank_error) \
or (fitnessval[t] >= best_fit_particle[t] and complexity[t] < best_complexity_particle[t]) \
or (best_fit_particle[t] - fitnessval[t]) <= self.rerank_error - (bestGlobalFitnessVal[t] - best_fit_particle[t]) and complexity[t] < best_complexity_particle[t]:
best_fit_particle[t] = fitnessval[t] # Update the particular best fit of that particle.
best_pos_particle[t, :] = population._pop[t, :] # Update the particular best pos of that particle.
best_complexity_particle[t] = complexity[t] # Update the complexity (could be more complex if the fitnessval[t] is better)
###########################################
# Compute Local bests in the Neighbourhoods
###########################################
best_pos_neighbourhood = np.empty(shape=(self.npart, nparams + nfs)) # Matrix in which i-th row contains the best particle of the i-th neighbourhood.
best_fit_neighbourhood = np.empty(self.npart) # Array that contains in position i the score of the best particle of the i-th neighbourhood.
best_fit_neighbourhood[:] = np.Inf
for i in valid_particles:
if nb_global[i]: # If the global best of the neighbourhood must be selected
particles_positions = nb[i] # Positions of the neighbourhood particles (number within population)
local_fits = best_fit_particle[particles_positions]
local_complexity = best_complexity_particle[particles_positions]
local_sort = order(local_fits, kind='heapsort', decreasing=True, na_last=True)
local_fits_sorted = local_fits[local_sort]
local_complexity_sorted = local_complexity[local_sort]
local_sort_rerank = _rerank(local_fits_sorted, local_complexity_sorted, len(local_fits),
self.rerank_error, preserve_best=True)
max_local_fit_pos = particles_positions[local_sort[local_sort_rerank[0]]]
best_pos_neighbourhood[i, :] = best_pos_particle[max_local_fit_pos, :]
#best_fit_neighbourhood[i] = best_fit_particle[max_local_fit_pos]
else: # The best of the neighbourhood in the current iteration
particles_positions = nb[i] # Positions of the neighbourhood particles (number within population)
local_fits = fitnessval[particles_positions]
local_complexity = complexity[particles_positions]
local_sort = order(local_fits, kind='heapsort', decreasing=True, na_last=True)
local_fits_sorted = local_fits[local_sort]
local_complexity_sorted = local_complexity[local_sort]
local_sort_rerank = _rerank(local_fits_sorted,local_complexity_sorted, len(local_fits), self.rerank_error, preserve_best=False)
max_local_fit_pos = particles_positions[local_sort[local_sort_rerank[0]]]
best_pos_neighbourhood[i, :] = population._pop[max_local_fit_pos, :]
#best_fit_neighbourhood[i] = fitnessval[max_local_fit_pos]
######################
# Crossover step
######################
indexes_worst_particles = []
if self.pcrossover is not None and self.pcrossover[iter] > 0:
######################
# Selection substep
######################
# Nonlinear-rank selection
# Michalewicz (1996) Genetic Algorithms + Data Structures = Evolution Programs. p. 60
q = 0.25
rank = list(range(self.npart))
prob = np.array(list(map(lambda x: q * (1 - q) ** (x), rank)))
prob = prob / prob.sum() # En prob, metemos las probabilidades. El primer elemento tiene más probabilidad, y así sucesivamente.
# Ahora en sel, aplicamos esas probabilidades para seleccionar, teniendo en cuenta que los índices de las mejores están en sort[ord_rerank]
# (porque la población no está ordenada, así que no podemos usar rank como en GA).
sel = np.random.choice(sort[ord_rerank], size=self.npart, replace=True, p=list(map(lambda x: np.min(
np.ma.masked_array(np.array([max(0, x), 1]), np.isnan(np.array([max(0, x), 1])))), prob)))
# Cambia la población para seleccionar los que se van a reproducir. Puede haber filas repetidas en population.
# Así, luego se pueden cruzar más veces.
population_selection = copy.deepcopy(population) # Hago deepcopy porque es array de arrays.
population_selection._pop = population_selection._pop[sel]
fitnessval_selection = fitnessval[sel].copy()
#fitnesstst_selection = fitnesstst[sel].copy()
complexity_selection = complexity[sel].copy()
velocity_selection = velocity[sel].copy()
######################
# Crossover substep
######################
nmating = int(np.floor(self.npart / 2))
mating = np.random.choice(list(range(2 * nmating)), size=(2 * nmating), replace=False).reshape((nmating, 2))
# Hacemos crossover de la población seleccionada
population_crossover = copy.deepcopy(population_selection)
fitnessval_crossover = fitnessval_selection.copy()
#fitnesstst_crossover = fitnesstst_selection.copy()
complexity_crossover = complexity_selection.copy()
velocity_crossover = velocity_selection.copy()
for i in range(nmating):
parents_indexes = mating[i,]
# Voy haciendo el crossover en la nueva población
_crossover(population_crossover, velocity_crossover, fitnessval_crossover, complexity_crossover,
parents_indexes, children_indexes=parents_indexes)
# Ahora cojo la población original, y sustituyo el % de malos a sustituir por individuos aleatorios de la población del crossover.
npart_worst = max(1, int(np.floor(self.npart * self.pcrossover[iter])))
indexes_worst_particles = sort[ord_rerank[-npart_worst:]]
# Array aleatorio de tamaño npart y números entre 0 y npart - 1. También podría hacer un suffle.
# No repito aquí (pero podrá haber padres repetidos porque en population_crossover podría haber filas repetidas):
random_array = np.random.choice(range(self.npart), self.npart, replace=False)
for i in indexes_worst_particles: #Esto ya me asegura que no toco los elitistas, solo sustituyo las partículas malas.
population._pop[i] = population_crossover._pop[random_array[i]]
fitnessval[i] = fitnessval_crossover[random_array[i]]
#fitnesstst[i] = fitnesstst_crossover[random_array[i]]
complexity[i] = complexity_crossover[random_array[i]]
velocity[i] = velocity[random_array[i]]
#####################################################
# Update positions and velocities following SPSO 2007
#####################################################
# Solo tengo que actualizar los que no haya sustituido.
indexes_except_substituted_particles = [i for i in range(self.npart) if i not in indexes_worst_particles]
U1 = np.random.uniform(low=0, high=1,
size=(self.npart, nparams + nfs)) # En el artículo se llaman r1 y r2
U2 = np.random.uniform(low=0, high=1,
size=(self.npart, nparams + nfs)) # En el artículo se llaman r1 y r2
IW = self.IW_max - (self.IW_max - self.IW_min) * iter / self.maxiter
# Two first terms of the velocity
velocity[indexes_except_substituted_particles,:] = IW * velocity[indexes_except_substituted_particles,:] \
+ U1[indexes_except_substituted_particles,:] * self.c1 * (best_pos_particle[indexes_except_substituted_particles,:] - population._pop[indexes_except_substituted_particles,:])
velocity[indexes_except_substituted_particles,:] = velocity[indexes_except_substituted_particles,:] + self.c2 * U2[indexes_except_substituted_particles,:] * (
best_pos_neighbourhood[indexes_except_substituted_particles,:] - population._pop[indexes_except_substituted_particles,:])
# Limit velocity to vmax to avoid explosion
for j in range(nparams + nfs):
vmax_pos = np.where(abs(velocity[:,j]) > vmax[j])[0]
for i in vmax_pos:
velocity[i, j] = math.copysign(1, velocity[i, j]) * abs(vmax[j])
##############################
# Update positions of FEATURES
##############################
for nf in range(nparams,nparams + nfs): # We must move to the features (the particles contain first hyper-parameters and then features)
for p in indexes_except_substituted_particles:
population._pop[p,nf] = population._pop[p,nf] + velocity[p,nf] # Update positions for the model positions (x = x + v)
# To ensure that the interval [0,1] is preserved
if population._pop[p, nf] > 1.0:
population._pop[p, nf] = 1.0
if population._pop[p,nf] < 0.0:
population._pop[p, nf] = 0.0
######################
# Mutation of FEATURES
# ####################
if self.pmutation > 0:
# Uniform random mutation (except first individual)
nfts_to_mute = round(self.pmutation * nfs * self.npart)
if nfts_to_mute < 1:
nfts_to_mute = 1
indexes_to_mute = sort[ord_rerank[self.not_muted:]]
for _ in range(nfts_to_mute):
i = np.random.choice(indexes_to_mute)
j = np.random.randint(0, nfs - 1)
population._pop[i, nparams + j] = population.random_gen[j](j, feat_mut_thres=self.feat_mut_thres)
fitnessval[i] = np.nan
fitnesstst[i] = np.nan
complexity[i] = np.nan
# if self.pmutation > 0:
# rnd_mut = np.random.uniform(size = (self.npart, nfs))
# for p in range(self.npart):
# for nf in range(nparams,nparams + nfs):
# if rnd_mut[p, nf - nparams] < self.pmutation:
# if population._pop[p, nf] < 0.5:
# population._pop[p, nf] = np.random.uniform(low=0.5, high=1.0)
# else:
# population._pop[p, nf] = np.random.uniform(low=0.0, high=0.5)
#######################################################
# Update positions of model HYPERPARAMETERS (x = x + v)
#######################################################
for j in range(nparams):
population._pop[indexes_except_substituted_particles, j] = \
population._pop[indexes_except_substituted_particles, j] + velocity[indexes_except_substituted_particles, j]
################################################################################################
# Confinement Method for SPSO 2007 - absorbing2007 (hydroPSO) - Deterministic Back (Clerc, 2007)
################################################################################################
for j in range(nparams):
out_max = (population._pop[:, j] > population._max[j])
out_min = (population._pop[:, j] < population._min[j])
population._pop[out_max, j] = population._max[j]
population._pop[out_min, j] = population._min[j]
velocity[out_max, j] = 0
velocity[out_min, j] = 0
# ASEGURARNOS QUE AL MENOS UNA FEATURE SE SELECCIONA EN CADA PARTICULA
# TODO: Esto debería hacerse en otro lado!
for i in range(self.npart): # the particles contain first hyper-parameters and then feature
aux = population._pop[i, nparams:]
if (aux<0.5).all():
feature_to_change = random.randint(nparams, nparams + nfs - 1)
new_value = random.uniform(0.5, 1)
population._pop[i, feature_to_change] = new_value
if self.n_jobs>1:
pool.close()
# Guardo las features seleccionadas
aux = self.best_model_conf[nparams:nparams + nfs]
self.selected_features_boolean = (aux >= 0.5) # Me guardo como una lista de booleanos si las features están o no
self.selected_features = np.array(self.features)[self.selected_features_boolean] # Me guardo los nombres
if self.verbose == 2:
print("Selected features:", self.selected_features)
return self.best_model
[docs] def predict(self, X):
r"""
Predict result for samples in X.
Parameters
----------
X : numpy.array or pandas.DataFrame
Samples.
Returns
-------
numpy.array
A `numpy.array` with predictions.
"""
num_rows, num_cols = X.shape
if num_cols == len(self.selected_features): #Si nos han pasado un X donde ya he cogido las columnas que debía coger
preds = self.best_model.predict(X)
else: # En otro caso, nos han pasado un X entero y nos tenemos que quedar solo con las columnas seleccionadas.
if isinstance(X, pd.DataFrame): # Si es un dataframe, puedo acceder a las columnas por nombre/booleano
X_selected_features = X[self.selected_features].values
else: #Si es un Numpy, entonces tengo que quedarme con las columnas apropiadas
X_selected_features = X[:,self.selected_features_boolean] # Cojo todas las filas pero solo las columnas apropiadas.
preds = self.best_model.predict(X_selected_features)
return preds
[docs] def predict_proba(self, X):
r"""
Predict probabilities for each class and sample in X (only for classification models).
Parameters
----------
X : numpy.array or pandas.DataFrame
Samples.
Returns
-------
numpy.array
A `numpy.array` with predictions. Returns the probability of the sample for each class in the model.
"""
num_rows, num_cols = X.shape
if num_cols == len(self.selected_features): #Si nos han pasado un X donde ya he cogido las columnas que debía coger
preds = self.best_model.predict_proba(X)
else: # En otro caso, nos han pasado un X entero y nos tenemos que quedar solo con las columnas seleccionadas.
if isinstance(X, pd.DataFrame): # Si es un dataframe, puedo acceder a las columnas por nombre/booleano
X_selected_features = X[self.selected_features].values
else: #Si es un Numpy, entonces tengo que quedarme con las columnas apropiadas
X_selected_features = X[:,self.selected_features_boolean] # Cojo todas las filas pero solo las columnas apropiadas.
preds = self.best_model.predict_proba(X_selected_features)
return preds