Module src.TokenLab.analytics.analyticsfunctions
Created on Mon Dec 5 19:52:42 2022
@author: stylianoskampakis
Expand source code
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 5 19:52:42 2022
@author: stylianoskampakis
"""
import pandas as pd
import numpy as np
import statsmodels.api as sm
from typing import List
def stepwise_selection(X:[List], y:[List],
initial_list:[List[str]]=[],
threshold_in:[float]=0.001,
threshold_out:[float] = 0.05,
verbose:[bool]=True,square:[bool]=False):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X: pandas.DataFrame with candidate features
y: list-like with the target
initial_list: ist of features to start with (column names of X)
threshold_in: include a feature if its p-value < threshold_in
threshold_out: exclude a feature if its p-value > threshold_out
verbose: whether to print the sequence of inclusions and exclusions
square
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
if square:
X_squared=X**2
X_squared.columns+='^2'
X=pd.concat([X,X_squared],axis=1)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.idxmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
model = sm.OLS(y,X.loc[:,included]).fit()
return model,included
Functions
def stepwise_selection(X: [typing.List], y: [typing.List], initial_list: [typing.List[str]] = [], threshold_in: [
] = 0.001, threshold_out: [ ] = 0.05, verbose: [ ] = True, square: [ ] = False) -
Perform a forward-backward feature selection based on p-value from statsmodels.api.OLS
Arguments
X: pandas.DataFrame with candidate features y: list-like with the target initial_list: ist of features to start with (column names of X) threshold_in: include a feature if its p-value < threshold_in threshold_out: exclude a feature if its p-value > threshold_out verbose: whether to print the sequence of inclusions and exclusions square
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
Expand source code
def stepwise_selection(X:[List], y:[List], initial_list:[List[str]]=[], threshold_in:[float]=0.001, threshold_out:[float] = 0.05, verbose:[bool]=True,square:[bool]=False): """ Perform a forward-backward feature selection based on p-value from statsmodels.api.OLS Arguments: X: pandas.DataFrame with candidate features y: list-like with the target initial_list: ist of features to start with (column names of X) threshold_in: include a feature if its p-value < threshold_in threshold_out: exclude a feature if its p-value > threshold_out verbose: whether to print the sequence of inclusions and exclusions square Returns: list of selected features Always set threshold_in < threshold_out to avoid infinite looping. See https://en.wikipedia.org/wiki/Stepwise_regression for the details """ included = list(initial_list) if square: X_squared=X**2 X_squared.columns+='^2' X=pd.concat([X,X_squared],axis=1) while True: changed=False # forward step excluded = list(set(X.columns)-set(included)) new_pval = pd.Series(index=excluded) for new_column in excluded: model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit() new_pval[new_column] = model.pvalues[new_column] best_pval = new_pval.min() if best_pval < threshold_in: best_feature = new_pval.idxmin() included.append(best_feature) changed=True if verbose: print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval)) # backward step model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit() # use all coefs except intercept pvalues = model.pvalues.iloc[1:] worst_pval = pvalues.max() # null if pvalues is empty if worst_pval > threshold_out: changed=True worst_feature = pvalues.idxmax() included.remove(worst_feature) if verbose: print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval)) if not changed: break model = sm.OLS(y,X.loc[:,included]).fit() return model,included