Source code for dodoml.impact

from sklearn import metrics
from scipy.stats.mstats import mquantiles
import numpy as np
import pandas as pd


def _is_classication(model, Xval):
    try:
        model.predict_proba(Xval[:3])
        return True
    except:
        return False


[docs]def compute_features_impact(model, Xval, Yval, row_sample=50000, features=None): """Compute the features impact for each features. Feature Impact for a given column measures how much worse a model’s error score would be if we made predictions after randomly shuffling that column (while leaving other columns unchanged). This technique is sometimes called Permutation Importance. The calculation may take time. To speed up you can either: - Use sampling with the `row_sample` parameter (in number of rows, 0 for all rows) - Specify the columns of interest with `features` paramters (list of columns, None for all) """ if not features: features = Xval.columns Xval, Yval = _row_sample(row_sample, Xval, Yval) if _is_classication(model, Xval): metric_func = metrics.roc_auc_score def predict_fn(X): return model.predict_proba(X)[:, 1] else: metric_func = metrics.r2_score predict_fn = model.predict perf0 = metric_func(Yval, predict_fn(Xval)) m = [] for col in features: xeval = Xval.copy() xeval[col] = np.random.permutation(xeval[col]) perf = metric_func(Yval, predict_fn(xeval)) m.append(1 - perf / perf0) return pd.Series(m, features).sort_values(ascending=False)
[docs]def compute_partial_dependence(model, Xval, features=None, row_sample=10000, percentiles=(0.05, 0.95), grid_resolution=20): """ Compute the partial dependence (of each feature on the prediction function) For a linear model, we can look at the regression coefficients to tell whether a feature impacts positively or negatively the predictions For a more complex model, we use `partial dependence` to visualize this relationship The calculation may take time. To speed up you can either: - Use sampling with the `row_sample` parameter (in number of rows, 0 for all rows) - Specify the columns of interest with `features` paramters (list of columns, None for all) """ if features is None: features = Xval.columns Xval, = _row_sample(row_sample, Xval) for feat in features: grid, pdp = _partial_dependence(model, Xval, feat) yield feat, pd.Series(pdp, grid)
def _partial_dependence(model, Xval, feature, percentiles=(0.05, 0.95), grid_resolution=20): X = Xval.copy() if X[feature].dtype == 'O': # TODO: deal with NA grid = most_freq(X[feature], grid_resolution) else: grid = grid_from_X(X[feature], percentiles, grid_resolution) pdp = [] for value in grid: X[feature] = value pred = _predict(model, X) pdp.append(pred.mean()) return list(grid), pdp def _predict(model, X): if _is_classication(model, X): return model.predict_proba(X)[:, 1] else: return model.predict(X) def grid_from_X(x, percentiles=(0.05, 0.95), grid_resolution=100): """Generate a grid of points based on the ``percentiles of ``x``. """ x = x[~x.isnull()] if len(percentiles) != 2: raise ValueError('percentile must be tuple of len 2') if not all(0. <= x <= 1. for x in percentiles): raise ValueError('percentile values must be in [0, 1]') uniques = np.unique(x) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals return uniques else: emp_percentiles = mquantiles(x, prob=percentiles) # create axis based on percentiles and grid resolution return np.linspace(emp_percentiles[0], emp_percentiles[1], num=grid_resolution, endpoint=True) def most_freq(x, k=10, min_freq=10): freq = x.value_counts() return freq[freq > min_freq].sort_values(ascending=False)[:k].index def _row_sample(row_sample, *dfs): nrows = dfs[0].shape[0] assert all(d.shape[0] == nrows for d in dfs) if row_sample > 0 and nrows > row_sample: idx = np.random.randint(0, nrows, row_sample) return [d.iloc[idx] for d in dfs] else: return dfs