Module imodels.tests.notebooks.imodels_demo
Expand source code
# ---
# jupyter:
# jupytext:
# formats: ipynb,../imodels/tests/notebooks//py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.11.1
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# %% pycharm={"is_executing": false}
# %load_ext autoreload
# %autoreload 2
import os
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(13)
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn import metrics
from scipy.io.arff import loadarff
# installable with: `pip install imodels`
from imodels import SLIMRegressor, BayesianRuleListClassifier, RuleFitRegressor, GreedyRuleListClassifier
from imodels import SLIMClassifier, OneRClassifier, BoostedRulesClassifier
# change working directory to project root
if os.getcwd().split('/')[-1] != 'imodels':
os.chdir('..')
def get_ames_data():
housing = fetch_openml(name="house_prices", as_frame=True)
housing_target = housing['target'].values
housing_data_numeric = housing['data'].select_dtypes('number').drop(columns=['Id']).dropna(axis=1)
feature_names = housing_data_numeric.columns.values
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
housing_data_numeric.values, housing_target, test_size=0.75)
return X_train_reg, X_test_reg, y_train_reg, y_test_reg, feature_names
def get_diabetes_data():
'''load (classification) data on diabetes
'''
data = loadarff("imodels/tests/test_data/diabetes.arff")
data_np = np.array(list(map(lambda x: np.array(list(x)), data[0])))
X = data_np[:, :-1].astype('float32')
y_text = data_np[:, -1].astype('str')
y = (y_text == 'tested_positive').astype(int) # labels 0-1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75) # split
feature_names = ["#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)",
"Triceps skin fold thickness(mm)",
"2-Hour serum insulin (mu U/ml)", "Body mass index", "Diabetes pedigree function", "Age (years)"]
return X_train, X_test, y_train, y_test, feature_names
def viz_classification_preds(probs, y_test):
'''look at prediction breakdown
'''
plt.subplot(121)
plt.hist(probs[:, 1][y_test == 0], label='Class 0')
plt.hist(probs[:, 1][y_test == 1], label='Class 1', alpha=0.8)
plt.ylabel('Count')
plt.xlabel('Predicted probability of class 1')
plt.legend()
plt.subplot(122)
preds = np.argmax(probs, axis=1)
plt.title('ROC curve')
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.plot(fpr, tpr)
plt.tight_layout()
plt.show()
if __name__ == '__main__':
X_train_reg, X_test_reg, y_train_reg, y_test_reg, feat_names_reg = get_ames_data()
X_train, X_test, y_train, y_test, feat_names = get_diabetes_data()
# load some data
print('regression data', X_train_reg.shape, 'classification data', X_train.shape)
# %% [markdown]
# # rule sets
# Rule sets are models that create a set of (potentially overlapping) rules.
# %% [markdown]
# ### rulefit
# %% pycharm={"is_executing": false}
# fit a rulefit model
rulefit = RuleFitRegressor(max_rules=10)
rulefit.fit(X_train_reg, y_train_reg, feature_names=feat_names_reg)
# get test performance
preds = rulefit.predict(X_test_reg)
print(f'test r2: {metrics.r2_score(y_test_reg, preds):0.2f}')
# inspect and print the rules
rules = rulefit.get_rules()
rules = rules[rules.coef != 0].sort_values("support", ascending=False)
# 'rule' is how the feature is constructed
# 'coef' is its weight in the final linear model
# 'support' is the fraction of points it applies to
rules[['rule', 'coef', 'support']].style.background_gradient(cmap='viridis')
# %% [markdown]
# ## boosted stumps
# %%
# fit boosted stumps
brc = BoostedRulesClassifier(n_estimators=10)
brc.fit(X_train, y_train, feature_names=feat_names)
print(brc)
# look at performance
probs = brc.predict_proba(X_test)
viz_classification_preds(probs, y_test)
# %% [markdown]
# # rule lists
# %% [markdown]
# ### greedy rule lists
# **like a decision tree that only ever splits going left**
# %% pycharm={"is_executing": false}
# fit a greedy rule list
m = GreedyRuleListClassifier()
m.fit(X_train, y=y_train, feature_names=feat_names) # stores into m.rules_
probs = m.predict_proba(X_test)
# print the list
print(m)
# look at prediction breakdown
viz_classification_preds(probs, y_test)
# %% [markdown]
# ### oneR
# **fits a rule list restricted to use only one feature**
# %%
# fit a oneR model
m = OneRClassifier()
m.fit(X_train, y=y_train, feature_names=feat_names) # stores into m.rules_
probs = m.predict_proba(X_test)
# print the rule list
print(m)
# look at prediction breakdown
viz_classification_preds(probs, y_test)
# %% [markdown]
# ### scalable bayesian rule lists
# %%
# train classifier (allow more iterations for better accuracy; use BigDataRuleListClassifier for large datasets)
print('training...')
m = BayesianRuleListClassifier(max_iter=3000, class1label="diabetes", verbose=False)
m.fit(X_train, y_train)
probs = m.predict_proba(X_test)
print("learned model:\n", m)
viz_classification_preds(probs, y_test)
# %% [markdown]
# # rule trees
# %% [markdown]
# ### short decision tree
# %% pycharm={"is_executing": false}
# specify a decision tree with a maximum depth
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)
# calculate mse on the training data
probs = dt.predict_proba(X_test)
# print(f'test mse: {np.mean(np.square(preds-y)):0.2f}')
plot_tree(dt)
# plt.savefig('tree.pdf')
plt.show()
viz_classification_preds(probs, y_test)
# %% [markdown]
# ### optimal classification tree
# - docs [here](https://github.com/csinva/interpretability-workshop/tree/master/imodels/optimal_classification_tree)
# - note: this implementation is still somewhat unstable, and can be made faster by installing either `cplex` or `gurobi`
# %%
# sys.path.append('../imodels/optimal_classification_tree/pyoptree')
# sys.path.append('../imodels/optimal_classification_tree/')
# %%
# from optree import OptimalTreeModel
# feature_names = np.array(["x1", "x2"])
# X = np.array([[1, 2, 2, 2, 3], [1, 2, 1, 0, 1]]).T
# y = np.array([1, 1, 0, 0, 0]).reshape(-1, 1)
# X_test = np.array([[1, 1, 2, 2, 2, 3, 3], [1, 2, 2, 1, 0, 1, 0]]).T
# y_test = np.array([1, 1, 1, 0, 0, 0, 0])
# np.random.seed(13)
# model = OptimalTreeModel(tree_depth=3, N_min=1, alpha=0.1) #, solver_name='baron'
# model.fit(X_test, y_test) # this method is currently using the fast, but not optimal solver
# preds = model.predict(X_test)
# # fit on the bigger diabetes dset from above
# # model.fit(Xtrain, ytrain) # this method is currently using the fast, but not optimal solver
# # preds = model.predict(Xtest)
# print('acc', np.mean(preds == y_test))
# %%
# model.print_tree(feature_names)
# %% [markdown]
# # algebraic models
# %% [markdown]
# ### integer linear models
# %% pycharm={"is_executing": false}
np.random.seed(123)
# generate X and y
n, p = 500, 10
X_sim = np.random.randn(n, p)
y_sim = 1 * X_sim[:, 0] + 2 * X_sim[:, 1] - 1 * X_sim[:, 2] + np.random.randn(n)
# fit linear models with different regularization parameters
print('groundtruth weights should be 1, 2, -1...')
model = SLIMRegressor()
for lambda_reg in [1e-3, 1e-2, 5e-2, 1e-1, 1, 2, 5, 10]:
model.fit(X_sim, y_sim, lambda_reg)
mse = np.mean(np.square(y_sim - model.predict(X_sim)))
print(f'lambda: {lambda_reg}\tmse: {mse: 0.2f}\tweights: {model.model_.coef_}')
# %%
y_sim = 1 / (1 + np.exp(-y_sim))
y_sim = np.round(y_sim)
# fit linear models with different regularization parameters
print('groundtruth weights should be 1, 2, -1...')
model = SLIMClassifier()
for lambda_reg in [1e-3, 1e-2, 5e-2, 1e-1, 1, 2, 5, 10]:
model.fit(X_sim, y_sim, lambda_reg)
mll = np.mean(metrics.log_loss(y_sim, model.predict(X_sim)))
print(f'lambda: {lambda_reg}\tmlogloss: {mll: 0.2f}\tweights: {model.model_.coef_}')
# %%
Functions
def get_ames_data()
-
Expand source code
def get_ames_data(): housing = fetch_openml(name="house_prices", as_frame=True) housing_target = housing['target'].values housing_data_numeric = housing['data'].select_dtypes('number').drop(columns=['Id']).dropna(axis=1) feature_names = housing_data_numeric.columns.values X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( housing_data_numeric.values, housing_target, test_size=0.75) return X_train_reg, X_test_reg, y_train_reg, y_test_reg, feature_names
def get_diabetes_data()
-
load (classification) data on diabetes
Expand source code
def get_diabetes_data(): '''load (classification) data on diabetes ''' data = loadarff("imodels/tests/test_data/diabetes.arff") data_np = np.array(list(map(lambda x: np.array(list(x)), data[0]))) X = data_np[:, :-1].astype('float32') y_text = data_np[:, -1].astype('str') y = (y_text == 'tested_positive').astype(int) # labels 0-1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75) # split feature_names = ["#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)", "Triceps skin fold thickness(mm)", "2-Hour serum insulin (mu U/ml)", "Body mass index", "Diabetes pedigree function", "Age (years)"] return X_train, X_test, y_train, y_test, feature_names
def viz_classification_preds(probs, y_test)
-
look at prediction breakdown
Expand source code
def viz_classification_preds(probs, y_test): '''look at prediction breakdown ''' plt.subplot(121) plt.hist(probs[:, 1][y_test == 0], label='Class 0') plt.hist(probs[:, 1][y_test == 1], label='Class 1', alpha=0.8) plt.ylabel('Count') plt.xlabel('Predicted probability of class 1') plt.legend() plt.subplot(122) preds = np.argmax(probs, axis=1) plt.title('ROC curve') fpr, tpr, thresholds = metrics.roc_curve(y_test, preds) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.plot(fpr, tpr) plt.tight_layout() plt.show()