import pandas as pd
from dodoml.pipeline import (
ColumnsSelector, UniqueCountColumnSelector, TolerantLabelEncoder, FillNaN,
ColumnApplier, OrdinalEncoder, CountFrequencyEncoder, Logify, BoxCoxTransformer,
YToLog)
from dodoml import (compute_features_impact, compute_partial_dependence, lift_curve)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, Imputer
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
df = pd.read_csv('./data/titanic.csv')
CAT = ['Sex', 'Embarked']
NUM = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
ColumnsSelector(CAT).fit_transform(df)[:3]
UniqueCountColumnSelector(1, 5).fit_transform(df)[:3]
TolerantLabelEncoder().fit_transform(df.Sex)[:3]
ColumnApplier(TolerantLabelEncoder()).fit_transform(df[CAT].fillna(''))[:3]
OrdinalEncoder(200).fit_transform(df[CAT])[:4]
CountFrequencyEncoder().fit_transform(df.Sex)[:4]
ColumnApplier(CountFrequencyEncoder()).fit_transform(df[CAT])[:4]
Logify().transform(df.Age)[:4]
BoxCoxTransformer().fit_transform(df.Age.fillna(-1))[:10]
model = YToLog(LinearRegression(), 0.0001)
X, y = df[['Age', 'Pclass']].fillna(0), df.Fare
model.fit(X, y)
model.predict(X)[:10]
feature_pipeline = make_union(
make_pipeline(
ColumnsSelector(CAT),
FillNaN('nan'),
ColumnApplier(TolerantLabelEncoder()),
OneHotEncoder()
),
make_pipeline(
ColumnsSelector(NUM),
Imputer()
)
)
model = make_pipeline(
feature_pipeline,
LogisticRegression()
)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(df, df.Survived)
model.fit(Xtrain, Ytrain)
compute_features_impact(model, Xtest, Ytest, row_sample=100).plot(kind='barh')
for feature, series in compute_partial_dependence(model, Xtest, row_sample=100):
if series.shape[0] > 0:
if feature in CAT:
series.plot(kind='bar')
else:
series.plot(kind='line')
plt.title(feature)
plt.show()
lift_curve(Ytest, model.predict_proba(Xtest)[:,1])
from dodoml.lime import PredictorLime
explainer = PredictorLime(Xtrain[NUM + CAT], Ytrain, NUM, CAT, class_names=['Dead', 'Alive'])
instance = Xtrain.iloc[100]
exp = explainer.explain_instance(instance, model.predict_proba, n_features=4)
exp.show_in_notebook()
# To get all the values
reason = [b for a in exp.as_list() for b in a]
values = [exp.domain_mapper.feature_values[i[0]] for i in exp.local_exp[1]]
variables = [exp.domain_mapper.feature_names[i[0]] for i in exp.local_exp[1]]
[exp.predict_proba[1]] + variables + reason + values
from dodoml.ml import xgboost_hyperband_classifier, xgboost_hyperband_regressor
model = xgboost_hyperband_classifier(NUM, CAT)
model.fit(Xtrain, Ytrain)
model.predict_proba(Xtest)[:5]
from dodoml.ml import Hyperband
from xgboost import XGBClassifier
from scipy.stats.distributions import uniform, randint
param_space = {
'max_depth': randint(2, 11),
'min_child_weight': randint(1, 11),
'subsample': uniform(0.5, 0.5),
}
model = make_pipeline(
feature_pipeline,
Hyperband(
XGBClassifier(learning_rate=0.1),
feat_space=param_space,
task='classification'
)
)
model.fit(Xtrain, Ytrain)
model.predict_proba(Xtest)[:5]