In [1]:
import pandas as pd
from dodoml.pipeline import (
    ColumnsSelector, UniqueCountColumnSelector, TolerantLabelEncoder, FillNaN,
    ColumnApplier, OrdinalEncoder, CountFrequencyEncoder, Logify, BoxCoxTransformer,
    YToLog)
from dodoml import (compute_features_impact, compute_partial_dependence, lift_curve)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, Imputer
from sklearn.model_selection import train_test_split

import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
In [2]:
df = pd.read_csv('./data/titanic.csv')
CAT = ['Sex', 'Embarked']
NUM = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Pipeline Operations

Columns Selector

In [3]:
ColumnsSelector(CAT).fit_transform(df)[:3]
Out[3]:
Sex Embarked
0 male S
1 female C
2 female S

Columns selector based on unique count

In [4]:
UniqueCountColumnSelector(1, 5).fit_transform(df)[:3]
Out[4]:
Survived Pclass Sex Embarked
0 0 3 male S
1 1 1 female C
2 1 3 female S

Label Encoder

In [5]:
TolerantLabelEncoder().fit_transform(df.Sex)[:3]
Out[5]:
array([1, 0, 0])
In [6]:
ColumnApplier(TolerantLabelEncoder()).fit_transform(df[CAT].fillna(''))[:3]
Out[6]:
Sex Embarked
0 1 3
1 0 1
2 0 3

Ordinal encoder

In [7]:
OrdinalEncoder(200).fit_transform(df[CAT])[:4]
Out[7]:
array([[ 0,  0],
       [ 1, -1],
       [ 1,  0],
       [ 1,  0]])

Frequency encoder

In [8]:
CountFrequencyEncoder().fit_transform(df.Sex)[:4]
Out[8]:
array([577, 314, 314, 314])
In [9]:
ColumnApplier(CountFrequencyEncoder()).fit_transform(df[CAT])[:4]
Out[9]:
Sex Embarked
0 577 644
1 314 168
2 314 644
3 314 644

Log transformation

In [10]:
Logify().transform(df.Age)[:4]
Out[10]:
0    1.372544
1    1.597476
2    1.440594
3    1.563244
Name: Age, dtype: float64

Boxcox transformation

In [11]:
BoxCoxTransformer().fit_transform(df.Age.fillna(-1))[:10]
Out[11]:
array([-0.98256081, -0.6311409 , -0.88380355, -0.68943678, -0.68943678,
       -2.377156  , -0.35775568, -1.80421622, -0.86048347, -1.21610827])

Y transformation

In [12]:
model = YToLog(LinearRegression(), 0.0001)
X, y = df[['Age', 'Pclass']].fillna(0), df.Fare
model.fit(X, y)
model.predict(X)[:10]
/Users/phi/anaconda/envs/env35/lib/python3.5/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
Out[12]:
array([  9.47391473,  40.13598763,   9.56391479,  39.85238423,
         9.7695533 ,   8.99384124,  41.68297699,   9.03646002,
         9.58654808,  18.77619118])

Model exploration

Feature Impact

In [27]:
feature_pipeline = make_union(
    make_pipeline(
        ColumnsSelector(CAT),
        FillNaN('nan'),
        ColumnApplier(TolerantLabelEncoder()),
        OneHotEncoder()
    ),
    make_pipeline(
        ColumnsSelector(NUM),
        Imputer()
    )
)
model = make_pipeline(
    feature_pipeline,
    LogisticRegression()
)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(df, df.Survived)
model.fit(Xtrain, Ytrain)
compute_features_impact(model, Xtest, Ytest, row_sample=100).plot(kind='barh')
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x10962f5f8>

Partial dependence

In [28]:
for feature, series in compute_partial_dependence(model, Xtest, row_sample=100):
    if series.shape[0] > 0:
        if feature in CAT:
            series.plot(kind='bar')
        else:
            series.plot(kind='line')
        plt.title(feature)
        plt.show()

LIFT Curve

In [15]:
lift_curve(Ytest, model.predict_proba(Xtest)[:,1])
Out[15]:
gain lift
decile
0.1 0.215054 2.179863
0.2 0.408602 1.961877
0.3 0.591398 1.852884
0.4 0.688172 0.980938
0.5 0.817204 1.251052
0.6 0.849462 0.326979
0.7 0.892473 0.435973
0.8 0.924731 0.326979
0.9 0.978495 0.544966
1.0 1.000000 0.208509

Explaining the prediction

In [16]:
from dodoml.lime import PredictorLime
explainer = PredictorLime(Xtrain[NUM + CAT], Ytrain, NUM, CAT, class_names=['Dead', 'Alive'])
In [17]:
instance = Xtrain.iloc[100]
exp = explainer.explain_instance(instance, model.predict_proba, n_features=4)
exp.show_in_notebook()
/Users/phi/anaconda/envs/env35/lib/python3.5/site-packages/dodoml-0.1.0-py3.5.egg/dodoml/lime.py:207: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  scaled_data.iloc[0].reshape(1, -1),
In [18]:
# To get all the values
reason = [b for a in exp.as_list() for b in a]
values = [exp.domain_mapper.feature_values[i[0]] for i in exp.local_exp[1]]
variables = [exp.domain_mapper.feature_names[i[0]] for i in exp.local_exp[1]]

[exp.predict_proba[1]] + variables + reason + values
Out[18]:
[0.079341869937077675,
 'Sex',
 'Pclass',
 'SibSp',
 'Embarked',
 'Sex=male',
 -0.48773466644766589,
 '2.00 < Pclass <= 3.00',
 -0.24690837598666437,
 'SibSp <= 0.00',
 0.085520931773833825,
 'Embarked=S',
 -0.072179629030610176,
 'male',
 '3',
 '0',
 'S']

Hyperband

Simple xgboost hyperband

In [23]:
from dodoml.ml import xgboost_hyperband_classifier, xgboost_hyperband_regressor

model = xgboost_hyperband_classifier(NUM, CAT)
model.fit(Xtrain, Ytrain)
model.predict_proba(Xtest)[:5]
Out[23]:
array([[ 0.32724768,  0.67275232],
       [ 0.01142949,  0.98857051],
       [ 0.05073935,  0.94926065],
       [ 0.63751793,  0.36248204],
       [ 0.83082908,  0.16917093]], dtype=float32)

Hyperband

In [33]:
from dodoml.ml import Hyperband
from xgboost import XGBClassifier
from scipy.stats.distributions import uniform, randint

param_space = {
    'max_depth': randint(2, 11),
    'min_child_weight': randint(1, 11),
    'subsample': uniform(0.5, 0.5),
}

model = make_pipeline(
    feature_pipeline,
    Hyperband(
        XGBClassifier(learning_rate=0.1),
        feat_space=param_space,
        task='classification'
    )
)

model.fit(Xtrain, Ytrain)
model.predict_proba(Xtest)[:5]
Out[33]:
array([[ 0.93623704,  0.06376296],
       [ 0.49969685,  0.50030315],
       [ 0.9584018 ,  0.04159822],
       [ 0.70166653,  0.29833347],
       [ 0.06679052,  0.93320948]], dtype=float32)
In [ ]:
 
In [ ]: