Source code for dodoml.ace

import readline  # noqa: F401,E731 # needed, otherwise rpy2 won't work.
# See https://github.com/ContinuumIO/anaconda-issues/issues/152
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2 import robjects as robj

from .pipeline import TolerantLabelEncoder, ColumnsSelector, ColumnApplier, FillNaN
from sklearn.pipeline import make_union, make_pipeline
import pandas as pd
import numpy as np
import os


[docs]def compute_ace(df, numeric_features, categoric_features, target, target_as_cat): """ Compute the ACE correlation (http://www.stat.cmu.edu/~ryantibs/datamining/lectures/11-cor2-marked.pdf) between each of the numeric/categoric features vs the target. The `target` variable can be treated either as numeric or as categoric using `target_as_cat` parameter. """ pandas2ri.activate() acepack = importr('acepack') rstats = importr('stats') df = _prepare(df, numeric_features, categoric_features) # Ordinal encoded categorics + numerics transformer = _ordinal_transformer(numeric_features, categoric_features) orddf = transformer.fit_transform(df) cat_indexes = list(range(1, 1 + len(categoric_features))) if target_as_cat: cat_indexes += [0] a = acepack.ace(orddf, df[target], cat=robj.IntVector(cat_indexes)) acescores = rstats.cor(a.rx('tx')[0], a.rx('ty')[0]) ace = pd.DataFrame({ "Feature": categoric_features + numeric_features, "Ace": acescores }) ace.Ace = ace.Ace.abs() ace = ace.sort_values(by="Ace", ascending=False) return ace
def compute_pairwise_ace(df, numeric_features, categoric_features): pandas2ri.activate() path = os.path.dirname(os.path.abspath(__file__)) robj.r('source')(os.path.join(path, 'ace.r')) df = _prepare(df, numeric_features, categoric_features) df, r_num_names, r_cat_names = \ _rename(df, numeric_features, categoric_features) corr = robj.globalenv['compute_pairwise_ace']( df, robj.StrVector(r_num_names), robj.StrVector(r_cat_names)) corr = pd.DataFrame(np.array(corr)) corr.columns = (categoric_features + numeric_features) pandas2ri.deactivate() return corr def _ordinal_transformer(numeric_features, categoric_features): cat_pipe = make_pipeline( ColumnsSelector(categoric_features), FillNaN('missing'), ColumnApplier(TolerantLabelEncoder()) ) num_pipe = make_pipeline( ColumnsSelector(numeric_features), FillNaN(-999), ) if categoric_features and numeric_features: return make_union(cat_pipe, num_pipe) elif categoric_features: return cat_pipe return num_pipe def _prepare(df, numeric_features, categoric_features): ''' Transform all boolean columns to float (R does not like boolean) and all categoric columns to string (LabelEncoder does not like boolean in mixed type) ''' df = df.copy() df[numeric_features] = df[numeric_features].astype(float) df = df.fillna(-777) for c in categoric_features: df[c] = df[c].astype(str) return df def _rename(df, numeric_features, categoric_features): """ Standalize the columns' names, as R does not like space, quotes, etc in the names either """ df = df[categoric_features + numeric_features] r_num_names = ['cont%s' % i for i, _ in enumerate(numeric_features)] r_cat_names = ['cat%s' % i for i, _ in enumerate(categoric_features)] df.columns = r_cat_names + r_num_names return df, r_num_names, r_cat_names