Module imodels.tree.saps
Expand source code
from copy import deepcopy
import numpy as np
from sklearn import datasets
from sklearn import tree
from sklearn.base import BaseEstimator
from sklearn.linear_model import RidgeCV, RidgeClassifierCV
from sklearn.model_selection import train_test_split
class Node:
def __init__(self, feature: int = None, threshold: int = None,
value=None, idxs=None, is_root: bool = False, left=None,
impurity_reduction: float = None, tree_num: int = None,
right=None, split_or_linear='split'):
"""Node class for splitting
"""
# split or linear
self.is_root = is_root
self.idxs = idxs
self.tree_num = tree_num
self.split_or_linear = split_or_linear
self.feature = feature
self.impurity_reduction = impurity_reduction
# different meanings
self.value = value # for split this is mean, for linear this is weight
# split-specific (for linear these should all be None)
self.threshold = threshold
self.left = left
self.right = right
self.left_temp = None
self.right_temp = None
def setattrs(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)
def __str__(self):
if self.split_or_linear == 'linear':
if self.is_root:
return f'X_{self.feature} * {self.value:0.3f} (Tree #{self.tree_num} linear root)'
else:
return f'X_{self.feature} * {self.value:0.3f} (linear)'
else:
if self.is_root:
return f'X_{self.feature} <= {self.threshold:0.3f} (Tree #{self.tree_num} root)'
elif self.left is None and self.right is None:
return f'Val: {self.value[0][0]:0.3f} (leaf)'
else:
return f'X_{self.feature} <= {self.threshold:0.3f} (split)'
def __repr__(self):
return self.__str__()
class SAPS(BaseEstimator):
"""Experimental SAPS (sum of saplings) classifier
"""
def __init__(self, max_rules: int = None, posthoc_ridge: bool = False, include_linear: bool = False):
super().__init__()
self.max_rules = max_rules
self.posthoc_ridge = posthoc_ridge
self.include_linear = include_linear
self.weighted_model_ = None # set if using posthoc_ridge
self._init_prediction_task() # decides between regressor and classifier
def _init_prediction_task(self):
"""
SuperCARTRegressor and SuperCARTClassifier override this method
to alter the prediction task. When using this class directly,
it is equivalent to SuperCARTRegressor
"""
self.prediction_task = 'regression'
def construct_node_linear(self, X, y, idxs, tree_num=0, sample_weight=None):
"""This can be made a lot faster
Assumes there are at least 5 points in node
Doesn't currently support sample_weight!
"""
y_target = y[idxs]
# print(np.unique(y_target))
impurity_orig = np.mean(np.square(y_target)) * idxs.sum()
# find best linear split
best_impurity = impurity_orig
best_linear_coef = None
best_feature = None
for feature_num in range(X.shape[1]):
x = X[idxs, feature_num].reshape(-1, 1)
m = RidgeCV(fit_intercept=False)
m.fit(x, y_target)
impurity = np.min(-m.best_score_) * idxs.sum()
assert impurity >= 0, 'impurity should not be negative'
if impurity < best_impurity:
best_impurity = impurity
best_linear_coef = m.coef_[0]
best_feature = feature_num
impurity_reduction = impurity_orig - best_impurity
# no good linear fit found
if impurity_reduction == 0:
return Node(idxs=idxs, value=np.mean(y_target), tree_num=tree_num,
feature=None, threshold=None,
impurity_reduction=None, split_or_linear='split') # leaf node that just returns its value
else:
assert isinstance(best_linear_coef, float), 'coef should be a float'
return Node(idxs=idxs, value=best_linear_coef, tree_num=tree_num,
feature=best_feature, threshold=None,
impurity_reduction=impurity_reduction, split_or_linear='linear')
def construct_node_with_stump(self, X, y, idxs, tree_num, sample_weight=None):
# array indices
SPLIT = 0
LEFT = 1
RIGHT = 2
# fit stump
stump = tree.DecisionTreeRegressor(max_depth=1)
if sample_weight is not None:
sample_weight = sample_weight[idxs]
stump.fit(X[idxs], y[idxs], sample_weight=sample_weight)
# these are all arrays, arr[0] is split node
# note: -2 is dummy
feature = stump.tree_.feature
threshold = stump.tree_.threshold
impurity = stump.tree_.impurity
n_node_samples = stump.tree_.n_node_samples
value = stump.tree_.value
# no split
if len(feature) == 1:
# print('no split found!', idxs.sum(), impurity, feature)
return Node(idxs=idxs, value=value[SPLIT], tree_num=tree_num,
feature=feature[SPLIT], threshold=threshold[SPLIT],
impurity_reduction=None)
# split node
impurity_reduction = (
impurity[SPLIT] -
impurity[LEFT] * n_node_samples[LEFT] / n_node_samples[SPLIT] -
impurity[RIGHT] * n_node_samples[RIGHT] / n_node_samples[SPLIT]
) * idxs.sum()
node_split = Node(idxs=idxs, value=value[SPLIT], tree_num=tree_num,
feature=feature[SPLIT], threshold=threshold[SPLIT],
impurity_reduction=impurity_reduction)
# print('\t>>>', node_split, 'impurity', impurity, 'num_pts', idxs.sum(), 'imp_reduc', impurity_reduction)
# manage children
idxs_split = X[:, feature[SPLIT]] <= threshold[SPLIT]
idxs_left = idxs_split & idxs
idxs_right = ~idxs_split & idxs
node_left = Node(idxs=idxs_left, value=value[LEFT], tree_num=tree_num)
node_right = Node(idxs=idxs_right, value=value[RIGHT], tree_num=tree_num)
node_split.setattrs(left_temp=node_left, right_temp=node_right, )
return node_split
def fit(self, X, y=None, feature_names=None, min_impurity_decrease=0.0, verbose=False, sample_weight=None):
"""
Params
------
sample_weight: array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Splits that would create child nodes with net zero or negative weight
are ignored while searching for a split in each node.
"""
y = y.astype(float)
if feature_names is not None:
self.feature_names_ = feature_names
self.trees_ = [] # list of the root nodes of added trees
self.complexity_ = 0 # tracks the number of rules in the model
y_predictions_per_tree = {} # predictions for each tree
y_residuals_per_tree = {} # based on predictions above
# set up initial potential_splits
# everything in potential_splits either is_root (so it can be added directly to self.trees_)
# or it is a child of a root node that has already been added
idxs = np.ones(X.shape[0], dtype=bool)
node_init = self.construct_node_with_stump(X=X, y=y, idxs=idxs, tree_num=-1, sample_weight=sample_weight)
potential_splits = [node_init]
if self.include_linear and idxs.sum() >= 5:
node_init_linear = self.construct_node_linear(X=X, y=y, idxs=idxs, tree_num=-1, sample_weight=sample_weight)
potential_splits.append(node_init_linear)
for node in potential_splits:
node.setattrs(is_root=True)
potential_splits = sorted(potential_splits, key=lambda x: x.impurity_reduction)
# start the greedy fitting algorithm
finished = False
while len(potential_splits) > 0 and not finished:
# print('potential_splits', [str(s) for s in potential_splits])
split_node = potential_splits.pop() # get node with max impurity_reduction (since it's sorted)
# don't split on node
if split_node.impurity_reduction < min_impurity_decrease:
finished = True
break
# split on node
if verbose:
print('\nadding ' + str(split_node))
self.complexity_ += 1
# if added a tree root
if split_node.is_root:
# start a new tree
self.trees_.append(split_node)
# update tree_num
for node_ in [split_node, split_node.left_temp, split_node.right_temp]:
if node_ is not None:
node_.tree_num = len(self.trees_) - 1
# add new root potential node
node_new_root = Node(is_root=True, idxs=np.ones(X.shape[0], dtype=bool),
tree_num=-1, split_or_linear=split_node.split_or_linear)
potential_splits.append(node_new_root)
# add children to potential splits (note this doesn't currently add linear potential splits)
if split_node.split_or_linear == 'split':
# assign left_temp, right_temp to be proper children
# (basically adds them to tree in predict method)
split_node.setattrs(left=split_node.left_temp, right=split_node.right_temp)
# add children to potential_splits
potential_splits.append(split_node.left)
potential_splits.append(split_node.right)
# update predictions for altered tree
for tree_num_ in range(len(self.trees_)):
y_predictions_per_tree[tree_num_] = self.predict_tree(self.trees_[tree_num_], X)
y_predictions_per_tree[-1] = np.zeros(X.shape[0]) # dummy 0 preds for possible new trees
# update residuals for each tree
# -1 is key for potential new tree
for tree_num_ in list(range(len(self.trees_))) + [-1]:
y_residuals_per_tree[tree_num_] = deepcopy(y)
# subtract predictions of all other trees
for tree_num_2_ in range(len(self.trees_)):
if not tree_num_2_ == tree_num_:
y_residuals_per_tree[tree_num_] -= y_predictions_per_tree[tree_num_2_]
# recompute all impurities + update potential_split children
potential_splits_new = []
for potential_split in potential_splits:
y_target = y_residuals_per_tree[potential_split.tree_num]
if potential_split.split_or_linear == 'split':
# re-calculate the best split
potential_split_updated = self.construct_node_with_stump(X=X,
y=y_target,
idxs=potential_split.idxs,
tree_num=potential_split.tree_num,
sample_weight=sample_weight,)
# need to preserve certain attributes from before (value at this split + is_root)
# value may change because residuals may have changed, but we want it to store the value from before
potential_split.setattrs(
feature=potential_split_updated.feature,
threshold=potential_split_updated.threshold,
impurity_reduction=potential_split_updated.impurity_reduction,
left_temp=potential_split_updated.left_temp,
right_temp=potential_split_updated.right_temp,
)
elif potential_split.split_or_linear == 'linear':
assert potential_split.is_root, 'Currently, linear node only supported as root'
assert potential_split.idxs.sum() == X.shape[0], 'Currently, linear node only supported as root'
potential_split_updated = self.construct_node_linear(idxs=potential_split.idxs,
X=X,
y=y_target,
tree_num=potential_split.tree_num,
sample_weight=sample_weight)
# don't need to retain anything from before (besides maybe is_root)
potential_split.setattrs(
feature=potential_split_updated.feature,
impurity_reduction=potential_split_updated.impurity_reduction,
value=potential_split_updated.value,
)
# this is a valid split
if potential_split.impurity_reduction is not None:
potential_splits_new.append(potential_split)
# sort so largest impurity reduction comes last (should probs make this a heap later)
potential_splits = sorted(potential_splits_new, key=lambda x: x.impurity_reduction)
if verbose:
print(self)
if self.max_rules is not None and self.complexity_ >= self.max_rules:
finished = True
break
# potentially fit linear model on the tree preds
if self.posthoc_ridge:
if self.prediction_task == 'regression':
self.weighted_model_ = RidgeCV(alphas=(0.01, 0.1, 0.5, 1.0, 5, 10))
elif self.prediction_task == 'classification':
self.weighted_model_ = RidgeClassifierCV(alphas=(0.01, 0.1, 0.5, 1.0, 5, 10))
X_feats = self.extract_tree_predictions(X)
self.weighted_model_.fit(X_feats, y)
return self
def tree_to_str(self, root: Node, prefix=''):
if root is None:
return ''
elif root.split_or_linear == 'linear':
return prefix + str(root)
elif root.threshold is None:
return ''
pprefix = prefix + '\t'
return prefix + str(root) + '\n' + self.tree_to_str(root.left, pprefix) + self.tree_to_str(root.right, pprefix)
def __str__(self):
s = '------------\n' + '\n\t+\n'.join([self.tree_to_str(t) for t in self.trees_])
if hasattr(self, 'feature_names_') and self.feature_names_ is not None:
for i in range(len(self.feature_names_))[::-1]:
s = s.replace(f'X_{i}', self.feature_names_[i])
return s
def predict(self, X):
if self.posthoc_ridge and self.weighted_model_: # note, during fitting don't use the weighted moel
X_feats = self.extract_tree_predictions(X)
return self.weighted_model_.predict(X_feats)
preds = np.zeros(X.shape[0])
for tree in self.trees_:
preds += self.predict_tree(tree, X)
if self.prediction_task == 'regression':
return preds
elif self.prediction_task == 'classification':
return (preds > 0.5).astype(int)
def predict_proba(self, X):
if self.prediction_task == 'regression':
return NotImplemented
elif self.posthoc_ridge and self.weighted_model_: # note, during fitting don't use the weighted moel
X_feats = self.extract_tree_predictions(X)
d = self.weighted_model_.decision_function(X_feats) # for 2 classes, this (n_samples,)
probs = np.exp(d) / (1 + np.exp(d))
return np.vstack((1 - probs, probs)).transpose()
else:
preds = np.zeros(X.shape[0])
for tree in self.trees_:
preds += self.predict_tree(tree, X)
preds = np.clip(preds, a_min=0., a_max=1.) # constrain to range of probabilities
return np.vstack((1 - preds, preds)).transpose()
def extract_tree_predictions(self, X):
"""Extract predictions for all trees
"""
X_feats = np.zeros((X.shape[0], len(self.trees_)))
for tree_num_ in range(len(self.trees_)):
preds_tree = self.predict_tree(self.trees_[tree_num_], X)
X_feats[:, tree_num_] = preds_tree
return X_feats
def predict_tree(self, root: Node, X):
"""Predict for a single tree
This can be made way faster
"""
def predict_tree_single_point(root: Node, x):
if root.split_or_linear == 'linear':
return x[root.feature] * root.value
elif root.left is None and root.right is None:
return root.value
left = x[root.feature] <= root.threshold
if left:
if root.left is None: # we don't actually have to worry about this case
return root.value
else:
return predict_tree_single_point(root.left, x)
else:
if root.right is None: # we don't actually have to worry about this case
return root.value
else:
return predict_tree_single_point(root.right, x)
preds = np.zeros(X.shape[0])
for i in range(X.shape[0]):
preds[i] = predict_tree_single_point(root, X[i])
return preds
class SaplingSumRegressor(SAPS):
def _init_prediction_task(self):
self.prediction_task = 'regression'
class SaplingSumClassifier(SAPS):
def _init_prediction_task(self):
self.prediction_task = 'classification'
if __name__ == '__main__':
np.random.seed(13)
X, y = datasets.load_breast_cancer(return_X_y=True) # binary classification
# X, y = datasets.load_diabetes(return_X_y=True) # regression
# X = np.random.randn(500, 10)
# y = (X[:, 0] > 0).astype(float) + (X[:, 1] > 1).astype(float)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
print('X.shape', X.shape)
print('ys', np.unique(y_train), '\n\n')
m = SaplingSumClassifier(max_rules=5)
m.fit(X_train, y_train)
print(m.predict_proba(X_train))
Classes
class Node (feature: int = None, threshold: int = None, value=None, idxs=None, is_root: bool = False, left=None, impurity_reduction: float = None, tree_num: int = None, right=None, split_or_linear='split')
-
Node class for splitting
Expand source code
class Node: def __init__(self, feature: int = None, threshold: int = None, value=None, idxs=None, is_root: bool = False, left=None, impurity_reduction: float = None, tree_num: int = None, right=None, split_or_linear='split'): """Node class for splitting """ # split or linear self.is_root = is_root self.idxs = idxs self.tree_num = tree_num self.split_or_linear = split_or_linear self.feature = feature self.impurity_reduction = impurity_reduction # different meanings self.value = value # for split this is mean, for linear this is weight # split-specific (for linear these should all be None) self.threshold = threshold self.left = left self.right = right self.left_temp = None self.right_temp = None def setattrs(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) def __str__(self): if self.split_or_linear == 'linear': if self.is_root: return f'X_{self.feature} * {self.value:0.3f} (Tree #{self.tree_num} linear root)' else: return f'X_{self.feature} * {self.value:0.3f} (linear)' else: if self.is_root: return f'X_{self.feature} <= {self.threshold:0.3f} (Tree #{self.tree_num} root)' elif self.left is None and self.right is None: return f'Val: {self.value[0][0]:0.3f} (leaf)' else: return f'X_{self.feature} <= {self.threshold:0.3f} (split)' def __repr__(self): return self.__str__()
Methods
def setattrs(self, **kwargs)
-
Expand source code
def setattrs(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v)
class SAPS (max_rules: int = None, posthoc_ridge: bool = False, include_linear: bool = False)
-
Experimental SAPS (sum of saplings) classifier
Expand source code
class SAPS(BaseEstimator): """Experimental SAPS (sum of saplings) classifier """ def __init__(self, max_rules: int = None, posthoc_ridge: bool = False, include_linear: bool = False): super().__init__() self.max_rules = max_rules self.posthoc_ridge = posthoc_ridge self.include_linear = include_linear self.weighted_model_ = None # set if using posthoc_ridge self._init_prediction_task() # decides between regressor and classifier def _init_prediction_task(self): """ SuperCARTRegressor and SuperCARTClassifier override this method to alter the prediction task. When using this class directly, it is equivalent to SuperCARTRegressor """ self.prediction_task = 'regression' def construct_node_linear(self, X, y, idxs, tree_num=0, sample_weight=None): """This can be made a lot faster Assumes there are at least 5 points in node Doesn't currently support sample_weight! """ y_target = y[idxs] # print(np.unique(y_target)) impurity_orig = np.mean(np.square(y_target)) * idxs.sum() # find best linear split best_impurity = impurity_orig best_linear_coef = None best_feature = None for feature_num in range(X.shape[1]): x = X[idxs, feature_num].reshape(-1, 1) m = RidgeCV(fit_intercept=False) m.fit(x, y_target) impurity = np.min(-m.best_score_) * idxs.sum() assert impurity >= 0, 'impurity should not be negative' if impurity < best_impurity: best_impurity = impurity best_linear_coef = m.coef_[0] best_feature = feature_num impurity_reduction = impurity_orig - best_impurity # no good linear fit found if impurity_reduction == 0: return Node(idxs=idxs, value=np.mean(y_target), tree_num=tree_num, feature=None, threshold=None, impurity_reduction=None, split_or_linear='split') # leaf node that just returns its value else: assert isinstance(best_linear_coef, float), 'coef should be a float' return Node(idxs=idxs, value=best_linear_coef, tree_num=tree_num, feature=best_feature, threshold=None, impurity_reduction=impurity_reduction, split_or_linear='linear') def construct_node_with_stump(self, X, y, idxs, tree_num, sample_weight=None): # array indices SPLIT = 0 LEFT = 1 RIGHT = 2 # fit stump stump = tree.DecisionTreeRegressor(max_depth=1) if sample_weight is not None: sample_weight = sample_weight[idxs] stump.fit(X[idxs], y[idxs], sample_weight=sample_weight) # these are all arrays, arr[0] is split node # note: -2 is dummy feature = stump.tree_.feature threshold = stump.tree_.threshold impurity = stump.tree_.impurity n_node_samples = stump.tree_.n_node_samples value = stump.tree_.value # no split if len(feature) == 1: # print('no split found!', idxs.sum(), impurity, feature) return Node(idxs=idxs, value=value[SPLIT], tree_num=tree_num, feature=feature[SPLIT], threshold=threshold[SPLIT], impurity_reduction=None) # split node impurity_reduction = ( impurity[SPLIT] - impurity[LEFT] * n_node_samples[LEFT] / n_node_samples[SPLIT] - impurity[RIGHT] * n_node_samples[RIGHT] / n_node_samples[SPLIT] ) * idxs.sum() node_split = Node(idxs=idxs, value=value[SPLIT], tree_num=tree_num, feature=feature[SPLIT], threshold=threshold[SPLIT], impurity_reduction=impurity_reduction) # print('\t>>>', node_split, 'impurity', impurity, 'num_pts', idxs.sum(), 'imp_reduc', impurity_reduction) # manage children idxs_split = X[:, feature[SPLIT]] <= threshold[SPLIT] idxs_left = idxs_split & idxs idxs_right = ~idxs_split & idxs node_left = Node(idxs=idxs_left, value=value[LEFT], tree_num=tree_num) node_right = Node(idxs=idxs_right, value=value[RIGHT], tree_num=tree_num) node_split.setattrs(left_temp=node_left, right_temp=node_right, ) return node_split def fit(self, X, y=None, feature_names=None, min_impurity_decrease=0.0, verbose=False, sample_weight=None): """ Params ------ sample_weight: array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. """ y = y.astype(float) if feature_names is not None: self.feature_names_ = feature_names self.trees_ = [] # list of the root nodes of added trees self.complexity_ = 0 # tracks the number of rules in the model y_predictions_per_tree = {} # predictions for each tree y_residuals_per_tree = {} # based on predictions above # set up initial potential_splits # everything in potential_splits either is_root (so it can be added directly to self.trees_) # or it is a child of a root node that has already been added idxs = np.ones(X.shape[0], dtype=bool) node_init = self.construct_node_with_stump(X=X, y=y, idxs=idxs, tree_num=-1, sample_weight=sample_weight) potential_splits = [node_init] if self.include_linear and idxs.sum() >= 5: node_init_linear = self.construct_node_linear(X=X, y=y, idxs=idxs, tree_num=-1, sample_weight=sample_weight) potential_splits.append(node_init_linear) for node in potential_splits: node.setattrs(is_root=True) potential_splits = sorted(potential_splits, key=lambda x: x.impurity_reduction) # start the greedy fitting algorithm finished = False while len(potential_splits) > 0 and not finished: # print('potential_splits', [str(s) for s in potential_splits]) split_node = potential_splits.pop() # get node with max impurity_reduction (since it's sorted) # don't split on node if split_node.impurity_reduction < min_impurity_decrease: finished = True break # split on node if verbose: print('\nadding ' + str(split_node)) self.complexity_ += 1 # if added a tree root if split_node.is_root: # start a new tree self.trees_.append(split_node) # update tree_num for node_ in [split_node, split_node.left_temp, split_node.right_temp]: if node_ is not None: node_.tree_num = len(self.trees_) - 1 # add new root potential node node_new_root = Node(is_root=True, idxs=np.ones(X.shape[0], dtype=bool), tree_num=-1, split_or_linear=split_node.split_or_linear) potential_splits.append(node_new_root) # add children to potential splits (note this doesn't currently add linear potential splits) if split_node.split_or_linear == 'split': # assign left_temp, right_temp to be proper children # (basically adds them to tree in predict method) split_node.setattrs(left=split_node.left_temp, right=split_node.right_temp) # add children to potential_splits potential_splits.append(split_node.left) potential_splits.append(split_node.right) # update predictions for altered tree for tree_num_ in range(len(self.trees_)): y_predictions_per_tree[tree_num_] = self.predict_tree(self.trees_[tree_num_], X) y_predictions_per_tree[-1] = np.zeros(X.shape[0]) # dummy 0 preds for possible new trees # update residuals for each tree # -1 is key for potential new tree for tree_num_ in list(range(len(self.trees_))) + [-1]: y_residuals_per_tree[tree_num_] = deepcopy(y) # subtract predictions of all other trees for tree_num_2_ in range(len(self.trees_)): if not tree_num_2_ == tree_num_: y_residuals_per_tree[tree_num_] -= y_predictions_per_tree[tree_num_2_] # recompute all impurities + update potential_split children potential_splits_new = [] for potential_split in potential_splits: y_target = y_residuals_per_tree[potential_split.tree_num] if potential_split.split_or_linear == 'split': # re-calculate the best split potential_split_updated = self.construct_node_with_stump(X=X, y=y_target, idxs=potential_split.idxs, tree_num=potential_split.tree_num, sample_weight=sample_weight,) # need to preserve certain attributes from before (value at this split + is_root) # value may change because residuals may have changed, but we want it to store the value from before potential_split.setattrs( feature=potential_split_updated.feature, threshold=potential_split_updated.threshold, impurity_reduction=potential_split_updated.impurity_reduction, left_temp=potential_split_updated.left_temp, right_temp=potential_split_updated.right_temp, ) elif potential_split.split_or_linear == 'linear': assert potential_split.is_root, 'Currently, linear node only supported as root' assert potential_split.idxs.sum() == X.shape[0], 'Currently, linear node only supported as root' potential_split_updated = self.construct_node_linear(idxs=potential_split.idxs, X=X, y=y_target, tree_num=potential_split.tree_num, sample_weight=sample_weight) # don't need to retain anything from before (besides maybe is_root) potential_split.setattrs( feature=potential_split_updated.feature, impurity_reduction=potential_split_updated.impurity_reduction, value=potential_split_updated.value, ) # this is a valid split if potential_split.impurity_reduction is not None: potential_splits_new.append(potential_split) # sort so largest impurity reduction comes last (should probs make this a heap later) potential_splits = sorted(potential_splits_new, key=lambda x: x.impurity_reduction) if verbose: print(self) if self.max_rules is not None and self.complexity_ >= self.max_rules: finished = True break # potentially fit linear model on the tree preds if self.posthoc_ridge: if self.prediction_task == 'regression': self.weighted_model_ = RidgeCV(alphas=(0.01, 0.1, 0.5, 1.0, 5, 10)) elif self.prediction_task == 'classification': self.weighted_model_ = RidgeClassifierCV(alphas=(0.01, 0.1, 0.5, 1.0, 5, 10)) X_feats = self.extract_tree_predictions(X) self.weighted_model_.fit(X_feats, y) return self def tree_to_str(self, root: Node, prefix=''): if root is None: return '' elif root.split_or_linear == 'linear': return prefix + str(root) elif root.threshold is None: return '' pprefix = prefix + '\t' return prefix + str(root) + '\n' + self.tree_to_str(root.left, pprefix) + self.tree_to_str(root.right, pprefix) def __str__(self): s = '------------\n' + '\n\t+\n'.join([self.tree_to_str(t) for t in self.trees_]) if hasattr(self, 'feature_names_') and self.feature_names_ is not None: for i in range(len(self.feature_names_))[::-1]: s = s.replace(f'X_{i}', self.feature_names_[i]) return s def predict(self, X): if self.posthoc_ridge and self.weighted_model_: # note, during fitting don't use the weighted moel X_feats = self.extract_tree_predictions(X) return self.weighted_model_.predict(X_feats) preds = np.zeros(X.shape[0]) for tree in self.trees_: preds += self.predict_tree(tree, X) if self.prediction_task == 'regression': return preds elif self.prediction_task == 'classification': return (preds > 0.5).astype(int) def predict_proba(self, X): if self.prediction_task == 'regression': return NotImplemented elif self.posthoc_ridge and self.weighted_model_: # note, during fitting don't use the weighted moel X_feats = self.extract_tree_predictions(X) d = self.weighted_model_.decision_function(X_feats) # for 2 classes, this (n_samples,) probs = np.exp(d) / (1 + np.exp(d)) return np.vstack((1 - probs, probs)).transpose() else: preds = np.zeros(X.shape[0]) for tree in self.trees_: preds += self.predict_tree(tree, X) preds = np.clip(preds, a_min=0., a_max=1.) # constrain to range of probabilities return np.vstack((1 - preds, preds)).transpose() def extract_tree_predictions(self, X): """Extract predictions for all trees """ X_feats = np.zeros((X.shape[0], len(self.trees_))) for tree_num_ in range(len(self.trees_)): preds_tree = self.predict_tree(self.trees_[tree_num_], X) X_feats[:, tree_num_] = preds_tree return X_feats def predict_tree(self, root: Node, X): """Predict for a single tree This can be made way faster """ def predict_tree_single_point(root: Node, x): if root.split_or_linear == 'linear': return x[root.feature] * root.value elif root.left is None and root.right is None: return root.value left = x[root.feature] <= root.threshold if left: if root.left is None: # we don't actually have to worry about this case return root.value else: return predict_tree_single_point(root.left, x) else: if root.right is None: # we don't actually have to worry about this case return root.value else: return predict_tree_single_point(root.right, x) preds = np.zeros(X.shape[0]) for i in range(X.shape[0]): preds[i] = predict_tree_single_point(root, X[i]) return preds
Ancestors
- sklearn.base.BaseEstimator
Subclasses
Methods
def construct_node_linear(self, X, y, idxs, tree_num=0, sample_weight=None)
-
This can be made a lot faster Assumes there are at least 5 points in node Doesn't currently support sample_weight!
Expand source code
def construct_node_linear(self, X, y, idxs, tree_num=0, sample_weight=None): """This can be made a lot faster Assumes there are at least 5 points in node Doesn't currently support sample_weight! """ y_target = y[idxs] # print(np.unique(y_target)) impurity_orig = np.mean(np.square(y_target)) * idxs.sum() # find best linear split best_impurity = impurity_orig best_linear_coef = None best_feature = None for feature_num in range(X.shape[1]): x = X[idxs, feature_num].reshape(-1, 1) m = RidgeCV(fit_intercept=False) m.fit(x, y_target) impurity = np.min(-m.best_score_) * idxs.sum() assert impurity >= 0, 'impurity should not be negative' if impurity < best_impurity: best_impurity = impurity best_linear_coef = m.coef_[0] best_feature = feature_num impurity_reduction = impurity_orig - best_impurity # no good linear fit found if impurity_reduction == 0: return Node(idxs=idxs, value=np.mean(y_target), tree_num=tree_num, feature=None, threshold=None, impurity_reduction=None, split_or_linear='split') # leaf node that just returns its value else: assert isinstance(best_linear_coef, float), 'coef should be a float' return Node(idxs=idxs, value=best_linear_coef, tree_num=tree_num, feature=best_feature, threshold=None, impurity_reduction=impurity_reduction, split_or_linear='linear')
def construct_node_with_stump(self, X, y, idxs, tree_num, sample_weight=None)
-
Expand source code
def construct_node_with_stump(self, X, y, idxs, tree_num, sample_weight=None): # array indices SPLIT = 0 LEFT = 1 RIGHT = 2 # fit stump stump = tree.DecisionTreeRegressor(max_depth=1) if sample_weight is not None: sample_weight = sample_weight[idxs] stump.fit(X[idxs], y[idxs], sample_weight=sample_weight) # these are all arrays, arr[0] is split node # note: -2 is dummy feature = stump.tree_.feature threshold = stump.tree_.threshold impurity = stump.tree_.impurity n_node_samples = stump.tree_.n_node_samples value = stump.tree_.value # no split if len(feature) == 1: # print('no split found!', idxs.sum(), impurity, feature) return Node(idxs=idxs, value=value[SPLIT], tree_num=tree_num, feature=feature[SPLIT], threshold=threshold[SPLIT], impurity_reduction=None) # split node impurity_reduction = ( impurity[SPLIT] - impurity[LEFT] * n_node_samples[LEFT] / n_node_samples[SPLIT] - impurity[RIGHT] * n_node_samples[RIGHT] / n_node_samples[SPLIT] ) * idxs.sum() node_split = Node(idxs=idxs, value=value[SPLIT], tree_num=tree_num, feature=feature[SPLIT], threshold=threshold[SPLIT], impurity_reduction=impurity_reduction) # print('\t>>>', node_split, 'impurity', impurity, 'num_pts', idxs.sum(), 'imp_reduc', impurity_reduction) # manage children idxs_split = X[:, feature[SPLIT]] <= threshold[SPLIT] idxs_left = idxs_split & idxs idxs_right = ~idxs_split & idxs node_left = Node(idxs=idxs_left, value=value[LEFT], tree_num=tree_num) node_right = Node(idxs=idxs_right, value=value[RIGHT], tree_num=tree_num) node_split.setattrs(left_temp=node_left, right_temp=node_right, ) return node_split
def extract_tree_predictions(self, X)
-
Extract predictions for all trees
Expand source code
def extract_tree_predictions(self, X): """Extract predictions for all trees """ X_feats = np.zeros((X.shape[0], len(self.trees_))) for tree_num_ in range(len(self.trees_)): preds_tree = self.predict_tree(self.trees_[tree_num_], X) X_feats[:, tree_num_] = preds_tree return X_feats
def fit(self, X, y=None, feature_names=None, min_impurity_decrease=0.0, verbose=False, sample_weight=None)
-
Params
sample_weight: array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node.
Expand source code
def fit(self, X, y=None, feature_names=None, min_impurity_decrease=0.0, verbose=False, sample_weight=None): """ Params ------ sample_weight: array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. """ y = y.astype(float) if feature_names is not None: self.feature_names_ = feature_names self.trees_ = [] # list of the root nodes of added trees self.complexity_ = 0 # tracks the number of rules in the model y_predictions_per_tree = {} # predictions for each tree y_residuals_per_tree = {} # based on predictions above # set up initial potential_splits # everything in potential_splits either is_root (so it can be added directly to self.trees_) # or it is a child of a root node that has already been added idxs = np.ones(X.shape[0], dtype=bool) node_init = self.construct_node_with_stump(X=X, y=y, idxs=idxs, tree_num=-1, sample_weight=sample_weight) potential_splits = [node_init] if self.include_linear and idxs.sum() >= 5: node_init_linear = self.construct_node_linear(X=X, y=y, idxs=idxs, tree_num=-1, sample_weight=sample_weight) potential_splits.append(node_init_linear) for node in potential_splits: node.setattrs(is_root=True) potential_splits = sorted(potential_splits, key=lambda x: x.impurity_reduction) # start the greedy fitting algorithm finished = False while len(potential_splits) > 0 and not finished: # print('potential_splits', [str(s) for s in potential_splits]) split_node = potential_splits.pop() # get node with max impurity_reduction (since it's sorted) # don't split on node if split_node.impurity_reduction < min_impurity_decrease: finished = True break # split on node if verbose: print('\nadding ' + str(split_node)) self.complexity_ += 1 # if added a tree root if split_node.is_root: # start a new tree self.trees_.append(split_node) # update tree_num for node_ in [split_node, split_node.left_temp, split_node.right_temp]: if node_ is not None: node_.tree_num = len(self.trees_) - 1 # add new root potential node node_new_root = Node(is_root=True, idxs=np.ones(X.shape[0], dtype=bool), tree_num=-1, split_or_linear=split_node.split_or_linear) potential_splits.append(node_new_root) # add children to potential splits (note this doesn't currently add linear potential splits) if split_node.split_or_linear == 'split': # assign left_temp, right_temp to be proper children # (basically adds them to tree in predict method) split_node.setattrs(left=split_node.left_temp, right=split_node.right_temp) # add children to potential_splits potential_splits.append(split_node.left) potential_splits.append(split_node.right) # update predictions for altered tree for tree_num_ in range(len(self.trees_)): y_predictions_per_tree[tree_num_] = self.predict_tree(self.trees_[tree_num_], X) y_predictions_per_tree[-1] = np.zeros(X.shape[0]) # dummy 0 preds for possible new trees # update residuals for each tree # -1 is key for potential new tree for tree_num_ in list(range(len(self.trees_))) + [-1]: y_residuals_per_tree[tree_num_] = deepcopy(y) # subtract predictions of all other trees for tree_num_2_ in range(len(self.trees_)): if not tree_num_2_ == tree_num_: y_residuals_per_tree[tree_num_] -= y_predictions_per_tree[tree_num_2_] # recompute all impurities + update potential_split children potential_splits_new = [] for potential_split in potential_splits: y_target = y_residuals_per_tree[potential_split.tree_num] if potential_split.split_or_linear == 'split': # re-calculate the best split potential_split_updated = self.construct_node_with_stump(X=X, y=y_target, idxs=potential_split.idxs, tree_num=potential_split.tree_num, sample_weight=sample_weight,) # need to preserve certain attributes from before (value at this split + is_root) # value may change because residuals may have changed, but we want it to store the value from before potential_split.setattrs( feature=potential_split_updated.feature, threshold=potential_split_updated.threshold, impurity_reduction=potential_split_updated.impurity_reduction, left_temp=potential_split_updated.left_temp, right_temp=potential_split_updated.right_temp, ) elif potential_split.split_or_linear == 'linear': assert potential_split.is_root, 'Currently, linear node only supported as root' assert potential_split.idxs.sum() == X.shape[0], 'Currently, linear node only supported as root' potential_split_updated = self.construct_node_linear(idxs=potential_split.idxs, X=X, y=y_target, tree_num=potential_split.tree_num, sample_weight=sample_weight) # don't need to retain anything from before (besides maybe is_root) potential_split.setattrs( feature=potential_split_updated.feature, impurity_reduction=potential_split_updated.impurity_reduction, value=potential_split_updated.value, ) # this is a valid split if potential_split.impurity_reduction is not None: potential_splits_new.append(potential_split) # sort so largest impurity reduction comes last (should probs make this a heap later) potential_splits = sorted(potential_splits_new, key=lambda x: x.impurity_reduction) if verbose: print(self) if self.max_rules is not None and self.complexity_ >= self.max_rules: finished = True break # potentially fit linear model on the tree preds if self.posthoc_ridge: if self.prediction_task == 'regression': self.weighted_model_ = RidgeCV(alphas=(0.01, 0.1, 0.5, 1.0, 5, 10)) elif self.prediction_task == 'classification': self.weighted_model_ = RidgeClassifierCV(alphas=(0.01, 0.1, 0.5, 1.0, 5, 10)) X_feats = self.extract_tree_predictions(X) self.weighted_model_.fit(X_feats, y) return self
def predict(self, X)
-
Expand source code
def predict(self, X): if self.posthoc_ridge and self.weighted_model_: # note, during fitting don't use the weighted moel X_feats = self.extract_tree_predictions(X) return self.weighted_model_.predict(X_feats) preds = np.zeros(X.shape[0]) for tree in self.trees_: preds += self.predict_tree(tree, X) if self.prediction_task == 'regression': return preds elif self.prediction_task == 'classification': return (preds > 0.5).astype(int)
def predict_proba(self, X)
-
Expand source code
def predict_proba(self, X): if self.prediction_task == 'regression': return NotImplemented elif self.posthoc_ridge and self.weighted_model_: # note, during fitting don't use the weighted moel X_feats = self.extract_tree_predictions(X) d = self.weighted_model_.decision_function(X_feats) # for 2 classes, this (n_samples,) probs = np.exp(d) / (1 + np.exp(d)) return np.vstack((1 - probs, probs)).transpose() else: preds = np.zeros(X.shape[0]) for tree in self.trees_: preds += self.predict_tree(tree, X) preds = np.clip(preds, a_min=0., a_max=1.) # constrain to range of probabilities return np.vstack((1 - preds, preds)).transpose()
def predict_tree(self, root: Node, X)
-
Predict for a single tree This can be made way faster
Expand source code
def predict_tree(self, root: Node, X): """Predict for a single tree This can be made way faster """ def predict_tree_single_point(root: Node, x): if root.split_or_linear == 'linear': return x[root.feature] * root.value elif root.left is None and root.right is None: return root.value left = x[root.feature] <= root.threshold if left: if root.left is None: # we don't actually have to worry about this case return root.value else: return predict_tree_single_point(root.left, x) else: if root.right is None: # we don't actually have to worry about this case return root.value else: return predict_tree_single_point(root.right, x) preds = np.zeros(X.shape[0]) for i in range(X.shape[0]): preds[i] = predict_tree_single_point(root, X[i]) return preds
def tree_to_str(self, root: Node, prefix='')
-
Expand source code
def tree_to_str(self, root: Node, prefix=''): if root is None: return '' elif root.split_or_linear == 'linear': return prefix + str(root) elif root.threshold is None: return '' pprefix = prefix + '\t' return prefix + str(root) + '\n' + self.tree_to_str(root.left, pprefix) + self.tree_to_str(root.right, pprefix)
class SaplingSumClassifier (max_rules: int = None, posthoc_ridge: bool = False, include_linear: bool = False)
-
Experimental SAPS (sum of saplings) classifier
Expand source code
class SaplingSumClassifier(SAPS): def _init_prediction_task(self): self.prediction_task = 'classification'
Ancestors
- SAPS
- sklearn.base.BaseEstimator
Inherited members
class SaplingSumRegressor (max_rules: int = None, posthoc_ridge: bool = False, include_linear: bool = False)
-
Experimental SAPS (sum of saplings) classifier
Expand source code
class SaplingSumRegressor(SAPS): def _init_prediction_task(self): self.prediction_task = 'regression'
Ancestors
- SAPS
- sklearn.base.BaseEstimator
Inherited members