Module imodels.tree.c45_tree.c45_tree
Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier References
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning .. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
Expand source code
"""Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier
References
----------
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
"""
from copy import deepcopy
from typing import List
from xml.dom import minidom
from xml.etree import ElementTree as ET
import numpy as np
import pandas as pd
from imodels.tree.c45_tree.c45_utils import decision, is_numeric_feature, gain, gain_ratio, get_best_split, \
set_as_leaf_node
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
class C45TreeClassifier(BaseEstimator, ClassifierMixin):
"""A C4.5 tree classifier.
Parameters
----------
max_rules : int, optional (default=None)
Maximum number of split nodes allowed in the tree
"""
def __init__(self, max_rules: int = None):
super().__init__()
self.max_rules = max_rules
def fit(self, X, y, feature_names: str = None):
self.complexity_ = 0
X, y = check_X_y(X, y)
self.resultType = type(y[0])
if feature_names is None:
self.feature_names = [f'X_{x}' for x in range(X.shape[1])]
else:
# only include alphanumeric chars / replace spaces with underscores
self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_')
for x in feature_names]
self.feature_names = ['X_' + x if x[0].isdigit()
else x
for x in self.feature_names]
assert len(self.feature_names) == X.shape[1]
data = [[] for i in range(len(self.feature_names))]
categories = []
for i in range(len(X)):
categories.append(str(y[i]))
for j in range(len(self.feature_names)):
data[j].append(X[i][j])
root = ET.Element('GreedyTree')
self.grow_tree(data, categories, root, self.feature_names) # adds to root
self.tree_ = ET.tostring(root, encoding="unicode")
# print('self.tree_', self.tree_)
self.dom_ = minidom.parseString(self.tree_)
return self
def raw_preds(self, X):
check_is_fitted(self, ['tree_', 'resultType', 'feature_names'])
X = check_array(X)
if isinstance(X, pd.DataFrame):
X = deepcopy(X)
X.columns = self.feature_names
root = self.dom_.childNodes[0]
prediction = []
for i in range(X.shape[0]):
answerlist = decision(root, X[i], self.feature_names, 1)
answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True)
answer = answerlist[0][0]
prediction.append(self.resultType(answer))
return np.array(prediction)
def predict(self, X):
return (self.raw_preds(X) > 0.5).astype(int)
def predict_proba(self, X):
raw_preds = self.raw_preds(X)
return np.vstack((1 - raw_preds, raw_preds)).transpose()
def __str__(self):
check_is_fitted(self, ['tree_'])
return self.dom_.toprettyxml(newl="\r\n")
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names):
"""
Parameters
----------
X_t: List[list]
input data transposed (num_features x num_observations)
y_str: List[str]
outcome represented as strings
parent
attrs_names
"""
# check that y contains more than 1 distinct value
if len(set(y_str)) > 1:
split = []
# loop over features and build up potential splits
for i in range(len(X_t)):
if set(X_t[i]) == set("?"):
split.append(0)
else:
if is_numeric_feature(X_t[i]):
split.append(gain(y_str, X_t[i]))
else:
split.append(gain_ratio(y_str, X_t[i]))
# no good split, return child node
if max(split) == 0:
set_as_leaf_node(parent, y_str)
# there is a good split
else:
index_selected = split.index(max(split))
name_selected = str(attrs_names[index_selected])
self.complexity_ += 1
if is_numeric_feature(X_t[index_selected]):
# split on this point
split_point = get_best_split(y_str, X_t[index_selected])
# build up children nodes
r_child_X = [[] for i in range(len(X_t))]
r_child_y = []
l_child_X = [[] for i in range(len(X_t))]
l_child_y = []
for i in range(len(y_str)):
if not X_t[index_selected][i] == "?":
if float(X_t[index_selected][i]) < float(split_point):
l_child_y.append(y_str[i])
for j in range(len(X_t)):
l_child_X[j].append(X_t[j][i])
else:
r_child_y.append(y_str[i])
for j in range(len(X_t)):
r_child_X[j].append(X_t[j][i])
# grow child nodes as well
if len(l_child_y) > 0 and len(r_child_y) > 0 and (
self.max_rules is None or
self.complexity_ <= self.max_rules
):
p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?"))
son = ET.SubElement(parent, name_selected,
{'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))})
self.grow_tree(l_child_X, l_child_y, son, attrs_names)
son = ET.SubElement(parent, name_selected,
{'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))})
self.grow_tree(r_child_X, r_child_y, son, attrs_names)
else:
num_max = 0
for cat in set(y_str):
num_cat = y_str.count(cat)
if num_cat > num_max:
num_max = num_cat
most_cat = cat
parent.text = most_cat
else:
# split on non-numeric variable (e.g. categorical)
# create a leaf for each unique value
for k in set(X_t[index_selected]):
if not k == "?" and (
self.max_rules is None or
self.complexity_ <= self.max_rules
):
child_X = [[] for i in range(len(X_t))]
child_y = []
for i in range(len(y_str)):
if X_t[index_selected][i] == k:
child_y.append(y_str[i])
for j in range(len(X_t)):
child_X[j].append(X_t[j][i])
son = ET.SubElement(parent, name_selected, {
'feature': k, "flag": "m",
'p': str(round(
float(len(child_y)) / (
len(X_t[index_selected]) - X_t[index_selected].count("?")),
3))})
self.grow_tree(child_X, child_y, son, attrs_names)
else:
parent.text = y_str[0]
if __name__ == '__main__':
from imodels.util.data_util import get_clean_dataset
X, y, feature_names = get_clean_dataset('ionosphere', data_source='pmlb')
m = C45TreeClassifier(max_rules=3)
m.fit(X, y)
print('mse', np.mean(np.square(m.predict(X) - y)))
print(m)
m.predict(X)
Classes
class C45TreeClassifier (max_rules: int = None)
-
A C4.5 tree classifier.
Parameters
max_rules
:int
, optional(default=None)
- Maximum number of split nodes allowed in the tree
Expand source code
class C45TreeClassifier(BaseEstimator, ClassifierMixin): """A C4.5 tree classifier. Parameters ---------- max_rules : int, optional (default=None) Maximum number of split nodes allowed in the tree """ def __init__(self, max_rules: int = None): super().__init__() self.max_rules = max_rules def fit(self, X, y, feature_names: str = None): self.complexity_ = 0 X, y = check_X_y(X, y) self.resultType = type(y[0]) if feature_names is None: self.feature_names = [f'X_{x}' for x in range(X.shape[1])] else: # only include alphanumeric chars / replace spaces with underscores self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_') for x in feature_names] self.feature_names = ['X_' + x if x[0].isdigit() else x for x in self.feature_names] assert len(self.feature_names) == X.shape[1] data = [[] for i in range(len(self.feature_names))] categories = [] for i in range(len(X)): categories.append(str(y[i])) for j in range(len(self.feature_names)): data[j].append(X[i][j]) root = ET.Element('GreedyTree') self.grow_tree(data, categories, root, self.feature_names) # adds to root self.tree_ = ET.tostring(root, encoding="unicode") # print('self.tree_', self.tree_) self.dom_ = minidom.parseString(self.tree_) return self def raw_preds(self, X): check_is_fitted(self, ['tree_', 'resultType', 'feature_names']) X = check_array(X) if isinstance(X, pd.DataFrame): X = deepcopy(X) X.columns = self.feature_names root = self.dom_.childNodes[0] prediction = [] for i in range(X.shape[0]): answerlist = decision(root, X[i], self.feature_names, 1) answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True) answer = answerlist[0][0] prediction.append(self.resultType(answer)) return np.array(prediction) def predict(self, X): return (self.raw_preds(X) > 0.5).astype(int) def predict_proba(self, X): raw_preds = self.raw_preds(X) return np.vstack((1 - raw_preds, raw_preds)).transpose() def __str__(self): check_is_fitted(self, ['tree_']) return self.dom_.toprettyxml(newl="\r\n") def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names): """ Parameters ---------- X_t: List[list] input data transposed (num_features x num_observations) y_str: List[str] outcome represented as strings parent attrs_names """ # check that y contains more than 1 distinct value if len(set(y_str)) > 1: split = [] # loop over features and build up potential splits for i in range(len(X_t)): if set(X_t[i]) == set("?"): split.append(0) else: if is_numeric_feature(X_t[i]): split.append(gain(y_str, X_t[i])) else: split.append(gain_ratio(y_str, X_t[i])) # no good split, return child node if max(split) == 0: set_as_leaf_node(parent, y_str) # there is a good split else: index_selected = split.index(max(split)) name_selected = str(attrs_names[index_selected]) self.complexity_ += 1 if is_numeric_feature(X_t[index_selected]): # split on this point split_point = get_best_split(y_str, X_t[index_selected]) # build up children nodes r_child_X = [[] for i in range(len(X_t))] r_child_y = [] l_child_X = [[] for i in range(len(X_t))] l_child_y = [] for i in range(len(y_str)): if not X_t[index_selected][i] == "?": if float(X_t[index_selected][i]) < float(split_point): l_child_y.append(y_str[i]) for j in range(len(X_t)): l_child_X[j].append(X_t[j][i]) else: r_child_y.append(y_str[i]) for j in range(len(X_t)): r_child_X[j].append(X_t[j][i]) # grow child nodes as well if len(l_child_y) > 0 and len(r_child_y) > 0 and ( self.max_rules is None or self.complexity_ <= self.max_rules ): p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?")) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))}) self.grow_tree(l_child_X, l_child_y, son, attrs_names) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))}) self.grow_tree(r_child_X, r_child_y, son, attrs_names) else: num_max = 0 for cat in set(y_str): num_cat = y_str.count(cat) if num_cat > num_max: num_max = num_cat most_cat = cat parent.text = most_cat else: # split on non-numeric variable (e.g. categorical) # create a leaf for each unique value for k in set(X_t[index_selected]): if not k == "?" and ( self.max_rules is None or self.complexity_ <= self.max_rules ): child_X = [[] for i in range(len(X_t))] child_y = [] for i in range(len(y_str)): if X_t[index_selected][i] == k: child_y.append(y_str[i]) for j in range(len(X_t)): child_X[j].append(X_t[j][i]) son = ET.SubElement(parent, name_selected, { 'feature': k, "flag": "m", 'p': str(round( float(len(child_y)) / ( len(X_t[index_selected]) - X_t[index_selected].count("?")), 3))}) self.grow_tree(child_X, child_y, son, attrs_names) else: parent.text = y_str[0]
Ancestors
- sklearn.base.BaseEstimator
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y, feature_names: str = None)
-
Expand source code
def fit(self, X, y, feature_names: str = None): self.complexity_ = 0 X, y = check_X_y(X, y) self.resultType = type(y[0]) if feature_names is None: self.feature_names = [f'X_{x}' for x in range(X.shape[1])] else: # only include alphanumeric chars / replace spaces with underscores self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_') for x in feature_names] self.feature_names = ['X_' + x if x[0].isdigit() else x for x in self.feature_names] assert len(self.feature_names) == X.shape[1] data = [[] for i in range(len(self.feature_names))] categories = [] for i in range(len(X)): categories.append(str(y[i])) for j in range(len(self.feature_names)): data[j].append(X[i][j]) root = ET.Element('GreedyTree') self.grow_tree(data, categories, root, self.feature_names) # adds to root self.tree_ = ET.tostring(root, encoding="unicode") # print('self.tree_', self.tree_) self.dom_ = minidom.parseString(self.tree_) return self
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names)
-
Parameters
X_t
:List[list]
- input data transposed (num_features x num_observations)
y_str
:List[str]
- outcome represented as strings
parent
attrs_names
Expand source code
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names): """ Parameters ---------- X_t: List[list] input data transposed (num_features x num_observations) y_str: List[str] outcome represented as strings parent attrs_names """ # check that y contains more than 1 distinct value if len(set(y_str)) > 1: split = [] # loop over features and build up potential splits for i in range(len(X_t)): if set(X_t[i]) == set("?"): split.append(0) else: if is_numeric_feature(X_t[i]): split.append(gain(y_str, X_t[i])) else: split.append(gain_ratio(y_str, X_t[i])) # no good split, return child node if max(split) == 0: set_as_leaf_node(parent, y_str) # there is a good split else: index_selected = split.index(max(split)) name_selected = str(attrs_names[index_selected]) self.complexity_ += 1 if is_numeric_feature(X_t[index_selected]): # split on this point split_point = get_best_split(y_str, X_t[index_selected]) # build up children nodes r_child_X = [[] for i in range(len(X_t))] r_child_y = [] l_child_X = [[] for i in range(len(X_t))] l_child_y = [] for i in range(len(y_str)): if not X_t[index_selected][i] == "?": if float(X_t[index_selected][i]) < float(split_point): l_child_y.append(y_str[i]) for j in range(len(X_t)): l_child_X[j].append(X_t[j][i]) else: r_child_y.append(y_str[i]) for j in range(len(X_t)): r_child_X[j].append(X_t[j][i]) # grow child nodes as well if len(l_child_y) > 0 and len(r_child_y) > 0 and ( self.max_rules is None or self.complexity_ <= self.max_rules ): p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?")) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))}) self.grow_tree(l_child_X, l_child_y, son, attrs_names) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))}) self.grow_tree(r_child_X, r_child_y, son, attrs_names) else: num_max = 0 for cat in set(y_str): num_cat = y_str.count(cat) if num_cat > num_max: num_max = num_cat most_cat = cat parent.text = most_cat else: # split on non-numeric variable (e.g. categorical) # create a leaf for each unique value for k in set(X_t[index_selected]): if not k == "?" and ( self.max_rules is None or self.complexity_ <= self.max_rules ): child_X = [[] for i in range(len(X_t))] child_y = [] for i in range(len(y_str)): if X_t[index_selected][i] == k: child_y.append(y_str[i]) for j in range(len(X_t)): child_X[j].append(X_t[j][i]) son = ET.SubElement(parent, name_selected, { 'feature': k, "flag": "m", 'p': str(round( float(len(child_y)) / ( len(X_t[index_selected]) - X_t[index_selected].count("?")), 3))}) self.grow_tree(child_X, child_y, son, attrs_names) else: parent.text = y_str[0]
def predict(self, X)
-
Expand source code
def predict(self, X): return (self.raw_preds(X) > 0.5).astype(int)
def predict_proba(self, X)
-
Expand source code
def predict_proba(self, X): raw_preds = self.raw_preds(X) return np.vstack((1 - raw_preds, raw_preds)).transpose()
def raw_preds(self, X)
-
Expand source code
def raw_preds(self, X): check_is_fitted(self, ['tree_', 'resultType', 'feature_names']) X = check_array(X) if isinstance(X, pd.DataFrame): X = deepcopy(X) X.columns = self.feature_names root = self.dom_.childNodes[0] prediction = [] for i in range(X.shape[0]): answerlist = decision(root, X[i], self.feature_names, 1) answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True) answer = answerlist[0][0] prediction.append(self.resultType(answer)) return np.array(prediction)