Module mixed_naive_bayes.mixed_naive_bayes
The mixed_naive_bayes
module implements Categorical and Gaussian
Naive Bayes algorithms. These are supervised learning methods based on
applying Bayes' theorem with strong (naive) feature independence assumptions.
The API's design is similar to scikit-learn's.
Look at the example in MixedNB
.
Expand source code
# -*- coding: utf-8 -*-
"""
The `mixed_naive_bayes` module implements Categorical and Gaussian
Naive Bayes algorithms. These are supervised learning methods based on
applying Bayes' theorem with strong (naive) feature independence assumptions.
The API's design is similar to scikit-learn's.
Look at the example in `mixed_naive_bayes.mixed_naive_bayes.MixedNB`.
"""
import warnings
import numpy as np
_ALPHA_MIN = 1e-10
class MixedNB():
"""
Naive Bayes classifier for Categorical and Gaussian models.
Note: When using categorical_features, MixedNB expects that
for each feature, all possible classes are captured in the
trining data X in the `mixed_naive_bayes.mixed_naive_bayes.MixedNB.fit` method.
This is to ensure numerical stability.
Parameters
----------
alpha : non-negative float, optional (default=0)
Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
This is for features with categorical distribution.
class_prior : array-like, size (num_classes,), optional (default=None)
Prior probabilities of the classes. If specified the priors are not
adjusted according to the data.
var_smoothing : float, optional (default=1e-9)
Portion of the largest variance of all features that is added to
variances for calculation stability.
Attributes
----------
class_prior : array, shape (num_classes,)
probability of each class.
epsilon : float
absolute additive value to variances
num_samples : int
number of training samples
num_features : int
number of features of X
num_classes : int
number of classes (number of layes of y)
models : array, shape (num_classes,)
the distribution for every feature and class
References
----------
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
Example
-------
>>> import numpy as np
>>> X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1],
[0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]]
>>> y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0]
>>> X = np.array(X)
>>> y = np.array(y)
>>> clf = MixedNB()
>>> clf.fit(X, y, categorical_features=[0, 1])
>>> print(clf.predict([[0, 0]]))
"""
def __init__(self, categorical_features=None, max_categories=None,
alpha=0.5, priors=None, var_smoothing=1e-9):
self.alpha = alpha
self.var_smoothing = var_smoothing
self.num_features = 0
self.epsilon = 1e-9
self._is_fitted = False
self.max_categories = max_categories
self.categorical_features = categorical_features
self.initial_priors = priors
self.gaussian_features = []
self.priors = self.initial_priors
self.theta = []
self.sigma = []
self.categorical_posteriors = []
def __repr__(self):
return str(f"{self.__class__.__name__}(alpha={self.alpha}, " +
f"var_smoothing={self.var_smoothing})")
def fit(self, X, y):
"""Fit Mixed Naive Bayes according to X, y
This method also prepares a `self.models` object. Note that the reason
why some variables are cast to list() is to make the models object
JSON serializable.
Parameters
----------
X : array-like, shape (num_samples, n_features)
Training vectors, where num_samples is the number of samples
and n_features is the number of features.
y : array-like, shape (num_samples,)
Target values.
categorical_features : array
Columns which have categorical feature_distributions
Returns
-------
self : object
"""
if self._is_fitted:
self.gaussian_features = []
self.priors = self.initial_priors
self.theta = []
self.sigma = []
self.categorical_posteriors = []
# Validate inputs
self.alpha = _validate_inits(self.alpha)
X, y = _validate_training_data(
X, y, self.categorical_features)
# From https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/naive_bayes.py#L344
# If the ratio of data variance between dimensions is too small, it
# will cause numerical errors. To address this, we artificially
# boost the variance by epsilon, a small fraction of the standard
# deviation of the largest dimension.
self.epsilon = self.var_smoothing * np.var(X, ddof=1, axis=0).max()
# Get whatever that is needed
uniques = np.unique(y)
num_classes = uniques.size
(num_samples, self.num_features) = X.shape
# print(self.priors)
# Correct the inputs
if self.priors is None:
self.priors = np.bincount(y)/num_samples
else:
self.priors = np.asarray(self.priors)
if len(self.priors) != num_classes:
raise ValueError('Number of priors must match number of classes.')
if np.isclose(self.priors.sum(), 1.0):
raise ValueError("The sum of priors should be 1.")
if (self.priors < 0).any():
raise ValueError('Priors must be non-negative.')
if self.categorical_features is None:
self.categorical_features = []
# Get the index columns of the discrete data and continuous data
self.categorical_features = np.array(self.categorical_features).astype(int)
self.gaussian_features = np.delete(
np.arange(self.num_features), self.categorical_features)
# How many categories are there in each categorical_feature
# Add 1 due to zero-indexing
if self.max_categories is None:
self.max_categories = np.max(X[:, self.categorical_features], axis=0) + 1
self.max_categories = self.max_categories.astype(int)
else:
self.max_categories = np.array(self.max_categories).astype(int)
# Prepare empty arrays
if self.gaussian_features.size != 0:
self.theta = np.zeros((num_classes, len(self.gaussian_features)))
self.sigma = np.zeros((num_classes, len(self.gaussian_features)))
if self.categorical_features.size != 0:
self.categorical_posteriors = [
np.zeros((num_classes, num_categories))
for num_categories in self.max_categories]
# TODO optimise below!
for y_i in uniques:
if self.gaussian_features.size != 0:
x = X[y==y_i, :][:, self.gaussian_features]
self.theta[y_i, :] = np.mean(x, axis=0)
self.sigma[y_i, :] = np.var(x, axis=0) # note: it's really sigma squared
if self.categorical_features.size != 0:
for i, categorical_feature in enumerate(self.categorical_features):
dist = np.bincount(X[y == y_i, :][:, categorical_feature].astype(int),
minlength=self.max_categories[i]) + self.alpha
self.categorical_posteriors[i][y_i,:] = dist/np.sum(dist)
self._is_fitted = True
return self
def predict_proba(self, X_test, verbose=False):
"""
Return probability estimates for the test vector X_test.
Parameters
----------
X_test : array-like, shape = [num_samples, num_features]
Returns
-------
C : array-like, shape = [num_samples, num_classes]
Returns the probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute `classes_`.
"""
if not self._is_fitted:
raise NotFittedError
_validate_test_data(X_test, self.num_features)
X_test = np.array(X_test)
if self.gaussian_features.size != 0:
# TODO optimisation: Below is a copy. Can consider masking
x_gaussian = X_test[:, self.gaussian_features]
mu = self.theta[:, np.newaxis]
s = self.sigma[:, np.newaxis]
s = s + self.epsilon
# For every y_class and feature,
# take values of x's from the samples
# to get its likelihood
# (num_classes, num_samples, num_features)
something = 1./np.sqrt(2.*np.pi*s) * \
np.exp(-((x_gaussian-mu)**2.)/(2.*s))
# For every y_class and sample,
# multiply all the features
# (num_samples, num_classes)
t = np.prod(something, axis=2)[:, :, np.newaxis]
t = np.squeeze(t.T)
if self.categorical_features.size != 0:
# Cast tensor to int
X = X_test[:, self.categorical_features].astype(int)
# A list of length=num_features.
# Each item in the list contains the distributions for the y_classes
# Shape of each item is (num_classes,1,num_samples)
probas = [categorical_posterior[:, X[:, i][:,np.newaxis]]
for i, categorical_posterior
in enumerate(self.categorical_posteriors)]
r = np.concatenate([probas], axis=0)
r = np.squeeze(r, axis=-1)
r = np.moveaxis(r, [0,1,2], [2,0,1])
# (num_samples, num_classes)
p = np.prod(r, axis=2).T
if self.gaussian_features.size != 0 and self.categorical_features.size != 0:
finals = t * p * self.priors
elif self.gaussian_features.size != 0:
finals = t * self.priors
elif self.categorical_features.size != 0:
finals = p * self.priors
normalised = finals.T/(np.sum(finals, axis=1) + 1e-6)
normalised = np.moveaxis(normalised, [0,1], [1,0])
return normalised
def predict(self, X, verbose=False):
"""
Perform classification on an array of test vectors X.
Parameters
----------
X : array-like, shape = [num_samples, n_features]
Returns
-------
C : array, shape = [num_samples]
Predicted target values for X
"""
probs = self.predict_proba(X, verbose)
return np.argmax(probs, axis=1)
def get_params(self, deep=False):
"""Get parameters for this model.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
"""
return {
'categorical_features': self.categorical_features,
'max_categories': self.max_categories,
'alpha': self.alpha,
'priors': self.priors,
'var_smoothing': self.var_smoothing
}
def score(self, X, y):
"""Returns the mean accuracy on the given test data and labels.
Parameters
----------
X : array-like, shape = (num_samples, n_features)
Test samples.
y : array-like, shape = (num_samples)
True labels for X.
Returns
-------
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
y_true = np.array(y)
y_predicted = np.array(self.predict(X))
bool_comparison = y_true == y_predicted
return np.sum(bool_comparison) / bool_comparison.size
class NotFittedError(Exception):
"""
Exception class for cases when the predict API is called before
model is fitted.
"""
def __str__(self):
return "This MixedNB instance is not fitted yet. Call 'fit' \
with appropriate arguments before using this method."
def _validate_test_data(X, num_features):
X = np.array(X)
if X.ndim is not 2:
raise ValueError("Bad input shape of X_test. " +
f"Expected an array of dim 2 but got dim {X.ndim} instead.")
if X.shape[1] != num_features:
raise ValueError("Bad input shape of X_test. " +
f"Expected (,{num_features}) but got (,{X.shape[1]}) instead")
def _validate_inits(alpha):
if not isinstance(alpha, (int, float)):
raise TypeError('Expected smoothing parameter alpha to be int or float.')
if alpha < 0:
raise ValueError('Expected smoothing parameter alpha > 0. '
f'Got {alpha}.')
if alpha < _ALPHA_MIN:
warnings.warn('alpha too small will result in numeric errors, '
f'setting alpha = {_ALPHA_MIN}')
alpha = _ALPHA_MIN
return alpha
def _validate_training_data(X_raw, y_raw, categorical_features):
"""Verifying user inputs
The following will be checked:
- dimensions
- number of samples
- data type (numbers only)
- data type for categorical distributions (integers only, starting from 0 onwards)
"""
ACCEPTABLE_TYPES = ['float64', 'int64', 'float32', 'int32']
X = np.array(X_raw)
y = np.squeeze(np.array(y_raw).astype(int))
if X.ndim is not 2:
raise ValueError("Bad input shape of X. " +
f"Expected 2D array, but got {X.ndim}D instead. " +
"Reshape your data accordingly.")
if y.ndim is not 1:
raise ValueError("Bad input shape of y. " +
f"Expected 1D/2D array, but got {y.ndim}D instead. " +
"Reshape your data accordingly.")
if X.shape[0] != y.shape[0]:
raise ValueError(
"No. of samples in X does not match no. of samples in y")
if X.dtype not in ACCEPTABLE_TYPES:
raise ValueError("Expected X to contain only numerics, " +
f"but got type {X.dtype} instead. For categorical variables, " +
"Encode your data using sklearn's LabelEncoder.")
if y.dtype not in ACCEPTABLE_TYPES:
raise ValueError("Expected X to contain only numerics, " +
f"but got type {y.dtype} instead. For categorical variables, " +
"Encode your data using sklearn's LabelEncoder.")
return X, y
# if categorical_features is not None:
# for feature_no in categorical_features:
# uniques = np.unique(X[:, feature_no]).astype(int)
# if not np.array_equal(uniques, list(range(np.max(uniques)+1))):
# raise ValueError(f"Expected feature no. {feature_no} to have " +
# f"{list(range(np.max(uniques)))} " +
# f"unique values, but got {uniques} instead.")
def load_example():
"""Load an example dataset"""
X = [[0, 0, 180, 75],
[1, 1, 165, 61],
[1, 0, 167, 62],
[0, 1, 178, 63],
[1, 1, 174, 69],
[2, 1, 166, 60],
[0, 2, 167, 59],
[2, 2, 165, 60],
[1, 1, 173, 68],
[0, 2, 178, 71]]
y = [0, 0, 1, 0, 0, 0, 1, 1, 0, 0]
return X, y
Functions
def load_example()
-
Load an example dataset
Expand source code
def load_example(): """Load an example dataset""" X = [[0, 0, 180, 75], [1, 1, 165, 61], [1, 0, 167, 62], [0, 1, 178, 63], [1, 1, 174, 69], [2, 1, 166, 60], [0, 2, 167, 59], [2, 2, 165, 60], [1, 1, 173, 68], [0, 2, 178, 71]] y = [0, 0, 1, 0, 0, 0, 1, 1, 0, 0] return X, y
Classes
class MixedNB (categorical_features=None, max_categories=None, alpha=0.5, priors=None, var_smoothing=1e-09)
-
Naive Bayes classifier for Categorical and Gaussian models.
Note: When using categorical_features, MixedNB expects that for each feature, all possible classes are captured in the trining data X in the
MixedNB.fit()
method. This is to ensure numerical stability.Parameters
alpha
:non
-negative
float
, optional (default=0
)- Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). This is for features with categorical distribution.
class_prior
:array
-like
,size
(num_classes
,), optional (default=None
)- Prior probabilities of the classes. If specified the priors are not adjusted according to the data.
var_smoothing
:float
, optional (default=1e
-9
)- Portion of the largest variance of all features that is added to variances for calculation stability.
Attributes
class_prior
:array
,shape
(num_classes
,)- probability of each class.
epsilon
:float
- absolute additive value to variances
num_samples
:int
- number of training samples
num_features
:int
- number of features of X
num_classes
:int
- number of classes (number of layes of y)
models
:array
,shape
(num_classes
,)- the distribution for every feature and class
References
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
Example
>>> import numpy as np >>> X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]] >>> y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0] >>> X = np.array(X) >>> y = np.array(y) >>> clf = MixedNB() >>> clf.fit(X, y, categorical_features=[0, 1]) >>> print(clf.predict([[0, 0]]))
Expand source code
class MixedNB(): """ Naive Bayes classifier for Categorical and Gaussian models. Note: When using categorical_features, MixedNB expects that for each feature, all possible classes are captured in the trining data X in the `mixed_naive_bayes.mixed_naive_bayes.MixedNB.fit` method. This is to ensure numerical stability. Parameters ---------- alpha : non-negative float, optional (default=0) Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). This is for features with categorical distribution. class_prior : array-like, size (num_classes,), optional (default=None) Prior probabilities of the classes. If specified the priors are not adjusted according to the data. var_smoothing : float, optional (default=1e-9) Portion of the largest variance of all features that is added to variances for calculation stability. Attributes ---------- class_prior : array, shape (num_classes,) probability of each class. epsilon : float absolute additive value to variances num_samples : int number of training samples num_features : int number of features of X num_classes : int number of classes (number of layes of y) models : array, shape (num_classes,) the distribution for every feature and class References ---------- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes Example ------- >>> import numpy as np >>> X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]] >>> y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0] >>> X = np.array(X) >>> y = np.array(y) >>> clf = MixedNB() >>> clf.fit(X, y, categorical_features=[0, 1]) >>> print(clf.predict([[0, 0]])) """ def __init__(self, categorical_features=None, max_categories=None, alpha=0.5, priors=None, var_smoothing=1e-9): self.alpha = alpha self.var_smoothing = var_smoothing self.num_features = 0 self.epsilon = 1e-9 self._is_fitted = False self.max_categories = max_categories self.categorical_features = categorical_features self.initial_priors = priors self.gaussian_features = [] self.priors = self.initial_priors self.theta = [] self.sigma = [] self.categorical_posteriors = [] def __repr__(self): return str(f"{self.__class__.__name__}(alpha={self.alpha}, " + f"var_smoothing={self.var_smoothing})") def fit(self, X, y): """Fit Mixed Naive Bayes according to X, y This method also prepares a `self.models` object. Note that the reason why some variables are cast to list() is to make the models object JSON serializable. Parameters ---------- X : array-like, shape (num_samples, n_features) Training vectors, where num_samples is the number of samples and n_features is the number of features. y : array-like, shape (num_samples,) Target values. categorical_features : array Columns which have categorical feature_distributions Returns ------- self : object """ if self._is_fitted: self.gaussian_features = [] self.priors = self.initial_priors self.theta = [] self.sigma = [] self.categorical_posteriors = [] # Validate inputs self.alpha = _validate_inits(self.alpha) X, y = _validate_training_data( X, y, self.categorical_features) # From https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/naive_bayes.py#L344 # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon = self.var_smoothing * np.var(X, ddof=1, axis=0).max() # Get whatever that is needed uniques = np.unique(y) num_classes = uniques.size (num_samples, self.num_features) = X.shape # print(self.priors) # Correct the inputs if self.priors is None: self.priors = np.bincount(y)/num_samples else: self.priors = np.asarray(self.priors) if len(self.priors) != num_classes: raise ValueError('Number of priors must match number of classes.') if np.isclose(self.priors.sum(), 1.0): raise ValueError("The sum of priors should be 1.") if (self.priors < 0).any(): raise ValueError('Priors must be non-negative.') if self.categorical_features is None: self.categorical_features = [] # Get the index columns of the discrete data and continuous data self.categorical_features = np.array(self.categorical_features).astype(int) self.gaussian_features = np.delete( np.arange(self.num_features), self.categorical_features) # How many categories are there in each categorical_feature # Add 1 due to zero-indexing if self.max_categories is None: self.max_categories = np.max(X[:, self.categorical_features], axis=0) + 1 self.max_categories = self.max_categories.astype(int) else: self.max_categories = np.array(self.max_categories).astype(int) # Prepare empty arrays if self.gaussian_features.size != 0: self.theta = np.zeros((num_classes, len(self.gaussian_features))) self.sigma = np.zeros((num_classes, len(self.gaussian_features))) if self.categorical_features.size != 0: self.categorical_posteriors = [ np.zeros((num_classes, num_categories)) for num_categories in self.max_categories] # TODO optimise below! for y_i in uniques: if self.gaussian_features.size != 0: x = X[y==y_i, :][:, self.gaussian_features] self.theta[y_i, :] = np.mean(x, axis=0) self.sigma[y_i, :] = np.var(x, axis=0) # note: it's really sigma squared if self.categorical_features.size != 0: for i, categorical_feature in enumerate(self.categorical_features): dist = np.bincount(X[y == y_i, :][:, categorical_feature].astype(int), minlength=self.max_categories[i]) + self.alpha self.categorical_posteriors[i][y_i,:] = dist/np.sum(dist) self._is_fitted = True return self def predict_proba(self, X_test, verbose=False): """ Return probability estimates for the test vector X_test. Parameters ---------- X_test : array-like, shape = [num_samples, num_features] Returns ------- C : array-like, shape = [num_samples, num_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ if not self._is_fitted: raise NotFittedError _validate_test_data(X_test, self.num_features) X_test = np.array(X_test) if self.gaussian_features.size != 0: # TODO optimisation: Below is a copy. Can consider masking x_gaussian = X_test[:, self.gaussian_features] mu = self.theta[:, np.newaxis] s = self.sigma[:, np.newaxis] s = s + self.epsilon # For every y_class and feature, # take values of x's from the samples # to get its likelihood # (num_classes, num_samples, num_features) something = 1./np.sqrt(2.*np.pi*s) * \ np.exp(-((x_gaussian-mu)**2.)/(2.*s)) # For every y_class and sample, # multiply all the features # (num_samples, num_classes) t = np.prod(something, axis=2)[:, :, np.newaxis] t = np.squeeze(t.T) if self.categorical_features.size != 0: # Cast tensor to int X = X_test[:, self.categorical_features].astype(int) # A list of length=num_features. # Each item in the list contains the distributions for the y_classes # Shape of each item is (num_classes,1,num_samples) probas = [categorical_posterior[:, X[:, i][:,np.newaxis]] for i, categorical_posterior in enumerate(self.categorical_posteriors)] r = np.concatenate([probas], axis=0) r = np.squeeze(r, axis=-1) r = np.moveaxis(r, [0,1,2], [2,0,1]) # (num_samples, num_classes) p = np.prod(r, axis=2).T if self.gaussian_features.size != 0 and self.categorical_features.size != 0: finals = t * p * self.priors elif self.gaussian_features.size != 0: finals = t * self.priors elif self.categorical_features.size != 0: finals = p * self.priors normalised = finals.T/(np.sum(finals, axis=1) + 1e-6) normalised = np.moveaxis(normalised, [0,1], [1,0]) return normalised def predict(self, X, verbose=False): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [num_samples, n_features] Returns ------- C : array, shape = [num_samples] Predicted target values for X """ probs = self.predict_proba(X, verbose) return np.argmax(probs, axis=1) def get_params(self, deep=False): """Get parameters for this model. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ return { 'categorical_features': self.categorical_features, 'max_categories': self.max_categories, 'alpha': self.alpha, 'priors': self.priors, 'var_smoothing': self.var_smoothing } def score(self, X, y): """Returns the mean accuracy on the given test data and labels. Parameters ---------- X : array-like, shape = (num_samples, n_features) Test samples. y : array-like, shape = (num_samples) True labels for X. Returns ------- score : float Mean accuracy of self.predict(X) wrt. y. """ y_true = np.array(y) y_predicted = np.array(self.predict(X)) bool_comparison = y_true == y_predicted return np.sum(bool_comparison) / bool_comparison.size
Methods
def fit(self, X, y)
-
Fit Mixed Naive Bayes according to X, y
This method also prepares a
self.models
object. Note that the reason why some variables are cast to list() is to make the models object JSON serializable.Parameters
X
:array
-like
,shape
(num_samples
,n_features
)- Training vectors, where num_samples is the number of samples and n_features is the number of features.
y
:array
-like
,shape
(num_samples
,)- Target values.
categorical_features
:array
- Columns which have categorical feature_distributions
Returns
self
:object
Expand source code
def fit(self, X, y): """Fit Mixed Naive Bayes according to X, y This method also prepares a `self.models` object. Note that the reason why some variables are cast to list() is to make the models object JSON serializable. Parameters ---------- X : array-like, shape (num_samples, n_features) Training vectors, where num_samples is the number of samples and n_features is the number of features. y : array-like, shape (num_samples,) Target values. categorical_features : array Columns which have categorical feature_distributions Returns ------- self : object """ if self._is_fitted: self.gaussian_features = [] self.priors = self.initial_priors self.theta = [] self.sigma = [] self.categorical_posteriors = [] # Validate inputs self.alpha = _validate_inits(self.alpha) X, y = _validate_training_data( X, y, self.categorical_features) # From https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/naive_bayes.py#L344 # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon = self.var_smoothing * np.var(X, ddof=1, axis=0).max() # Get whatever that is needed uniques = np.unique(y) num_classes = uniques.size (num_samples, self.num_features) = X.shape # print(self.priors) # Correct the inputs if self.priors is None: self.priors = np.bincount(y)/num_samples else: self.priors = np.asarray(self.priors) if len(self.priors) != num_classes: raise ValueError('Number of priors must match number of classes.') if np.isclose(self.priors.sum(), 1.0): raise ValueError("The sum of priors should be 1.") if (self.priors < 0).any(): raise ValueError('Priors must be non-negative.') if self.categorical_features is None: self.categorical_features = [] # Get the index columns of the discrete data and continuous data self.categorical_features = np.array(self.categorical_features).astype(int) self.gaussian_features = np.delete( np.arange(self.num_features), self.categorical_features) # How many categories are there in each categorical_feature # Add 1 due to zero-indexing if self.max_categories is None: self.max_categories = np.max(X[:, self.categorical_features], axis=0) + 1 self.max_categories = self.max_categories.astype(int) else: self.max_categories = np.array(self.max_categories).astype(int) # Prepare empty arrays if self.gaussian_features.size != 0: self.theta = np.zeros((num_classes, len(self.gaussian_features))) self.sigma = np.zeros((num_classes, len(self.gaussian_features))) if self.categorical_features.size != 0: self.categorical_posteriors = [ np.zeros((num_classes, num_categories)) for num_categories in self.max_categories] # TODO optimise below! for y_i in uniques: if self.gaussian_features.size != 0: x = X[y==y_i, :][:, self.gaussian_features] self.theta[y_i, :] = np.mean(x, axis=0) self.sigma[y_i, :] = np.var(x, axis=0) # note: it's really sigma squared if self.categorical_features.size != 0: for i, categorical_feature in enumerate(self.categorical_features): dist = np.bincount(X[y == y_i, :][:, categorical_feature].astype(int), minlength=self.max_categories[i]) + self.alpha self.categorical_posteriors[i][y_i,:] = dist/np.sum(dist) self._is_fitted = True return self
def get_params(self, deep=False)
-
Get parameters for this model.
Returns
params
:mapping
ofstring
to
any
- Parameter names mapped to their values.
Expand source code
def get_params(self, deep=False): """Get parameters for this model. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ return { 'categorical_features': self.categorical_features, 'max_categories': self.max_categories, 'alpha': self.alpha, 'priors': self.priors, 'var_smoothing': self.var_smoothing }
def predict(self, X, verbose=False)
-
Perform classification on an array of test vectors X.
Parameters
X
:array
-like
,shape
= [num_samples
,n_features
]
Returns
C
:array
,shape
= [num_samples
]- Predicted target values for X
Expand source code
def predict(self, X, verbose=False): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [num_samples, n_features] Returns ------- C : array, shape = [num_samples] Predicted target values for X """ probs = self.predict_proba(X, verbose) return np.argmax(probs, axis=1)
def predict_proba(self, X_test, verbose=False)
-
Return probability estimates for the test vector X_test.
Parameters
X_test
:array
-like
,shape
= [num_samples
,num_features
]
Returns
C
:array
-like
,shape
= [num_samples
,num_classes
]- Returns the probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute
classes_
.
Expand source code
def predict_proba(self, X_test, verbose=False): """ Return probability estimates for the test vector X_test. Parameters ---------- X_test : array-like, shape = [num_samples, num_features] Returns ------- C : array-like, shape = [num_samples, num_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ if not self._is_fitted: raise NotFittedError _validate_test_data(X_test, self.num_features) X_test = np.array(X_test) if self.gaussian_features.size != 0: # TODO optimisation: Below is a copy. Can consider masking x_gaussian = X_test[:, self.gaussian_features] mu = self.theta[:, np.newaxis] s = self.sigma[:, np.newaxis] s = s + self.epsilon # For every y_class and feature, # take values of x's from the samples # to get its likelihood # (num_classes, num_samples, num_features) something = 1./np.sqrt(2.*np.pi*s) * \ np.exp(-((x_gaussian-mu)**2.)/(2.*s)) # For every y_class and sample, # multiply all the features # (num_samples, num_classes) t = np.prod(something, axis=2)[:, :, np.newaxis] t = np.squeeze(t.T) if self.categorical_features.size != 0: # Cast tensor to int X = X_test[:, self.categorical_features].astype(int) # A list of length=num_features. # Each item in the list contains the distributions for the y_classes # Shape of each item is (num_classes,1,num_samples) probas = [categorical_posterior[:, X[:, i][:,np.newaxis]] for i, categorical_posterior in enumerate(self.categorical_posteriors)] r = np.concatenate([probas], axis=0) r = np.squeeze(r, axis=-1) r = np.moveaxis(r, [0,1,2], [2,0,1]) # (num_samples, num_classes) p = np.prod(r, axis=2).T if self.gaussian_features.size != 0 and self.categorical_features.size != 0: finals = t * p * self.priors elif self.gaussian_features.size != 0: finals = t * self.priors elif self.categorical_features.size != 0: finals = p * self.priors normalised = finals.T/(np.sum(finals, axis=1) + 1e-6) normalised = np.moveaxis(normalised, [0,1], [1,0]) return normalised
def score(self, X, y)
-
Returns the mean accuracy on the given test data and labels.
Parameters
X
:array
-like
,shape
= (num_samples
,n_features
)- Test samples.
y
:array
-like
,shape
= (num_samples
)- True labels for X.
Returns
score
:float
- Mean accuracy of self.predict(X) wrt. y.
Expand source code
def score(self, X, y): """Returns the mean accuracy on the given test data and labels. Parameters ---------- X : array-like, shape = (num_samples, n_features) Test samples. y : array-like, shape = (num_samples) True labels for X. Returns ------- score : float Mean accuracy of self.predict(X) wrt. y. """ y_true = np.array(y) y_predicted = np.array(self.predict(X)) bool_comparison = y_true == y_predicted return np.sum(bool_comparison) / bool_comparison.size
class NotFittedError (*args, **kwargs)
-
Exception class for cases when the predict API is called before model is fitted.
Expand source code
class NotFittedError(Exception): """ Exception class for cases when the predict API is called before model is fitted. """ def __str__(self): return "This MixedNB instance is not fitted yet. Call 'fit' \ with appropriate arguments before using this method."
Ancestors
- builtins.Exception
- builtins.BaseException