Module mixed_naive_bayes
The mixed_naive_bayes
module implements Categorical and Gaussian
Naive Bayes algorithms. These are supervised learning methods based on
applying Bayes' theorem with strong (naive) feature independence assumptions.
The API's design is similar to scikit-learn's.
Look at the example in MixedNB
.
Expand source code
# -*- coding: utf-8 -*-
"""
The `mixed_naive_bayes` module implements Categorical and Gaussian
Naive Bayes algorithms. These are supervised learning methods based on
applying Bayes' theorem with strong (naive) feature independence assumptions.
The API's design is similar to scikit-learn's.
Look at the example in `mixed_naive_bayes.MixedNB`.
"""
import numpy as np
class MixedNB():
"""
Naive Bayes classifier for categorical and Gaussian models.
Note: MixedNB expects that for each feature, all possible classes
are in the dataset or encoded.
Parameters
----------
alpha : non-negative float, optional (default=0)
Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
This is for features with categorical distribution.
class_prior : array-like, size (num_classes,), optional (default=None)
Prior probabilities of the classes. If specified the priors are not
adjusted according to the data.
var_smoothing : float, optional (default=1e-9)
Portion of the largest variance of all features that is added to
variances for calculation stability.
Attributes
----------
class_prior : array, shape (num_classes,)
probability of each class.
epsilon : float
absolute additive value to variances
num_samples : int
number of training samples
num_features : int
number of features of X
num_classes : int
number of classes (number of layes of y)
models : array, shape (num_classes,)
the distribution for every feature and class
References
----------
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
Example
-------
>>> import numpy as np
>>> X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1],
[0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]]
>>> y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0]
>>> X = np.array(X)
>>> y = np.array(y)
>>> clf = MixedNB()
>>> clf.fit(X, y, categorical_features=[0, 1])
>>> print(clf.predict([[0, 0]]))
"""
def __init__(self, alpha=0.0, class_prior=None, var_smoothing=1e-9):
self.alpha = alpha
self.var_smoothing = var_smoothing
self.num_features = 0
self.epsilon = 1e-9
self._is_fitted = False
self.prior = class_prior
self.theta = []
self.sigma = []
self.categorical_posteriors = []
self.gaussian_features = []
self.categorical_features = []
def fit(self, X, y, categorical_features=None):
"""Fit Mixed Naive Bayes according to X, y
This method also prepares a `self.models` object. Note that the reason
why some variables are cast to list() is to make the models object
JSON serializable.
Parameters
----------
X : array-like, shape (num_samples, n_features)
Training vectors, where num_samples is the number of samples
and n_features is the number of features.
y : array-like, shape (num_samples,)
Target values.
categorical_features : array
Columns which have categorical feature_distributions
Returns
-------
self : object
"""
self.categorical_features = categorical_features
_validate_inits(self.alpha, self.prior)
_validate_training_data(X, y, self.categorical_features)
y = np.array(y).astype(int)
num_classes = np.unique(y).size
num_samples, self.num_features = X.shape
if self.prior is None:
self.prior = np.bincount(y)/num_samples
else:
self.prior = np.array(self.prior)
if self.categorical_features is None:
self.categorical_features = []
self.categorical_features = np.array(self.categorical_features).astype(int)
# From https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/naive_bayes.py#L344
# If the ratio of data variance between dimensions is too small, it
# will cause numerical errors. To address this, we artificially
# boost the variance by epsilon, a small fraction of the standard
# deviation of the largest dimension.
self.epsilon = np.sqrt(self.var_smoothing) * np.std(X, ddof=0, axis=0).max()
self.gaussian_features = np.delete(
np.arange(self.num_features), self.categorical_features)
# How many categories are there in each categorical_feature
# Add 1 due to zero-indexing
max_categories = np.max(X[:, self.categorical_features], axis=0) + 1
max_categories = max_categories.astype(int)
print(f"Max categories: {max_categories}")
# Prepare empty arrays
self.theta = np.zeros((num_classes, len(self.gaussian_features)))
self.sigma = np.zeros((num_classes, len(self.gaussian_features)))
if self.categorical_features.size != 0:
self.categorical_posteriors = [
np.zeros((num_classes, num_categories))
for num_categories in max_categories]
for y_i in np.unique(y):
if self.gaussian_features.size != 0:
x = X[y == y_i, :][:, self.gaussian_features]
self.theta[y_i, :] = np.mean(x, axis=0)
# Bessel's correction; n-1
self.sigma[y_i, :] = np.std(x, ddof=1, axis=0)
if self.categorical_features.size != 0:
x = X[y == y_i, :][:, self.categorical_features]
for i, categorical_feature in enumerate(self.categorical_features):
dist = np.bincount(X[y == y_i, :][:, categorical_feature].astype(int),
minlength=max_categories[i])
self.categorical_posteriors[i][y_i,:] = dist/np.sum(dist)
self._is_fitted = True
print("Model fitted")
return self
def predict_proba(self, X_test, verbose=False):
"""
Return probability estimates for the test vector X_test.
Parameters
----------
X_test : array-like, shape = [num_samples, num_features]
Returns
-------
C : array-like, shape = [num_samples, num_classes]
Returns the probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute `classes_`.
"""
if not self._is_fitted:
raise NotFittedError
_validate_test_data(X_test, self.num_features)
X_test = np.array(X_test)
if self.gaussian_features.size != 0:
x_gaussian = X_test[:, self.gaussian_features]
mu = self.theta[:, np.newaxis]
s = self.sigma[:, np.newaxis]
s = s + self.epsilon
# For every y_class and feature,
# take values of x's from the samples
# to get its likelihood
# (num_classes, num_samples, num_features)
something = 1/np.sqrt(2.*np.pi*(s**2.)) * \
np.exp(-((x_gaussian-mu)**2.)/(2.*(s**2.)))
# For every y_class and sample,
# multiply all the features
# (num_samples, num_classes)
t = np.prod(something, axis=2)[:, :, np.newaxis]
t = np.squeeze(t.T)
if self.categorical_features.size != 0:
# Cast tensor to int
X = X_test[:, self.categorical_features].astype(int)
# A list of length=num_features.
# Each item in the list contains the distributions for the y_classes
# Shape of each item is (num_classes,1,num_samples)
probas = [categorical_posterior[:, X[:, i][:,np.newaxis]]
for i, categorical_posterior in enumerate(self.categorical_posteriors)]
r = np.concatenate([probas], axis=0)
# print(r.shape)
r = np.squeeze(r, axis=-1)
# print(r.shape)
r = np.moveaxis(r, [0,1,2], [2,0,1])
# (num_samples, num_classes)
p = np.prod(r, axis=2).T
if self.gaussian_features.size != 0 and self.categorical_features.size != 0:
finals = t * p * self.prior
elif self.gaussian_features.size != 0:
finals = t * self.prior
elif self.categorical_features.size != 0:
finals = p * self.prior + self.alpha
# print(f"Sum: {np.sum(finals.T, axis=1)}")
normalised = finals.T/np.sum(finals, axis=1)
normalised = np.moveaxis(normalised, [0,1], [1,0])
return normalised
def predict(self, X, verbose=False):
"""
Perform classification on an array of test vectors X.
Parameters
----------
X : array-like, shape = [num_samples, n_features]
Returns
-------
C : array, shape = [num_samples]
Predicted target values for X
"""
probs = self.predict_proba(X, verbose)
return np.argmax(probs, axis=1)
def get_params(self):
"""Get parameters for this model.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
"""
return {
'alpha': self.alpha,
'priors': self.prior,
'var_smoothing': self.var_smoothing
}
def score(self, X, y):
"""Returns the mean accuracy on the given test data and labels.
Parameters
----------
X : array-like, shape = (num_samples, n_features)
Test samples.
y : array-like, shape = (num_samples)
True labels for X.
Returns
-------
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
y_true = np.array(y)
y_predicted = np.array(self.predict(X))
bool_comparison = y_true == y_predicted
return np.sum(bool_comparison) / bool_comparison.size
class NotFittedError(Exception):
"""
Exception class for cases when the predict API is called before
model is fitted.
"""
def __str__(self):
return "This MixedNB instance is not fitted yet. Call 'fit' \
with appropriate arguments before using this method."
def _validate_test_data(X, num_features):
X = np.array(X)
if X.ndim is not 2:
raise ValueError("Bad input shape of X_test. " +
f"Expected an array of dim 2 but got dim {X.ndim} instead.")
if X.shape[1] != num_features:
raise ValueError("Bad input shape of X_test. " +
f"Expected (,{num_features}) but got (,{X.shape[1]}) instead")
def _validate_inits(alpha, priors):
if alpha < 0:
raise ValueError("alpha must be nonnegative.")
if priors is not None and np.sum(priors) != 1:
raise ValueError("The sum of the priors should be 1.")
def _validate_training_data(X_raw, y_raw, categorical_features):
"""Verifying user inputs
The following will be checked:
- dimensions
- number of samples
- data type (numbers only)
- data type for categorical distributions (integers only, starting from 0 onwards)
"""
ACCEPTABLE_TYPES = ['float64', 'int64', 'float32', 'int32']
X = np.array(X_raw)
y = np.array(y_raw)
if X.ndim is not 2:
raise ValueError("Bad input shape of X. " +
f"Expected 2D array, but got dim {X.ndim} instead. " +
"Reshape your data accordingly.")
if y.ndim is not 1:
raise ValueError("Bad input shape of y. " +
f"Expected 2D array, but got dim {y.ndim} instead. " +
"Reshape your data accordingly.")
if X.shape[0] != y.shape[0]:
raise ValueError(
"No. of samples in X does not match no. of samples in y")
if X.dtype not in ACCEPTABLE_TYPES:
raise ValueError("Expected X to contain only numerics, " +
f"but got type {X.dtype} instead. For categorical variables, " +
"Encode your data using sklearn's LabelEncoder.")
if y.dtype not in ACCEPTABLE_TYPES:
raise ValueError("Expected X to contain only numerics, " +
f"but got type {y.dtype} instead. For categorical variables, " +
"Encode your data using sklearn's LabelEncoder.")
# if categorical_features is not None:
# for feature_no in categorical_features:
# uniques = np.unique(X[:, feature_no]).astype(int)
# if not np.array_equal(uniques, list(range(np.max(uniques)+1))):
# raise ValueError(f"Expected feature no. {feature_no} to have " +
# f"{list(range(np.max(uniques)))} " +
# f"unique values, but got {uniques} instead.")
def load_example():
"""Load an example dataset"""
# Assume all data flushed to 0
X0 = [[0, 0], [1, 1], [1, 0], [0, 1], [1, 1],
[2, 1], [0, 2], [2, 2], [1, 1], [0, 2]]
X1 = [[180, 75], [165, 61], [167, 62],
[178, 63], [174, 69], [166, 60],
[167, 59], [165, 60], [173, 68],
[178, 71]]
X = [[0, 0, 180, 75],
[1, 1, 165, 61],
[1, 0, 167, 62],
[0, 1, 178, 63],
[1, 1, 174, 69],
[2, 1, 166, 60],
[0, 2, 167, 59],
[2, 2, 165, 60],
[1, 1, 173, 68],
[0, 2, 178, 71]]
y = [0, 0, 1, 0, 0, 0, 1, 1, 0, 0]
# X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1],
# [0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]]
# y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0]
X = np.array(X)
y = np.array(y)
return X, y
Functions
def load_example()
-
Load an example dataset
Expand source code
def load_example(): """Load an example dataset""" # Assume all data flushed to 0 X0 = [[0, 0], [1, 1], [1, 0], [0, 1], [1, 1], [2, 1], [0, 2], [2, 2], [1, 1], [0, 2]] X1 = [[180, 75], [165, 61], [167, 62], [178, 63], [174, 69], [166, 60], [167, 59], [165, 60], [173, 68], [178, 71]] X = [[0, 0, 180, 75], [1, 1, 165, 61], [1, 0, 167, 62], [0, 1, 178, 63], [1, 1, 174, 69], [2, 1, 166, 60], [0, 2, 167, 59], [2, 2, 165, 60], [1, 1, 173, 68], [0, 2, 178, 71]] y = [0, 0, 1, 0, 0, 0, 1, 1, 0, 0] # X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1], # [0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]] # y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0] X = np.array(X) y = np.array(y) return X, y
Classes
class MixedNB (alpha=0.0, class_prior=None, var_smoothing=1e-09)
-
Naive Bayes classifier for categorical and Gaussian models.
Note: MixedNB expects that for each feature, all possible classes are in the dataset or encoded.
Parameters
alpha
:non
-negative
float
, optional (default=0
)- Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). This is for features with categorical distribution.
class_prior
:array
-like
,size
(num_classes
,), optional (default=None
)- Prior probabilities of the classes. If specified the priors are not adjusted according to the data.
var_smoothing
:float
, optional (default=1e
-9
)- Portion of the largest variance of all features that is added to variances for calculation stability.
Attributes
class_prior
:array
,shape
(num_classes
,)- probability of each class.
epsilon
:float
- absolute additive value to variances
num_samples
:int
- number of training samples
num_features
:int
- number of features of X
num_classes
:int
- number of classes (number of layes of y)
models
:array
,shape
(num_classes
,)- the distribution for every feature and class
References
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
Example
>>> import numpy as np >>> X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]] >>> y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0] >>> X = np.array(X) >>> y = np.array(y) >>> clf = MixedNB() >>> clf.fit(X, y, categorical_features=[0, 1]) >>> print(clf.predict([[0, 0]]))
Expand source code
class MixedNB(): """ Naive Bayes classifier for categorical and Gaussian models. Note: MixedNB expects that for each feature, all possible classes are in the dataset or encoded. Parameters ---------- alpha : non-negative float, optional (default=0) Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). This is for features with categorical distribution. class_prior : array-like, size (num_classes,), optional (default=None) Prior probabilities of the classes. If specified the priors are not adjusted according to the data. var_smoothing : float, optional (default=1e-9) Portion of the largest variance of all features that is added to variances for calculation stability. Attributes ---------- class_prior : array, shape (num_classes,) probability of each class. epsilon : float absolute additive value to variances num_samples : int number of training samples num_features : int number of features of X num_classes : int number of classes (number of layes of y) models : array, shape (num_classes,) the distribution for every feature and class References ---------- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes Example ------- >>> import numpy as np >>> X = [[1, 0], [1, 0], [0, 0], [0, 1], [1, 1], [1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [1, 1], [0, 0]] >>> y = [1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0] >>> X = np.array(X) >>> y = np.array(y) >>> clf = MixedNB() >>> clf.fit(X, y, categorical_features=[0, 1]) >>> print(clf.predict([[0, 0]])) """ def __init__(self, alpha=0.0, class_prior=None, var_smoothing=1e-9): self.alpha = alpha self.var_smoothing = var_smoothing self.num_features = 0 self.epsilon = 1e-9 self._is_fitted = False self.prior = class_prior self.theta = [] self.sigma = [] self.categorical_posteriors = [] self.gaussian_features = [] self.categorical_features = [] def fit(self, X, y, categorical_features=None): """Fit Mixed Naive Bayes according to X, y This method also prepares a `self.models` object. Note that the reason why some variables are cast to list() is to make the models object JSON serializable. Parameters ---------- X : array-like, shape (num_samples, n_features) Training vectors, where num_samples is the number of samples and n_features is the number of features. y : array-like, shape (num_samples,) Target values. categorical_features : array Columns which have categorical feature_distributions Returns ------- self : object """ self.categorical_features = categorical_features _validate_inits(self.alpha, self.prior) _validate_training_data(X, y, self.categorical_features) y = np.array(y).astype(int) num_classes = np.unique(y).size num_samples, self.num_features = X.shape if self.prior is None: self.prior = np.bincount(y)/num_samples else: self.prior = np.array(self.prior) if self.categorical_features is None: self.categorical_features = [] self.categorical_features = np.array(self.categorical_features).astype(int) # From https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/naive_bayes.py#L344 # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon = np.sqrt(self.var_smoothing) * np.std(X, ddof=0, axis=0).max() self.gaussian_features = np.delete( np.arange(self.num_features), self.categorical_features) # How many categories are there in each categorical_feature # Add 1 due to zero-indexing max_categories = np.max(X[:, self.categorical_features], axis=0) + 1 max_categories = max_categories.astype(int) print(f"Max categories: {max_categories}") # Prepare empty arrays self.theta = np.zeros((num_classes, len(self.gaussian_features))) self.sigma = np.zeros((num_classes, len(self.gaussian_features))) if self.categorical_features.size != 0: self.categorical_posteriors = [ np.zeros((num_classes, num_categories)) for num_categories in max_categories] for y_i in np.unique(y): if self.gaussian_features.size != 0: x = X[y == y_i, :][:, self.gaussian_features] self.theta[y_i, :] = np.mean(x, axis=0) # Bessel's correction; n-1 self.sigma[y_i, :] = np.std(x, ddof=1, axis=0) if self.categorical_features.size != 0: x = X[y == y_i, :][:, self.categorical_features] for i, categorical_feature in enumerate(self.categorical_features): dist = np.bincount(X[y == y_i, :][:, categorical_feature].astype(int), minlength=max_categories[i]) self.categorical_posteriors[i][y_i,:] = dist/np.sum(dist) self._is_fitted = True print("Model fitted") return self def predict_proba(self, X_test, verbose=False): """ Return probability estimates for the test vector X_test. Parameters ---------- X_test : array-like, shape = [num_samples, num_features] Returns ------- C : array-like, shape = [num_samples, num_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ if not self._is_fitted: raise NotFittedError _validate_test_data(X_test, self.num_features) X_test = np.array(X_test) if self.gaussian_features.size != 0: x_gaussian = X_test[:, self.gaussian_features] mu = self.theta[:, np.newaxis] s = self.sigma[:, np.newaxis] s = s + self.epsilon # For every y_class and feature, # take values of x's from the samples # to get its likelihood # (num_classes, num_samples, num_features) something = 1/np.sqrt(2.*np.pi*(s**2.)) * \ np.exp(-((x_gaussian-mu)**2.)/(2.*(s**2.))) # For every y_class and sample, # multiply all the features # (num_samples, num_classes) t = np.prod(something, axis=2)[:, :, np.newaxis] t = np.squeeze(t.T) if self.categorical_features.size != 0: # Cast tensor to int X = X_test[:, self.categorical_features].astype(int) # A list of length=num_features. # Each item in the list contains the distributions for the y_classes # Shape of each item is (num_classes,1,num_samples) probas = [categorical_posterior[:, X[:, i][:,np.newaxis]] for i, categorical_posterior in enumerate(self.categorical_posteriors)] r = np.concatenate([probas], axis=0) # print(r.shape) r = np.squeeze(r, axis=-1) # print(r.shape) r = np.moveaxis(r, [0,1,2], [2,0,1]) # (num_samples, num_classes) p = np.prod(r, axis=2).T if self.gaussian_features.size != 0 and self.categorical_features.size != 0: finals = t * p * self.prior elif self.gaussian_features.size != 0: finals = t * self.prior elif self.categorical_features.size != 0: finals = p * self.prior + self.alpha # print(f"Sum: {np.sum(finals.T, axis=1)}") normalised = finals.T/np.sum(finals, axis=1) normalised = np.moveaxis(normalised, [0,1], [1,0]) return normalised def predict(self, X, verbose=False): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [num_samples, n_features] Returns ------- C : array, shape = [num_samples] Predicted target values for X """ probs = self.predict_proba(X, verbose) return np.argmax(probs, axis=1) def get_params(self): """Get parameters for this model. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ return { 'alpha': self.alpha, 'priors': self.prior, 'var_smoothing': self.var_smoothing } def score(self, X, y): """Returns the mean accuracy on the given test data and labels. Parameters ---------- X : array-like, shape = (num_samples, n_features) Test samples. y : array-like, shape = (num_samples) True labels for X. Returns ------- score : float Mean accuracy of self.predict(X) wrt. y. """ y_true = np.array(y) y_predicted = np.array(self.predict(X)) bool_comparison = y_true == y_predicted return np.sum(bool_comparison) / bool_comparison.size
Methods
def fit(self, X, y, categorical_features=None)
-
Fit Mixed Naive Bayes according to X, y
This method also prepares a
self.models
object. Note that the reason why some variables are cast to list() is to make the models object JSON serializable.Parameters
X
:array
-like
,shape
(num_samples
,n_features
)- Training vectors, where num_samples is the number of samples and n_features is the number of features.
y
:array
-like
,shape
(num_samples
,)- Target values.
categorical_features
:array
- Columns which have categorical feature_distributions
Returns
self
:object
Expand source code
def fit(self, X, y, categorical_features=None): """Fit Mixed Naive Bayes according to X, y This method also prepares a `self.models` object. Note that the reason why some variables are cast to list() is to make the models object JSON serializable. Parameters ---------- X : array-like, shape (num_samples, n_features) Training vectors, where num_samples is the number of samples and n_features is the number of features. y : array-like, shape (num_samples,) Target values. categorical_features : array Columns which have categorical feature_distributions Returns ------- self : object """ self.categorical_features = categorical_features _validate_inits(self.alpha, self.prior) _validate_training_data(X, y, self.categorical_features) y = np.array(y).astype(int) num_classes = np.unique(y).size num_samples, self.num_features = X.shape if self.prior is None: self.prior = np.bincount(y)/num_samples else: self.prior = np.array(self.prior) if self.categorical_features is None: self.categorical_features = [] self.categorical_features = np.array(self.categorical_features).astype(int) # From https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/naive_bayes.py#L344 # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. self.epsilon = np.sqrt(self.var_smoothing) * np.std(X, ddof=0, axis=0).max() self.gaussian_features = np.delete( np.arange(self.num_features), self.categorical_features) # How many categories are there in each categorical_feature # Add 1 due to zero-indexing max_categories = np.max(X[:, self.categorical_features], axis=0) + 1 max_categories = max_categories.astype(int) print(f"Max categories: {max_categories}") # Prepare empty arrays self.theta = np.zeros((num_classes, len(self.gaussian_features))) self.sigma = np.zeros((num_classes, len(self.gaussian_features))) if self.categorical_features.size != 0: self.categorical_posteriors = [ np.zeros((num_classes, num_categories)) for num_categories in max_categories] for y_i in np.unique(y): if self.gaussian_features.size != 0: x = X[y == y_i, :][:, self.gaussian_features] self.theta[y_i, :] = np.mean(x, axis=0) # Bessel's correction; n-1 self.sigma[y_i, :] = np.std(x, ddof=1, axis=0) if self.categorical_features.size != 0: x = X[y == y_i, :][:, self.categorical_features] for i, categorical_feature in enumerate(self.categorical_features): dist = np.bincount(X[y == y_i, :][:, categorical_feature].astype(int), minlength=max_categories[i]) self.categorical_posteriors[i][y_i,:] = dist/np.sum(dist) self._is_fitted = True print("Model fitted") return self
def get_params(self)
-
Get parameters for this model.
Returns
params
:mapping
ofstring
to
any
- Parameter names mapped to their values.
Expand source code
def get_params(self): """Get parameters for this model. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ return { 'alpha': self.alpha, 'priors': self.prior, 'var_smoothing': self.var_smoothing }
def predict(self, X, verbose=False)
-
Perform classification on an array of test vectors X.
Parameters
X
:array
-like
,shape
= [num_samples
,n_features
]
Returns
C
:array
,shape
= [num_samples
]- Predicted target values for X
Expand source code
def predict(self, X, verbose=False): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [num_samples, n_features] Returns ------- C : array, shape = [num_samples] Predicted target values for X """ probs = self.predict_proba(X, verbose) return np.argmax(probs, axis=1)
def predict_proba(self, X_test, verbose=False)
-
Return probability estimates for the test vector X_test.
Parameters
X_test
:array
-like
,shape
= [num_samples
,num_features
]
Returns
C
:array
-like
,shape
= [num_samples
,num_classes
]- Returns the probability of the samples for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute
classes_
.
Expand source code
def predict_proba(self, X_test, verbose=False): """ Return probability estimates for the test vector X_test. Parameters ---------- X_test : array-like, shape = [num_samples, num_features] Returns ------- C : array-like, shape = [num_samples, num_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ if not self._is_fitted: raise NotFittedError _validate_test_data(X_test, self.num_features) X_test = np.array(X_test) if self.gaussian_features.size != 0: x_gaussian = X_test[:, self.gaussian_features] mu = self.theta[:, np.newaxis] s = self.sigma[:, np.newaxis] s = s + self.epsilon # For every y_class and feature, # take values of x's from the samples # to get its likelihood # (num_classes, num_samples, num_features) something = 1/np.sqrt(2.*np.pi*(s**2.)) * \ np.exp(-((x_gaussian-mu)**2.)/(2.*(s**2.))) # For every y_class and sample, # multiply all the features # (num_samples, num_classes) t = np.prod(something, axis=2)[:, :, np.newaxis] t = np.squeeze(t.T) if self.categorical_features.size != 0: # Cast tensor to int X = X_test[:, self.categorical_features].astype(int) # A list of length=num_features. # Each item in the list contains the distributions for the y_classes # Shape of each item is (num_classes,1,num_samples) probas = [categorical_posterior[:, X[:, i][:,np.newaxis]] for i, categorical_posterior in enumerate(self.categorical_posteriors)] r = np.concatenate([probas], axis=0) # print(r.shape) r = np.squeeze(r, axis=-1) # print(r.shape) r = np.moveaxis(r, [0,1,2], [2,0,1]) # (num_samples, num_classes) p = np.prod(r, axis=2).T if self.gaussian_features.size != 0 and self.categorical_features.size != 0: finals = t * p * self.prior elif self.gaussian_features.size != 0: finals = t * self.prior elif self.categorical_features.size != 0: finals = p * self.prior + self.alpha # print(f"Sum: {np.sum(finals.T, axis=1)}") normalised = finals.T/np.sum(finals, axis=1) normalised = np.moveaxis(normalised, [0,1], [1,0]) return normalised
def score(self, X, y)
-
Returns the mean accuracy on the given test data and labels.
Parameters
X
:array
-like
,shape
= (num_samples
,n_features
)- Test samples.
y
:array
-like
,shape
= (num_samples
)- True labels for X.
Returns
score
:float
- Mean accuracy of self.predict(X) wrt. y.
Expand source code
def score(self, X, y): """Returns the mean accuracy on the given test data and labels. Parameters ---------- X : array-like, shape = (num_samples, n_features) Test samples. y : array-like, shape = (num_samples) True labels for X. Returns ------- score : float Mean accuracy of self.predict(X) wrt. y. """ y_true = np.array(y) y_predicted = np.array(self.predict(X)) bool_comparison = y_true == y_predicted return np.sum(bool_comparison) / bool_comparison.size
class NotFittedError (*args, **kwargs)
-
Exception class for cases when the predict API is called before model is fitted.
Expand source code
class NotFittedError(Exception): """ Exception class for cases when the predict API is called before model is fitted. """ def __str__(self): return "This MixedNB instance is not fitted yet. Call 'fit' \ with appropriate arguments before using this method."
Ancestors
- builtins.Exception
- builtins.BaseException