Source code for imblearn.datasets.imbalance

"""Transform a dataset into an imbalanced dataset."""


# Authors: Dayvid Oliveira
#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Christos Aridas
# License: MIT

import logging
import warnings
from collections import Counter
from numbers import Real

from sklearn.utils import check_X_y

from ..under_sampling.prototype_selection import RandomUnderSampler
from ..utils import check_ratio

LOGGER = logging.getLogger(__name__)


[docs]def make_imbalance(X, y, ratio, min_c_=None, random_state=None, **kwargs): """Turns a dataset into an imbalanced dataset at specific ratio. A simple toy dataset to visualize clustering and classification algorithms. Read more in the :ref:`User Guide <make_imbalanced>`. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data to be imbalanced. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. ratio : str, dict, or callable, optional (default='auto') Ratio to use for resampling the data set. - If ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples. All samples will be passed through if the class is not specified. - If callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples. min_c_ : str or int, optional (default=None) The identifier of the class to be the minority class. If ``None``, ``min_c_`` is set to be the current minority class. Only used when ``ratio`` is a float for back-compatibility. .. deprecated:: 0.2 ``min_c_`` is deprecated in 0.2 and will be removed in 0.4. Use ``ratio`` by passing a ``dict`` instead. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. kwargs : dict, optional Dictionary of additional keyword arguments to pass to ``ratio``. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the imbalanced data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` Notes ----- See :ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`, :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and :ref:`sphx_glr_auto_examples_plot_ratio_usage.py`. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import load_iris >>> from imblearn.datasets import make_imbalance >>> data = load_iris() >>> X, y = data.data, data.target >>> print('Distribution before imbalancing: {}'.format(Counter(y))) Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50}) >>> X_res, y_res = make_imbalance(X, y, ratio={0: 10, 1: 20, 2: 30}, ... random_state=42) >>> print('Distribution after imbalancing: {}'.format(Counter(y_res))) Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) """ X, y = check_X_y(X, y) target_stats = Counter(y) # restrict ratio to be a dict or a callable if isinstance(ratio, dict) or callable(ratio): ratio_ = check_ratio(ratio, y, 'under-sampling', **kwargs) # FIXME: deprecated in 0.2 to be removed in 0.4 elif isinstance(ratio, Real): if min_c_ is None: min_c_ = min(target_stats, key=target_stats.get) else: warnings.warn("'min_c_' is deprecated in 0.2 and will be removed" " in 0.4. Use 'ratio' as dictionary instead.", DeprecationWarning) warnings.warn("'ratio' being a float is deprecated in 0.2 and will not" " be supported in 0.4. Use a dictionary instead.", DeprecationWarning) class_majority = max(target_stats, key=target_stats.get) ratio_ = {} for class_sample, n_sample in target_stats.items(): if class_sample == min_c_: n_min_samples = int(target_stats[class_majority] * ratio) ratio_[class_sample] = n_min_samples else: ratio_[class_sample] = n_sample ratio_ = check_ratio(ratio_, y, 'under-sampling') else: raise ValueError("'ratio' has to be a dictionary or a function" " returning a dictionary. Got {} instead.".format( type(ratio))) LOGGER.info('The original target distribution in the dataset is: %s', target_stats) rus = RandomUnderSampler(ratio=ratio_, replacement=False, random_state=random_state) X_resampled, y_resampled = rus.fit_sample(X, y) LOGGER.info('Make the dataset imbalanced: %s', Counter(y_resampled)) return X_resampled, y_resampled