Source code for bcselector.datasets

import json
from os.path import dirname, join

import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer


def _discretize(vector, **kwargs):
    """Discretizes vector with sklearn.preprocessing.KBinsDiscretizer.

    Parameters
    ----------
    vector : np.array
    kwargs
        Arguments passed to sklearn.preprocessing.KBinsDiscretizer constructor.

    Returns
    -------
    discretized_vector: np.array
        Discretized by **kwargs arguments method vector.
    """
    discretizer = KBinsDiscretizer(encode='ordinal', **kwargs)
    discretized_vector = discretizer.fit_transform(vector.reshape(-1, 1)).reshape(-1)
    return discretized_vector


[docs]def load_sample(as_frame=True): """Load and return the sample artificial dataset. ================= ============== Samples total 10000 Dimensionality 35 Target variables 1 ================= ============== Parameters ---------- as_frame : bool, default=True If True, the data is a pandas DataFrame including columns with appropriate names. The target is a pandas DataFrame with multiple target variables. Returns ------- data : {np.ndarray, pd.DataFrame} of shape (10000, 35) The data matrix. If `as_frame=True`, `data` will be a pd.DataFrame. target: {np.ndarray, pd.Series} of shape (10000, 35) The binary classification target variable. If `as_frame=True`, `target` will be a pd.DataFrame. costs: {dict, list) Cost of every feature in data. If `as_frame=True`, `target` will be a dict. Examples -------- >>> from bcselector.dataset import load_sample >>> data, target, costs = load_sample() """ module_path = dirname(__file__) # Load data data = pd.read_csv(join(module_path, 'data', 'sample_data', 'sample_data.csv')) targets = pd.read_csv(join(module_path, 'data', 'sample_data', 'sample_target.csv')) with open(join(module_path, 'data', 'sample_data', 'sample_costs.json'), 'r') as j: costs = json.load(j) if as_frame: return data, targets['Class'], costs else: return data.values, targets.values, list(costs.values())
[docs]def load_hepatitis(as_frame=True, discretize_data=True, **kwargs): """Load and return the hepatitis dataset provided. The mimic3 dataset is a small medical dataset with single target variable. Dataset is collected from UCI repository [3]_. ================= ============== Samples total 155 Dimensionality 19 Target variables 1 ================= ============== Parameters ---------- as_frame : bool, default=True If True, the data is a pandas DataFrame including columns with appropriate names. The target is a pandas DataFrame with multiple target variables. discretize_data: bool, default=True If True, the returned data is discretized with sklearn.preprocessing.KBinsDiscretizer. kwargs Arguments passed to sklearn.preprocessing.KBinsDiscretizer constructor. Returns ------- data : {np.ndarray, pd.DataFrame} of shape (6591, 306) The data matrix. If `as_frame=True`, `data` will be a pd.DataFrame. target: {np.ndarray, pd.Series} of shape (6591, 10) The binary classification target variable. If `as_frame=True`, `target` will be a pd.DataFrame. costs: {dict, list) Cost of every feature in data. If `as_frame=True`, `target` will be a dict. References ---------- .. [3] Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. Examples -------- >>> from bcselector.dataset import load_hepatitis >>> data, target, costs = load_hepatitis() """ module_path = dirname(__file__) # Load data data = pd.read_csv(join(module_path, 'data', 'hepatitis', 'hepatitis.csv')) targets = pd.read_csv(join(module_path, 'data', 'hepatitis', 'hepatitis_target.csv')) with open(join(module_path, 'data', 'hepatitis', 'hepatitis_costs.json'), 'r') as j: costs = json.load(j) if discretize_data: data_colnames = data.columns n_bins = kwargs.get('n_bins', 10) col_to_discretize = data.nunique()[data.nunique() > n_bins].index col_not_changing = data.nunique()[data.nunique() <= n_bins].index data_discretized = np.apply_along_axis(func1d=_discretize, axis=0, arr=data[col_to_discretize].values, **kwargs) data = pd.concat([pd.DataFrame(data_discretized, columns=col_to_discretize), data[col_not_changing]], axis=1) data = data[data_colnames] if as_frame: return data, targets['Class'], costs else: return data.values, targets.values, list(costs.values())