Source code for bcselector.data_generation

import numpy as np
import pandas as pd

from sklearn.preprocessing import KBinsDiscretizer


class _BasicDataGenerator():
    def __init__(self):
        self.n_rows = None
        self.n_cols = None
        self.seed = None

    def generate(self, n_rows=100, n_cols=10, seed=None):
        self.n_rows = n_rows
        self.n_cols = n_cols
        if seed is None:
            self.seed = np.random.randint(0, 420000)
        else:
            self.seed = seed
        np.random.seed(self.seed)


[docs]class MatrixGenerator(_BasicDataGenerator): def __init__(self): super().__init__() def _generate_basic_dataset(self, loc=0, scale=1): X = np.random.normal(loc=loc, scale=scale, size=(self.n_rows, self.n_cols)) y = np.random.binomial(1, np.exp(X.sum(axis=1))/(1+np.exp(X.sum(axis=1)))) return X, y def _generate_noise(self, sigma, loc=0): noise_sigmas = np.repeat(sigma, self.n_cols) noise = np.random.normal(loc=loc, scale=noise_sigmas, size=(self.n_rows, self.n_cols)) return noise
[docs] def generate(self, n_rows=100, n_basic_cols=10, loc=0, noise_sigmas=None, basic_cost=1, seed=None, discretize_method='uniform', discretize_bins=10): assert isinstance(n_rows, int), "Argument `n_rows` must be int." assert isinstance(n_basic_cols, int), "Argument `n_cols` must be int." assert isinstance(loc, int), "Argument `loc` must be int or float." assert discretize_method in ['uniform', 'quantile', 'kmeans'], "Argument `discretize_method` must be in ['uniform', 'quantile', 'kmeans']" assert discretize_bins > 1 and isinstance(discretize_bins, int), "Argument `discretize_bins` must be int greater then 1." super().generate(n_rows, n_basic_cols, seed) self.loc = loc self.noise_sigmas = noise_sigmas # Generate basic features X_basic, y = self._generate_basic_dataset(loc=0, scale=1) costs = [basic_cost for i in range(self.n_cols)] if self.noise_sigmas is None: return X_basic, y, costs # Generate perturbed features X = X_basic.copy() for noise_sigma in self.noise_sigmas: noise = self._generate_noise(sigma=noise_sigma/(len(noise_sigmas)), loc=0) X_transformed = X_basic + noise X = np.concatenate((X, X_transformed), axis=1) costs = costs + [1/(noise_sigma + basic_cost) for i in range(self.n_cols)] # Discretize data def discretize(vector, **kwargs): discretizer = KBinsDiscretizer(encode='ordinal', **kwargs) vector = discretizer.fit_transform(vector.reshape(-1, 1)).reshape(-1) return vector X = np.apply_along_axis(func1d=discretize, axis=0, arr=X, n_bins=discretize_bins, strategy=discretize_method) return X, y, costs
[docs]class DataFrameGenerator(MatrixGenerator): def __init__(self): super().__init__() def _generate_colnames(self, n): new_cols = ['var_' + str(i) for i in np.arange(1, n+1)] return new_cols
[docs] def generate(self, n_rows=100, n_basic_cols=10, loc=0, noise_sigmas=None, basic_cost=1, seed=None, discretize_method='uniform', discretize_bins=10): X, y, costs = super().generate(n_rows=n_rows, n_basic_cols=n_basic_cols, loc=loc, noise_sigmas=noise_sigmas, seed=seed, discretize_method=discretize_method, discretize_bins=discretize_bins) # Generate colnames if noise_sigmas is None: noise_sigmas_len = 0 else: noise_sigmas_len = len(noise_sigmas) cols = self._generate_colnames(self.n_cols + self.n_cols*noise_sigmas_len) # Zip costs costs_dict = dict(zip(cols, costs)) # Create final data frame X_df = pd.DataFrame(X, columns=cols) y_series = pd.Series(y) return X_df, y_series, costs_dict