Source code for vbvarsel.data_sim_crook

import numpy as np
from .experiment_data import ExperimentValues

[docs] class SimulateCrookData: """ A class to represent simulated data as described by Crook et al. Data will be generated in accordance to the parameters passed through :func:`vbvarsel.global_parameters.SimulationParameters`. This only generates synthetic data, and as such is not used if a user supplies their own data source. `Reference paper <https://www.degruyter.com/document/doi/10.1515/sagmb-2018-0065/html>`_ """ # Attributes # observation : int # Number of observations to simulate. # n_variables : int # Number of variables to simulate. # n_relevant : int # Number of variables that are relevant. # mixture_proportions : list[float] # Proportion of observations in each cluster, length of the array defines # number of simulated clusters. # means : list[int] # Mean of the Gaussian distribution for each cluster. # variance_covariance_matrix : np.ndarray # Matrix of variance and covariance for simulation. def __init__( self, observation:int, n_variables: int, n_relevant: int, mixture_proportions: list, means: list, variance_covariance_matrix: np.ndarray, ):
[docs] self.observation = observation
[docs] self.n_variables = n_variables
[docs] self.n_relevant = n_relevant
[docs] self.mixture_proportions = mixture_proportions
[docs] self.means = means
[docs] self.variance_covariance_matrix = variance_covariance_matrix
[docs] self.ExperimentValues = ExperimentValues()
[docs] def relevant_vars(self) -> np.ndarray: """Returns array of relevant variables for use in simulation.""" samples = [] true_labels = [] # Store the true labels for _ in range(self.observation): # Select mixture component based on proportions component = np.random.choice(list(range(len(self.mixture_proportions))), p=self.mixture_proportions) true_labels.append(component) # Store the true label mean_vector = np.full(self.n_relevant, self.means[component]) sample = np.random.multivariate_normal( mean_vector, self.variance_covariance_matrix ) samples.append(sample) # Convert list of samples to numpy array self.ExperimentValues.true_labels = true_labels return np.array(samples)
[docs] def irrelevant_vars(self) -> np.ndarray: """Returns array of irrelevant variables in simulation.""" n_irrelevant = self.n_variables - self.n_relevant return np.random.randn(self.observation, n_irrelevant)
[docs] def data_sim(self) -> np.ndarray: """Returns simulated data array.""" # Combine relevant and irrelevant variables relevant_variables = self.relevant_vars() irrelevant_variables = self.irrelevant_vars() data = np.hstack((relevant_variables, irrelevant_variables)) self.ExperimentValues.data = data return data
[docs] def permutation(self) -> np.ndarray: """Returns permutations for simulation.""" permutations = np.random.permutation(self.n_variables) self.ExperimentValues.permutations = permutations return permutations
[docs] def shuffle_sim_data(self, data, permutation) -> np.ndarray: """Shuffles randomised data for simulation. Params data: np.ndarray Array of data generated from `self.data_sim()` permutation: np.ndarray Array of permutations generated from `self.permutations()` """ shuffled_data = data[:, permutation] self.ExperimentValues.shuffled_data = shuffled_data