import numpy as np
import pandas as pd
import typing
import os
from sklearn.preprocessing import LabelEncoder
from .experiment_data import ExperimentValues
[docs]
class UserDataHandler:
'''Class object to represent user supplied data.'''
def __init__(self):
'''Initalizer to create the ExperimentValues object.'''
[docs]
self.ExperimentValues = ExperimentValues()
[docs]
def normalise_data(self, data: pd.DataFrame) -> np.ndarray:
'''Function that returns a normalised dataframe from a non-normalised input.
Params
data: pd.DataFrame
DataFrame of unsorted, unshuffled and non-normalised data to be used.
Returns
array_normalised: np.ndarray
2-D array of normalised data
'''
# Compute the mean and standard deviation of each column
mean = np.mean(data, axis=0)
std = np.std(data, axis=0)
# Subtract the mean and divide by the standard deviation
array_normalized = (data - mean) / std
array_normalized = array_normalized.to_numpy()
self.ExperimentValues.data = array_normalized
return array_normalized
[docs]
def shuffle_normalised_data(self,
normalised_data: np.ndarray
) -> typing.Union[pd.Series, np.ndarray]:
'''Shuffles a DataFrame of normalised data.
Params
normalised_data: pd.DataFrame
A normalisd dataframe of numerical data.
Returns
Tuple:
shuffled_data: pd.Series, shuffled_indices: np.ndarray
A tuple of shuffled data and their corresponding indices.
'''
# Number of columns
num_columns = np.shape(normalised_data)[1]
# Generate a permutation of column indices
shuffled_indices = np.random.permutation(num_columns)
# Concatenate with the indices of the last 10 columns
shuffled_indices = np.concatenate(
(shuffled_indices, np.arange(num_columns, np.shape(normalised_data)[1]))
)
# Shuffle the columns of the matrix
shuffled_data = normalised_data[:, shuffled_indices]
self.ExperimentValues.shuffled_data = shuffled_data
self.ExperimentValues.permutations = shuffled_indices
return shuffled_data, shuffled_indices
[docs]
def load_data(
self,
data_source: str | os.PathLike,
cols_to_ignore: list[str] = None,
labels: str | list[str] = None,
header: int = 0,
index_col: bool = False,
) -> None:
"""Loads data to be be used in simulations with option to clean data.
Params
data_loc: str | os.Pathlike
The file location of the spreadsheet, must be in CSV format.
IMPORTANT Format note: Columns should be variables, and rows should be observation.
Header rows and index column will not be loaded. CSV must only have numerical values.
Non-numerical values can be passed, via the `cols_to_ignore` parameter.
cols_to_ignore: list[str] (Optional) (Default: None)
Any columns which are irrelevant or non-numerical to be excluded from analysis.
labels: str | list[str] (Optional) (Default: None)
Labels to be used to calculate the ARI to check clustering accuracy.
This parameter is optional but strongly encouraged. If a string value
is passed, logic assumes that is the name of a column to be used as
labels. Alternatively, a list of strings may be passed separately.
Label column can be included in the `cols_to_drop` parameter, labels
are extracted before the ignored columns are dropped.
header: int (Optional) (Default: 0)
Parameter to determine the header of the incoming dataframe.
index_col: bool (Optional) (Default: False)
Parameter to determine whether to include an index col.
Returns
None
"""
raw_data = pd.read_csv(data_source, header=header, index_col=index_col)
if isinstance(labels, str):
#If the labels param is a string used to indicate a column target of the df
self.ExperimentValues.true_labels = raw_data[labels].to_numpy()
else:
self.ExperimentValues.true_labels = np.array(labels)
raw_data = raw_data.drop(cols_to_ignore, axis=1)
normalised_data = self.normalise_data(raw_data)
self.shuffle_normalised_data(normalised_data)