Module cognet.dataFormatter

Expand source code
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cognet.util import assert_None, assert_array_dimension
class dataFormatter:
    """Aggregate related Qnet functions
    """

    def __init__(self,
                 samples,
                 test_size,
                 train_size=None,
                 random_state=None):
        """[summary]

        Args:
            samples ([type]): [description]
            test_size ([type]): [description]
            train_size ([type]): [description]
            random_state ([type], optional): [description]. Defaults to None.
        """
        self.samples = pd.read_csv(samples)
        self.test_size = test_size
        self.train_size = train_size
        self.random_state = None
        self.train_data, self.test_data = train_test_split(self.samples,
                                                           test_size=test_size,
                                                           train_size=train_size,
                                                           random_state=random_state)
        self.features = {}
        self.nan_cols = []
        self.immutable_vars = None
        self.mutable_vars = None

    def __Qnet_formatter(self,
                         key,
                         samples):
        """[summary]

        Args:
            key ([type]): [description]
            samples ([type]): [description]

        Returns:
            [type]: [description]
        """
        # if not isinstance(samples, np.ndarray):
        #     raise ValueError('Samples must be in numpy array form!')
        samples = samples
        features = np.array(samples.columns.astype(str)[:])
        samples = samples.values.astype(str)[:]
        # remove columns that are all NaNs
        not_all_nan_cols = ~np.all(samples == '', axis=0)
        self.nan_cols = np.all(samples == '', axis=0)

        samples = samples[:, not_all_nan_cols]
        
        features = features[not_all_nan_cols]
        features = list(features)
        self.features[key] = features
        return features, samples

    def train(self):
        """return train data
        """
        return self.__Qnet_formatter('train',self.train_data)
    
    def test(self):
        """return test data
        """
        return self.__Qnet_formatter('test',self.test_data)
    
    def __set_varcase(self,
                      vars,
                      lower):
        """[summary]

        Args:
            vars ([type]): [description]
            lower ([type]): [description]

        Returns:
            [type]: [description]
        """
        if lower:
            features = [x.lower() for x in self.features['train']]
            vars = [x.lower() for x in vars]
        else:
            features = [x.upper() for x in self.features['train']]
            vars = [x.upper() for x in vars]
        return features, vars

    def __interpretvars_fromfile(self,
                                 lower,
                                 IMMUTABLE,
                                 FILE=None,
                                 LIST=None):
        """[summary]

        Args:
            IMMUTABLE ([type]): [description]
            FILE ([type]): [description]
            lower ([type]): [description]

        Returns:
            [type]: [description]
        """
        if IMMUTABLE:
            immutable_vars = np.array(LIST)
            if FILE is not None:
                immutable_vars = pd.read_csv(FILE,index_col=0).transpose()
            #assert_array_dimension(immutable_vars, 1)
            features, immutable_vars = self.__set_varcase(immutable_vars,
                                                          lower)
            mutable_vars = [x for x in features
                            if x not in immutable_vars]
            immutable_vars = [x for x in immutable_vars
                              if x in features]
            invalid_vars = [x for x in immutable_vars
                            if x not in features]
        else:
            mutable_vars = LIST
            if FILE is not None:
                mutable_vars = pd.read_csv(FILE,index_col=0).transpose()
            #assert_array_dimension(mutable_vars, 1)
            features, mutable_vars = self.__set_varcase(mutable_vars,
                                                        lower)
            immutable_vars = [x for x in features
                              if x not in mutable_vars]
            mutable_vars = [x for x in mutable_vars
                            if x in features]
            invalid_vars = [x for x in mutable_vars
                            if x not in features]
        if len(invalid_vars) != 0:
            print("{} vars not found".format(len(invalid_vars)))
            print("vars not found:{}".format(invalid_vars))
        return mutable_vars, immutable_vars

    def mutable_variables(self,
                immutable_list=None,
                IMMUTABLE_FILE=None,
                mutable_list=None,
                MUTABLE_FILE=None,
                lower=False):
        ## can set arguments to accept any type,
        ## and add parameters to make sure if list or FILE, immutable or mutable
        """[summary]

        Args:
            immutable_list ([type]): [description]
            IMMUTABLE_FILE (str, optional): [description]. Defaults to ''.
            mutable_list (list, optional): [description]. Defaults to [].
            MUTABLE_FILE (str, optional): [description]. Defaults to ''.

        Raises:
            ValueError: [description]
            ValueError: [description]
        """
        list_None = assert_None([immutable_list,mutable_list], raise_error=False)
        file_None = assert_None([IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
        num_None = assert_None([immutable_list,mutable_list,
                                IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
        if list_None == 0 or file_None == 0:
            raise ValueError("Only input either IMMUTABLE or MUTABLE vars, not both!")
        elif num_None == 4:
            raise ValueError("Too few inputs! One argument needed")
        elif num_None != 3:
            raise ValueError("Too many inputs! Only one argument needed")
        else:
            if IMMUTABLE_FILE is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=True,
                                                                             FILE=IMMUTABLE_FILE,
                                                                             lower=lower)
            elif MUTABLE_FILE is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=False,
                                                                             FILE=MUTABLE_FILE,
                                                                             lower=lower)
            elif immutable_list is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=True,
                                                                             LIST=immutable_list,
                                                                             lower=lower)
            elif mutable_list is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=False,
                                                                             LIST=mutable_list,
                                                                             lower=lower)
        self.mutable_vars, self.immutable_vars = mutable_vars, immutable_vars
        return mutable_vars, immutable_vars

Classes

class dataFormatter (samples, test_size, train_size=None, random_state=None)

Aggregate related Qnet functions

[summary]

Args

samples : [type]
[description]
test_size : [type]
[description]
train_size : [type]
[description]
random_state : [type], optional
[description]. Defaults to None.
Expand source code
class dataFormatter:
    """Aggregate related Qnet functions
    """

    def __init__(self,
                 samples,
                 test_size,
                 train_size=None,
                 random_state=None):
        """[summary]

        Args:
            samples ([type]): [description]
            test_size ([type]): [description]
            train_size ([type]): [description]
            random_state ([type], optional): [description]. Defaults to None.
        """
        self.samples = pd.read_csv(samples)
        self.test_size = test_size
        self.train_size = train_size
        self.random_state = None
        self.train_data, self.test_data = train_test_split(self.samples,
                                                           test_size=test_size,
                                                           train_size=train_size,
                                                           random_state=random_state)
        self.features = {}
        self.nan_cols = []
        self.immutable_vars = None
        self.mutable_vars = None

    def __Qnet_formatter(self,
                         key,
                         samples):
        """[summary]

        Args:
            key ([type]): [description]
            samples ([type]): [description]

        Returns:
            [type]: [description]
        """
        # if not isinstance(samples, np.ndarray):
        #     raise ValueError('Samples must be in numpy array form!')
        samples = samples
        features = np.array(samples.columns.astype(str)[:])
        samples = samples.values.astype(str)[:]
        # remove columns that are all NaNs
        not_all_nan_cols = ~np.all(samples == '', axis=0)
        self.nan_cols = np.all(samples == '', axis=0)

        samples = samples[:, not_all_nan_cols]
        
        features = features[not_all_nan_cols]
        features = list(features)
        self.features[key] = features
        return features, samples

    def train(self):
        """return train data
        """
        return self.__Qnet_formatter('train',self.train_data)
    
    def test(self):
        """return test data
        """
        return self.__Qnet_formatter('test',self.test_data)
    
    def __set_varcase(self,
                      vars,
                      lower):
        """[summary]

        Args:
            vars ([type]): [description]
            lower ([type]): [description]

        Returns:
            [type]: [description]
        """
        if lower:
            features = [x.lower() for x in self.features['train']]
            vars = [x.lower() for x in vars]
        else:
            features = [x.upper() for x in self.features['train']]
            vars = [x.upper() for x in vars]
        return features, vars

    def __interpretvars_fromfile(self,
                                 lower,
                                 IMMUTABLE,
                                 FILE=None,
                                 LIST=None):
        """[summary]

        Args:
            IMMUTABLE ([type]): [description]
            FILE ([type]): [description]
            lower ([type]): [description]

        Returns:
            [type]: [description]
        """
        if IMMUTABLE:
            immutable_vars = np.array(LIST)
            if FILE is not None:
                immutable_vars = pd.read_csv(FILE,index_col=0).transpose()
            #assert_array_dimension(immutable_vars, 1)
            features, immutable_vars = self.__set_varcase(immutable_vars,
                                                          lower)
            mutable_vars = [x for x in features
                            if x not in immutable_vars]
            immutable_vars = [x for x in immutable_vars
                              if x in features]
            invalid_vars = [x for x in immutable_vars
                            if x not in features]
        else:
            mutable_vars = LIST
            if FILE is not None:
                mutable_vars = pd.read_csv(FILE,index_col=0).transpose()
            #assert_array_dimension(mutable_vars, 1)
            features, mutable_vars = self.__set_varcase(mutable_vars,
                                                        lower)
            immutable_vars = [x for x in features
                              if x not in mutable_vars]
            mutable_vars = [x for x in mutable_vars
                            if x in features]
            invalid_vars = [x for x in mutable_vars
                            if x not in features]
        if len(invalid_vars) != 0:
            print("{} vars not found".format(len(invalid_vars)))
            print("vars not found:{}".format(invalid_vars))
        return mutable_vars, immutable_vars

    def mutable_variables(self,
                immutable_list=None,
                IMMUTABLE_FILE=None,
                mutable_list=None,
                MUTABLE_FILE=None,
                lower=False):
        ## can set arguments to accept any type,
        ## and add parameters to make sure if list or FILE, immutable or mutable
        """[summary]

        Args:
            immutable_list ([type]): [description]
            IMMUTABLE_FILE (str, optional): [description]. Defaults to ''.
            mutable_list (list, optional): [description]. Defaults to [].
            MUTABLE_FILE (str, optional): [description]. Defaults to ''.

        Raises:
            ValueError: [description]
            ValueError: [description]
        """
        list_None = assert_None([immutable_list,mutable_list], raise_error=False)
        file_None = assert_None([IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
        num_None = assert_None([immutable_list,mutable_list,
                                IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
        if list_None == 0 or file_None == 0:
            raise ValueError("Only input either IMMUTABLE or MUTABLE vars, not both!")
        elif num_None == 4:
            raise ValueError("Too few inputs! One argument needed")
        elif num_None != 3:
            raise ValueError("Too many inputs! Only one argument needed")
        else:
            if IMMUTABLE_FILE is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=True,
                                                                             FILE=IMMUTABLE_FILE,
                                                                             lower=lower)
            elif MUTABLE_FILE is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=False,
                                                                             FILE=MUTABLE_FILE,
                                                                             lower=lower)
            elif immutable_list is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=True,
                                                                             LIST=immutable_list,
                                                                             lower=lower)
            elif mutable_list is not None:
                mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=False,
                                                                             LIST=mutable_list,
                                                                             lower=lower)
        self.mutable_vars, self.immutable_vars = mutable_vars, immutable_vars
        return mutable_vars, immutable_vars

Methods

def mutable_variables(self, immutable_list=None, IMMUTABLE_FILE=None, mutable_list=None, MUTABLE_FILE=None, lower=False)

[summary]

Args

immutable_list : [type]
[description]
IMMUTABLE_FILE : str, optional
[description]. Defaults to ''.
mutable_list : list, optional
[description]. Defaults to [].
MUTABLE_FILE : str, optional
[description]. Defaults to ''.

Raises

ValueError
[description]
ValueError
[description]
Expand source code
def mutable_variables(self,
            immutable_list=None,
            IMMUTABLE_FILE=None,
            mutable_list=None,
            MUTABLE_FILE=None,
            lower=False):
    ## can set arguments to accept any type,
    ## and add parameters to make sure if list or FILE, immutable or mutable
    """[summary]

    Args:
        immutable_list ([type]): [description]
        IMMUTABLE_FILE (str, optional): [description]. Defaults to ''.
        mutable_list (list, optional): [description]. Defaults to [].
        MUTABLE_FILE (str, optional): [description]. Defaults to ''.

    Raises:
        ValueError: [description]
        ValueError: [description]
    """
    list_None = assert_None([immutable_list,mutable_list], raise_error=False)
    file_None = assert_None([IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
    num_None = assert_None([immutable_list,mutable_list,
                            IMMUTABLE_FILE,MUTABLE_FILE], raise_error=False)
    if list_None == 0 or file_None == 0:
        raise ValueError("Only input either IMMUTABLE or MUTABLE vars, not both!")
    elif num_None == 4:
        raise ValueError("Too few inputs! One argument needed")
    elif num_None != 3:
        raise ValueError("Too many inputs! Only one argument needed")
    else:
        if IMMUTABLE_FILE is not None:
            mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=True,
                                                                         FILE=IMMUTABLE_FILE,
                                                                         lower=lower)
        elif MUTABLE_FILE is not None:
            mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=False,
                                                                         FILE=MUTABLE_FILE,
                                                                         lower=lower)
        elif immutable_list is not None:
            mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=True,
                                                                         LIST=immutable_list,
                                                                         lower=lower)
        elif mutable_list is not None:
            mutable_vars, immutable_vars = self.__interpretvars_fromfile(IMMUTABLE=False,
                                                                         LIST=mutable_list,
                                                                         lower=lower)
    self.mutable_vars, self.immutable_vars = mutable_vars, immutable_vars
    return mutable_vars, immutable_vars
def test(self)

return test data

Expand source code
def test(self):
    """return test data
    """
    return self.__Qnet_formatter('test',self.test_data)
def train(self)

return train data

Expand source code
def train(self):
    """return train data
    """
    return self.__Qnet_formatter('train',self.train_data)