Source code for akerbp.mlpet.transformer

from typing import Any, Dict

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from akerbp.mlpet import Dataset
from akerbp.mlpet.utilities import feature_target_split


class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "FeatureSelector":
        return self

    def transform(self, X: pd.DataFrame, y: pd.DataFrame = None) -> pd.DataFrame:
        return X[self.columns]


[docs]class MLPetTransformer(BaseEstimator, TransformerMixin): def __init__( self, ds: Dataset, train_kwargs: Dict[str, Dict[str, Any]] = None, test_kwargs: Dict[str, Dict[str, Any]] = None, verbose=False, ): self.ds = ds self.train_kwargs = train_kwargs self.test_kwargs = test_kwargs self.verbose = verbose
[docs] def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> "MLPetTransformer": """ Purely an implementational function to adhere to the sklearn API. See docstring for fit_transform. Args: X (pd.DataFrame): feature set y (pd.DataFrame): label set """ return self
[docs] def fit_transform( self, X: pd.DataFrame = None, y: pd.DataFrame = None ) -> pd.DataFrame: """ Performs the requested train preprocessing pipeline either via the kwargs passed at class instantiation or via the pipeline defined in the class connected Dataset's settings file. Args: X (pd.DataFrame - optional): feature set to be preprocessed. Defaults to None. If X=None, the transformers attempts to retrieve X from the df_original saved to the dataset class y (pd.DataFrame - optional): Preprocessing of the label column is **NOT** supported. This is by default set to None Returns: X (pd.DataFrame - optional): Preprocessed feature set y (pd.DataFrame - optional): Preprocessed label set """ # Combine the sets for preprocessing if X is not None: df = X elif hasattr(self.ds, "df_original"): df = self.ds.df_original.copy() else: raise ValueError("No dataframe was provided to the transformer!") # Perform preprocessing if self.train_kwargs is not None: df = self.ds.preprocess(df, verbose=self.verbose, **self.train_kwargs) elif hasattr(self.ds, "preprocessing_pipeline"): df = self.ds.preprocess( df, verbose=self.verbose, ) else: ValueError("No preprocessing kwargs were provided to the transformer!") # Retrieve X if self.ds.label_column in df: X, _ = feature_target_split(df, self.ds.label_column) else: X = df return X
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Perform the requested test preprocessing pipeline either via the kwargs passed at class instantiation or via the pipeline defined in the class connected Dataset's settings file. Args: X (pd.DataFrame): The test set to be preprocessed Returns: pd.DataFrame: The preprocessed test set """ # Perform preprocessing if self.test_kwargs is not None: X = self.ds.preprocess(X, verbose=self.verbose, **self.test_kwargs) elif hasattr(self.ds, "preprocessing_pipeline"): X = self.ds.preprocess(X, verbose=self.verbose) else: ValueError("No preprocessing kwargs were provided to the transformer!") return X