Source code for akerbp.mlpet.dataset

import os
import warnings
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, List, Set, Union

import numpy as np
import pandas as pd
import yaml
from pandas.core.frame import DataFrame
from tqdm.auto import tqdm

from akerbp.mlpet import feature_engineering, imputers, preprocessors, utilities
from akerbp.mlpet.dataloader import DataLoader


[docs]class Dataset(DataLoader): """ The main class representing a dataset Note: **All settings on the first level of the settings dictionary/YAML passed to the class instance are set as class attributes** Warning: **ALL** filepaths (regardless of whether it is directlty passed to the class at instantiation or in the settings.yaml file) **MUST** be specified in absolute form! Note: The id_column is always considered a categorical variable! Args: mappings: dict or path to a yaml file. If a path is provided it must be provided as an absolute path settings: dict or path to a yaml file. If a path is provided it must be provided as an absolute path. The possible keys for the settings: - id_column (required): name of the id column, eg. well_name - depth_column (optional): name of the measured depth column, e.g. "DEPTH" - label_column (optional): name of the column containing the labels - num_filler (optional - default 0): filler value for numerical curves(existing or wishing value for replacing missing values) - cat_filler (optional - default 'MISSING'): filler value categorical curves(existing or wishing value for replacing missing values) - categorical_curves (optional - default [id_column]): The curves to be considered as categorical when identifying which column as numerical (this setting is used several places throughout the library and can be nice to have defined in advance) - keep_columns (optional - default []): If you would like to keep some of the columns passed in your dataframe that will not be part of the preprocessing_pipeline you define but should still make part of the preprocessed dataframe, this setting enables that. - preprocessing_pipeline (optional - default None): The list of preprocessing functions to be run when the classes' preprocess function is called. If this is not provided, the pipeline **MUST** be provided in the preprocess call. Each key in the preprocessing_pipeline can have the relevant kwargs for that particular preprocessor as it's value. All passed kwargs are parsed and saved to the class instance where relevant for use as defaults in the preprocessing functions folder_path: The path to where preprocessing artifacts are stored/shall be saved to. Similar to the other two arguments this path must be provided as an absolute path. """ # Setting type annotations for class attributes that can be set when an # instance of the Dataset class is created settings: Dict[str, Any] settings_path: str all_curves: Set[str] id_column: str label_column: str num_filler: float cat_filler: str mappings: Dict[str, Any] categorical_curves: List[str] petrophysical_features: List[str] keep_columns: List[str] preprocessing_pipeline: Dict[str, Dict[str, Any]] def __set_defaults(self) -> None: """ Set necessary defaults for proper class use """ if not hasattr(self, "num_filler"): self.num_filler = 0 if not hasattr(self, "cat_filler"): self.cat_filler = "MISSING" if not hasattr(self, "keep_columns"): self.keep_columns = [] def __handle_paths(self, path: Union[Path, str]) -> Union[Path, str]: """ A helper function to handle paths passed either directly to the class or via the settings file Args: path (Union[Path, str]): A filepath to be handled Raises: ValueError: Raises a ValueError is the path provided is not absolute- Returns: Union[Path, str]: Returns the path handled. """ if not os.path.isabs(path): raise ValueError( "All paths must be passed as absolute paths. This is done for " "consistency! (HINT: You can import os and simply wrap a " "os.path.abspath() call around your path.)" ) return path def __ingest_pipeline( self, preprocessing_pipeline: Dict[str, Dict[str, Any]] ) -> None: """ A helper function to ingest preprocessing pipelines Args: preprocessing_pipeline (Dict[str, Dict[str, Any]]): The preprocessing pipeline to ingest """ for func_name, kwargs in preprocessing_pipeline.items(): try: for setting_name, setting in kwargs.items(): local = getattr(self, setting_name, None) if local is not None: warnings.warn( "This class instance already has a value set for " f"{setting_name}. You are overwriting " f"it's value {local} with {setting}!" ) setattr(self, setting_name, setting) except Exception as e: raise Exception( f"Something is wrong in your specification for the {func_name} " "function in your preprocessing_pipeling!" ) from e def __standardize_curves(self) -> None: """ A helper function to standardize curve names. """ # First need to compile a single list of all curves across all methods # MAKE SURE TO KEEP THIS LIST UPDATED! curve_sets = [ "curves_to_scale", "curves_to_normalize", "curves_to_select", "curves_to_drop", "curves_to_impute", "columns_to_encode", "rolling_features", "gradient_features", "log_features", "sequential_features", "petrophysical_features", "noisy_curves", "outlier_curves", "numerical_curves", "categorical_curves", "keep_columns", ] all_curves = {} for curve_set in curve_sets: if hasattr(self, curve_set): all_curves[curve_set] = getattr(self, curve_set) # Standardize passed curves if mappings exist if hasattr(self, "curve_mappings"): for curve_set, curves in all_curves.items(): setattr(self, f"{curve_set}_original", curves) new_names, _ = utilities.standardize_names( names=curves, mapper=self.curve_mappings ) setattr(self, curve_set, new_names) all_curves[curve_set] = new_names # Clean up all curves to be on one level and unique self.all_curves = set(np.concatenate(list(all_curves.values()))) # Standardize single curves if mappings exist curves = {"id_column": self.id_column} if hasattr(self, "label_column"): curves["label_column"] = self.label_column if hasattr(self, "depth_column"): curves["depth_column"] = self.depth_column if hasattr(self, "curve_mappings"): for curve_label, curve_name in curves.items(): new_name, _ = utilities.standardize_names( [curve_name], mapper=self.curve_mappings ) setattr(self, curve_label, new_name[0]) setattr(self, f"{curve_name}_original", curve_name) curves[curve_label] = new_name[0] # Add all single curves to all_curves self.all_curves.update(list(curves.values())) # If preprocessing exists, ensure to update it with all the new # curve names if hasattr(self, "preprocessing_pipeline"): for func_name, kwargs in self.preprocessing_pipeline.items(): for setting_name, _ in kwargs.items(): # No default for getattr. At this point if the attribute # doesn't exist an error should be raised new_setting = getattr(self, setting_name) self.preprocessing_pipeline[func_name][setting_name] = new_setting def __ingest_init_input( self, att_name: str, att_val: Union[str, Dict[str, Any], Path] ) -> None: if isinstance(att_val, dict): setattr(self, att_name, att_val) elif isinstance(att_val, str): att_val = self.__handle_paths(att_val) if os.path.isfile(att_val): att_path = f"{att_name}_path" setattr(self, att_path, att_val) with open(getattr(self, att_path)) as file: setattr(self, att_name, yaml.load(file, Loader=yaml.SafeLoader)) else: raise FileNotFoundError( f"The provided filepath {att_val} is not a valid path! " f"The Dataset cannot be initialised without a {att_name}.yaml!" " Please refer to the classes' docstring to ensure you have" " specified your filepath in the correct form." ) def __init__( self, mappings: Union[str, Dict[str, str]], settings: Union[str, Dict[str, Any]], folder_path: Union[str, Path], ) -> None: # Define supported preprocessing functions self.supported_preprocessing_functions = { f.__name__: f for f in [ feature_engineering.add_log_features, feature_engineering.add_gradient_features, feature_engineering.add_rolling_features, feature_engineering.add_sequential_features, feature_engineering.add_formations_and_groups, feature_engineering.add_vertical_depths, feature_engineering.add_petrophysical_features, imputers.impute_depth_trend, preprocessors.set_as_nan, preprocessors.remove_outliers, preprocessors.remove_small_negative_values, preprocessors.fill_zloc_from_depth, preprocessors.fillna_with_fillers, preprocessors.encode_columns, preprocessors.select_columns, preprocessors.normalize_curves, preprocessors.scale_curves, preprocessors.process_wells, preprocessors.remove_noise, preprocessors.drop_columns, ] } # <--------------------- INGEST INIT INPUTS -------------------------> # self.__ingest_init_input(att_name="settings", att_val=settings) for key, val in self.settings.items(): setattr(self, key, val) self.__ingest_init_input(att_name="mappings", att_val=mappings) if "curve_mappings" in self.mappings: self.curve_mappings = self.mappings["curve_mappings"] if "formations_map" in self.mappings: self.formations_map = self.mappings["formations_map"] if "groups_map" in self.mappings: self.groups_map = self.mappings["groups_map"] # Ensure required settings were provided to prevent problems later down the line required = ["id_column"] for r in required: if not hasattr(self, r): raise AttributeError( f"{r} was not set in your settings file! This setting is " "required. Please refer to the docstring." ) self.folder_path = self.__handle_paths(folder_path) if not os.path.isdir(self.folder_path): os.makedirs(self.folder_path) # Ingest the preprocessing kwargs if a preprocessing_pipeline was passed if hasattr(self, "preprocessing_pipeline"): self.__ingest_pipeline(self.preprocessing_pipeline) # Ensure all functions are supported for func_name in self.preprocessing_pipeline: if func_name not in self.supported_preprocessing_functions: raise ValueError( f"The function {func_name} is not a supported " "preprocessing function. All function specifications " "passed in the preprocessing_pipeline must be a subset " "of the supported preprocessing functions: " f"{list(self.supported_preprocessing_functions)}" ) # Fill missing gaps for parameters that are required for proper operation # of this class self.__set_defaults() # <------------------ PERFORM INPUT CHECKS---------------------------> # # Standardize curve names and create all_curves attribute, update settings with new curve names self.__standardize_curves() # Check that categorical curves includes the id_column (to prevent # unnesscary warnings later on) if hasattr(self, "categorical_curves"): self.categorical_curves = list( set(self.categorical_curves + [self.id_column]) ) else: self.categorical_curves = [self.id_column]
[docs] def preprocess(self, df: DataFrame = None, verbose=True, **kwargs) -> DataFrame: """ Main preprocessing function. Pass the dataframe to be preprocessed along with any kwargs for running any desired order (within reason) of the various supported preprocessing functions. To see which functions are supported for preprocessing you can access the class attribute 'supported_preprocessing_functions'. To see what all the default settings are for all the supported preprocessing functions are, run the class 'get_preprocess_defaults' method without any arguments. To see what kwargs are being used for the default workflow, run the class 'get_preprocess_defaults' with the class attribute 'default_preprocessing_workflow' as the main arg. Warning: The preprocess function will run through the provided kwargs in the order provided by the kwargs dictionary. In python 3.7+, dictionaries are insertion ordered and it is this implemnetational detail this function builds upon. As such, do not use any Python version below 3.7 or ensure to pass an OrderedDict instance as your kwargs to have complete control over what order the preprocessing functions are run in! Args: df (pd.Dataframe, optional): dataframe to which apply preprocessing. If none is provided, it will use the class' original df if exists. verbose (bool, optional): Whether to display some logs on the progression off the preprocessing pipeline being run. Defaults to True. Keyword Args: See above in the docstring on all potential kwargs and their relevant structures. Returns: pd.Dataframe: preprocessed dataframe """ # <---------------- Perform admin/prep work -------------------------> # # If no dataframe is provided, use class df_original if df is None: if hasattr(self, "df_original"): df = self.df_original if df.empty: raise ValueError( "The class connected pd.Dataframe ('df_original') has " "no data so there is nothing to preprocess!" ) else: raise ValueError( "This Dataset class instance does not have a pd.DataFrame " "attached to it so there is no data to preprocess!" ) # Ingest the kwargs to the class instance, if the pipeline was defined # in the settings file it will have already been ingested when the class # was instantiated so no need to do it here if kwargs: self.__ingest_pipeline(kwargs) # Standardize settings curve names and create all_curves attribute self.__standardize_curves() # Map curve names in the provided dataframe df = utilities.standardize_curve_names(df=df, mapper=self.curve_mappings) # Keep track of original column names original_columns = set(df.columns) # Validate data once kwargs have been ingested and standardized, # and the columns of the provided df has been standardized df = self.__validate_data(df) # Retain only the curves required for preprocessing - the all_curves # attribute will have been defined by this point either at instantiation # or from the call above to standardize_curves diff = original_columns - self.all_curves if diff: warnings.warn( "The following columns were passed in the preprocessing " "dataframe but are not used in any of the functions defined in " "the defined preprocessing pipeline. As such they will be " f"dropped! {list(diff)}" ) df = df.drop(columns=diff) # Define kwargs to be used in preprocess method calls if not kwargs: # User did not provide any kwargs so checking they were provided at # instantiation via the settings file. Taking a deepcopy because # we don't want to mutate the original pipeline with general defaults # in case it is to be used again later if self.preprocessing_pipeline is not None: kwargs = deepcopy(self.preprocessing_pipeline) else: raise ValueError( "No preprocessing kwargs were passed (either at runtime or " "via the settings file at instantiation). There's nothing " "to preprocess!" ) # Fill in the blanks where necessary kwargs = self.get_preprocess_defaults(kwargs) # <---------------- Perform preprocessing pipeline ------------------> # pbar = tqdm( kwargs.items(), desc="Preprocessing", disable=(not verbose), unit="function" ) artifacts = {} for function, settings in pbar: if verbose: tqdm.write(f"Running {function}") try: res = self.supported_preprocessing_functions[function](df, **settings) except Exception as e: raise Exception( f"Running {function} failed! Please see the traceback to understand what could have caused the issue:" ) from e if isinstance(res, tuple): # There are artifacts to be saved back to the class. Save them df, artifact = res # Artifacts must be passed back in dict form where the key is # the name the artifact should be saved to this class as # and the value is the artifact itself if isinstance(artifact, dict): # safe to proceed with saving to cls for k, v in artifact.items(): setattr(self, k, v) artifacts.update(artifact) else: ValueError( "A preprocessing function that doesn't return only a " "pd.DataFrame MUST return a tuple where the first item " "is the manipulated pd.DataFrame and the second item is " "a dict of artifacts to be saved back to the class " "instance. The dictionary's keys should be the " "attribute name under which the artifact shall be saved " "and the values should be the artifacts themselves." ) elif isinstance(res, pd.DataFrame): df = res else: raise ValueError( f"The preprocessing function {function} returned an illegal return type!" ) # Perform admin work on detecting features created and removed and # artifacts created self.features_added = list( set([x for x in df.columns if x not in original_columns]) ) self.original_columns_removed = list( set([x for x in original_columns if x not in df.columns]) ) if artifacts: self.artifacts = artifacts return df
[docs] def get_preprocess_defaults( self, kwargs: Dict[str, Dict[str, Any]] = None ) -> Dict[str, Any]: """ Wrapper function to define and provide the default kwargs to use for preprocessing. This function allows the user to only tweak certain function kwargs rather than having to define a setting for every single function kwargs. If a kwargs dictionary is passed to the function, only the defaults for the provided function names found in the kwargs will be returned. In other words, to generate a full default kwargs example, run this method without any arguments. Args: kwargs (Dict[str, Any], optional): Any user defined kwargs that should override the defaults. Defaults to {}. Returns: Dict[str, Any]: A populated kwargs dictionary to be passed to all supported methods in preprocessing. """ # Define per method defaults defaults: Dict[str, Dict[str, Any]] = { "add_log_features": {"log_features": getattr(self, "log_features", None)}, "add_gradient_features": { "gradient_features": getattr(self, "gradient_features", None) }, "add_rolling_features": { "rolling_features": getattr(self, "rolling_features", None), "window": getattr(self, "window", None), }, "add_sequential_features": { "sequential_features": getattr(self, "sequential_features", None), "shift_size": getattr(self, "shift_size", 5), }, "add_formations_and_groups": { "id_column": self.id_column, "depth_column": getattr(self, "depth_column", None), }, "add_vertical_depths": { "id_column": self.id_column, "md_column": getattr(self, "depth_column", None), }, "add_petrophysical_features": { "petrophysical_features": getattr(self, "petrophysical_features", None), "id_column": self.id_column, }, "simple_impute": { "categorical_curves": getattr(self, "categorical_curves", None), "depth_column": getattr(self, "depth_column", None), }, "iterative_impute": { "imputer": getattr(self, "imputer", None), }, "impute_depth_trend": { "curves_to_impute": getattr(self, "curves_to_impute", None), "imputation_models": getattr(self, "imputation_models", None), "save_imputation_models": getattr( self, "save_imputation_models", False ), "allow_individual_models": getattr( self, "allow_individual_models", True ), "folder_path": self.folder_path, "curves_mapping": getattr(self, "curve_mappings", None), }, "set_as_nan": { "categorical_value": getattr(self, "categorical_value", None), "categorical_curves": getattr(self, "categorical_curves", None), "numerical_value": getattr(self, "numerical_value", None), "numerical_curves": getattr(self, "numerical_curves", None), }, "remove_outliers": { "outlier_curves": getattr(self, "outlier_curves", None), "threshold": getattr(self, "threshold", 0.05), }, "remove_small_negative_values": { "numerical_curves": getattr(self, "numerical_curves", None), "nan_threshold": getattr(self, "nan_threshold", None), }, "fill_zloc_from_depth": {}, "fillna_with_fillers": { "num_filler": getattr(self, "num_filler", 0), "numerical_curves": getattr(self, "numerical_curves", None), "cat_filler": getattr(self, "cat_filler", "MISSING"), "categorical_curves": getattr(self, "categorical_curves", None), }, "encode_columns": { "columns_to_encode": getattr( self, "columns_to_encode", getattr(self, "categorical_curves", None) ), "formations_map": getattr(self, "formations_map", None), "groups_map": getattr(self, "groups_map", None), "missing_encoding_value": getattr(self, "missing_encoding_value", -1), }, "select_columns": { "curves_to_select": getattr(self, "curves_to_select", None), "label_column": self.label_column, "id_column": self.id_column, }, "drop_columns": { "curves_to_drop": getattr(self, "curves_to_drop", None), }, "normalize_curves": { "low_perc": getattr(self, "low_perc", 0.05), "high_perc": getattr(self, "high_perc", 0.95), "save_key_wells": getattr(self, "save_key_wells", False), "curves_to_normalize": getattr(self, "curves_to_normalize", None), "id_column": self.id_column, "user_key_wells": getattr(self, "user_key_wells", None), "folder_path": self.folder_path, }, "scale_curves": { "scaler_method": getattr(self, "scaler_method", "RobustScaler"), "scaler": getattr(self, "scaler", None), "save_scaler": getattr(self, "save_scaler", False), "folder_path": self.folder_path, "curves_to_scale": getattr(self, "curves_to_scale", None), "scaler_kwargs": getattr(self, "scaler_kwargs", {}), }, "process_wells": { "id_column": self.id_column, "imputation_type": getattr(self, "imputer", None), }, "remove_noise": { # Default behaviour is to apply to all numeric cols "noisy_curves": getattr(self, "noisy_curves", None), "noise_removal_window": getattr(self, "noise_removal_window", None), }, } # Process wells uses a bunch of lower level functions so we need to # enrich it's kwargs with the relevant kwargs methods_used_by_process_wells = [ "simple_impute", "iterative_impute", "add_rolling_features", "add_gradient_features", "add_sequential_features", ] for method in methods_used_by_process_wells: defaults["process_wells"].update(defaults[method]) # Ingest defaults into kwargs if they exist if kwargs is not None: for function_name in kwargs: # retrieve default settings for function default_function_settings = defaults[function_name] # Populate kwargs with all non provided defaults for setting_name, default_setting in default_function_settings.items(): set_result = kwargs[function_name].setdefault( setting_name, default_setting ) # Need to perform some more advanced operations for specifically mapping # dictionaries # First, if the setting is of type dict (e.g. a mapping dict) # need to ensure that we preserve the users mapping and combine # them with any existing mappings created for example upon # class initialisation. if isinstance(set_result, dict) and set_result != default_setting: if setting_name in [ "formations_map", "groups_map", "curves_mapping", ]: # Append/Overwrite user provided mappings to existing mappings kwargs[function_name][setting_name] = { **default_setting, **set_result, } return kwargs return defaults
def __validate_data(self, df: pd.DataFrame) -> pd.DataFrame: """ Checks that the data loaded into the Dataset includes the expected curves and returns the validated dataframe Note: This is an internal class method inly supposed to use once the all_curves attribute of the class has been created. Args: df (pd.DataFrame): The dataframe to be validated Returns: pd.DataFrame: Returns the validated Dataframe """ # check that all expected curves are present in the data expected_but_missing_curves = self.all_curves - set(df.columns.tolist()) # Remove curves to be generated (petrophysical features) if hasattr(self, "petrophysical_features"): expected_but_missing_curves -= set(self.petrophysical_features) # Remove label column if this a prediction call and the label column is # therefore intentionally not in the dataframe: if hasattr(self, "label_column"): expected_but_missing_curves -= set([self.label_column]) if expected_but_missing_curves: expected_but_missing_cat_curves = expected_but_missing_curves & set( self.categorical_curves ) expected_but_missing_num_curves = ( expected_but_missing_curves - expected_but_missing_cat_curves ) warning_msg = ( "There are curves that are expected but missing from" " the provided dataframe. " ) if expected_but_missing_cat_curves: warning_msg += ( "These curves are being filled with cat_filler: " f"{expected_but_missing_cat_curves}" ) if expected_but_missing_num_curves: warning_msg += ( "These curves are being filled with num_filler: " f"{expected_but_missing_num_curves}" ) warnings.warn(warning_msg) df[list(expected_but_missing_cat_curves)] = self.cat_filler df[list(expected_but_missing_num_curves)] = self.num_filler return df