Source code for akerbp.mlpet.preprocessors

"""
This module contains all the preprocessors available to the Dataset class of the
mlpet repo (besides the preprocessing functions found in feature_engineering and
imputers). All preprocessing functions in mlpet **MUST** follow a strict API
in order to be used in conjunction with the preprocess method of the Dataset
class.

The preprocessing API looks like this::

    def some_preprocessing_function(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
        ...
        do_something
        ...
        return df

This API allows for defining a preprocessing pipeline at runtime and passing it
to the preprocess method instead of defining it prior to the initialisation of
the Dataset class.

"""
import os
import warnings
from typing import Any, Dict, List, Tuple, Union

import joblib
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.base import BaseEstimator

from akerbp.mlpet import feature_engineering, imputers, utilities


[docs]def set_as_nan(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Replaces the provided numerical and categorical values with np.nan in the respective numerical and categorical columns. If numerical or categorical column names are not provided they will be inferred using the get_col_types utility function Args: df (pd.DataFrame): dataframe to apply metadata to Keyword Args: numerical_curves (List[str], optional): The numerical columns in which the numerical value should be replaced with np.nan. categorical_curves (List[str], optional): The categorical columns in which the numerical value should be replaced with np.nan. numerical_value (float/int, optional): The numerical value that should be replaced with np.nan. categorical_value (str, optional): The categorical value that should be replaced with np.nan. Returns: df (pd.Dataframe): The original dataframe filled with np.nan where requested """ # Processing inputs numerical_curves: List[str] = kwargs.get("numerical_curves", None) categorical_curves: List[str] = kwargs.get("categorical_curves", None) numerical_value = kwargs.get("numerical_value") categorical_value = kwargs.get("categorical_value") inferred_numeric, inferred_categorical = utilities.get_col_types( df, categorical_curves ) # Ensure user is using method the correct way if numerical_value is not None: if numerical_curves is None: numerical_curves = inferred_numeric df.loc[:, numerical_curves] = df[numerical_curves].replace( to_replace=numerical_value, value=np.nan ) if categorical_value: if categorical_curves is None: categorical_curves = inferred_categorical df.loc[:, categorical_curves] = df[categorical_curves].replace( to_replace=categorical_value, value=np.nan ) return df
[docs]def remove_outliers(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Returns the dataframe after applying the curve specific cutoff values if the threshold (th) of the number of outliers is passed. The following curves and corresponding cutoff values are used (if they exist in the provided curves list): - GR: low cutoff: 0, high cutoff: 250 - RMED: high cutoff: 100 - RDEP: high cutoff: 100 - RSHA: high cutoff: 100 - NEU: low cutoff: -0.5, high cutoff: 1 (replaced with np.nan) - PEF: high cutoff: 10 (replaced with np.nan) If not otherwise specified above, values above/below the cutoffs are replaced with the corresponding cutoff value. Args: df (pd.DataFrame): dataframe to remove outliers Keyword Args: outlier_curves (list): The curves to remove outliers for using the above rules threshold (float, optional): threshold of number of samples that are outliers. Used for displaying warnings of too many samples removed. Defaults to 0.05. Returns: pd.DataFrame: dataframe without outliers """ th = kwargs.get("threshold", 0.05) outlier_curves = kwargs.get("outlier_curves", None) if outlier_curves is not None: len_df = np.array([len(df)]) with np.errstate(divide="ignore", invalid="ignore"): if "GR" in outlier_curves: outliers = df[(df.GR < 0) | (df.GR > 250)] if len(outliers) / len_df > th: warnings.warn( f"GR has more than {th*100}% of its values lower" " than 0 or higher than 250. Replacing them with either 0" " or 250. Note: This column name is the name after it has" " been mapped using the provided mappings.yaml! So it could" " be another column from your original data that triggered" " this warning and instead was mapped to the name printed above." ) df.GR = df.GR.clip(lower=0, upper=250) for resistivity in ["RSHA", "RMED", "RDEP"]: if resistivity in outlier_curves: outliers = df[df[resistivity] > 100] if len(outliers) / len_df > th: warnings.warn( f"{resistivity} has more than {th*100}% of its values higher" " than 100. Note: This column names is the name after it has" " been mapped using the provided mappings.yaml! So it could" " be another column from your original data that triggered" " this warning and instead was mapped to the name printed above." ) df.loc[outliers.index, resistivity] = 100 if "NEU" in outlier_curves: outliers = df[(df.NEU < -0.5) | (df.NEU > 1)] if len(outliers) / len_df > th: warnings.warn( f"NEU has more than {th*100}% of its values higher than 1" " or lower than -0.5" ) df.loc[df.NEU > 1, "NEU"] = np.nan df.loc[df.NEU < -0.5, "NEU"] = np.nan if "PEF" in outlier_curves: outliers = df[df.PEF > 10] if len(outliers) / len_df > th: warnings.warn( f"PEF has more than {th*100}% of its values higher than 10" ) df.loc[df.PEF > 10, "PEF"] = np.nan return df
[docs]def remove_small_negative_values(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Replaces small negative values with np.NaN in all the numeric columns. The small negative values are determined by defining a nan_threshold. If the negative value is smaller than the threshold it is set to nan. Naturally, this operation is only done on numeric columns. Args: df (pd.DataFrame): dataframe to be preprocessed numerical_curves (List[str]): The column names for which small negative values should be replaced with NaNs. If not provided, this list is generated using the get_col_types utility function nan_threshold (float, optional): The threshold determing the smallest acceptable negative value. Defaults to None Returns: pd.DataFrame: preprocessed dataframe """ nan_threshold = kwargs.get("nan_threshold") numerical_curves = kwargs.get("numerical_curves", None) if numerical_curves is None: numerical_curves, _ = utilities.get_col_types(df) if nan_threshold is not None: # remove small negative values for col in numerical_curves: df.loc[df[col] <= nan_threshold, col] = np.NaN return df
[docs]def fill_zloc_from_depth(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Fill missing values in Z_LOC column with values from the DEPTH_MD column Args: df (pd.DataFrame): The dataframe containing both Z_LOC and DEPTH_MD columns Returns: pd.DataFrame: The original dataframe with the Z_LOC column filled where possible. """ # fill the missing Z_LOC values with regards to DEPTH_MD(always present) if ("Z_LOC" in df.columns) and ("DEPTH_MD" in df.columns): df.loc[:, "Z_LOC"] = df["Z_LOC"].fillna(-(df["DEPTH_MD"] - 20)) return df
[docs]def fillna_with_fillers(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Fills all NaNs in numeric columns with a num_filler and all NaNs in categorical columns with a cat_filler. All four of these variables are passed as kwargs. If a num_filler and/or cat_filler is passed without corresponding column names, column types are inferred using the get_col_type utility function. Args: df (pd.DataFrame): The dataframe to be preprocessed Keyword Args: num_filler (float): The numeric value to fill nans with in the numeric numerical_curves (List[str]): The column names for all numeric columns where the NaNs will be filled with the num_filler cat_filler (float): The numeric value to fill nans with in the numeric categorical_curves (List[str]): The column names for all numeric columns where the NaNs will be filled with the num_filler Returns: pd.DataFrame: Preprocessed dataframe """ # Process kwargs num_filler = kwargs.get("num_filler") cat_filler = kwargs.get("cat_filler") numerical_curves = kwargs.get("numerical_curves", None) categorical_curves = kwargs.get("categorical_curves", None) inferred_numeric, inferred_categorical = utilities.get_col_types( df, categorical_curves ) if numerical_curves is None: numerical_curves = inferred_numeric if categorical_curves is None: categorical_curves = inferred_categorical # Fill missing rows with num and cat filler if num_filler is not None: df.loc[:, numerical_curves] = df[numerical_curves].fillna(num_filler) if cat_filler is not None: df.loc[:, categorical_curves] = df[categorical_curves].fillna(cat_filler) return df
[docs]def encode_columns(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Encodes categorical columns. Only available for: - FORMATION column - categories are encoded using the formations_map provided in the kwargs. - GROUP column - categories are encoded using the groups_map provided in the kwargs - lsuName column - categories are encoded using the groups_map provided in the kwargs Note: All names are standardized prior to mapping using the utility function standardize_group_formation_name and all categories that weren't mapped are encoded with -1. Args: df (pd.DataFrame): dataframe to which apply encoding of categorical variables Keyword Args: columns_to_encode (list): which columns to encode. Default to no columns being encoded. If no columns are passed the get_col_types utility function is used to determine the categorical columns formations_map (dict): A mapping dictionary mapping formation names to corresponding integers. Defaults to an empty dictionary (ie no encoding). groups_map (dict): A mapping dictionary mapping group names to corresponding integers. Defaults to an empty dictionary (ie no encoding). missing_encoding_value (int): The value to fill encode categories for which no match was found in the provided mappings. Defaults to -1. Returns: pd.DataFrame: dataframe with categorical columns encoded """ columns_to_encode: List[str] = kwargs.get("columns_to_encode", None) formations_map: Dict[str, int] = kwargs.get("formations_map", None) groups_map: Dict[str, int] = kwargs.get("groups_map", None) missing_encoding_value: int = kwargs.get("missing_encoding_value", -1) if columns_to_encode is None: _, columns_to_encode = utilities.get_col_types(df, columns_to_encode) if "FORMATION" in columns_to_encode and formations_map is not None: df["FORMATION"] = df["FORMATION"].apply( utilities.standardize_group_formation_name ) df["FORMATION"] = df["FORMATION"].map(formations_map) df["FORMATION"] = df["FORMATION"].fillna(missing_encoding_value) if "GROUP" in columns_to_encode and groups_map is not None: df["GROUP"] = df["GROUP"].apply(utilities.standardize_group_formation_name) df["GROUP"] = df["GROUP"].map(groups_map) df["GROUP"] = df["GROUP"].fillna(missing_encoding_value) if "lsuName" in columns_to_encode and groups_map is not None: df["lsuName"] = df["lsuName"].apply(utilities.standardize_group_formation_name) df["lsuName"] = df["lsuName"].map(groups_map) df["lsuName"] = df["lsuName"].fillna(missing_encoding_value) return df
[docs]def select_columns(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Returns a dataframe with only curves chosen by user, filtered from the original dataframe Args: df (pd.DataFrame): dataframe to filter Keyword Args: curves_to_select (list): which curves should be kept. Defaults to None. label_column (str): The name of the label column to keep if also desired. Defaults to None id_column (str): The name of the id column to keep if also desired. Defaults to None Returns: pd.DataFrame: dataframe with relevant curves """ curves_to_select: List[str] = kwargs.get("curves_to_select", None) label_column: str = kwargs.get("label_column", None) id_column: str = kwargs.get("id_column", None) if curves_to_select is not None: curves_to_keep = list(set(curves_to_select)) if label_column is not None and label_column in df.columns: curves_to_keep += [label_column] if id_column is not None and id_column in df.columns: curves_to_keep += [id_column] df = df.loc[:, curves_to_keep] return df
[docs]def drop_columns(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Returns a dataframe with the requested curves dropped Args: df (pd.DataFrame): dataframe to filter Keyword Args: curves_to_drop (list): The curves to be dropped. Defaults to None. Returns: pd.DataFrame: dataframe with requested curves dropped """ curves_to_drop: List[str] = kwargs.get("curves_to_drop", None) if curves_to_drop is not None: curves_to_drop = list(set(curves_to_drop)) df = df.drop(columns=curves_to_drop) return df
[docs]def normalize_curves( df: pd.DataFrame, **kwargs ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict[str, Any]]]: """ Normalizes dataframe columns. We choose one well to be a "key well" and normalize all other wells to its low and high values. This process requires the kwarg 'id_column' to be passed so that wells can be grouped by their ID. For each curve to be normalized, high and low quantiles are calculated per well (the high and low percentage keyword arguments dictate this). If the user provides key wells, key wells calculation is not perfomed. Args: df (pd.DataFrame): dataframe with columns to normalize Keyword Args: curves_to_normalize (list): List of curves (column names) to normalize. Defaults to None (i.e. no curves being normalized). id_column (str): The name of the well ID column. This keyword argument **MUST** be provided to use this method. low_perc (float): low quantile to use as min value. Defaults to 5% high_perc (float): high quantile to use as max value. Defaults to 95% user_key_wells (dict): dictionary with curves as keys and min/max values and key well as values save_key_wells (bool): whether to save keys wells dictionary in folder_path. Defaults to False folder_path (str): The folder to save the key wells dictionary in. Defaults to "" so an error will be raised is saving is set to True but no folder_path is provided. Returns: tuple(pd.DataFrame, dict): pd.DataFrame with normalized values and dictionary with key wells that were used to normalize the curves_to_normalize """ curves_to_normalize: List[str] = kwargs.get("curves_to_normalize", None) id_column: str = kwargs.get("id_column", None) low_perc: float = kwargs.get("low_perc", 0.05) high_perc: float = kwargs.get("high_perc", 0.95) user_key_wells: Dict[str, Dict[str, Union[str, float]]] = kwargs.get( "user_key_wells", None ) save_key_wells: bool = kwargs.get("save_key_wells", False) folder_path: str = kwargs.get("folder_path", "") if curves_to_normalize is not None: if id_column is None: id_column = "DUMMY_WELL_ID_COLUMN" df[id_column] = "UNKNOWN WELL" if user_key_wells is None: raise ValueError( "Unable to normalize curves because no well ID column name was " "provided and no user_key_wells were provided!" ) else: warnings.warn( "No id column was provided. Perfoming normalization " "by assuming all the data is one well!" ) # Calculate necessary quantiles for determining key wells and performing # normalization later on wells_data = df.loc[:, curves_to_normalize + [id_column]].groupby(id_column) high_quantiles = wells_data.quantile(high_perc) low_quantiles = wells_data.quantile(low_perc) if user_key_wells is None: # Need to determine key wells key_wells = (high_quantiles - low_quantiles).idxmax() # Convert key_wells into save format key_wells = { k: { "curve": k, "well_name": v, "ref_low": low_quantiles[k][v], "ref_high": high_quantiles[k][v], } for k, v in key_wells.to_dict().items() } else: # Check if key wells is provided as a dict with the same format if not isinstance(user_key_wells, dict): raise ValueError( "Other methods to provide key wells are not implemented yet!" ) if user_key_wells.keys() != set(curves_to_normalize): raise ValueError( "Curves included in the key wells dictionary inconsistent with curves_to_normalize", user_key_wells.keys(), curves_to_normalize, ) key_wells = user_key_wells # Normalize all wells for c in curves_to_normalize: key_well = key_wells[c] df.loc[:, "low_p"] = df[id_column].map(low_quantiles[c]) df.loc[:, "high_p"] = df[id_column].map(high_quantiles[c]) # normalize all other wells using key well as reference df.loc[:, c] = df.apply( lambda x: utilities.normalize( x[c], key_well["ref_low"], key_well["ref_high"], x["low_p"], x["high_p"], ), axis=1, ) # Perform post normalization cleanup df = df.drop(columns=["low_p", "high_p"]) if id_column == "DUMMY_WELL_ID_COLUMN": df = df.drop(columns=[id_column]) if save_key_wells: if folder_path: # save key wells to where model is joblib.dump( key_wells, os.path.join(folder_path, "key_wells.joblib"), ) else: raise ValueError( "Save key wells was set to true but no folder_path kwarg was " "passed to the method!" ) return (df, {"user_key_wells": key_wells}) else: warnings.warn( "No curves were passed to curves_to_normalize so no normalization " "was performed!" ) return df
[docs]def scale_curves( df: pd.DataFrame, **kwargs ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict[str, BaseEstimator]]]: """ Scales specified columns Args: df (pd.DataFrame): dataframe containing columns to scale Keyword Args: curves_to_scale (list): list of curves (column names) to scale scaler_method (str): string of any sklearn scalers. Defaults to RobustScaler scaler_kwargs (dict): dictionary of any kwargs to pass to the sklearn scaler scaler (BaseEstimator): a pre-fitted sklearn scaler object to apply directly to the curves_to_scale. If this kwarg is provided none of the other kwargs **BESIDES** curves_to_scale is needed. save_scaler (bool): whether to save scaler in folder_path or not. Defaults to False. folder_path (str): Which folder to save the scalers in. Defaults to no path so a path needs to be provided if the save_scaler kwarg is set to True. Returns: tuple(pd.DataFrame, dict): scaled columns and the scaler object that was used to scale the scaled columns stored in a dict. """ curves_to_scale: List[str] = kwargs.get("curves_to_scale", None) scaler_method: str = kwargs.get("scaler_method", "RobustScaler") scaler_kwargs: Dict[str, Any] = kwargs.get("scaler_kwargs", {}) scaler: BaseEstimator = kwargs.get("scaler", None) save_scaler: bool = kwargs.get("save_scaler", False) folder_path: str = kwargs.get("folder_path", "") if curves_to_scale is not None: if scaler is None: try: scaler = getattr(sklearn.preprocessing, scaler_method) except AttributeError as ae: raise ValueError( "The requested scaler_method could not be found in the " "sklearn.preprocessing library!" ) from ae scaler = scaler(**scaler_kwargs) scaler.fit(df[curves_to_scale]) # save scaler to same path as model if save_scaler: if folder_path: joblib.dump( scaler, os.path.join(folder_path, "scaler.joblib"), ) else: raise ValueError( "Save_scaler was set to true but no folder_path kwarg was " "passed to the method!" ) df.loc[:, curves_to_scale] = scaler.transform(df[curves_to_scale]) return (df, {"scaler": scaler}) return df
[docs]def process_wells(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Performs preprocessing per well This a convenience function that will perform several preprocessing steps per well if an id_column is provided in the kwargs. Otherwise it will treat the entire df as one well and preprocess it according to the same pipeline as the per well treatment. The preprocessing pipeline performed is as follows: 1. imputation (if the 'imputer' kwarg is set) 2. feature engineering: - Rolling features created using the add_rolling_features function (if the 'rolling_features' kwarg is set) - Gradient features created using the add_gradient_features function (if the 'gradient_features' kwarg is set) - Sequential features created using the add_sequential_features function (if the 'sequential_features' kwarg is set) The kwargs for each method discussed above must also be provided to this method. Please refer to the specific methods to determine which kwargs to provide Args: df (pd.DataFrame): dataframe of data to be preprocessed Keyword Args: id_column (str): The well ID column name to use to groupby well ID imputation_type (str): Which imputer to use. Can be one of the following two options: 1. 'iterative' - runs the iterative_impute method from the imputers module. Please refer to that method to read up on all necessary kwargs to use that method properly 2. 'simple' - runs the simple_impute method from the imputers module. Please refer to that method to read up on all necessary kwargs to use that method properly Returns: pd.Dataframe: dataframe of preprocessed data """ def _preprocessing_pipeline(df: pd.DataFrame, **kwargs) -> pd.DataFrame: # Process kwargs to determine what to do imputation_type: str = kwargs.get("imputation_type", "") # impute features if imputation_type == "iterative": df, _ = imputers.iterative_impute(df, **kwargs) if imputation_type == "simple": df, _ = imputers.simple_impute(df, **kwargs) # add rolling features if "rolling_features" in kwargs: df = feature_engineering.add_rolling_features(df, **kwargs) # add gradient features if "gradient_features" in kwargs: df = feature_engineering.add_gradient_features(df, **kwargs) # add sequential features if "sequential_features" in kwargs: df = feature_engineering.add_sequential_features(df, **kwargs) return df id_column: str = kwargs.get("id_column", None) # Process per well if id_column exists otherwise process as one big set if id_column in df.columns: well_names = df[id_column].unique() res_df = pd.DataFrame() for well in well_names: well_df = df.loc[df[id_column] == well, :].copy() well_df = _preprocessing_pipeline(well_df, **kwargs) res_df = pd.concat([res_df, well_df]) df = res_df.copy() else: warnings.warn( "Not possible to process per well as well ID is not in dataset. " "Preprocessing was done considering all data is from the same well." ) df = _preprocessing_pipeline(df, **kwargs) return df
[docs]def remove_noise(df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Removes noise by applying a median rolling window on each curve. Warning: Both kwargs are required for this function. If they are not provided, no noise filtering is performed and the df is returned untouched Args: df (pd.DataFrame): dataframe to which apply median filtering Keyword Args: noisy_curves (list): list of curves (columns) to apply noise removal with median filter if none are provided, median filtering will be applied to all numerical columns. Numerical columns are identified using the get_col_types utility function noise_removal_window (int): the window size to use when applying median filtering Returns: pd.DataFrame: dataframe after removing noise """ # Processing inputs noisy_curves: List[str] = kwargs.get("noisy_curves", None) noise_removal_window = kwargs.get("noise_removal_window") if noisy_curves is None: # Only interested in numerical columns so no need to flood console with # warnings related to categorical curves noisy_curves, _ = utilities.get_col_types(df, warn=False) warnings.warn( "No noisy_curves were defined for the remove_noise function! " "Proceeding with removing noise in all detected numeric columns." ) if noise_removal_window is not None: df.loc[:, noisy_curves] = ( df[noisy_curves] .rolling(noise_removal_window, center=True, min_periods=1) .median() ) else: warnings.warn( "No noise filtering was applied because one of the keyword arguments" " was not passed!" ) return df
[docs]def apply_calibration( df_measured: pd.DataFrame, df_predicted: pd.DataFrame, curves: List[str], location_curves: List[str], level: str, mode: str, id_column: str, distance_thres: float = 9999.0, calib_map: pd.DataFrame = None, standardize_level_names: bool = True, ) -> pd.DataFrame: """ Applies calibration to predicted curves, removing biases with the help of measured curves, either in the same well or closest wells Args: df_measured (pd.DataFrame): original dataframe with measured values df_predicted (pd.DataFrame): datframe with predicted values, same column names curves (List[str]): curves to which apply calibration location_curves (List[str]): which curves to use for distance to get closest wells level (str): which grouping type to apply calibration (poer group, per formation) mode (str): type of value aggregation (mean, median, mode) id_column (str): well id distance_thres (float, optional): threshold for a well to be considered close enough. Defaults to 9999.0. calib_map (pd.DataFrame, optional): calibration map for the level. Defaults to None. standardize_level_names (bool optional): whether to standardize formation or group names. Defaults to True. Returns: pd.DataFrame: dataframe with calibrated values """ # Make sure all necessary columns are in given dfs for df, df_name in zip( [df_measured, df_predicted, calib_map], ["df_measured", "df_predicted", "calibration map"], ): missing_curves = [ c for c in curves + location_curves + [level, id_column] if c not in df.columns ] if len(missing_curves) > 0: raise ValueError( f"Missing necessary curves in dataframe '{df_name}': " f"{missing_curves}" ) # either get calibration from cdf if None or work on given map if calib_map is None: # TODO get calibration map from CDF raise ValueError("Getting calibration map from CDF is not yet implemented!") # standardize level names if either group or formation if standardize_level_names and level in ["FORMATION", "GROUP"]: df_measured[level] = df_measured[level].apply( utilities.standardize_group_formation_name ) df_predicted[level] = df_predicted[level].apply( utilities.standardize_group_formation_name ) calib_map[level] = calib_map[level].apply( utilities.standardize_group_formation_name ) calib_meas = utilities.get_calibration_values( df_measured, curves=curves, location_curves=location_curves, level=level, mode=mode, id_column=id_column, distance_thres=distance_thres, calibration_map=calib_map, ) calib_pred = utilities.get_calibration_values( df_predicted, curves=curves, location_curves=location_curves, level=level, mode=mode, id_column=id_column, distance_thres=distance_thres, calibration_map=calib_map, ) df_calibrated = df_predicted.copy() for well in df_predicted[id_column].unique(): print(well) df_m = df_predicted[df_predicted[id_column] == well] for c in curves: diffs = { i: calib_pred[well][c][i] - calib_meas[well][c][i] for i in df_m[level].unique() } for g, v in diffs.items(): if np.isnan(v): continue df_calibrated.loc[ (df_calibrated[level] == g) & (df_calibrated[id_column] == well), c ] = ( df_calibrated.loc[ (df_calibrated[level] == g) & (df_calibrated[id_column] == well), c, ] - v ) return df_calibrated