Source code for akerbp.mlpet.utilities

import importlib.resources
import re
import warnings
from functools import lru_cache
from typing import Any, Dict, List, Set, Tuple, Union

import numpy as np
import pandas as pd
from cognite.client import CogniteClient
from cognite.client.data_classes import Asset, AssetList
from numpy import float64

import akerbp.mlpet.data


[docs]def drop_rows_wo_label(df: pd.DataFrame, label_column: str, **kwargs) -> pd.DataFrame: """ Removes columns with missing targets. Now that the imputation is done via pd.df.fillna(), what we need is the constant filler_value If the imputation is everdone using one of sklearn.impute methods or a similar API, we can use the indicator column (add_indicator=True) Args: df (pd.DataFrame): dataframe to process label_column (str): Name of the label column containing rows without labels Keyword Args: missing_label_value (str, optional): If nans are denoted differently than np.nans, a missing_label_value can be passed as a kwarg and all rows containing this missing_label_value in the label column will be dropped Returns: pd.DataFrame: processed dataframe """ missing_label_value = kwargs.get("missing_label_value") if missing_label_value is not None: return df.loc[df[label_column] != missing_label_value, :] return df.loc[~df[label_column].isna(), :]
[docs]@lru_cache(maxsize=None) def readPickle(path): """ A cached helper function for loading pickle files. Loading pickle files multiple times can really slow down execution Args: path (str): Path to the pickled object to be loaded Returns: data: Return the loaded pickled data """ import pickle infile = open(path, "rb") data = pickle.load(infile, encoding="bytes") infile.close() return data
[docs]def map_formation_and_group( form_or_group: pd.Series, MissingValue: Union[float, str] = np.nan ) -> Tuple[Union[float, str], Union[float, str]]: """ A helper function for retrieving the formation and group of a standardised formation/group based on mlpet's NPD pickle mapper. Args: form_or_group (pd.Series): A pandas series containing AkerBP legal formation/group names to be mapped MissingValue (Any): If no mapping is found, return this missing value Returns: tuple(pd.Series): Returns a formation and group series respectively corresponding to the input string series """ with importlib.resources.path(akerbp.mlpet.data, "npd_fm_gp_key_dic.pcl") as path: dic_names = readPickle(path) mapping = {} for item in form_or_group.unique(): form, group = MissingValue, MissingValue try: dic = dic_names[item] if dic["LEVEL"] == "FORMATION": form = dic["NAME"] if " GP" in dic["PARENT"]: group = dic["PARENT"] elif dic["LEVEL"] == "GROUP": group = dic["NAME"] except KeyError: pass mapping[item] = (form, group) form, group = zip(*form_or_group.map(mapping)) return form, group
[docs]def standardize_group_formation_name(name: Union[str, Any]) -> Union[str, Any]: """ Performs several string operations to standardize group formation names for later categorisation. Args: name (str): A group formation name Returns: float or str: Returns the standardized group formation name or np.nan if the name == "NAN". """ def __split(string: str) -> str: string = string.split(" ")[0] string = string.split("_")[0] return string def __format(string: str) -> str: string = string.replace("AA", "A") string = string.replace("Å", "A") string = string.replace("AE", "A") string = string.replace("Æ", "A") string = string.replace("OE", "O") string = string.replace("Ø", "O") return string # First perform some formatting to ensure consistencies in the checks name = str(name).upper().strip() # Replace NAN string with actual nan if name == "NAN": return np.nan # GPs & FMs with no definition leave as is if name in [ "NO FORMAL NAME", "NO GROUP DEFINED", "UNDEFINED", "UNDIFFERENTIATED", "UNKNOWN", ]: return "UNKNOWN" # Then perform standardization if "INTRA" in name: name = " ".join(name.split(" ")[:2]) name = " ".join(name.split("_")[:2]) elif "(" in name and ")" in name: # Remove text between parantheses including the parentheses name = re.sub(r"[\(].*?[\)]", "", name).strip() name = __split(name) elif name == "TD": name = "TOTAL DEPTH" else: name = __split(name) # Format name = __format(name) return name
[docs]def standardize_names( names: List[str], mapper: Dict[str, str] ) -> Tuple[List[str], Dict[str, str]]: """ Standardize curve names in a list based on the curve_mappings dictionary. Any columns not in the dictionary are ignored. Args: names (list): list with curves names mapper (dictionary): dictionary with mappings. Defaults to curve_mappings. Returns: list: list of strings with standardized curve names """ standardized_names = [] for name in names: mapped_name = mapper.get(name) if mapped_name: standardized_names.append(mapped_name) else: standardized_names.append(name) old_new_cols = {n: o for o, n in zip(names, standardized_names)} return standardized_names, old_new_cols
[docs]def standardize_curve_names(df: pd.DataFrame, mapper: Dict[str, str]) -> pd.DataFrame: """ Standardize curve names in a dataframe based on the curve_mappings dictionary. Any columns not in the dictionary are ignored. Args: df (pd.DataFrame): dataframe to which apply standardization of columns names mapper (dictionary): dictionary with mappings. Defaults to curve_mappings. They keys should be the old curve name and the values the desired curved name. Returns: pd.DataFrame: dataframe with columns names standardized """ return df.rename(columns=mapper)
[docs]def get_col_types( df: pd.DataFrame, categorical_curves: List[str] = None, warn: bool = True ) -> Tuple[List[str], List[str]]: """ Returns lists of numerical and categorical columns Args: df (pd.DataFrame): dataframe with columns to classify categorical_curves (list): List of column names that should be considered as categorical. Defaults to an empty list. warn (bool): Whether to warn the user if categorical curves were detected which were not in the provided categorical curves list. Returns: tuple: lists of numerical and categorical columns """ if categorical_curves is None: categorical_curves = [] cat_original: Set[str] = set(categorical_curves) # Make sure we are comparing apples with apples. Sometimes cat_original # will contain column names that are no longer in the passed df and this # will cause a false positive and trigger the first if check below. So # ensure that all cols in cat_original are in the df before proceeding. cat_original = set([c for c in cat_original if c in df.columns]) num_cols = set(df.select_dtypes(include="number").columns) cat_cols = set(df.columns) - num_cols if warn: if cat_cols != cat_original: extra = cat_original - cat_cols if extra: warnings.warn( f"Cols {extra} were specified as categorical by user even though" " they are numerical. Note: These column names are the names" " after they have been mapped using the provided mappings.yaml!" " So it could be another column from your original data that" " triggered this warning and instead was mapped to one of the" " names printed above." ) extra = cat_cols - cat_original if extra: warnings.warn( f"Cols {extra} were identified as categorical and are being" " treated as such. Note: These column names" " are the names after they have been mapped using the provided" " mappings.yaml! So it could be another column from your" " original data that triggered this warning and instead was" " mapped to one of the names printed above." ) cat_cols = cat_original.union(cat_cols) # make sure nothing from categorical is in num cols num_cols = num_cols - cat_cols return list(num_cols), list(cat_cols)
[docs]def wells_split_train_test( df: pd.DataFrame, id_column: str, test_size: float, **kwargs ) -> Tuple[List[str], List[str], List[str]]: """ Splits wells into two groups (train and val/test) NOTE: Set operations are used to perform the splits so ordering is not preserved! The well IDs will be randomly ordered. Args: df (pd.DataFrame): dataframe with data of wells and well ID id_column (str): The name of the column containing well names which will be used to perform the split. test_size (float): percentage (0-1) of wells to be in val/test data Returns: wells (list): well IDs test_wells (list): wells IDs of val/test data training_wells (list): wells IDs of training data """ wells = set(df[id_column].unique()) rng: np.random.Generator = np.random.default_rng() test_wells = set(rng.choice(list(wells), int(len(wells) * test_size))) training_wells = wells - test_wells return list(wells), list(test_wells), list(training_wells)
[docs]def df_split_train_test( df: pd.DataFrame, id_column: str, test_size: float = 0.2, test_wells: List[str] = None, **kwargs, ) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: """ Splits dataframe into two groups: train and val/test set. Args: df (pd.Dataframe): dataframe to split id_column (str): The name of the column containing well names which will be used to perform the split. test_size (float, optional): size of val/test data. Defaults to 0.2. test_wells (list, optional): list of wells to be in val/test data. Defaults to None. Returns: tuple: dataframes for train and test sets, and list of test well IDs """ if test_wells is None: test_wells = wells_split_train_test(df, id_column, test_size, **kwargs)[1] if not test_wells: raise ValueError( "Not enough wells in your dataset to perform the requested train " "test split!" ) df_test = df.loc[df[id_column].isin(test_wells)] df_train = df.loc[~df[id_column].isin(test_wells)] return df_train, df_test, test_wells
[docs]def train_test_split( df: pd.DataFrame, target_column: str, id_column: str, **kwargs ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Splits a dataset into training and val/test sets by well (i.e. for an 80-20 split, the provided dataset would need data from at least 5 wells). This function makes use of several other utility functions. The workflow it executes is: 1. Drops row without labels 2. Splits into train and test sets using df_split_train_test which in turn performs the split via wells_split_train_test Args: df (pd.DataFrame, optional): dataframe with data target_column (str): Name of the target column (y) id_column (str): Name of the wells ID column. This is used to perform the split based on well ID. Keyword Args: test_size (float, optional): size of val/test data. Defaults to 0.2. test_wells (list, optional): list of wells to be in val/test data. Defaults to None. missing_label_value (str, optional): If nans are denoted differently than np.nans, a missing_label_value can be passed as a kwarg and all rows containing this missing_label_value in the label column will be dropped Returns: tuple: dataframes for train and test sets, and list of test wells IDs """ df = drop_rows_wo_label(df, target_column, **kwargs) df_train, df_test, _ = df_split_train_test(df, id_column, **kwargs) return df_train, df_test
[docs]def feature_target_split( df: pd.DataFrame, target_column: str ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Splits set into features and target Args: df (pd.DataFrame): dataframe to be split target_column (str): target column name Returns: tuple: input (features) and output (target) dataframes """ X = df.loc[:, ~df.columns.isin([target_column])] y = df[target_column] return X, y
[docs]def normalize( col: pd.Series, ref_min: float64, ref_max: float64, col_min: float, col_max: float ) -> pd.Series: """ Helper function that applies min-max normalization on a pandas series and rescales it according to a reference range according to the following formula: ref_low + ((col - col_min) * (ref_max - ref_min) / (col_max - col_min)) Args: col (pd.Series): column from dataframe to normalize (series) ref_low (float): min value of the column of the well of reference ref_high (float): max value of the column of the well of reference well_low (float): min value of the column of well to normalize well_high (float): max value of the column of well to normalize Returns: pd.Series: normalized series """ diff_ref = ref_max - ref_min diff_well = col_max - col_min with np.errstate(divide="ignore", invalid="ignore"): norm = ref_min + diff_ref * (col - col_min) / diff_well return norm
# Specifically ignoring complexity for this function because it would not # make sense to split out the sub components into the utilities module
[docs]def get_well_metadata( # noqa: C901 client: CogniteClient, well_names: List[str] ) -> Dict[str, Dict[str, Any]]: """ Retrieve relevant well metadata for the provided well_names Warning: If a well is not found in the asset database, it is not returned in the returned dictionary. Instead a warning is printed to the console with the corresponding well name. Metadata retrieved: - COMPLETION_DATE - COORD_SYSTEM_NAME - KB_ELEV - KB_ELEV_OUOM - PUBLIC - SPUD_DATE - WATER_DEPTH - CDF_wellName - WATER_DEPTH_DSDSUNIT - X_COORDINATE - Y_COORDINATE - DATUM_ELEVATION - DATUM_ELEVATION_UNIT - LATITUDE - LONGITUDE Args: client (CogniteClient): A connected cognite client instance well_names (List): The list of well names to retrieve metadata for Returns: dict: Returns a dictionary where the keys are the well names and the values are dictionaries with metadata keys and values. Example: Example return dictionary:: { '25/10-10': { 'COMPLETION_DATE': '2010-04-02T00:00:00', 'COORD_SYSTEM_NAME': 'ED50 / UTM zone 31N', 'DATUM_ELEVATION': '0.0', ...}, '25/10-12 ST2': { 'COMPLETION_DATE': '2015-01-18T00:00:00', 'COORD_SYSTEM_NAME': 'ED50 / UTM zone 31N', 'DATUM_ELEVATION': nan, ...}, } """ relevant_metadata_keys = [ "WATER_DEPTH", "WATER_DEPTH_DSDSUNIT", "KB_ELEV", "KB_ELEV_OUOM", "PUBLIC", "Latitude", "Longitude", "SURFACE_NODE_LATITUDE", "SURFACE_NODE_LONGITUDE", "COORD_SYSTEM_NAME", "X_COORD", "X_COORDINATE", "Y_COORD", "Y_COORDINATE", "loc-x", "loc-y", "loc-x", "y-loc", "x", "y", "DATUM_ELEVATION", "DATUM_ELEVATION_DSDSUNIT", "DATUM_TYPE", "datum-elevation", "datum-unit", "SPUD_DATE", "COMPLETION_DATE", "WELLBORE_LOCATION_SPATIAL", ] # The order in which the similar keys are defined will determine which # key to chose if there are multiple unique values for similar keys! map_similar_keys = { "X_COORDINATE": [ "X_COORDINATE", "X_COORD", "loc-x", "x-loc", ], "Y_COORDINATE": [ "Y_COORDINATE", "Y_COORD", "loc-y", "y-loc", ], "DATUM_ELEVATION": [ "DATUM_ELEVATION", "datum-elevation", ], "DATUM_ELEVATION_UNIT": [ "DATUM_ELEVATION_DSDS_UNIT", "datum-unit", ], "LATITUDE": [ "Latitude", "SURFACE_NODE_LATITUDE", "y", ], "LONGITUDE": [ "Longitude", "SURFACE_NODE_LONGITUDE", "x", ], } # Helper function to find best match from fuzzy search results def _find_best_match(assetlist: AssetList, wellName: str) -> str: # Compares only the alphanumerics of the wellName (ie. punctuation removed) # If no match is found it returns an empty string pat = re.compile(r"[\W_]+") for asset in assetlist: name: str = asset.name if pat.sub("", name) == pat.sub("", wellName): return name return "" # Helper function to retrieve asset with most relevant metadata in the case # of multiple matches def _merge_assets(assetlist: List[Asset]) -> pd.Series: metadata = {} for asset in assetlist: metadata.update(asset.to_pandas().squeeze().to_dict()) merged = pd.Series(metadata, name=metadata["name"]) return merged # First retrieve metadata from the Cognite asset API meta = [] for well in well_names: try: # First try list search asset: Union[AssetList, List[Asset], Asset] = client.assets.list(name=well) if len(asset) == 0: # If first attempt failed use fuzzy search to retrieve proper # well name. Find best match based on alphanumeric equality wellName = _find_best_match( client.assets.search(name=well, limit=10), well ) if not wellName: raise IndexError warnings.warn( f"Could not find a direct match for '{well}' in the CDF Assets" f" database. Closest match found is '{wellName}'. Using the " "metadata from that asset!" ) # Then retrieve asset using list API asset = client.assets.list(name=wellName, metadata={"type": "Wellbore"}) if len(asset) > 1: # Sort by time with first element being most recent asset = sorted(asset, key=lambda x: x.last_updated_time) # Some wells are stored several times as assets?? # In this case find merge them all together to retrieve as much # metadata as possible. Where a me series_meta = _merge_assets(asset) else: asset = asset[0] series_meta = asset.to_pandas().squeeze() series_meta.name = asset.name except IndexError: # No match found for the well in the asset database. warnings.warn(f"Could not find any metadata for well: {well}") continue # Filter retrieved series to only relevant keys and save CDF well name series_meta = series_meta.loc[ series_meta.index.intersection(relevant_metadata_keys) ].copy() series_meta.loc["CDF_wellName"] = series_meta.name series_meta.name = well meta.append(series_meta) cdf_meta = pd.concat(meta, axis=1) # Need to handle WELLBORE_LOCATION_SPATIAL specially if "WELLBORE_LOCATION_SPATIAL" in cdf_meta.index: sub = cdf_meta.loc["WELLBORE_LOCATION_SPATIAL"].dropna().apply(eval).copy() restructured = pd.json_normalize(sub)[["x", "y"]] restructured.index = sub.index restructured = restructured.explode("x").explode("y").T cdf_meta = pd.concat([cdf_meta, restructured], axis=0) cdf_meta = cdf_meta.loc[~cdf_meta.index.isin(["WELLBORE_LOCATION_SPATIAL"])] # Then group mapped keys # Helper function for apply operation def _apply_function(x: pd.Series, highest_rank_key: List[str]) -> Any: unique = x.dropna().unique() # Check for multiple unique values per well (float & string) if len(unique) > 1: # Return the key off highest rank return x.loc[highest_rank_key] elif len(unique) == 0: return np.nan else: return unique[0] for mapping_name, mapping in map_similar_keys.items(): # filter to relevant mapping idx = cdf_meta.index.intersection(mapping) if len(idx) == 0: # No metadata matching this mapping continue elif len(idx) == 1: # One key matching this mapping so just use it's values values = cdf_meta.loc[idx].squeeze() else: # Respect order of similar key mapping highest_rank_key = idx[np.argmin([mapping.index(x) for x in idx])] values = cdf_meta.loc[idx].apply( lambda x: _apply_function(x, highest_rank_key), axis=0 ) values.name = mapping_name cdf_meta = cdf_meta.loc[~cdf_meta.index.isin(idx)] cdf_meta.loc[mapping_name] = values metadata_dict: Dict[str, Dict[str, Any]] = cdf_meta.to_dict() return metadata_dict
[docs]def get_formation_tops( well_names: str, client: CogniteClient, **kwargs, ) -> Dict[str, Dict[str, Any]]: """ Retrieves formation tops metadata for a provided list of well names (IDs) from CDF and returns them in a dictionary of depth levels and labels per well. Args: well_names (str): A list of well names (IDs) client (CogniteClient): A connected instance of the Cognite Client. Keyword Args: undefined_name (str): Name for undefined formation/group tops. Defaults to 'UNKNOWN' NOTE: The formation will be skipped if it's only 1m thick. NPD do not provide technial side tracks, such that information (formation tops) provided by NPD is missing T-labels. Returns: Dict: Returns a dictionary of formation tops metadata per map in this format:: formation_tops_mapper = { "31/6-6": { "group_labels": ['Nordland Group', 'Hordaland Group', ...], "group_labels_chronostrat": ['Cenozoic', 'Paleogene', ...] "group_levels": [336.0, 531.0, 650.0, ...], "formation_labels": ['Balder Formation', 'Sele Formation', ...], "formation_labels_chronostrat": ['Eocene', 'Paleocene', ...], "formation_levels": [650.0, 798.0, 949.0, ...] } ... } NOTE: The length of the levels entries equals the length of the corresponding labels entries + 1, such that the first entry of a label entry lies between the first and the second entries of the corresponding level entry. """ undefined_name: str = kwargs.get("undefined_name", "UNKNOWN") formation_tops_mapper = {} for well in well_names: well_name = well.split("T")[0].strip() tops = client.sequences.list( metadata={ "wellbore_name": well_name, "type": "FormationTops", "source": "NPD", } ) if tops is None or len(tops) == 0: warnings.warn( f"No formation tops information was found for {well}. Skipping it!" ) continue rows = tops[0].rows(start=None, end=None).to_pandas() rows_groups = rows[rows.Level == "GROUP"].sort_values(["Top_MD", "Base_MD"]) rows_formations = rows[rows.Level == "FORMATION"].sort_values( ["Top_MD", "Base_MD"] ) group_labels: List[str] = [] chrono_group_labels: List[str] = [] group_levels: List[float] = [] formation_labels: List[str] = [] chrono_formation_labels: List[str] = [] formation_levels: List[float] = [] label = undefined_name ### Groups ### for _, row in rows_groups.iterrows(): # Skip group is length is 1m if row.Top_MD == row.Base_MD: continue new_label = row.Lithostrat new_chrono_label = row.Chronostrat if label == new_label or new_label.lower().startswith( "undefined" ): # merge levels group_levels = group_levels[:-1] group_levels.append(row.Base_MD) else: try: if row.Top_MD != group_levels[-1]: # groups not continuous group_labels.append(undefined_name) chrono_group_labels.append(undefined_name) group_levels.extend([group_levels[-1], row.Top_MD]) except Exception: pass label = new_label chrono_label = new_chrono_label group_labels.append(label) chrono_group_labels.append(chrono_label) group_levels.extend([row.Top_MD, row.Base_MD]) group_levels = list(dict.fromkeys(group_levels)) assert len(chrono_group_labels) == len( group_labels ), "Chronostrat labels no consistent with groups" ### Formations ### label = undefined_name for _, row in rows_formations.iterrows(): # Skip formation is length is 1m if row.Top_MD == row.Base_MD: continue new_label = row.Lithostrat new_chrono_label = row.Chronostrat if label == new_label or new_label.lower().startswith("undefined"): formation_levels = formation_levels[:-1] formation_levels.append(row.Base_MD) else: try: if row.Top_MD != formation_levels[-1]: # groups not continuous formation_labels.append(undefined_name) chrono_formation_labels.append(undefined_name) formation_levels.extend([formation_levels[-1], row.Top_MD]) except Exception: pass label = new_label chrono_label = new_chrono_label formation_labels.append(label) chrono_formation_labels.append(chrono_label) formation_levels.extend([row.Top_MD, row.Base_MD]) formation_levels = list(dict.fromkeys(formation_levels)) assert len(chrono_formation_labels) == len( formation_labels ), "Chronostrat labels no consistent with formations" formation_tops_mapper[well] = { "group_labels": group_labels, "group_labels_chronostrat": chrono_group_labels, "group_levels": group_levels, "formation_labels": formation_labels, "formation_labels_chronostrat": chrono_formation_labels, "formation_levels": formation_levels, } return formation_tops_mapper
[docs]def get_vertical_depths( well_names: List[str], client: CogniteClient, ) -> Dict[str, Dict[str, List[float]]]: """Makes trajectory queries to CDF for all provided wells and extracts vertical- and measured depths. These depths will further down the pipeline be used to interpolate the vertical depths along all the entire wellbores. Args: well_names (List[str]): list of well names client (CogniteClient): cognite client Returns: Dict[str, Dict[str, List[float]]]: Dictionary containing vertical- and measured depths (values) for each well (keys), list of wells with empty trajectory query to CDF """ vertical_depths_mapper = {} for well in well_names: well_data_cdf = client.sequences.list( metadata={"wellbore_name": well, "type": "trajectory"}, limit=None ) if len(well_data_cdf) == 0: warnings.warn( f"No trajectory information was found for {well}. Skipping it!" ) continue well_df_discrete = client.sequences.data.retrieve_dataframe( id=well_data_cdf[0].id, start=None, end=None ) if len(well_df_discrete) == 0: warnings.warn( f"No trajectory information was found for {well}. Skipping it!" ) continue well_df_discrete = well_df_discrete.drop_duplicates() md_query = well_df_discrete["MD"].to_list() tvdkb_query = well_df_discrete["TVDKB"].to_list() tvdss_query = well_df_discrete["TVDSS"].to_list() tvdss_query = [-x for x in tvdss_query] tvdbml_query = well_df_discrete["TVDBML"].to_list() vertical_dict_well = { "TVDKB": tvdkb_query, "TVDSS": tvdss_query, "TVDBML": tvdbml_query, "MD": md_query, } vertical_depths_mapper[well] = vertical_dict_well return vertical_depths_mapper
[docs]def get_calibration_map( df: pd.DataFrame, curves: List[str], location_curves: List[str], mode: str, id_column: str, levels: List[str] = None, standardize_level_names: bool = True, ) -> Dict[str, pd.DataFrame]: """ Returns calibration maps for each level, per well, typically formation and group. Calibration maps are pandas dataframes with the well name and unique values for each curve and location, where the value is the chosen "mode", such as mean, median, mode, etc, specified by the user. Useful for functions preprocessors.apply_calibration() and imputers.fillna_callibration_values(). Args: df (pd.DataFrame): dataframe with wells data curves (List[str]): list of curves to fetch unique values location_curves (List[str]): list of curves indicating location of well/formation/group. Typically latitude, longitude, tvdbml, depth mode (str): any method supported in pandas dataframe for representing the curve, such as median, mean, mode, min, max, etc. id_column (str): column with well names levels (List[str], optional): how to group samples in a well, typically per group or formation. Defaults to ["FORMATION", "GROUP"]. standardize_level_names (bool, optional): whether to standardize formation or group names. Defaults to True. Returns: Dict[str, pd.DataFrame]: dictionary with keys being level and values being the calibration map in dataframe format """ if levels is None: levels = ["FORMATION", "GROUP"] missing_curves = [ c for c in curves + location_curves + levels + [id_column] if c not in df.columns ] if len(missing_curves) > 0: raise ValueError(f"Missing necessary curves in dataframe: {missing_curves}") if standardize_level_names and any(((c in ["FORMATION", "GROUP"]) for c in levels)): for level in levels: df[level] = df[level].apply(standardize_group_formation_name) level_tables = {level: None for level in levels} for level in levels: data = [] for w in df[id_column].unique(): df_well = df[df[id_column] == w] for g, s in df_well.groupby(level, dropna=True): new_row = [ w, g, *getattr( s[curves + location_curves].dropna(how="all"), mode )().values, ] data.append(new_row) level_tables[level] = pd.DataFrame( data, columns=["well_name", level, *curves, *location_curves] ) return level_tables
[docs]def get_calibration_values( df: pd.DataFrame, curves: List[str], location_curves: List[str], level: str, mode: str, id_column: str, distance_thres: float = 99999.0, calibration_map: pd.DataFrame = None, standardize_level_names: bool = True, ) -> Dict[str, pd.DataFrame]: """ Get calibration map and fill na values (if any) for that well in calibration maps from closeby wells. Args: df (pd.DataFrame): dataframe curves (List[str]): list of curves to take into account for maps location_curves (List[str]): which curves to consider for calculating the distance between wells level (str): how to group samples in a well, typically per group or formation mode (str): any method supported in pandas dataframe for representing the curve, such as median, mean, mode, min, max, etc. id_column (str): column with well names distance_thres (float, optional): threshold for indicating a well is to far to be considered close enough. Defaults to 99999.0. calibration_map (pd.DataFrame, optional): calibration map for the level. Defaults to None. standardize_level_names (bool, optional): whether to standardize formation or group names. Defaults to True. Returns: Dict[str, pd.DataFrame]: _description_ """ missing_curves = [ c for c in curves + location_curves + [level, id_column] if c not in df.columns ] if len(missing_curves) > 0: raise ValueError(f"Missing necessary curves in dataframe: {missing_curves}") if standardize_level_names and level in ["FORMATION", "GROUP"]: df[level] = df[level].apply(standardize_group_formation_name) # get closest wells based on location curves def get_closest_wells( w_name: str, well_measures: pd.DataFrame, location_curves: List[str], calib_map: pd.DataFrame, distance_thres: float, ) -> Any: non_nan_cols = well_measures[location_curves].dropna().index.tolist() well_location = well_measures[non_nan_cols].values nona_rows = calib_map[non_nan_cols].dropna().index calib_locations = calib_map.loc[nona_rows, :][non_nan_cols].values if len(non_nan_cols) < len(location_curves): if len(non_nan_cols) == 0: warnings.warn( f"There are no valid values for {location_curves}" "in well {w_name} for {well_measures.name}." ) return [] warnings.warn( f"Distance was calculated only with the following features " f"{non_nan_cols}, as the rest was missing in well {w_name} " f"for {well_measures.name}." ) # distance between well and all others: calib_map = calib_map.loc[nona_rows, :] calib_map["distance"] = np.linalg.norm( np.repeat([well_location], repeats=len(calib_map), axis=0) - calib_locations, axis=1, ) calib_map = calib_map.loc[calib_map["distance"] <= distance_thres, :] # TODO: For now only returning 10 closests wells if more than 10 within # the distance threshold need to change this to radius based approach (maybe) closest_wells = calib_map.sort_values(by="distance")[id_column][:10] return closest_wells.tolist() # either get calibration from cdf if None or work on given map if calibration_map is None: # TODO get calibration map from CDF raise ValueError("Getting calibration map from CDF not yet implemented!") well_values = {} for well in df[id_column].unique(): df_well = df[df[id_column] == well] well_properties: Union[Dict[str, float], pd.DataFrame] = { g: getattr(v[curves + location_curves].dropna(how="all"), mode)() for g, v in df_well.groupby(level) } well_properties = pd.DataFrame.from_dict(well_properties, orient="index") if well_properties.empty: warnings.warn(f"Well {well} could not be processed (all NaN)!") continue # go through each level value, and find closest well for i in well_properties.index: if not any(well_properties.loc[i, curves].isna()): continue mask = (calibration_map[id_column] != well) & (calibration_map[level] == i) tmp_calib_map = calibration_map.loc[mask, :].copy() if not len(tmp_calib_map) > 0: continue closest_wells = get_closest_wells( well, well_properties.loc[i], location_curves, tmp_calib_map, distance_thres, ) cwells_map = tmp_calib_map[tmp_calib_map[id_column].isin(closest_wells)] if len(closest_wells) == 0: continue for c in closest_wells: well_properties.update( cwells_map.loc[ cwells_map[id_column] == c, [level, *curves] ].set_index(level), overwrite=False, ) if all(well_properties.loc[i, curves].notna()): break well_values[well] = well_properties return well_values
### VSH HELPERS ###
[docs]def get_violation_indices(mask: pd.Series) -> pd.DataFrame: """Helper function to retrieve the indices where a mask series is True Args: mask (pd.Series): The mask series to retrieve True indices of Returns: pd.DataFrame: A dataframe with the columns ["first", "last"] denoting the start and end indices of each block of True values in the passed mask. """ counter = (mask != mask.shift(1)).cumsum() indices = ( counter.index.to_series().groupby(counter, sort=False).agg(["first", "last"]) ) values = ( mask.groupby(counter).unique().apply(lambda x: x[0] if len(x) == 1 else np.nan) ) if values.isna().any(): raise ValueError( "More than one unique value found in the one of the mask groups!" ) return indices.loc[values]
[docs]def inflection_points( df: pd.DataFrame, curveName: str, before: int, after: int ) -> Tuple[int, int]: """Helper function for identifying the first inflection point in a curve before and after certain indices. Args: df (pd.DataFrame): The dataframe containing the specified curveName. curveName (str): The curve for which to detect inflection points. before (int): The index before which inflection points should be detected after (int): The index after which inflection points should be detected Returns: tuple(int, int): The first inflection point in the curve before the before index and after the after index If no inflection point is found, np.nan is returned. If no inflection point before the before index and after the after index is found, a ValueError is raised. """ before_df = df.loc[:before, curveName][::-1] after_df = df.loc[after:, curveName] inflection_points = {"before": before_df, "after": after_df} for series_name, series in inflection_points.items(): try: with np.errstate(invalid="ignore"): first_inflection_point = ( np.where( np.diff( pd.Series(np.sign(np.gradient(series))) .replace(0, np.nan) .fillna(method="ffill") ) != 0 )[0] + 1 )[0] first_inflection_point = series.index[first_inflection_point] except ValueError: # Not enough data points to take a gradient # Just return the only index in the series first_inflection_point = series.index[0] except IndexError: # No inflection point in provided series first_inflection_point = np.nan inflection_points[series_name] = first_inflection_point if np.isnan(list(inflection_points.values())).all(): raise ValueError("No inflection points found before or after!") return inflection_points["before"], inflection_points["after"]
[docs]def calculate_sampling_rate(array, max_sampling_rate=1): """ Calculates the sampling rate of an array by calculating the weighed average diff between the array's values. Args: array (pd.Series): The array for which the sampling rate should be calculated max_sampling_rate: The maximum acceptable sampling rate above which the the calculated sampling rates should not be included in the weighted average calculation (defined in samples/unit length e.g. m). Defaults to max 1 sample per m (where m is the assumed unit of the provided array) """ if array.empty or np.isnan(array).all(): raise ValueError( "The provided array is empty or contains only NaNs! Cannot calculate sampling rate!" ) diffs = pd.Series(np.diff(array.values)).value_counts(normalize=True) # Ensure big holes in the index don't affect the weighted average # Asumming 1 is a good enough threshold for now diffs.loc[diffs.index.to_series().gt(max_sampling_rate)] = np.nan sampling_rate = (diffs * diffs.index).sum() return sampling_rate