Source code for sensortoolkit.qc._persistent_values

# -*- coding: utf-8 -*-
"""
This module contains a method for flagging consecutive data values where the
recorded value repeats multiple times.

================================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Tue Aug 17 15:24:31 2021
Last Updated:
  Tue Aug 17 15:24:31 2021
"""
import pandas as pd
import numpy as np


[docs]def persistent_values(df, param, tolerance=3, freq='H', invalidate=False): """Flag data points where consecutive timestamp parameter values repeat. Values persisting for N or greater consecutive timestamps will be flagged (N is the integer value set for the tolerance). If invalidate is true, corresponding values will be set null (np.nan). Args: df (pandas DataFrame): Dataset containing parameter data to check for repeating values. param (str): The name of the parameter to check for repeating values. tolerance (int, optional): The number of consecutive entries for repeated/persistent values required to flag a data point. Defaults to 3. freq (TYPE, optional): The sampling frequency or averaging interval of the passed dataset, expressed as a pandas offset alias (see a list here https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases). Defaults to 'H' for 1-hour averaged datasets. invalidate (bool, optional): If True, repeated entries will be set null (np.nan). Defaults to False. Returns: df (pandas DataFrame): Modified dataset with flagged entries for repeated data entries. """ if param + '_QAQC_Code' not in df: df.loc[:, param + '_QAQC_Code'] = np.nan if df[df[param + '_Value'].diff() == 0].empty: print('..no persistant values found for ' + param) return df print('..flagging persistant values for ' + param) data = df[param + '_Value'].copy().to_frame() # take the difference between consequtive data points and shift n times # where n is the tolerance window_df = pd.DataFrame() for i in np.arange(1, tolerance + 1, 1, dtype=int): window_df[param + '_diff_' + str(i)] = data[param + '_Value'].diff() window_df['z_count'] = (window_df == 0).astype(int).sum(axis=1) flag_idx = window_df[window_df.z_count == tolerance].index flag_idx = df.index.intersection(flag_idx) df.loc[flag_idx, param + '_QAQC_Code'] = 'persist' # temporary flag if invalidate is True: df.loc[flag_idx, param + '_Value'] = np.nan return df