Source code for sensortoolkit.calculate._cv

# -*- coding: utf-8 -*-
"""
This module computes the coefficient of variation (CV), a measure of precision,
for collocated sensors recording data concurrently.

.. note::

    CV as defined by U.S. EPA's Performance Targets Reports is measured for
    periods where all sensors in the evaluation group are measuring
    concurrently.

    For example, if three sensors are included in a deployment group,
    the CV will only be calculated for periods where all three sensors are operating
    normally and recording values simultaneously.

U.S. EPA's Performance Targets Reports calculate CV as

.. math::

    CV = \\frac{SD}{\\bar{x}}\\times 100

where :math:`\\bar{x}` is the deployment averaged sensor concentration for a
field test, and :math:`SD`, the standard deviation, is defined as

.. math::

    SD = \\sqrt{\\frac{1}{(N\\times M)-1}\\sum_{j=1}^{M}\\left[
    \\sum_{i=1}^{N}(x_{ij} - \\bar{x_i})^2\\right]}

and where:

    :math:`M` = number of identical sensors operated simultaneously during a
    field test

    :math:`N` = number of measurement intervals during which all identical
    instruments are operating and returning valid averages over the duration of
    the field test

    :math:`x_{ij}` = Sensor concentration for measurement interval :math:`i` and
    sensor unit :math:`j`.

    :math:`\\bar{x_i}` = Intersensor average sensor concentration for
    measurement interval :math:`i`. **All sensor units deployed for testing**
    **must have recorded non-null values for measurement interval** :math:`i`
    **to compute** :math:`\\bar{x_i}`.

================================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Wed Jan 29 10:03:27 2020
Last Updated:
  Tue Jul 13 09:21:40 2021
"""
import numpy as np
import pandas as pd
from sensortoolkit.datetime_utils import deploy_timestamp_index


def _calculate_cv(cv_df, sensor_numbers, param):
    """Compute CV for a group of collocated, concurrently recording sensors.

    Args:
        cv_df (pandas dataframe):
            Dataframe with parameter concentration values, used to
            calculate CV. Only rows (unique timestamps) are kept where all
            deployment group sensors are concurrently recording for calculating
            CV.
        sensor_numbers
            serial identifiers for sensors in the deployment group.
        param (str):
            Parameter name to evaluate

    Returns:
        (tuple): four-element tuple containing:

            - cv_df (pandas DataFrame): Modified cv_df, dropped rows were not
              all sensors measuring concurrently, add columns for computing CV.
            - CV (float): The coefficient of variation of concurrent sensor
              measurements. Calculated as the 100*(standard deviation / mean of
              all concurrent sensor measurements).
            - st_dev (float): The standard deviation of concurrent sensor
              measurements.
            - n_concurr (int): Number of concurrent hours with all sensors
              reporting pollutant values.

    """
    if cv_df.index[1] - cv_df.index[0] == pd.Timedelta('0 days 01:00:00'):
        time_interval = "1-Hour"
    elif cv_df.index[1] - cv_df.index[0] == pd.Timedelta('1 days 00:00:00'):
        time_interval = "24-Hour"

    print("Computing CV for " + time_interval + " averaged " + param)

    cv_df[param + '_sensor_mean'] = cv_df.mean(axis=1)

    for sensor_number in sensor_numbers:
        cv_df[str(sensor_number)+'_val-avg_val_sqrd'] = \
            (cv_df[str(sensor_number)+'_' + param] -
             cv_df[param + '_sensor_mean'])**2

    sum_sqrd_diff = 0
    total_n_vals = 0
    pollutant_avg = 0

    before_drop_n = len(cv_df)
    cv_df = cv_df.dropna(axis=1, how='all')  # drop empty columns
    cv_df = cv_df.dropna(how='any')  # drop rows with any missing data values
    after_drop_n = len(cv_df)
    n_excluded = before_drop_n - after_drop_n

    print("..N excluded:", n_excluded, "out of", before_drop_n, "total")
    print("..N concurrent:", after_drop_n)
    print("..Concurrent measurement timeframe:", cv_df.index[0], '-',
          cv_df.index[-1])

    n_sensors = 0
    for sensor_n in sensor_numbers:
        try:
            sum_sqrd_diff += cv_df[str(sensor_n)+'_val-avg_val_sqrd'].sum()
            total_n_vals += cv_df[str(sensor_n)+'_val-avg_val_sqrd'].count()
            pollutant_avg += cv_df[str(sensor_n)+'_' + param].mean()
            n_sensors += 1
        except KeyError:
            continue

    # number of concurrent hours with all sensors reporting pollutant values
    n_concurr = total_n_vals / n_sensors

    pollutant_avg /= n_sensors
    st_dev = np.sqrt(sum_sqrd_diff/(total_n_vals - 1))
    CV = (st_dev / pollutant_avg)*100  # CV reported in percentage

    return cv_df, CV, st_dev, n_concurr


[docs]def cv(df_list, deploy_dict, param=None, return_deploy_dict=True): """Compute CV for set of sensor dataframes and indicated parameter. Loops over the unique deployment groups and constructs a dataframe of concurrently recorded sensor measurements which is passed to CV_Calculator to determine CV. Args: df_list (list): List of sensor dataframes deploy_dict (dict): A dictionary containing descriptive statistics and textual information about the deployment (testing agency, site, time period, etc.), sensors tested, and site conditions during the evaluation. param (str): Parameter name to evaluate return_deploy_dict (bool): If true, return modified deployment dictionary with precision statisitcs (CV, standard deviation, N concurrent datapoints across all sensors). Returns: If ``return_deploy_dict`` is ``True``, return ``deploy_dict`` with updated precision statistics, else return ``CV`` (float). """ date_index, avg_suffix = deploy_timestamp_index(df_list, averaging_suffix=True) cv_df = pd.DataFrame(index=date_index) for i, df in enumerate(df_list): df = df_list[i] sensor_number = i + 1 try: cv_df[str(sensor_number)+'_'+param] = df[param+ '_Value'] except KeyError as param_not_found: print('Parameter name not found in passed dataframes:', param_not_found) continue for group in deploy_dict['Deployment Groups']: deploy = deploy_dict['Deployment Groups'][group] deploy_sensor_nums = list(deploy['sensors'].keys()) if param not in deploy: deploy[param] = {} deploy[param]['Precision'] = {} if 'Precision' not in deploy[param]: deploy[param]['Precision'] = {} stats_loc = deploy[param]['Precision'] start = date_index.min().floor(freq='H') end = date_index.max().ceil(freq='H') serials = {str(i): deploy['sensors'][str(i)]['serial_id'] for i in list(deploy['sensors'].keys())} # Check if issues with individual sensors during deployment, remove # from serial dictionary and sensor number list used to pop. CV df for i, n in enumerate(deploy['sensors']): if deploy['sensors'][n]['deploy_issues'] == 'True': serials.pop(n) deploy_sensor_nums.remove(n) print('Sensor', n, 'indicates issues during deployment') print('Excluding sensor', n, 'from CV calculation') # Set analysis dataframe for computing CV deploy_cols = [i + '_' + param for i in deploy_sensor_nums] deploy_cv_df = cv_df.loc[start:end, deploy_cols] deploy_cv_df, CV, st_dev, n_concurr = _calculate_cv(deploy_cv_df, serials, param=param) stats_loc['cv' + avg_suffix] = float("{0:.3f}".format(CV)) stats_loc['std' + avg_suffix] = float("{0:.3f}".format(st_dev)) stats_loc['n' + avg_suffix] = int(n_concurr) if return_deploy_dict is True: return deploy_dict return CV