Source code for sensortoolkit.datetime_utils._time_averaging

# -*- coding: utf-8 -*-
"""
This module calculates 1-hour and 24-hour averaged datasets from sensor and
FRM/FEM recorded datasets.

U.S. EPA's Performance Targets Reports stipulate that a 75% data completeness
requirement for each averaging interval should be imposed. For example, a
:math:`PM_{2.5}` sensor recording concentration measurements every hour would
require a minimum of 18 valid measurements in order to calculate a valid
24-hour averaged concentration [i.e., (18/24) * 100% = 75%].

U.S. EPA's Performance Targets Reports calculate averages as

.. math::

    x_{kpj} = \\frac{1}{n}\\sum_{i=1}^{n}c_{ij}

where:

    :math:`x_{kpj}` = 1-hour or 24-hour averaged measurement k for hour/day p
    and instrument j

    :math:`n` = number of instrument measurements per averaging interval

    :math:`c_{ij}` = measurement from instrument j for time i of the averaging
    interval

================================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Wed Oct 21 14:46:27 2020
Last Updated:
  Tue Jul 13 16:32:44 2021
"""
import os
import pandas as pd
import numpy as np
from sensortoolkit.qc._duplicate_removal import remove_duplicates


[docs]def sensor_averaging(full_df_list, sensor_serials=None, name='', write_to_file=True, path=None, **kwargs): """Write full (recorded), hourly, and daily averaged datasets to csv. Wrapper function for computing hourly and daily averaged DataFrames. Args: full_df_list (list): List of sensor DataFrames at original recorded sampling frequency. sensor_serials (dict): A dictionary of unique serial identifiers for each sensor in the testing group. name (str): The make and model of the sensor being evaluated. write_to_file (bool): If true, datasets will be written to the path for data at original recorded sampling frequency (files ending in '_full.csv'), 1-hour averaged datasets (files ending in '_hourly.csv'), and 24-hour averaged datasets (files ending in '_daily.csv'). path (str): The full directory path to processed sensor data for a given sensor make and model. **Keyword Arguments:** :param float threshold: The completeness threshold for averaging datasets to 1-hour or 24-hour intervals. Defaults to 75% (``0.75``). Returns: (tuple): Two-element tuple containing: - **hourly_df_list** (*list of pandas DataFrames*): List of sensor data frames of length N (where N is the number of sensor units in a testing group). frames indexed by DateTime at 1-hour averaged sampling frequency. - **daily_df_list** (*list of pandas DataFrames*): List of sensor data frames of length N (where N is the number of sensor units in a testing group). frames indexed by DateTime at 24-hour averaged sampling frequency. """ print('Averaging datasets to 1-hour and 24-hour intervals:') hourly_df_list, daily_df_list = [], [] if sensor_serials is None: n_sensors = len(full_df_list) sensor_serials = {i: 'Sensor '+str(i) for i in np.linspace(1, n_sensors, n_sensors, dtype=int)} # Loop over each recorded sensor dataset and compute hourly, daily averages for full_df, sensor_n in zip(full_df_list, sensor_serials): serial_id = sensor_serials[sensor_n] full_df = remove_duplicates(full_df, agg_numeric_by='mean', agg_object_by='first', print_indent=2) # Compute timedelta between successive timestamps delta = (full_df.index[1:] - full_df.index[0:-1]).to_frame() if delta.index.name is None: delta.index.name = 'DateTime' idx_name = delta.index.name # Use mode of timedelta to extrapolate # of datapoints recorded per hr time_delta = delta[idx_name].mode()[0] hr_count = pd.to_timedelta(1, unit='H') / time_delta day_count = pd.to_timedelta(1, unit='D') / time_delta # Use a 75% threshold hr_thres = kwargs.get('threshold', 0.75) day_thres = kwargs.get('threshold', 0.75) # Print the mode of the sampling interval for recorded sensor data and # the number of counts within each hour interval. print('..{0:s} recording interval mode: {1:s}, ' '{2:4.1f} counts per hour'.format(serial_id, str(time_delta), hr_count,)) hourly_df = interval_averaging(full_df, freq='H', interval_count=hr_count, thres=hr_thres) if full_df.attrs != {} and hourly_df.attrs == {}: hourly_df.attrs = full_df.attrs daily_df = interval_averaging(full_df, freq='D', interval_count=day_count, thres=day_thres) if full_df.attrs != {} and daily_df.attrs == {}: daily_df.attrs = full_df.attrs hourly_df_list.append(hourly_df) daily_df_list.append(daily_df) if write_to_file is True: print('....writing full, hourly, and daily datasets to .csv files') # check if sensor-specific subfolder exists if not os.path.exists(path): os.makedirs(path) # Add ISO8601 formatting. Not sure if copy of datasets are needed, # but applying in case applying directly to original versions would # modify datasets in place (converting to ISO8601 changes the # data type of the index from datetime64 to object, so modifying # original may cause issues when trying to utilize index in # date/time fashion). full_cp = full_df.copy() full_cp.index = full_cp.index.to_series().apply( pd.Timestamp.isoformat) hourly_cp = hourly_df.copy() hourly_cp.index = hourly_cp.index.to_series().apply( pd.Timestamp.isoformat) daily_cp = daily_df.copy() daily_cp.index = daily_cp.index.to_series().apply( pd.Timestamp.isoformat) full_cp.to_csv(path + name + '_' + serial_id + '_full.csv') hourly_cp.to_csv(path + name + '_' + serial_id + '_hourly.csv') daily_cp.to_csv(path + name + '_' + serial_id + '_daily.csv') return hourly_df_list, daily_df_list
[docs]def interval_averaging(df, freq='H', interval_count=60, thres=0.75): """Average DataFrame to the specified sampling frequency ('freq'). Numeric columns are averaged for for each interval and a completeness threshold (default 75%) must be met, otherwise averages are null. Columns of type 'object' (i.e. text) are aggregated within each interval by the mode of unique object values. Args: df (pandas DataFrame or pandas Series): Dataframe or Series for which averages will be computed. freq (str): The frequency (averaging interval) to which the DataFrame will be averaged. Defaults to ``H``. Pandas refers to these as 'offset aliases', and a list is found here (https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases). interval_count (int): The number of datapoints expected within the passed DataFrame for the specified averaging interval ('freq'). Defaults to 60 for 1-hour averages. E.g., if computing 1-hour averages (freq='H') an the passed DataFrame is for a sensor that recorded measurements at 1-minute sampling frequency, interval_count will equal 60 (expect 60 non-null data points per averaging interval). thres (float): Threshold (ranging from 0 to 1) for ratio of the number of data points recorded within a given averaging interval vs. the number of expected data points. Defaults to ``0.75`` (i.e., 75%). Return: avg_df (pandas DataFrame): Dataframe averaged to datetimeindex interval specified by 'freq'. """ n_thres = interval_count*thres # If Series object passed, convert to DataFrame data_type = type(df) if data_type is not pd.core.frame.DataFrame: df = pd.Series(df).to_frame() # List of unique column names with the column order preserved col_list = list(dict.fromkeys(df.columns)) # Split DataFrame in to object-like columns and numeric-like columns obj_df = df.select_dtypes(include=['object', 'datetime']) num_df = df.select_dtypes(exclude=['object', 'datetime']) # Merge object columns by using the instance in the first non-null instance obj_df = column_merger(obj_df, by='first') # Merge numeric columns with same name by mean num_df = column_merger(num_df, by='mean') num_df_cols = list(num_df.columns) obj_df_cols = list(obj_df.columns) num_df = num_df.dropna(axis=1, how='all') # index at specified interval for empty DataFrames (all NaNs) nan_df_idx = pd.date_range(start=obj_df.index[0], end=obj_df.index[-1], freq=freq, normalize=True) # Sample object-like data at specified interval by the mode obj_df = obj_df.dropna(how='all', axis=1).fillna('') if obj_df.empty: avg_obj_df = pd.DataFrame(np.nan, index=nan_df_idx, columns=obj_df_cols) else: avg_obj_df = obj_df.groupby([pd.Grouper(freq=freq)] ).agg(lambda x: object_grouper(x, n_thres)) dropped_objcols = [col for col in obj_df_cols if col not in avg_obj_df] for col in dropped_objcols: avg_obj_df[col] = np.nan if num_df.empty: avg_num_df = pd.DataFrame(np.nan, index=nan_df_idx, columns=num_df_cols) else: # Mean param values for each averaging interval mean_df = num_df[:].groupby( [pd.Grouper(freq=freq)]).mean() # Counts for each param and each averaging interval counts_df = num_df[:].groupby( [pd.Grouper(freq=freq)]).count().add_suffix( '_count' + freq) avg_num_df = mean_df.join(counts_df).sort_index(axis=1) # List of columns containing count for each averaging interval count_list = list(avg_num_df.columns[[col.endswith('count' + freq) for col in avg_num_df.columns]]) # Set null param vals for averaging intervals below completeness thres for col in count_list: mean_col = col.replace('_count' + freq, '') avg_num_df[mean_col] = avg_num_df[mean_col].where( avg_num_df[col] > n_thres, np.nan) # Rejoin non-numeric columns on averaging interval avg_df = avg_num_df.join(avg_obj_df) # Ensure that any columns with all NaNs in passed df are in avg_df # (Numeric type columns that were dropped) dropped_numcols = [col for col in col_list if col not in avg_df.columns] for col in dropped_numcols: avg_df[col] = np.nan # reorder columns before return avg_df = avg_df[col_list] return avg_df
[docs]def object_grouper(series, number_threshold): """Group columns of type `object` by the mode of values within each averaging interval. Args: series (pandas Series): An array of values with type object (typically textual information) alongside an associated datetime index. number_threshold (int or float): The number of counts for the modal value within a given averaging interval required to assign the modal value to the averaging interval. This can be expressed as a completeness threshold (typically 70%) multiplied by the number of expected counts within a given averaging interval. Returns: val (str or numpy.nan): The mode of the object-type series within the specified averaging interval. If the number of counts for the modal value is less than the number threshold (70% x expected counts within an averaging interval), return numpy.nan (null). """ try: counts = series.value_counts().values[0] if counts >= number_threshold: val = series.value_counts().index[0] else: val = np.nan except IndexError: val = np.nan return val
[docs]def column_merger(df, by='first'): """Group duplicated column names if detected in passed dataset. Args: df (pandas DataFrame): Dataset containing columns with the same name. by (str, optional): Method for how to keep entries from duplicated columns. Either ``'first'`` (keep the first non-null entries, good for columns of dtype object - i.e., strings) or ``'mean'`` (compute the mean of entries for duplicated columns (good for numeric type columns). Defaults to 'first'. Returns: df (pandas DataFrame): Modified dataset with duplicated column entries merged. """ col_counts = {col: list(df.columns).count(col) for col in df.columns if list(df.columns).count(col) > 1} if col_counts != {}: print('....duplicate column names found in dataset:') for col_name, occurrences in col_counts.items(): print(f'......column name: "{col_name}", occurrences: {occurrences}') if by == 'first': grouped_df = df.groupby(level=0, axis=1).first() if by == 'mean': grouped_df = df.groupby(level=0, axis=1).mean() print(f'....duplicate column occurrences grouped by {by}') df = grouped_df return df