Source code for sensortoolkit.qc._purpleair_abcleaning

# -*- coding: utf-8 -*-
"""
This module contains a method for averaging the dual PMS5003 PM2.5 data channels
for PurpleAir PA-II and PA-II-SD sensors. This method of data averaging was
developed by Barkjohn et al. 2021 [#f1]_, and is discussed in detail in the
publication section 3.2.3 "Comparison of A and B channels". Briefly, quality
control (QC) criteria are applied during data averaging, whereby the absolute
and percent difference between concurrent (nearest neighbor by logged timestamp)
A and B channel measurement pairs are calculated. If both of these QC criteria
do not exceed respective thesholds, the A and B channels are averaged. Otherwise,
occasions where the A and B channel differ by a margin greater than the QC
criteria thresholds are set null.

.. rubric:: Footnotes

.. [#f1] Barkjohn, K. K., Gantt, B., and Clements, A. L.: Development and application of a United States-wide correction for PM2.5 data collected with the PurpleAir sensor, Atmos. Meas. Tech., 14, 4617–4637, https://doi.org/10.5194/amt-14-4617-2021, 2021.

================================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Wed Jul  7 13:32:49 2021
Last Updated:
  Wed Aug 18 09:10:24 2021
"""
import pandas as pd
import numpy as np


[docs]def purpleair_ab_averages(df, cleaning=True, a_col_name=None, b_col_name=None): """Average A and B channel data for PurpleAir sensors. QC criteria via Barkjohn et al. 2021, publication link: https://amt.copernicus.org/articles/14/4617/2021/ Args: df (pandas dataframe): PurpleAir dataframe containing columns with A and B channel PM2.5 data. cleaning (bool): If true, datapoints outside the QC criteria of Barkjohn et al. 2021 will be invalidated (set null). Else, QC criteria will not be applied. a_col_name (str): The column header name for PM2.5 data from channel A. b_col_name (str): The column header name for PM2.5 data from channel B. Returns: df (pandas dataframe): Modified PurpleAir dataframe with computed AB averages """ if cleaning is True: # Compute (A-B) difference AB_diff = df[a_col_name] - df[b_col_name] # Absolute value of (A-B) AB_absdiff = abs(AB_diff) # Compute percent difference of A and B AB_pdiff = 2*(df[a_col_name] - df[b_col_name]) / \ (df[a_col_name] + df[b_col_name]) # Exclusion thresholds pm_thres = 5.0 # ug/m^3 AB_pdiff_thres = 0.70 # exclude > 70% difference df = pd.concat([df, AB_absdiff, AB_pdiff], axis=1).rename(columns={0: 'PM25_AB_absdiff', 1: 'PM25_AB_pdiff'}) # Datapoint exclusion mask invalid = ((df['PM25_AB_absdiff'] > pm_thres) & (abs(df['PM25_AB_pdiff']) > AB_pdiff_thres)) | \ (df[a_col_name].isnull()) | \ (df[b_col_name].isnull()) # Raw AB averages df['PM25_AB_raw'] = (df[a_col_name] + df[b_col_name])/2 # Extract as series AB_avgs = df['PM25_AB_raw'] # Cleaned AB averages AB_avgs[invalid] = np.nan # Assign cleaned AB averaged data to dataframe df['PM25_Value'] = AB_avgs #df = QC_flags(df, a_col_name, b_col_name) else: # Raw AB averages df['PM25_AB_raw'] = (df[a_col_name] + df[b_col_name])/2 # Extract as series AB_avgs = df['PM25_AB_raw'] # Assign AB averaged data to dataframe df['PM25_value'] = AB_avgs return df