Source code for sensortoolkit.plotting._distribution

# -*- coding: utf-8 -*-
"""
Plotting methods for graphing the distribution of measured quantities such as
reference monitor pollutant concentrations (``ref_distrib()``), meteorological
conditions including temperature and relative humidity (``met_distrib()``),
and the distribution of recording intervals (i.e., the time difference between
consecutive timestamps) in sensor datasets (``recording_interval_histogram()``).

================================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Mon Jan 27 08:49:12 2020
Last Updated:
  Wed Jul 28 14:20:18 2021
"""
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sensortoolkit.param import Parameter
from sensortoolkit.datetime_utils import get_todays_date
register_matplotlib_converters()
sns.set_style('darkgrid')


[docs]def ref_distrib(ref_df, param=None, averaging_interval='1-hour', font_size=18, write_to_file=True, figure_path=None, filename_suffix=''): """Plot the distribution of reference values for the passed parameter. Args: ref_df (pandas DataFrame): Dataframe containing reference data for the parameter ``'param'`` and logged at the specified ``'averaging_interval'`` . param (str, optional): The name of the parameter for which the distribution plot will show the distribution of reference measurements. Defaults to None. averaging_interval (str, optional): The averaging interval for the passed dataframe. Defaults to '1-hour'. font_size (int or float, optional): The font size for the figure. Defaults to 18. write_to_file (bool, optional): If true, the figure will be saved as a png image to the ``[project_path]/figures`` subdirectory. Defaults to True. figure_path (str): The full directory path to the folder where figures are saved. This should be located at ``[project_path]/figures``. filename_suffix (str, optional): Optional suffix that can be added to the end of filenames to ensure previously created files with similar naming are not overwritten. Defaults to ''. Returns: None. """ try: # Determine name of reference monitor from passed parameter name try: ref_name = ref_df[param + '_Method'].dropna().unique()[0] except IndexError: ref_name = 'Unspecified Reference' # Format the parameter name for plotting param_obj = Parameter(param) param_name = param_obj.param_name fmt_param = param_obj.param_format_name fmt_param_units = param_obj.param_units # Construct plot instance fig, ax = plt.subplots(1, 1, figsize=(6, 5)) sns.distplot(ref_df[ref_name].dropna(), label=ref_name +' ' + fmt_param, ax=ax) # Set axes attributes ax.set_xlabel(f'Reference {averaging_interval} {fmt_param} ({fmt_param_units})', fontsize=font_size) ax.set_ylabel('Relative Probability', fontsize=font_size) ax.tick_params(axis='both', labelsize=0.75*font_size) plt.legend(fontsize=0.85*font_size) if write_to_file is True: todays_date = get_todays_date() figure_path = figure_path + ref_name + '_DistPlot_' + param_name \ + '_' + todays_date if filename_suffix != '': figure_path = figure_path + '_' + filename_suffix figure_path += '.png' plt.tight_layout() plt.savefig(figure_path, dpi=300) plt.close() # Exception: Column name for reference monitor data not in passed df except KeyError as i: print(i, 'not found in passed reference dataframe')
[docs]def met_distrib(met_ref_data, avg_hrly_df, figure_path, sensor_name=None, write_to_file=True): """Create distribution plots for meteorological parameters provided in the passed met_ref_data dataframe. Distributions are displayed as relative frequencies (i.e., percentages of the total distribution of measurements). Args: met_ref_data (pandas DataFrame): Meteorological reference data (1-hour averages) for temperature, relative humidity, and dew point measurements. avg_hrly_df (pandas DataFrame): Dataframe containing the inter-sensor average value for 1-hour averaged air sensor measurements. figure_path (str): The full directory path to the folder where figures are saved. This should be located at ``[project_path]/figures``. sensor_name (str, optional): The name of the air sensor (make, manufacturer). Defaults to None. write_to_file (bool, optional): If true, the figure will be saved as a png image to the ``[project_path]/figures`` subdirectory. Defaults to True. Returns: None. """ font_size = 10 detail_font_size = 0.8*font_size n_var = len(met_ref_data.count()) # Number of met variables to plot fig, axs = plt.subplots(1, n_var, figsize=(5.15, 2.54)) fill_color = [['#77529A'], ['#b06c8b'], ['#588ded']] plt.suptitle('Evaluation Site Meteorological Conditions\n', fontsize=font_size) fig.subplots_adjust(wspace=.6, hspace=.3, left=.12, right=.88, top=.86, bottom=.17) for i in range(n_var): sensor_data = False param = met_ref_data.columns[i] data = met_ref_data[param].dropna() if data.empty: print(f'..Met data empty for {param}, trying sensor measurements') try: data = avg_hrly_df['mean_' + param].dropna() sensor_data = True except KeyError: print('..{param} not measured by sensor, unable to plot ' 'distribution') continue if data.empty: print('..no intersensor averaged {param} data, unable to plot ' 'distribution') continue sns.histplot(data, ax=axs[i], bins=15, stat='percent', kde=True, color=fill_color[i][0], **{'alpha': 0.6}) if param.startswith('RH'): label = 'Relative Humidity (%)' if sensor_data: axs[i].set_title('*Sensor Measurements Shown*', fontsize=detail_font_size, y=0.97) axs[i].set_xlabel(label, fontsize=detail_font_size) axs[i].xaxis.set_major_locator(plt.MultipleLocator(25)) if param.startswith('Temp'): label = 'Temperature ($\\degree$C)' if sensor_data: axs[i].set_title('*Sensor Measurements Shown*', fontsize=detail_font_size, y=0.97) axs[i].set_xlabel(label, fontsize=detail_font_size) axs[i].xaxis.set_major_locator(plt.MultipleLocator(10)) if param.startswith('DP'): label = 'Dew Point ($\\degree$C)' if sensor_data: axs[i].set_title('*Sensor Measurements Shown*', fontsize=detail_font_size, y=0.97) axs[i].set_xlabel(label, fontsize=detail_font_size) axs[i].set_ylabel('Relative Probability (%)', fontsize=detail_font_size) axs[i].tick_params(axis='both', labelsize=detail_font_size) if write_to_file is True: todays_date = get_todays_date() file_path = figure_path + 'Met' + '\\' + sensor_name + '_'\ 'met_distplot_report_fmt' + '_' + todays_date plt.savefig(file_path + '.png', dpi=300) plt.close()
[docs]def recording_interval_histogram(full_df_list, xlims=(-10, 120), bar_width=2, bar_alpha=.4): """Plot indicating the uneven time delta in sensor data. Graphs bar plot of Log(counts) vs. time delta between consecutive timestamp entries. Args: full_df_list (list): List of pandas DataFrames containing timeseries data at the original recorded sampling frequency. xlims (Two-element tuple, optional): The x-axis limits (in seconds) for displaying the distribution of consecutive intervals between recorded timestamps. Defaults to (-10, 120). bar_width (int or float, optional): The width of bars displayed in the figure. Defaults to 2. bar_alpha (float, optional): The transparency of bars displayed in the figure. Defaults to .4. Returns: None. """ xmin, xmax = xlims if len(full_df_list) == 3: color_list = ['#1f77b4', '#d62728', '#9467bd'] # blue, red, purple else: color_list = ['#9F99C8', '#fb8072', '#80b1d3', '#8dd3c7', '#ffffb3', '#FD9962', '#b3de69'] fig, ax = plt.subplots(1, 1, figsize=(8, 6)) for color, df in zip(color_list, full_df_list): idx_name = df.index.name delta = (df.index[1:] - df.index[0:-1]).to_frame() ax.bar(delta[idx_name].value_counts().index.total_seconds(), np.log10(delta[idx_name].value_counts().values), width=bar_width, alpha=bar_alpha, edgecolor='none', color=color) ax.set_xlim(xmin, xmax) ax.set_xlabel(r'$\Delta t$ (seconds)') ax.set_ylabel(r'Log$_{10}$(counts)')