Module nais_processor

Expand source code
import numpy as np
from matplotlib import colors
import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator
from datetime import date, datetime, timedelta
import matplotlib.dates as dts
import pandas as pd
import os
import locale
import warnings
import yaml
import re
import sys
from dateutil.parser import parse
from tinydb import TinyDB, Query
from tinydb.operations import add
import time
import json
import aerosol_functions as af

__pdoc__ = {
    'tubeloss': False,
    'average_mob': False,
    'average_dp': False,
    'find_diagnostic_names': False,
    'process_data': False,
    'correct_data': False,
    'data_cleaner': False,
    'corona_limit': False,
    'corona_ion_cleaner': False
}

# The final geometric mean diameters of diameter and mobility bins
dp_ion = np.array([7.86360416e-10, 9.08232168e-10, 1.04902018e-09, 1.21167006e-09,
1.39958930e-09, 1.61672083e-09, 1.86762862e-09, 2.15759741e-09,
2.49274932e-09, 2.88018000e-09, 3.32811839e-09, 3.84611427e-09,
4.44525917e-09, 5.13844742e-09, 5.94068566e-09, 6.86946146e-09,
7.94518431e-09, 9.19171623e-09, 1.06370142e-08, 1.23139134e-08,
1.42610904e-08, 1.65242568e-08, 1.91576555e-08, 2.22259544e-08,
2.58066722e-08, 2.99933244e-08, 3.48995548e-08, 4.06646353e-08])*1e9

dp_par = np.array([7.498942093324539870e-01,8.659643233600640144e-01,
9.999999999999980016e-01,1.154781984689456031e+00,1.333521432163321974e+00,
1.539926526059490097e+00,1.778279410038920094e+00,2.053525026457140079e+00,
2.371373705661659947e+00,2.738419634264360081e+00,3.162277660168379967e+00,
3.651741272548380213e+00,4.216965034285819591e+00,4.869675251658620141e+00,
5.623413251903479626e+00,6.493816315762099833e+00,7.498942093324560076e+00,
8.659643233600640144e+00,1.000000000000000000e+01,1.154781984689457985e+01,
1.333521432163323972e+01,1.539926526059490008e+01,1.778279410038922137e+01,
2.053525026457139901e+01,2.371373705661660125e+01,2.738419634264360170e+01,
3.162277660168379967e+01,3.651741272548380124e+01,4.216965034285819769e+01])

mob_ion = np.array([3.162277660168379937e-04,2.371373705661659990e-04,
1.778279410038920258e-04,1.333521432163320159e-04,1.000000000000000048e-04,
7.498942093324559917e-05,5.623413251903490022e-05,4.216965034285820205e-05,
3.162277660168380208e-05,2.371373705661660125e-05,1.778279410038919852e-05,
1.333521432163319990e-05,1.000000000000000082e-05,7.498942093324561442e-06,
5.623413251903490361e-06,4.216965034285830030e-06,3.162277660168380038e-06,
2.371373705661659871e-06,1.778279410038920148e-06,1.333521432163330027e-06,
1.000000000000000167e-06,7.498942093324570124e-07,5.623413251903499890e-07,
4.216965034285829924e-07,3.162277660168379721e-07,2.371373705661660136e-07,
1.778279410038920042e-07,1.333521432163329868e-07])*1e4

mob_ion_geomeans=np.array([2.73841963e-04, 2.05352503e-04, 1.53992653e-04, 1.15478198e-04,
8.65964323e-05, 6.49381632e-05, 4.86967525e-05, 3.65174127e-05,
2.73841963e-05, 2.05352503e-05, 1.53992653e-05, 1.15478198e-05,
8.65964323e-06, 6.49381632e-06, 4.86967525e-06, 3.65174127e-06,
2.73841963e-06, 2.05352503e-06, 1.53992653e-06, 1.15478198e-06,
8.65964323e-07, 6.49381632e-07, 4.86967525e-07, 3.65174127e-07,
2.73841963e-07, 2.05352503e-07, 1.53992653e-07])*1e4

dp_par_geomeans=np.array([0.80584219,  0.93057204,  1.07460783,  1.24093776,  1.43301257,
1.6548171 ,  1.91095297,  2.20673407,  2.54829675,  2.94272718,
3.39820833,  3.92418976,  4.53158364,  5.23299115,  6.0429639 ,
6.97830585,  8.05842188,  9.30572041, 10.74607828, 12.40937761,
14.3301257 , 16.548171  , 19.10952975, 22.06734069, 25.48296748,
29.42727176, 33.98208329, 39.24189758])

dlogmob_ion=np.array([0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125,
0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125,
0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125,
0.125])

dlogdp_ion = np.array([0.06257524, 0.0625811 , 0.06259375, 0.06260838, 0.06262533,
0.06264495, 0.06266769, 0.06269404, 0.06272461, 0.06276008,
0.06280128, 0.06284916, 0.06290487, 0.06296974, 0.06304539,
0.0631337 , 0.06323696, 0.06335788, 0.06349974, 0.0636665 ,
0.06386292, 0.06409481, 0.06436924, 0.06469482, 0.06508209,
0.06554394, 0.06609614, 0.06639699])

dlogdp_par=np.array([0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625,
0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625,
0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625,
0.0625, 0.0625, 0.0625, 0.0625, 0.0625])

filename_formats = [
["%Y-%m-%d.ions.nds","%Y-%m-%d.particles.nds","%Y-%m-%d.log"],
["%Y%m%d-block-ions.spectra","%Y%m%d-block-particles.spectra","%Y%m%d-block.records"],
["%Y%m%d-block-ions.spectra","%Y%m%d-block-particles.spectra","%Y%m%d-block.diagnostics"]]

possible_sampleflow_names = [
"pos_sampleflow.mean",
"neg_sampleflow.mean",
"pos_sampleflow",
"neg_sampleflow",
"sampleflow",
"Flowaer"]

possible_temperature_names = [
"temperature.mean",
"temperature",
"temp"]

possible_pressure_names = [
"baro.mean",
"baro"]

# Define standard conditions
temp_ref = 273.15 # K, 0C
pres_ref = 101325.0 # Pa, 1atm

def make_config():
    """
    Make a configuration file for processing NAIS data

    Running `make_config()` asks information about the
    measurement and the data, then writes the configuration
    file to the specified location.

    """

    # Collect data from the user
    print()
    print("Enter absolute path to configuration file.")
    print("For example: /home/user/campaign.yml")
    while True:
        config_file = input("> ")
        if len(config_file)>0:
            if not config_file.lower().endswith(".yml"):
                config_file = config_file + ".yml"
            break
        else:
            continue

    print()
    print("Enter absolute path(s) to raw data folder(s). Separate multiple paths with comma.")
    print("For example: /home/user/data/2021,/home/user/data/2022")
    while True:
        user_input = input("> ")
        if len(user_input)>0:
            load_path=user_input.split(",")
            break
        else:
            continue

    print()
    print("Enter absolute path to processed data folder.")
    print("For example: /home/user/campaign")
    while True:
        save_path = input("> ")
        if len(save_path)>0:
            break
        else:
            continue

    print()
    print("Enter start of measurement (YYYY-MM-DD)")
    print("Leave empty if you want to start from the earliest date")
    while True:
        start_date = input("> ")
        if len(start_date)==0:
            break
        try:
            start_dt = pd.to_datetime(start_date)
            break
        except:
            continue

    print()
    print("Enter end of measurement (YYYY-MM-DD)")
    print("If empty processor assumes current day")
    while True:
        end_date = input("> ")
        if len(end_date)==0:
            break
        try:
            end_dt = pd.to_datetime(end_date)
            break
        except:
            continue

    print()
    print("Enter absolute path to database file")
    print("For example: /home/user/campaign.json")
    while True:
        database_file = input("> ")
        if len(database_file)>0:
            if not database_file.lower().endswith(".json"):
                database_file = database_file + ".json"
            break
        else:
            continue

    print()
    print("Reprocess on re-run (True/False)")
    while True:
        allow_reprocessing = input("> ")
        if ((allow_reprocessing=='True') or (allow_reprocessing=='False')):
            if (allow_reprocessing=='True'):
                allow_reprocessing=True
            else:
                allow_reprocessing=False
            break
        else:
            continue

    print()
    print("Enter measurement location")
    print("For example: Helsinki, Kumpula")
    location = input("> ")

    ############## CLEANINIG
    print()
    print("Apply data cleaning procedures (True/False)")
    print("Attempt to remove corona ions and electrometer noise from data")
    while True:
        apply_cleaning = input("> ")
        if ((apply_cleaning=='True') or (apply_cleaning=='False')):
            if (apply_cleaning=='True'):
                apply_cleaning=True
            else:
                apply_cleaning=False
                remove_corona_ions=False
                remove_noisy_electrometers=False
                reclean=False
            break
        else:
            continue

    if apply_cleaning:
        print()
        print("Remove corona charger ions from particle data (True/False)")
        while True:
            remove_corona_ions = input("> ")
            if ((remove_corona_ions=='True') or (remove_corona_ions=='False')):
                if remove_corona_ions=='True':
                    remove_corona_ions=True
                else:
                    remove_corona_ions=False
                break
            else:
                continue

        print()
        print("Remove noisy electrometer data (True/False)")
        while True:
            remove_noisy_electrometers = input("> ")
            if ((remove_noisy_electrometers=='True') or (remove_noisy_electrometers=='False')):
                if remove_noisy_electrometers=='True':
                    remove_noisy_electrometers=True
                else:
                    remove_noisy_electrometers=False
                break
            else:
                continue

    ################### CORRECTIONS
    print()
    print("Apply corrections (True/False)")
    print("Requires a NAIS with temperature and pressure sensors.")
    while True:
        apply_corrections = input("> ")
        if ((apply_corrections=='True') or (apply_corrections=='False')):
            if apply_corrections=='True':
                apply_corrections=True
            else:
                apply_corrections=False
                sealevel_correction=False
                recorrect=False
                inlet_length = 0.0
            break
        else:
            continue

    if apply_corrections:
        print()
        print("Length of the inlet in meters")
        while True:
            inlet_length = input("> ")
            try:
                inlet_length = float(inlet_length)
                break
            except:
                continue

        print()
        print("Correct concentrations to sealevel conditions (True/False)")
        while True:
            sealevel_correction = input("> ")
            if sealevel_correction=='True':
                sealevel_correction=True
                break
            elif sealevel_correction=='False':
                sealevel_correction=False
                break
            else:
                continue

    print()
    print("Configuration saved to: %s"%config_file)
    print()

    # Make a dictionary out of the user input
    config_info = {
        "data_folder": load_path,
        "processed_folder": save_path,
        "start_date": start_date,
        "end_date": end_date,
        "database_file": database_file,
        "location": location,
        "inlet_length": inlet_length,
        "apply_corrections":apply_corrections,
        "sealevel_correction": sealevel_correction,
        "remove_corona_ions": remove_corona_ions,
        "remove_noisy_electrometers": remove_noisy_electrometers,
        "allow_reprocess": allow_reprocessing,
        "apply_cleaning": apply_cleaning
    }

    # Save the config file
    with open(config_file,"w") as cf:
        yaml.dump(config_info,cf)


# Inlet losses
################################################################################
def tubeloss(dpp,pflow,plength,temp,press):
    """ Laminar diffusion losses in circular straight conduit """
    DPP,TEMP = np.meshgrid(dpp,temp)
    DPP,PRESS = np.meshgrid(dpp,press)
    DPP,PFLOW = np.meshgrid(dpp,pflow)
    rmuu = np.pi*af.particle_diffusivity(DPP,TEMP,PRESS)*plength/PFLOW
    pene = np.nan*np.ones(rmuu.shape)
    cond1=rmuu<0.02
    cond2=rmuu>=0.02
    pene[cond1] = 1. - 2.56*rmuu[cond1]**(2./3.) + 1.2*rmuu[cond1]+0.177*rmuu[cond1]**(4./3.)
    pene[cond2] = 0.819*np.exp(-3.657*rmuu[cond2]) + 0.097*np.exp(-22.3*rmuu[cond2]) + 0.032*np.exp(-57.0*rmuu[cond2])
    return pene

# Read raw data file into a dataframe
################################################################################
def read_file(fn):
    """
    Read NAIS raw data file into a pandas DataFrame

    Parameters
    ----------

    fn : str
        Raw data filename

    Returns
    -------

    pandas.DataFrame
        Contents of the file in dataframe

    """

    with open(fn,'r') as f:

        header_found = False
        data_matrix=[]
        lines = f.read().splitlines()
        
        for line in lines:

             # Skip empty and comments
             if (len(line)==0):
                 continue

             if (line[0]=='#'):
                 continue

             # Test if it is a header
             elif (header_found==False):
                 if "opmode" in line:
                     delimiter = re.search('(.)opmode',line).group(1)
                     header = line.split(delimiter)
                     number_of_columns = len(header)
                     header_found = True
                     continue
                 else:
                     continue 
             else:
                 data_line = line.split(delimiter)
                 
                 if ((len(data_line)==number_of_columns) & ("opmode" not in data_line)):
                     data_matrix.append(data_line)
                 continue

    if len(data_matrix)==0:
        return None

    else:
        # Convert anything that can be converted to float and the rest is coerced to NaNs
        df = pd.DataFrame(columns = header, data = data_matrix)
        df.iloc[:,3:] = df.iloc[:,3:].apply(pd.to_numeric, errors='coerce').astype(float)

        # Establish begin_time (first column) as index
        df = df.set_index(df.columns[0])
        df.index = pd.to_datetime(df.index)
        
        return df

# Average data into the standard size bins
################################################################################
def average_mob(y,h):
    data = pd.DataFrame([])
    
    for i in range(0,len(mob_ion_geomeans)):
        if i==0:
            y_block = y.iloc[:,h>mob_ion_geomeans[i]]
        else:
            y_block = y.iloc[:,((h>mob_ion_geomeans[i]) & (h<=mob_ion_geomeans[i-1]))]

        data[i] = y_block.median(axis=1)

    y_block = y.iloc[:,h<=mob_ion_geomeans[i]]
    data[i+1] = y_block.mean(axis=1)

    return data

def average_dp(y,h):
    data = pd.DataFrame([])

    for i in range(0,len(dp_par_geomeans)):
        if i==0:
            y_block = y.iloc[:,h<dp_par_geomeans[i]]
        else:
            y_block = y.iloc[:,((h<dp_par_geomeans[i]) & (h>=dp_par_geomeans[i-1]))]

        data[i] = y_block.median(axis=1)

    y_block = y.iloc[:,h>=dp_par_geomeans[i]]
    data[i+1] = y_block.mean(axis=1)

    return data

# Find diagnostic names
################################################################################
def find_diagnostic_names(diag_params):

    sampleflow_name=None
    sampleflow_names=None
    temperature_name=None
    pressure_name=None

    for temp_name in possible_temperature_names:
         if temp_name in diag_params:
             temperature_name = temp_name
             break

    for pres_name in possible_pressure_names:
        if pres_name in diag_params:
            pressure_name = pres_name
            break

    sf_name = []
    for flow_name in possible_sampleflow_names:
        if flow_name in diag_params:
            sf_name.append(flow_name)

    if len(sf_name)==2:
        sampleflow_names = sf_name
    if len(sf_name)==1:
        sampleflow_name = sf_name

    return temperature_name, pressure_name, sampleflow_names, sampleflow_name

# Process the data (convert to dndlogdp & corrections)
################################################################################
def process_data(
    df,
    mode):

    if (df is None):
        return None, None

    elif not df.index.to_series().is_monotonic_increasing:
        return None, None

    else:
        df_columns = df.columns
        df_inverter_reso = int((len(df_columns)-2)/4)

        neg_df = df.iloc[:,2:2+df_inverter_reso]
        pos_df = df.iloc[:,2+2*df_inverter_reso:2+3*df_inverter_reso]

        if mode=="ions":
            mob_ion_inv = np.array([float(re.findall(r"[-+]?\d*\.\d+|\d+",y)[0])
                                    for y in df_columns[2:2+df_inverter_reso]])

            neg_df = average_mob(neg_df,mob_ion_inv)
            pos_df = average_mob(pos_df,mob_ion_inv)

            # Convert to number size distributions
            neg_df = neg_df * dlogmob_ion / dlogdp_ion
            pos_df = pos_df * dlogmob_ion / dlogdp_ion

        if mode=="particles":
            dp_par_inv = 2.0*np.array([float(re.findall(r"[-+]?\d*\.\d+|\d+",y)[0])
                                       for y in df_columns[2:2+df_inverter_reso]])
        
            neg_df = average_dp(neg_df,dp_par_inv)
            pos_df = average_dp(pos_df,dp_par_inv)

        # Construct the headers
        if mode=="ions":
            df_header = dp_ion*1e-9
        if mode=="particles":
            df_header = dp_par*1e-9

        negdf = pd.DataFrame(columns=df_header, index=df.index, data=neg_df.values)
        posdf = pd.DataFrame(columns=df_header, index=df.index, data=pos_df.values)

        negdf.index.name = "Time"
        posdf.index.name= "Time"

        if negdf.isna().all().all():
            negdf = None
        if posdf.isna().all().all():
            posdf = None

        return negdf, posdf

def correct_data(
    df,
    rec,
    mode,
    do_sealevel_corr,
    pipe_length):

    if ((rec is None) or (df is None)):
        return None

    else:        
        # Extract the records that match the mode
        if mode=="ions":
            df_rec = rec[rec.opmode=='ions']
        if mode=="particles":
            df_rec = rec[rec.opmode=='particles']

        if not df_rec.index.to_series().is_monotonic_increasing:
            return None
        
        df_rec = df_rec.reindex(df.index,method="nearest")

        # Check that the relevant diagnostic data is found
        t_name,p_name,sf_names,sf_name = find_diagnostic_names(list(df_rec))
        if ((t_name is not None) & 
            (p_name is not None) &
            ((sf_names is not None) | 
             (sf_name is not None))):
            pass
        else:
            return None
    
        # Temperature
        t_df = 273.15 + df_rec[t_name].astype(float).to_frame()

        # Pressure
        p_df = 100.0 * df_rec[p_name].astype(float).to_frame()
    
        # Sampleflow
        if sf_names is not None:
            flow_df = df_rec[sf_names].sum(axis=1,min_count=2).astype(float).to_frame()
        if sf_name is not None:
            flow_df = df_rec[sf_name].astype(float).to_frame()
    
        # Test if the sampleflow is in cm3/s (old models) or 
        # l/min and possibly convert to l/min
        if (np.nanmedian(flow_df)>300):
            flow_df = (flow_df/1000.0) * 60.0
        else:
            pass
    
        # If all parameters are NaN
        # e.g. sensor is broken
        if (flow_df.isna().all().all() |
            p_df.isna().all().all() |
            t_df.isna().all().all()):
            return None
    
        # Sanity check the values
        t_df = t_df.where(((t_df>=223.)|(t_df<=353.)),np.nan)
        p_df = p_df.where(((p_df>=37000.)|(p_df<=121000.)),np.nan)
        flow_df = flow_df.where(((flow_df>=48.)|(flow_df<=60.)),np.nan)
    
        # Correct the number concentrations to standard conditions (optional)
        if (do_sealevel_corr):
            stp_corr_df = (pres_ref*t_df.values)/(temp_ref*p_df.values)
            df = stp_corr_df * df
       
        # Diffusion loss correction
        if mode=="ions":
            throughput = tubeloss(dp_ion*1e-9,flow_df.values*1.667e-5,pipe_length,t_df.values,p_df.values)
        if mode=="particles":
            throughput = tubeloss(dp_par*1e-9,flow_df.values*1.667e-5,pipe_length,t_df.values,p_df.values)
        
        df = df / throughput
    
        # Robert Wagner's calibration (only ions)
        if mode=="ions":
            roberts_corr = 0.713*dp_ion**0.120
            df = df / roberts_corr
    
        return df

# Data clean-up
################################################################################
def data_cleaner(df):
    """ Returns a cleaned data array and portion of data removed """

    if df is None:
        return None

    # Rolling time window
    reso_in_seconds = (df.index[1]-df.index[0]).seconds
    small_window = int((10.*60.)/(reso_in_seconds))
    large_window = int((24.*60.*60.)/(reso_in_seconds))

    # Calculate standard deviation in 10 min segments
    df2=df.rolling(small_window, min_periods=int((small_window+1.)/2.), center=True).std()

    # In a bigger window (24 hours) calculate the 75th quantile of the standard deviations
    # (semi)continous noise causes higher values compared to normal and rare sudden changes in conc
    df3=df2.rolling(large_window, min_periods=int((large_window+1.)/2.), center=True).quantile(0.75)

    # a Good threshold for significant noise seems to be 5 times the median background
    threshold = 5*np.nanmedian(df3)

    df4 = df.where(df3 < threshold, np.nan)

    return df4

def corona_limit(df):
    """ Find corona ion upper limit using maximum concentration difference """

    # Only consider likely limit range
    lower = 1.5e-9
    upper = 5.0e-9
    c = (lower <= df.columns.values) & (upper >= df.columns.values)
    df2 = df.loc[:, c]

    # Find maximum difference between size bin medians
    return df2.columns.values[df2.median().diff().abs().argmax()]

def corona_ion_cleaner(df):
    """ Return a cleaned data array and ratio of data removed """

    if df is None:
        return None

    corona_lim = corona_limit(df)
    df2 = df.copy()

    # Set values below corona ion limit to NaNs
    df2.iloc[:,df2.columns.values<=corona_lim]=np.nan

    return df2

def check_config(f):
    """
    Check that config file is ok

    Parameters
    ----------

    f : `str`
        full path to the configuration file

    Returns
    -------

    boolean
        `True` if file is OK
        `False` if file is not OK

    """

    if not os.path.isfile(f):
        print("Config not found")
        return False

    print(f)

    with open(f,'r') as stream:
        try:
            config = yaml.safe_load(stream)
            load_path = config['data_folder']
            save_path = config['processed_folder']
            start_date = config['start_date']
            database = config['database_file']
            location = config['location']
            end_date = config['end_date']
            allow_reprocess = config["allow_reprocess"]
            pipelength = config['inlet_length']
            sealevel_correction = config['sealevel_correction']
            apply_corrections = config['apply_corrections']
            apply_cleaning=config["apply_cleaning"]
            remove_noisy_electrometers = config["remove_noisy_electrometers"]
            remove_corona_ions = config["remove_corona_ions"]
        except:
            print("Config badly formatted")
            return False

    try:
      db = TinyDB(database)
      check = Query()
    except:
        print("Could not init DB")
        return False

    if start_date=='':
        pass
    elif isinstance(start_date,date):
        pass
    else:
        print("Bad start_date")
        return False

    if end_date=='':
        pass
    elif isinstance(end_date,date):
        pass
    else:
        print("Bad end_date")
        return False

    if os.path.exists(save_path)==False:
        print("Bad save path")
        return False

    for x in load_path:
        if os.path.exists(x)==False:
            print("Bad load path")
            return False

    if (allow_reprocess==True) or (allow_reprocess==False):
        pass
    else:
        print("Bad allow_reprocess")
        return False

    if (remove_corona_ions==True) or (remove_corona_ions==False):
        pass
    else:
        print("Bad remove_corona_ions")
        return False

    if (remove_noisy_electrometers==True) or (remove_noisy_electrometers==False):
        pass
    else:
        print("Bad remove_noisy_electrometers")
        return False

    if (sealevel_correction==True) or (sealevel_correction==False):
        pass
    else:
        print("Bad sealevel_correction")
        return False

    if (apply_cleaning==True) or (apply_cleaning==False):
        pass
    else:
        print("Bad apply_cleaning")
        return False

    if (apply_corrections==True) or (apply_corrections==False):
        pass
    else:
        print("Bad apply_corrections")
        return False

    try:
        float(pipelength)
    except:
        print("Bad inlet_length")
        return False

    return True


def nais_processor(config_file):
    """ Function that is called to processes data from the NAIS

    Parameters
    ----------

    config_file : str
        full path to the configuration file

    """

    ################# READING CONFIG
    if not check_config(config_file):
        return 

    # Today
    today_dt = date.today()

    with open(config_file,'r') as stream:
        config = yaml.safe_load(stream)
        load_path = config['data_folder']
        save_path = config['processed_folder']
        start_date = config['start_date']
        database = config['database_file']
        location = config['location']
        end_date = config['end_date']
        allow_reprocess = config["allow_reprocess"]
        pipelength = config['inlet_length']
        sealevel_correction = config['sealevel_correction']
        apply_corrections = config['apply_corrections']
        apply_cleaning=config["apply_cleaning"]
        remove_noisy_electrometers = config["remove_noisy_electrometers"]
        remove_corona_ions = config["remove_corona_ions"]


    ##################### UPDATING DATABASE
    if start_date=='':
        start_date = date(2000,1,1)
    if end_date=='':
        end_date = today_dt

    db = TinyDB(database)
    check = Query()

    start_dt=pd.to_datetime(start_date)
    end_dt=pd.to_datetime(end_date)

    start_date_str = start_dt.strftime("%Y%m%d")
    end_date_str = end_dt.strftime("%Y%m%d")

    # list existing dates
    list_of_existing_dates = [x["timestamp"] for x in db.search(check.diagnostics.exists())]

    if len(list_of_existing_dates)==0:
        print("building database...")
        list_of_datetimes = pd.date_range(start=start_date_str, end=end_date_str)
    else:
        last_existing_date = sorted(list_of_existing_dates)[-1]
        list_of_datetimes = pd.date_range(start=last_existing_date, end=end_date_str)
    
    # Add unprocessed datafiles to the database
    for x in list_of_datetimes:
        if (x.strftime("%Y%m%d") in list_of_existing_dates):
            continue
        else:
            files_found=False
            for z in load_path:
                for y in filename_formats:

                    ion_fn = os.path.join(z,x.strftime(y[0]))
                    particle_fn = os.path.join(z,x.strftime(y[1]))
                    diagnostic_fn = os.path.join(z,x.strftime(y[2]))

                    if ( (os.path.exists(ion_fn) | # ions
                         os.path.exists(particle_fn)) & # particles
                         os.path.exists(diagnostic_fn) # diagnostics
                       ):

                        dtstr = x.strftime("%Y%m%d")

                        db.insert(
                            {"timestamp":dtstr,
                            "diagnostics":diagnostic_fn}
                            )

                        if os.path.exists(ion_fn):
                            db.update(
                                {"ions":ion_fn},
                                check.timestamp==dtstr)

                        if os.path.exists(particle_fn):
                            db.update(
                                {"particles":particle_fn},
                                check.timestamp==dtstr)

                        files_found=True
                        break

                if files_found:
                    break

    # From the database find the last day with processed data
    processed_days = db.search(
        check.processed_neg_ion_file.exists() |
        check.processed_pos_ion_file.exists() |
        check.processed_neg_particle_file.exists() |
        check.processed_pos_particle_file.exists())

    if len(processed_days)!=0:
        last_day=np.max([datetime.strptime(x["timestamp"],"%Y%m%d") for x in processed_days]).strftime("%Y%m%d")
    else:
        last_day=None

    ############## PROCESSING
    # reprocess data in db
    if allow_reprocess:
        iterator1 = iter(db.search(
         (check.diagnostics.exists() &
          (check.ions.exists() |
          check.particles.exists()) &
          (check.timestamp>=start_date_str) &
          (check.timestamp<=end_date_str))))
    else:
        iterator1 = iter(db.search(
            ((check.timestamp==last_day) &
             (check.timestamp>=start_date_str) &
             (check.timestamp<=end_date_str)) |
            (check.diagnostics.exists() &
             (check.ions.exists() |
             check.particles.exists()) &
             ~check.processed_neg_ion_file.exists() &
             ~check.processed_pos_ion_file.exists() &
             ~check.processed_neg_particle_file.exists() &
             ~check.processed_pos_particle_file.exists() &
             (check.timestamp>=start_date_str) &
             (check.timestamp<=end_date_str))))

    for x in iterator1:

        print("processing %s (%s)" % (x["timestamp"],location))

        ions_exist=bool(db.search(
            check.ions.exists() &
            (check.timestamp==x["timestamp"])))

        particles_exist=bool(db.search(
            check.particles.exists() &
            (check.timestamp==x["timestamp"])))

        records = read_file(x["diagnostics"])

        # ions
        if ions_exist:

            ions = read_file(x["ions"])

            negion_datamatrix,posion_datamatrix = process_data(ions,"ions")

            if apply_corrections:
                negion_datamatrix = correct_data(
                       negion_datamatrix,
                       records,
                       "ions",
                       sealevel_correction,
                       pipelength)
    
                posion_datamatrix = correct_data(
                       posion_datamatrix,
                       records,
                       "ions",
                       sealevel_correction,
                       pipelength)

            if apply_cleaning:
                if remove_noisy_electrometers:
                    negion_datamatrix = data_cleaner(negion_datamatrix)
                    posion_datamatrix = data_cleaner(posion_datamatrix)

            if (negion_datamatrix is not None):
                my_save_path_neg=os.path.join(save_path,"NAISn"+x["timestamp"]+"nds.sum")
                negion_datamatrix.to_csv(my_save_path_neg)
                db.update({"processed_neg_ion_file": my_save_path_neg},
                    check.timestamp==x["timestamp"])

            if (posion_datamatrix is not None):
                my_save_path_pos=os.path.join(save_path,"NAISp"+x["timestamp"]+"nds.sum")
                posion_datamatrix.to_csv(my_save_path_pos)
                db.update({"processed_pos_ion_file": my_save_path_pos},
                    check.timestamp==x["timestamp"])

        # particles
        if particles_exist:

            # Process particles
            particles = read_file(x["particles"])

            negpar_datamatrix,pospar_datamatrix = process_data(particles,"particles")

            if apply_corrections:
                negpar_datamatrix = correct_data(
                       negpar_datamatrix,
                       records,
                       "particles",
                       sealevel_correction,
                       pipelength)
    
                pospar_datamatrix = correct_data(
                       pospar_datamatrix,
                       records,
                       "particles",
                       sealevel_correction,
                       pipelength)

            if apply_cleaning:
                if remove_corona_ions:
                    negpar_datamatrix = corona_ion_cleaner(negpar_datamatrix)
                    pospar_datamatrix = corona_ion_cleaner(pospar_datamatrix)
    
                if remove_noisy_electrometers:
                    negpar_datamatrix = data_cleaner(negpar_datamatrix)
                    negpar_datamatrix = data_cleaner(pospar_datamatrix)
 
            if (pospar_datamatrix is not None):
                my_save_path_neg=os.path.join(save_path,"NAISn"+x["timestamp"]+"np.sum")
                negpar_datamatrix.to_csv(my_save_path_neg)
                db.update({"processed_neg_particle_file": my_save_path_neg},
                    check.timestamp==x["timestamp"])

            if (negpar_datamatrix is not None):
                my_save_path_pos=os.path.join(save_path,"NAISp"+x["timestamp"]+"np.sum")
                pospar_datamatrix.to_csv(my_save_path_pos)
                db.update({"processed_pos_particle_file": my_save_path_pos},
                    check.timestamp==x["timestamp"])

    print("Done!")


def combine_databases(database_list, combined_database):
    """Combine JSON databases

    If the measurement setup changes one may have to use multiple configuration files
    which results in multiple databases. With this function you can combine the databases
    into a single database after processing.

    Parameters
    ----------

    database_list : `str`
        List of full paths to databases that should be combined

        First database should have the earliest data, second database
        the second earliest and so on

    combined_database : `str`
        full path to combined database
    
    """

    DB = {}
    i = 0

    for database in database_list:

        fid=open(database)

        database_json=json.load(fid)

        for key in database_json["_default"]:
            DB[i] = database_json["_default"][key]
            i=i+1

    with open(combined_database, "w") as f:
        json.dump({"_default":DB},f)

def combine_spectra(
    database_file,
    begin_time,
    end_time,
    spectrum_type="negion",
    reso=60):
    """
    Combine (cleaned) processed particle or ion data from some time range

    Parameters
    ----------

    database_file : `str`
        full path to database_file

    begin_time : `str`
        time aware iso formatted time string

        For example `"2013-01-02 15:00:00+02:00"`

    end_time : `str`
        time aware iso formatted time string

        For example `"2013-01-03 17:00:00+02:00"`

    spectrum_type : `str`
        negative ions `negion` (default)

        positive ions `posion`

        negative particles `negpar`

        positive particles `pospar`

    reso : `int`
        desired resolution given in minutes

    Returns
    -------

    pandas.DataFrame
        Combined aerosol number size distribution in the given 
        time interval

    """

    db = TinyDB(database_file)
    check = Query()

    begin_dt=pd.to_datetime(begin_time)
    end_dt=pd.to_datetime(end_time)

    begin_date=begin_dt.strftime("%Y%m%d")
    end_date=end_dt.strftime("%Y%m%d")

    if spectrum_type=="negpar":
        iterator = iter(db.search(
            (check.processed_neg_particle_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_neg_particle_file"
    elif spectrum_type=="pospar":
        iterator = iter(db.search(
            (check.processed_pos_particle_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_pos_particle_file"
    elif spectrum_type=="negion":
        iterator = iter(db.search(
            (check.processed_neg_ion_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_neg_ion_file"
    elif spectrum_type=="posion":
        iterator = iter(db.search(
            (check.processed_pos_ion_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_pos_ion_file"
    else:
        print("ERROR: %s is not valid 'spectrum_type'" % spectrum_type)
        return

    filenames = [x[db_entry] for x in iterator]

    df = af.stack_data(filenames,begin_time,end_time,reso)

    return df

Functions

def check_config(f)

Check that config file is ok

Parameters

f : str
full path to the configuration file

Returns

boolean
True if file is OK False if file is not OK
Expand source code
def check_config(f):
    """
    Check that config file is ok

    Parameters
    ----------

    f : `str`
        full path to the configuration file

    Returns
    -------

    boolean
        `True` if file is OK
        `False` if file is not OK

    """

    if not os.path.isfile(f):
        print("Config not found")
        return False

    print(f)

    with open(f,'r') as stream:
        try:
            config = yaml.safe_load(stream)
            load_path = config['data_folder']
            save_path = config['processed_folder']
            start_date = config['start_date']
            database = config['database_file']
            location = config['location']
            end_date = config['end_date']
            allow_reprocess = config["allow_reprocess"]
            pipelength = config['inlet_length']
            sealevel_correction = config['sealevel_correction']
            apply_corrections = config['apply_corrections']
            apply_cleaning=config["apply_cleaning"]
            remove_noisy_electrometers = config["remove_noisy_electrometers"]
            remove_corona_ions = config["remove_corona_ions"]
        except:
            print("Config badly formatted")
            return False

    try:
      db = TinyDB(database)
      check = Query()
    except:
        print("Could not init DB")
        return False

    if start_date=='':
        pass
    elif isinstance(start_date,date):
        pass
    else:
        print("Bad start_date")
        return False

    if end_date=='':
        pass
    elif isinstance(end_date,date):
        pass
    else:
        print("Bad end_date")
        return False

    if os.path.exists(save_path)==False:
        print("Bad save path")
        return False

    for x in load_path:
        if os.path.exists(x)==False:
            print("Bad load path")
            return False

    if (allow_reprocess==True) or (allow_reprocess==False):
        pass
    else:
        print("Bad allow_reprocess")
        return False

    if (remove_corona_ions==True) or (remove_corona_ions==False):
        pass
    else:
        print("Bad remove_corona_ions")
        return False

    if (remove_noisy_electrometers==True) or (remove_noisy_electrometers==False):
        pass
    else:
        print("Bad remove_noisy_electrometers")
        return False

    if (sealevel_correction==True) or (sealevel_correction==False):
        pass
    else:
        print("Bad sealevel_correction")
        return False

    if (apply_cleaning==True) or (apply_cleaning==False):
        pass
    else:
        print("Bad apply_cleaning")
        return False

    if (apply_corrections==True) or (apply_corrections==False):
        pass
    else:
        print("Bad apply_corrections")
        return False

    try:
        float(pipelength)
    except:
        print("Bad inlet_length")
        return False

    return True
def combine_databases(database_list, combined_database)

Combine JSON databases

If the measurement setup changes one may have to use multiple configuration files which results in multiple databases. With this function you can combine the databases into a single database after processing.

Parameters

database_list : str

List of full paths to databases that should be combined

First database should have the earliest data, second database the second earliest and so on

combined_database : str
full path to combined database
Expand source code
def combine_databases(database_list, combined_database):
    """Combine JSON databases

    If the measurement setup changes one may have to use multiple configuration files
    which results in multiple databases. With this function you can combine the databases
    into a single database after processing.

    Parameters
    ----------

    database_list : `str`
        List of full paths to databases that should be combined

        First database should have the earliest data, second database
        the second earliest and so on

    combined_database : `str`
        full path to combined database
    
    """

    DB = {}
    i = 0

    for database in database_list:

        fid=open(database)

        database_json=json.load(fid)

        for key in database_json["_default"]:
            DB[i] = database_json["_default"][key]
            i=i+1

    with open(combined_database, "w") as f:
        json.dump({"_default":DB},f)
def combine_spectra(database_file, begin_time, end_time, spectrum_type='negion', reso=60)

Combine (cleaned) processed particle or ion data from some time range

Parameters

database_file : str
full path to database_file
begin_time : str

time aware iso formatted time string

For example "2013-01-02 15:00:00+02:00"

end_time : str

time aware iso formatted time string

For example "2013-01-03 17:00:00+02:00"

spectrum_type : str

negative ions negion (default)

positive ions posion

negative particles negpar

positive particles pospar

reso : int
desired resolution given in minutes

Returns

pandas.DataFrame
Combined aerosol number size distribution in the given time interval
Expand source code
def combine_spectra(
    database_file,
    begin_time,
    end_time,
    spectrum_type="negion",
    reso=60):
    """
    Combine (cleaned) processed particle or ion data from some time range

    Parameters
    ----------

    database_file : `str`
        full path to database_file

    begin_time : `str`
        time aware iso formatted time string

        For example `"2013-01-02 15:00:00+02:00"`

    end_time : `str`
        time aware iso formatted time string

        For example `"2013-01-03 17:00:00+02:00"`

    spectrum_type : `str`
        negative ions `negion` (default)

        positive ions `posion`

        negative particles `negpar`

        positive particles `pospar`

    reso : `int`
        desired resolution given in minutes

    Returns
    -------

    pandas.DataFrame
        Combined aerosol number size distribution in the given 
        time interval

    """

    db = TinyDB(database_file)
    check = Query()

    begin_dt=pd.to_datetime(begin_time)
    end_dt=pd.to_datetime(end_time)

    begin_date=begin_dt.strftime("%Y%m%d")
    end_date=end_dt.strftime("%Y%m%d")

    if spectrum_type=="negpar":
        iterator = iter(db.search(
            (check.processed_neg_particle_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_neg_particle_file"
    elif spectrum_type=="pospar":
        iterator = iter(db.search(
            (check.processed_pos_particle_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_pos_particle_file"
    elif spectrum_type=="negion":
        iterator = iter(db.search(
            (check.processed_neg_ion_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_neg_ion_file"
    elif spectrum_type=="posion":
        iterator = iter(db.search(
            (check.processed_pos_ion_file.exists()) &
            (check.timestamp>=begin_date) &
            (check.timestamp<=end_date)))
        db_entry = "processed_pos_ion_file"
    else:
        print("ERROR: %s is not valid 'spectrum_type'" % spectrum_type)
        return

    filenames = [x[db_entry] for x in iterator]

    df = af.stack_data(filenames,begin_time,end_time,reso)

    return df
def make_config()

Make a configuration file for processing NAIS data

Running make_config() asks information about the measurement and the data, then writes the configuration file to the specified location.

Expand source code
def make_config():
    """
    Make a configuration file for processing NAIS data

    Running `make_config()` asks information about the
    measurement and the data, then writes the configuration
    file to the specified location.

    """

    # Collect data from the user
    print()
    print("Enter absolute path to configuration file.")
    print("For example: /home/user/campaign.yml")
    while True:
        config_file = input("> ")
        if len(config_file)>0:
            if not config_file.lower().endswith(".yml"):
                config_file = config_file + ".yml"
            break
        else:
            continue

    print()
    print("Enter absolute path(s) to raw data folder(s). Separate multiple paths with comma.")
    print("For example: /home/user/data/2021,/home/user/data/2022")
    while True:
        user_input = input("> ")
        if len(user_input)>0:
            load_path=user_input.split(",")
            break
        else:
            continue

    print()
    print("Enter absolute path to processed data folder.")
    print("For example: /home/user/campaign")
    while True:
        save_path = input("> ")
        if len(save_path)>0:
            break
        else:
            continue

    print()
    print("Enter start of measurement (YYYY-MM-DD)")
    print("Leave empty if you want to start from the earliest date")
    while True:
        start_date = input("> ")
        if len(start_date)==0:
            break
        try:
            start_dt = pd.to_datetime(start_date)
            break
        except:
            continue

    print()
    print("Enter end of measurement (YYYY-MM-DD)")
    print("If empty processor assumes current day")
    while True:
        end_date = input("> ")
        if len(end_date)==0:
            break
        try:
            end_dt = pd.to_datetime(end_date)
            break
        except:
            continue

    print()
    print("Enter absolute path to database file")
    print("For example: /home/user/campaign.json")
    while True:
        database_file = input("> ")
        if len(database_file)>0:
            if not database_file.lower().endswith(".json"):
                database_file = database_file + ".json"
            break
        else:
            continue

    print()
    print("Reprocess on re-run (True/False)")
    while True:
        allow_reprocessing = input("> ")
        if ((allow_reprocessing=='True') or (allow_reprocessing=='False')):
            if (allow_reprocessing=='True'):
                allow_reprocessing=True
            else:
                allow_reprocessing=False
            break
        else:
            continue

    print()
    print("Enter measurement location")
    print("For example: Helsinki, Kumpula")
    location = input("> ")

    ############## CLEANINIG
    print()
    print("Apply data cleaning procedures (True/False)")
    print("Attempt to remove corona ions and electrometer noise from data")
    while True:
        apply_cleaning = input("> ")
        if ((apply_cleaning=='True') or (apply_cleaning=='False')):
            if (apply_cleaning=='True'):
                apply_cleaning=True
            else:
                apply_cleaning=False
                remove_corona_ions=False
                remove_noisy_electrometers=False
                reclean=False
            break
        else:
            continue

    if apply_cleaning:
        print()
        print("Remove corona charger ions from particle data (True/False)")
        while True:
            remove_corona_ions = input("> ")
            if ((remove_corona_ions=='True') or (remove_corona_ions=='False')):
                if remove_corona_ions=='True':
                    remove_corona_ions=True
                else:
                    remove_corona_ions=False
                break
            else:
                continue

        print()
        print("Remove noisy electrometer data (True/False)")
        while True:
            remove_noisy_electrometers = input("> ")
            if ((remove_noisy_electrometers=='True') or (remove_noisy_electrometers=='False')):
                if remove_noisy_electrometers=='True':
                    remove_noisy_electrometers=True
                else:
                    remove_noisy_electrometers=False
                break
            else:
                continue

    ################### CORRECTIONS
    print()
    print("Apply corrections (True/False)")
    print("Requires a NAIS with temperature and pressure sensors.")
    while True:
        apply_corrections = input("> ")
        if ((apply_corrections=='True') or (apply_corrections=='False')):
            if apply_corrections=='True':
                apply_corrections=True
            else:
                apply_corrections=False
                sealevel_correction=False
                recorrect=False
                inlet_length = 0.0
            break
        else:
            continue

    if apply_corrections:
        print()
        print("Length of the inlet in meters")
        while True:
            inlet_length = input("> ")
            try:
                inlet_length = float(inlet_length)
                break
            except:
                continue

        print()
        print("Correct concentrations to sealevel conditions (True/False)")
        while True:
            sealevel_correction = input("> ")
            if sealevel_correction=='True':
                sealevel_correction=True
                break
            elif sealevel_correction=='False':
                sealevel_correction=False
                break
            else:
                continue

    print()
    print("Configuration saved to: %s"%config_file)
    print()

    # Make a dictionary out of the user input
    config_info = {
        "data_folder": load_path,
        "processed_folder": save_path,
        "start_date": start_date,
        "end_date": end_date,
        "database_file": database_file,
        "location": location,
        "inlet_length": inlet_length,
        "apply_corrections":apply_corrections,
        "sealevel_correction": sealevel_correction,
        "remove_corona_ions": remove_corona_ions,
        "remove_noisy_electrometers": remove_noisy_electrometers,
        "allow_reprocess": allow_reprocessing,
        "apply_cleaning": apply_cleaning
    }

    # Save the config file
    with open(config_file,"w") as cf:
        yaml.dump(config_info,cf)
def nais_processor(config_file)

Function that is called to processes data from the NAIS

Parameters

config_file : str
full path to the configuration file
Expand source code
def nais_processor(config_file):
    """ Function that is called to processes data from the NAIS

    Parameters
    ----------

    config_file : str
        full path to the configuration file

    """

    ################# READING CONFIG
    if not check_config(config_file):
        return 

    # Today
    today_dt = date.today()

    with open(config_file,'r') as stream:
        config = yaml.safe_load(stream)
        load_path = config['data_folder']
        save_path = config['processed_folder']
        start_date = config['start_date']
        database = config['database_file']
        location = config['location']
        end_date = config['end_date']
        allow_reprocess = config["allow_reprocess"]
        pipelength = config['inlet_length']
        sealevel_correction = config['sealevel_correction']
        apply_corrections = config['apply_corrections']
        apply_cleaning=config["apply_cleaning"]
        remove_noisy_electrometers = config["remove_noisy_electrometers"]
        remove_corona_ions = config["remove_corona_ions"]


    ##################### UPDATING DATABASE
    if start_date=='':
        start_date = date(2000,1,1)
    if end_date=='':
        end_date = today_dt

    db = TinyDB(database)
    check = Query()

    start_dt=pd.to_datetime(start_date)
    end_dt=pd.to_datetime(end_date)

    start_date_str = start_dt.strftime("%Y%m%d")
    end_date_str = end_dt.strftime("%Y%m%d")

    # list existing dates
    list_of_existing_dates = [x["timestamp"] for x in db.search(check.diagnostics.exists())]

    if len(list_of_existing_dates)==0:
        print("building database...")
        list_of_datetimes = pd.date_range(start=start_date_str, end=end_date_str)
    else:
        last_existing_date = sorted(list_of_existing_dates)[-1]
        list_of_datetimes = pd.date_range(start=last_existing_date, end=end_date_str)
    
    # Add unprocessed datafiles to the database
    for x in list_of_datetimes:
        if (x.strftime("%Y%m%d") in list_of_existing_dates):
            continue
        else:
            files_found=False
            for z in load_path:
                for y in filename_formats:

                    ion_fn = os.path.join(z,x.strftime(y[0]))
                    particle_fn = os.path.join(z,x.strftime(y[1]))
                    diagnostic_fn = os.path.join(z,x.strftime(y[2]))

                    if ( (os.path.exists(ion_fn) | # ions
                         os.path.exists(particle_fn)) & # particles
                         os.path.exists(diagnostic_fn) # diagnostics
                       ):

                        dtstr = x.strftime("%Y%m%d")

                        db.insert(
                            {"timestamp":dtstr,
                            "diagnostics":diagnostic_fn}
                            )

                        if os.path.exists(ion_fn):
                            db.update(
                                {"ions":ion_fn},
                                check.timestamp==dtstr)

                        if os.path.exists(particle_fn):
                            db.update(
                                {"particles":particle_fn},
                                check.timestamp==dtstr)

                        files_found=True
                        break

                if files_found:
                    break

    # From the database find the last day with processed data
    processed_days = db.search(
        check.processed_neg_ion_file.exists() |
        check.processed_pos_ion_file.exists() |
        check.processed_neg_particle_file.exists() |
        check.processed_pos_particle_file.exists())

    if len(processed_days)!=0:
        last_day=np.max([datetime.strptime(x["timestamp"],"%Y%m%d") for x in processed_days]).strftime("%Y%m%d")
    else:
        last_day=None

    ############## PROCESSING
    # reprocess data in db
    if allow_reprocess:
        iterator1 = iter(db.search(
         (check.diagnostics.exists() &
          (check.ions.exists() |
          check.particles.exists()) &
          (check.timestamp>=start_date_str) &
          (check.timestamp<=end_date_str))))
    else:
        iterator1 = iter(db.search(
            ((check.timestamp==last_day) &
             (check.timestamp>=start_date_str) &
             (check.timestamp<=end_date_str)) |
            (check.diagnostics.exists() &
             (check.ions.exists() |
             check.particles.exists()) &
             ~check.processed_neg_ion_file.exists() &
             ~check.processed_pos_ion_file.exists() &
             ~check.processed_neg_particle_file.exists() &
             ~check.processed_pos_particle_file.exists() &
             (check.timestamp>=start_date_str) &
             (check.timestamp<=end_date_str))))

    for x in iterator1:

        print("processing %s (%s)" % (x["timestamp"],location))

        ions_exist=bool(db.search(
            check.ions.exists() &
            (check.timestamp==x["timestamp"])))

        particles_exist=bool(db.search(
            check.particles.exists() &
            (check.timestamp==x["timestamp"])))

        records = read_file(x["diagnostics"])

        # ions
        if ions_exist:

            ions = read_file(x["ions"])

            negion_datamatrix,posion_datamatrix = process_data(ions,"ions")

            if apply_corrections:
                negion_datamatrix = correct_data(
                       negion_datamatrix,
                       records,
                       "ions",
                       sealevel_correction,
                       pipelength)
    
                posion_datamatrix = correct_data(
                       posion_datamatrix,
                       records,
                       "ions",
                       sealevel_correction,
                       pipelength)

            if apply_cleaning:
                if remove_noisy_electrometers:
                    negion_datamatrix = data_cleaner(negion_datamatrix)
                    posion_datamatrix = data_cleaner(posion_datamatrix)

            if (negion_datamatrix is not None):
                my_save_path_neg=os.path.join(save_path,"NAISn"+x["timestamp"]+"nds.sum")
                negion_datamatrix.to_csv(my_save_path_neg)
                db.update({"processed_neg_ion_file": my_save_path_neg},
                    check.timestamp==x["timestamp"])

            if (posion_datamatrix is not None):
                my_save_path_pos=os.path.join(save_path,"NAISp"+x["timestamp"]+"nds.sum")
                posion_datamatrix.to_csv(my_save_path_pos)
                db.update({"processed_pos_ion_file": my_save_path_pos},
                    check.timestamp==x["timestamp"])

        # particles
        if particles_exist:

            # Process particles
            particles = read_file(x["particles"])

            negpar_datamatrix,pospar_datamatrix = process_data(particles,"particles")

            if apply_corrections:
                negpar_datamatrix = correct_data(
                       negpar_datamatrix,
                       records,
                       "particles",
                       sealevel_correction,
                       pipelength)
    
                pospar_datamatrix = correct_data(
                       pospar_datamatrix,
                       records,
                       "particles",
                       sealevel_correction,
                       pipelength)

            if apply_cleaning:
                if remove_corona_ions:
                    negpar_datamatrix = corona_ion_cleaner(negpar_datamatrix)
                    pospar_datamatrix = corona_ion_cleaner(pospar_datamatrix)
    
                if remove_noisy_electrometers:
                    negpar_datamatrix = data_cleaner(negpar_datamatrix)
                    negpar_datamatrix = data_cleaner(pospar_datamatrix)
 
            if (pospar_datamatrix is not None):
                my_save_path_neg=os.path.join(save_path,"NAISn"+x["timestamp"]+"np.sum")
                negpar_datamatrix.to_csv(my_save_path_neg)
                db.update({"processed_neg_particle_file": my_save_path_neg},
                    check.timestamp==x["timestamp"])

            if (negpar_datamatrix is not None):
                my_save_path_pos=os.path.join(save_path,"NAISp"+x["timestamp"]+"np.sum")
                pospar_datamatrix.to_csv(my_save_path_pos)
                db.update({"processed_pos_particle_file": my_save_path_pos},
                    check.timestamp==x["timestamp"])

    print("Done!")
def read_file(fn)

Read NAIS raw data file into a pandas DataFrame

Parameters

fn : str
Raw data filename

Returns

pandas.DataFrame
Contents of the file in dataframe
Expand source code
def read_file(fn):
    """
    Read NAIS raw data file into a pandas DataFrame

    Parameters
    ----------

    fn : str
        Raw data filename

    Returns
    -------

    pandas.DataFrame
        Contents of the file in dataframe

    """

    with open(fn,'r') as f:

        header_found = False
        data_matrix=[]
        lines = f.read().splitlines()
        
        for line in lines:

             # Skip empty and comments
             if (len(line)==0):
                 continue

             if (line[0]=='#'):
                 continue

             # Test if it is a header
             elif (header_found==False):
                 if "opmode" in line:
                     delimiter = re.search('(.)opmode',line).group(1)
                     header = line.split(delimiter)
                     number_of_columns = len(header)
                     header_found = True
                     continue
                 else:
                     continue 
             else:
                 data_line = line.split(delimiter)
                 
                 if ((len(data_line)==number_of_columns) & ("opmode" not in data_line)):
                     data_matrix.append(data_line)
                 continue

    if len(data_matrix)==0:
        return None

    else:
        # Convert anything that can be converted to float and the rest is coerced to NaNs
        df = pd.DataFrame(columns = header, data = data_matrix)
        df.iloc[:,3:] = df.iloc[:,3:].apply(pd.to_numeric, errors='coerce').astype(float)

        # Establish begin_time (first column) as index
        df = df.set_index(df.columns[0])
        df.index = pd.to_datetime(df.index)
        
        return df