Module redvox.common.gap_and_pad_utils

Expand source code
from typing import List, Tuple, Optional
import enum
from math import modf
from dataclasses import dataclass, field

from dataclasses_json import dataclass_json
import pandas as pd
import numpy as np

from redvox.common import date_time_utils as dtu
from redvox.common.errors import RedVoxExceptions, RedVoxError
from redvox.api1000.wrapped_redvox_packet.sensors.audio import AudioCodec
from redvox.api1000.wrapped_redvox_packet.sensors.location import LocationProvider
from redvox.api1000.wrapped_redvox_packet.sensors.image import ImageCodec
from redvox.api1000.wrapped_redvox_packet.station_information import NetworkType, PowerState, CellServiceState

# default maximum number of points required to brute force calculating gap timestamps
DEFAULT_MAX_BRUTE_FORCE_GAP_TIMESTAMPS: int = 5000
# percent of packet duration/sample rate required for gap to be considered a whole unit
DEFAULT_GAP_UPPER_LIMIT: float = 0.8
# percent of packet duration/sample rate required for gap to be considered nothing
DEFAULT_GAP_LOWER_LIMIT: float = 0.02
# columns for audio dataframe
AUDIO_DF_COLUMNS = ["timestamps", "unaltered_timestamps", "microphone"]
# columns that cannot be interpolated
NON_INTERPOLATED_COLUMNS = ["compressed_audio", "image"]
# columns that are not numeric but can be interpolated
NON_NUMERIC_COLUMNS = ["location_provider", "image_codec", "audio_codec",
                       "network_type", "power_state", "cell_service"]


# noinspection Mypy,DuplicatedCode
class DataPointCreationMode(enum.Enum):
    """
    Type of data point to create
    """

    NAN: int = 0
    COPY: int = 1
    INTERPOLATE: int = 2

    @staticmethod
    def list_names() -> List[str]:
        return [n.name for n in DataPointCreationMode]


@dataclass_json()
@dataclass
class GapPadResult:
    """
    The result of filling gaps or padding a time series
    """
    result_df: Optional[pd.DataFrame] = None
    gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
    errors: RedVoxExceptions = field(default_factory=lambda: RedVoxExceptions("GapPadResult"))

    def add_error(self, error: str):
        """
        add an error to the result
        :param error: error message to add
        """
        self.errors.append(error)


def calc_evenly_sampled_timestamps(
        start: float, samples: int, sample_interval_micros: float
) -> np.array:
    """
    given a start time, calculates samples amount of evenly spaced timestamps at rate_hz

    :param start: float, start timestamp in microseconds
    :param samples: int, number of samples
    :param sample_interval_micros: float, sample interval in microseconds
    :return: np.array with number of samples timestamps, evenly spaced starting at start
    """
    return start + (np.arange(0, samples) * sample_interval_micros)


def check_gap_list(gaps: List[Tuple[float, float]], start_timestamp: float = None,
                   end_timestamp: float = None) -> List[Tuple[float, float]]:
    """
    removes any gaps where end time <= start time, consolidates overlapping gaps, and ensures that no gap
    starts or ends before start_timestamp and starts or ends after end_timestamp.  All timestamps are in
    microseconds since epoch UTC

    :param gaps: list of gaps to check
    :param start_timestamp: lowest possible timestamp for a gap to start at
    :param end_timestamp: lowest possible timestamp for a gap to end at
    :return: list of correct, valid gaps
    """
    return_gaps: List[Tuple[float, float]] = []
    for gap in gaps:
        if start_timestamp:
            gap = (np.max([start_timestamp, gap[0]]), np.max([start_timestamp, gap[1]]))
        if end_timestamp:
            gap = (np.min([end_timestamp, gap[0]]), np.min([end_timestamp, gap[1]]))
        if gap[0] < gap[1]:
            if len(return_gaps) < 1:
                return_gaps.append(gap)
            for a, r_g in enumerate(return_gaps):
                if (gap[0] < r_g[0] and gap[1] < r_g[0]) or (gap[0] > r_g[1] and gap[1] > r_g[1]):
                    return_gaps.append(gap)
                    break
                else:
                    if gap[0] < r_g[0] < gap[1]:
                        r_g = (gap[0], r_g[1])
                    if gap[0] < r_g[1] < gap[1]:
                        r_g = (r_g[0], gap[1])
                    return_gaps[a] = r_g
    return return_gaps


def pad_data(
        expected_start: float,
        expected_end: float,
        data_df: pd.DataFrame,
        sample_interval_micros: float,
) -> pd.DataFrame:
    """
    Pad the start and end of the dataframe with np.nan

    :param expected_start: timestamp indicating start time of the data to pad from
    :param expected_end: timestamp indicating end time of the data to pad from
    :param data_df: dataframe with timestamps as column "timestamps"
    :param sample_interval_micros: constant sample interval in microseconds
    :return: dataframe padded with np.nans in front and back to meet full size of expected start and end
    """
    # extract the necessary information to pad the data
    data_time_stamps = data_df["timestamps"].to_numpy()
    first_data_timestamp = data_time_stamps[0]
    last_data_timestamp = data_time_stamps[-1]
    result_df = data_df.copy()
    result_before_update_length = len(result_df) - 1
    # FRONT/END GAP FILL!  calculate the samples missing based on inputs
    if expected_start < first_data_timestamp:
        start_diff = first_data_timestamp - expected_start
        num_missing_samples = int(start_diff / sample_interval_micros)
        if num_missing_samples > 0:
            # add the gap data to the result dataframe
            result_df = add_dataless_timestamps_to_df(
                result_df,
                0,
                sample_interval_micros,
                num_missing_samples,
                True
            )
    if expected_end > last_data_timestamp:
        last_diff = expected_end - last_data_timestamp
        num_missing_samples = int(last_diff / sample_interval_micros)
        if num_missing_samples > 0:
            # add the gap data to the result dataframe
            result_df = add_dataless_timestamps_to_df(
                result_df,
                result_before_update_length,
                sample_interval_micros,
                num_missing_samples
            )
    return result_df.sort_values("timestamps", ignore_index=True)


def fill_gaps(
        data_df: pd.DataFrame,
        gaps: List[Tuple[float, float]],
        sample_interval_micros: float,
        copy: bool = False
) -> pd.DataFrame:
    """
    fills gaps in the dataframe with np.nan or interpolated values by interpolating timestamps based on the
    calculated sample interval

    :param data_df: dataframe with timestamps as column "timestamps"
    :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps
    :param sample_interval_micros: known sample interval of the data points
    :param copy: if True, copy the data points, otherwise interpolate from edges, default False
    :return: dataframe without gaps
    """
    # extract the necessary information to compute gap size and gap timestamps
    data_time_stamps = data_df["timestamps"].to_numpy()
    if len(data_time_stamps) > 1:
        result_df = data_df.copy()
        data_duration = data_time_stamps[-1] - data_time_stamps[0]
        expected_samples = (np.floor(data_duration / sample_interval_micros)
                            + (1 if data_duration % sample_interval_micros >=
                               sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1
        if expected_samples > len(data_time_stamps):
            if copy:
                pcm = DataPointCreationMode["COPY"]
            else:
                pcm = DataPointCreationMode["NAN"]
            # make it safe to alter the gap values
            my_gaps = check_gap_list(gaps, data_time_stamps[0], data_time_stamps[-1])
            for gap in my_gaps:
                # if timestamps are around gaps, we have to update the values
                before_start = np.argwhere([t <= gap[0] for t in data_time_stamps])
                after_end = np.argwhere([t >= gap[1] for t in data_time_stamps])
                if len(before_start) > 0:
                    before_start = before_start[-1][0]
                    # sim = gap[0] - data_time_stamps[before_start]
                    # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm)
                    gap = (data_time_stamps[before_start], gap[1])
                else:
                    before_start = None
                if len(after_end) > 0:
                    after_end = after_end[0][0]
                    # sim = gap[1] - data_time_stamps[after_end]
                    gap = (gap[0], data_time_stamps[after_end])
                else:
                    after_end = None
                num_new_points = int((gap[1] - gap[0]) / sample_interval_micros) - 1
                if before_start is not None:
                    result_df = add_data_points_to_df(result_df, before_start, sample_interval_micros,
                                                      num_new_points, pcm)
                elif after_end is not None:
                    result_df = add_data_points_to_df(result_df, after_end, -sample_interval_micros,
                                                      num_new_points, pcm)
        return result_df.sort_values("timestamps", ignore_index=True)
    return data_df


def fill_audio_gaps(
        packet_data: List[Tuple[float, np.array]],
        sample_interval_micros: float,
        gap_upper_limit: float = DEFAULT_GAP_UPPER_LIMIT,
        gap_lower_limit: float = DEFAULT_GAP_LOWER_LIMIT
) -> GapPadResult:
    """
    fills gaps in the dataframe with np.nan by interpolating timestamps based on the expected sample interval
      * ignores gaps with duration less than or equal to packet length * gap_lower_limit
      * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of
        packet length

    :param packet_data: list of tuples, each tuple containing two pieces of packet information:
        * packet_start_timestamps: float of packet start timestamp in microseconds
        * audio_data: array of data points
    :param sample_interval_micros: sample interval in microseconds
    :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet,
                            default DEFAULT_GAP_UPPER_LIMIT
    :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT
    :return: dataframe without gaps and the list of timestamps of the non-inclusive start and end of the gaps
    """
    result_array = [[], [], []]
    last_data_timestamp: Optional[float] = None
    gaps = []
    for packet in packet_data:
        samples_in_packet = len(packet[1])
        start_ts = packet[0]
        packet_length = sample_interval_micros * samples_in_packet
        if last_data_timestamp:
            last_data_timestamp += sample_interval_micros
            # check if start_ts is close to the last timestamp in data_timestamps
            last_timestamp_diff = start_ts - last_data_timestamp
            if last_timestamp_diff > gap_lower_limit * packet_length:
                fractional_packet, num_packets = modf(last_timestamp_diff /
                                                      (samples_in_packet * sample_interval_micros))
                if fractional_packet >= gap_upper_limit:
                    num_samples = samples_in_packet * (num_packets + 1)
                else:
                    num_samples = np.max([np.floor((fractional_packet + num_packets) * samples_in_packet), 1])
                gap_ts = calc_evenly_sampled_timestamps(last_data_timestamp, num_samples, sample_interval_micros)
                gap_array = [gap_ts, np.full(len(gap_ts), np.nan)]
                start_ts = gap_ts[-1] + sample_interval_micros
                gaps.append((last_data_timestamp, start_ts))
                result_array[0].extend(gap_array[0])
                result_array[1].extend(gap_array[0])
                result_array[2].extend(gap_array[1])
            elif last_timestamp_diff < -gap_lower_limit * packet_length:
                result = GapPadResult()
                result.add_error(f"Packet start timestamp: {dtu.microseconds_to_seconds(start_ts)} "
                                 f"is before last timestamp of previous "
                                 f"packet: {dtu.microseconds_to_seconds(last_data_timestamp)}")
                return result
        estimated_ts = calc_evenly_sampled_timestamps(start_ts, samples_in_packet, sample_interval_micros)
        last_data_timestamp = estimated_ts[-1]
        result_array[0].extend(estimated_ts)
        result_array[1].extend(estimated_ts)
        result_array[2].extend(packet[1])
    return GapPadResult(pd.DataFrame(np.transpose(result_array), columns=AUDIO_DF_COLUMNS), gaps)


def add_data_points_to_df(dataframe: pd.DataFrame,
                          start_index: int,
                          sample_interval_micros: float,
                          num_samples_to_add: int = 1,
                          point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY,
                          ) -> pd.DataFrame:
    """
    adds data points to the end of the dataframe, starting from the index specified.
        Note:
            * dataframe must not be empty
            * start_index must be non-negative and less than the length of dataframe
            * num_samples_to_add must be greater than 0
            * points are added onto the end and the result is not sorted
        Options for point_creation_mode are:
            * NAN: default values and nans
            * COPY: copies of the start data point
            * INTERPOLATE: interpolated values between start data point and adjacent point

    :param dataframe: dataframe to add dataless timestamps to
    :param start_index: index of the dataframe to use as starting point for creating new values
    :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to
                                    add points before the start_index
    :param num_samples_to_add: the number of timestamps to create, default 1
    :param point_creation_mode: the mode of point creation to use
    :return: updated dataframe with synthetic data points
    """
    if len(dataframe) > start_index and len(dataframe) > 0 and num_samples_to_add > 0:
        start_timestamp = dataframe["timestamps"].iloc[start_index]
        t = start_timestamp + np.arange(1, num_samples_to_add + 1) * sample_interval_micros
        # interpolate mode only uses the first created timestamp
        if point_creation_mode == DataPointCreationMode.COPY:
            empty_df = dataframe.iloc[start_index].copy()
            for column_index in dataframe.columns:
                if column_index in NON_INTERPOLATED_COLUMNS:
                    empty_df[column_index] = np.nan
            empty_df["timestamps"] = t[0]
        elif point_creation_mode == DataPointCreationMode.INTERPOLATE:
            start_point = dataframe.iloc[start_index]
            numeric_start = start_point[[col for col in dataframe.columns
                                         if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]]
            non_numeric_start = start_point[[col for col in dataframe.columns if col in NON_NUMERIC_COLUMNS]]
            end_point = dataframe.iloc[start_index + (1 if sample_interval_micros > 0 else -1)]
            numeric_end = end_point[[col for col in dataframe.columns
                                     if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]]
            non_numeric_end = end_point[[col for col in dataframe.columns if col in NON_NUMERIC_COLUMNS]]
            if np.abs(start_point["timestamps"] - t[0]) <= np.abs(end_point["timestamps"] - t[0]):
                non_numeric_diff = non_numeric_start
            else:
                non_numeric_diff = non_numeric_end
            numeric_diff = numeric_end - numeric_start
            numeric_diff = \
                (numeric_diff / numeric_diff["timestamps"]) * \
                (t - numeric_start) + numeric_start
            empty_df = pd.concat([numeric_diff, non_numeric_diff])
        else:
            empty_df = pd.DataFrame(np.full([num_samples_to_add, len(dataframe.columns)], np.nan),
                                    columns=dataframe.columns)
            for column_index in dataframe.columns:
                if column_index == "timestamps":
                    empty_df[column_index] = t
                elif column_index == "location_provider":
                    empty_df[column_index] = [LocationProvider["UNKNOWN"].value for i in range(num_samples_to_add)]
                elif column_index == "image_codec":
                    empty_df[column_index] = [ImageCodec["UNKNOWN"].value for i in range(num_samples_to_add)]
                elif column_index == "audio_codec":
                    empty_df[column_index] = [AudioCodec["UNKNOWN"].value for i in range(num_samples_to_add)]
                elif column_index == "network_type":
                    empty_df[column_index] = [NetworkType["UNKNOWN_NETWORK"].value for i in range(num_samples_to_add)]
                elif column_index == "power_state":
                    empty_df[column_index] = [PowerState["UNKNOWN_POWER_STATE"].value
                                              for i in range(num_samples_to_add)]
                elif column_index == "cell_service":
                    empty_df[column_index] = [CellServiceState["UNKNOWN"].value for i in range(num_samples_to_add)]
        dataframe = dataframe.append(empty_df, ignore_index=True)

    return dataframe


def add_dataless_timestamps_to_df(dataframe: pd.DataFrame,
                                  start_index: int,
                                  sample_interval_micros: float,
                                  num_samples_to_add: int,
                                  add_to_start: bool = False,
                                  copy: bool = True,
                                  ) -> pd.DataFrame:
    """
    adds dataless timestamps directly to a dataframe that already contains data
      Note:
        * dataframe must not be empty
        * start_index must be non-negative and less than the length of dataframe
        * num_samples_to_add must be greater than 0
        * the points are added onto the end and the result is not sorted

    :param dataframe: dataframe to add dataless timestamps to
    :param start_index: index of the dataframe to use as starting point for creating new values
    :param sample_interval_micros: sample interval in microseconds of the timestamps
    :param num_samples_to_add: the number of timestamps to create
    :param add_to_start: if True, subtracts sample_interval_micros from start_timestamp, default False
    :param copy: if True, copy the value of the start point when creating new points, default True
    :return: updated dataframe with synthetic data points
    """
    if len(dataframe) > start_index and len(dataframe) > 0 and num_samples_to_add > 0:
        start_timestamp = dataframe["timestamps"].iloc[start_index]
        dataframe = dataframe.append(
            create_dataless_timestamps_df(start_timestamp, sample_interval_micros,
                                          dataframe.columns, num_samples_to_add, add_to_start),
            ignore_index=True)
    return dataframe


def create_dataless_timestamps_df(
        start_timestamp: float,
        sample_interval_micros: float,
        columns: pd.Index,
        num_samples_to_add: int,
        add_to_start: bool = False,
) -> pd.DataFrame:
    """
    Creates an empty dataframe with num_samples_to_add timestamps, using columns as the columns
    the first timestamp created is 1 sample_interval_s from the start_timestamp

    :param start_timestamp: timestamp in microseconds since epoch UTC to start calculating other timestamps from
    :param sample_interval_micros: fixed sample interval in microseconds since epoch UTC
    :param columns: dataframe the non-timestamp columns of the dataframe
    :param num_samples_to_add: the number of timestamps to create
    :param add_to_start: if True, subtracts sample_interval_s from start_timestamp, default False
    :return: dataframe with timestamps and no data
    """
    empty_df = pd.DataFrame(np.full([num_samples_to_add, len(columns)], np.nan), columns=columns)
    enum_samples = {
        "location_provider": LocationProvider["UNKNOWN"].value,
        "image_codec": ImageCodec["UNKNOWN"].value,
        "audio_codec": AudioCodec["UNKNOWN"].value,
        "network_type": NetworkType["UNKNOWN_NETWORK"].value,
        "power_state": PowerState["UNKNOWN_POWER_STATE"].value,
        "cell_service": CellServiceState["UNKNOWN"].value
    }
    if num_samples_to_add > 0:
        if add_to_start:
            sample_interval_micros = -sample_interval_micros
        t = start_timestamp + np.arange(1, num_samples_to_add + 1) * sample_interval_micros
        for column_index in columns:
            if column_index == "timestamps":
                empty_df[column_index] = t
            elif column_index in enum_samples.keys():
                empty_df[column_index] = [enum_samples[column_index] for i in range(num_samples_to_add)]
            # elif column_index == "location_provider":
            #     empty_df[column_index] = [LocationProvider.UNKNOWN for i in range(num_samples_to_add)]
            # elif column_index == "image_codec":
            #     empty_df[column_index] = [ImageCodec.UNKNOWN for i in range(num_samples_to_add)]
            # elif column_index == "audio_codec":
            #     empty_df[column_index] = [AudioCodec.UNKNOWN for i in range(num_samples_to_add)]
            # elif column_index == "network_type":
            #     empty_df[column_index] = [NetworkType.UNKNOWN_NETWORK for i in range(num_samples_to_add)]
            # elif column_index == "power_state":
            #     empty_df[column_index] = [PowerState.UNKNOWN_POWER_STATE for i in range(num_samples_to_add)]
            # elif column_index == "cell_service":
            #     empty_df[column_index] = [CellServiceState.UNKNOWN for i in range(num_samples_to_add)]
    return empty_df

Functions

def add_data_points_to_df(dataframe: pandas.core.frame.DataFrame, start_index: int, sample_interval_micros: float, num_samples_to_add: int = 1, point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY) ‑> pandas.core.frame.DataFrame

adds data points to the end of the dataframe, starting from the index specified. Note: * dataframe must not be empty * start_index must be non-negative and less than the length of dataframe * num_samples_to_add must be greater than 0 * points are added onto the end and the result is not sorted Options for point_creation_mode are: * NAN: default values and nans * COPY: copies of the start data point * INTERPOLATE: interpolated values between start data point and adjacent point

:param dataframe: dataframe to add dataless timestamps to :param start_index: index of the dataframe to use as starting point for creating new values :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to add points before the start_index :param num_samples_to_add: the number of timestamps to create, default 1 :param point_creation_mode: the mode of point creation to use :return: updated dataframe with synthetic data points

Expand source code
def add_data_points_to_df(dataframe: pd.DataFrame,
                          start_index: int,
                          sample_interval_micros: float,
                          num_samples_to_add: int = 1,
                          point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY,
                          ) -> pd.DataFrame:
    """
    adds data points to the end of the dataframe, starting from the index specified.
        Note:
            * dataframe must not be empty
            * start_index must be non-negative and less than the length of dataframe
            * num_samples_to_add must be greater than 0
            * points are added onto the end and the result is not sorted
        Options for point_creation_mode are:
            * NAN: default values and nans
            * COPY: copies of the start data point
            * INTERPOLATE: interpolated values between start data point and adjacent point

    :param dataframe: dataframe to add dataless timestamps to
    :param start_index: index of the dataframe to use as starting point for creating new values
    :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to
                                    add points before the start_index
    :param num_samples_to_add: the number of timestamps to create, default 1
    :param point_creation_mode: the mode of point creation to use
    :return: updated dataframe with synthetic data points
    """
    if len(dataframe) > start_index and len(dataframe) > 0 and num_samples_to_add > 0:
        start_timestamp = dataframe["timestamps"].iloc[start_index]
        t = start_timestamp + np.arange(1, num_samples_to_add + 1) * sample_interval_micros
        # interpolate mode only uses the first created timestamp
        if point_creation_mode == DataPointCreationMode.COPY:
            empty_df = dataframe.iloc[start_index].copy()
            for column_index in dataframe.columns:
                if column_index in NON_INTERPOLATED_COLUMNS:
                    empty_df[column_index] = np.nan
            empty_df["timestamps"] = t[0]
        elif point_creation_mode == DataPointCreationMode.INTERPOLATE:
            start_point = dataframe.iloc[start_index]
            numeric_start = start_point[[col for col in dataframe.columns
                                         if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]]
            non_numeric_start = start_point[[col for col in dataframe.columns if col in NON_NUMERIC_COLUMNS]]
            end_point = dataframe.iloc[start_index + (1 if sample_interval_micros > 0 else -1)]
            numeric_end = end_point[[col for col in dataframe.columns
                                     if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]]
            non_numeric_end = end_point[[col for col in dataframe.columns if col in NON_NUMERIC_COLUMNS]]
            if np.abs(start_point["timestamps"] - t[0]) <= np.abs(end_point["timestamps"] - t[0]):
                non_numeric_diff = non_numeric_start
            else:
                non_numeric_diff = non_numeric_end
            numeric_diff = numeric_end - numeric_start
            numeric_diff = \
                (numeric_diff / numeric_diff["timestamps"]) * \
                (t - numeric_start) + numeric_start
            empty_df = pd.concat([numeric_diff, non_numeric_diff])
        else:
            empty_df = pd.DataFrame(np.full([num_samples_to_add, len(dataframe.columns)], np.nan),
                                    columns=dataframe.columns)
            for column_index in dataframe.columns:
                if column_index == "timestamps":
                    empty_df[column_index] = t
                elif column_index == "location_provider":
                    empty_df[column_index] = [LocationProvider["UNKNOWN"].value for i in range(num_samples_to_add)]
                elif column_index == "image_codec":
                    empty_df[column_index] = [ImageCodec["UNKNOWN"].value for i in range(num_samples_to_add)]
                elif column_index == "audio_codec":
                    empty_df[column_index] = [AudioCodec["UNKNOWN"].value for i in range(num_samples_to_add)]
                elif column_index == "network_type":
                    empty_df[column_index] = [NetworkType["UNKNOWN_NETWORK"].value for i in range(num_samples_to_add)]
                elif column_index == "power_state":
                    empty_df[column_index] = [PowerState["UNKNOWN_POWER_STATE"].value
                                              for i in range(num_samples_to_add)]
                elif column_index == "cell_service":
                    empty_df[column_index] = [CellServiceState["UNKNOWN"].value for i in range(num_samples_to_add)]
        dataframe = dataframe.append(empty_df, ignore_index=True)

    return dataframe
def add_dataless_timestamps_to_df(dataframe: pandas.core.frame.DataFrame, start_index: int, sample_interval_micros: float, num_samples_to_add: int, add_to_start: bool = False, copy: bool = True) ‑> pandas.core.frame.DataFrame

adds dataless timestamps directly to a dataframe that already contains data Note: * dataframe must not be empty * start_index must be non-negative and less than the length of dataframe * num_samples_to_add must be greater than 0 * the points are added onto the end and the result is not sorted

:param dataframe: dataframe to add dataless timestamps to :param start_index: index of the dataframe to use as starting point for creating new values :param sample_interval_micros: sample interval in microseconds of the timestamps :param num_samples_to_add: the number of timestamps to create :param add_to_start: if True, subtracts sample_interval_micros from start_timestamp, default False :param copy: if True, copy the value of the start point when creating new points, default True :return: updated dataframe with synthetic data points

Expand source code
def add_dataless_timestamps_to_df(dataframe: pd.DataFrame,
                                  start_index: int,
                                  sample_interval_micros: float,
                                  num_samples_to_add: int,
                                  add_to_start: bool = False,
                                  copy: bool = True,
                                  ) -> pd.DataFrame:
    """
    adds dataless timestamps directly to a dataframe that already contains data
      Note:
        * dataframe must not be empty
        * start_index must be non-negative and less than the length of dataframe
        * num_samples_to_add must be greater than 0
        * the points are added onto the end and the result is not sorted

    :param dataframe: dataframe to add dataless timestamps to
    :param start_index: index of the dataframe to use as starting point for creating new values
    :param sample_interval_micros: sample interval in microseconds of the timestamps
    :param num_samples_to_add: the number of timestamps to create
    :param add_to_start: if True, subtracts sample_interval_micros from start_timestamp, default False
    :param copy: if True, copy the value of the start point when creating new points, default True
    :return: updated dataframe with synthetic data points
    """
    if len(dataframe) > start_index and len(dataframe) > 0 and num_samples_to_add > 0:
        start_timestamp = dataframe["timestamps"].iloc[start_index]
        dataframe = dataframe.append(
            create_dataless_timestamps_df(start_timestamp, sample_interval_micros,
                                          dataframe.columns, num_samples_to_add, add_to_start),
            ignore_index=True)
    return dataframe
def calc_evenly_sampled_timestamps(start: float, samples: int, sample_interval_micros: float) ‑> 

given a start time, calculates samples amount of evenly spaced timestamps at rate_hz

:param start: float, start timestamp in microseconds :param samples: int, number of samples :param sample_interval_micros: float, sample interval in microseconds :return: np.array with number of samples timestamps, evenly spaced starting at start

Expand source code
def calc_evenly_sampled_timestamps(
        start: float, samples: int, sample_interval_micros: float
) -> np.array:
    """
    given a start time, calculates samples amount of evenly spaced timestamps at rate_hz

    :param start: float, start timestamp in microseconds
    :param samples: int, number of samples
    :param sample_interval_micros: float, sample interval in microseconds
    :return: np.array with number of samples timestamps, evenly spaced starting at start
    """
    return start + (np.arange(0, samples) * sample_interval_micros)
def check_gap_list(gaps: List[Tuple[float, float]], start_timestamp: float = None, end_timestamp: float = None) ‑> List[Tuple[float, float]]

removes any gaps where end time <= start time, consolidates overlapping gaps, and ensures that no gap starts or ends before start_timestamp and starts or ends after end_timestamp. All timestamps are in microseconds since epoch UTC

:param gaps: list of gaps to check :param start_timestamp: lowest possible timestamp for a gap to start at :param end_timestamp: lowest possible timestamp for a gap to end at :return: list of correct, valid gaps

Expand source code
def check_gap_list(gaps: List[Tuple[float, float]], start_timestamp: float = None,
                   end_timestamp: float = None) -> List[Tuple[float, float]]:
    """
    removes any gaps where end time <= start time, consolidates overlapping gaps, and ensures that no gap
    starts or ends before start_timestamp and starts or ends after end_timestamp.  All timestamps are in
    microseconds since epoch UTC

    :param gaps: list of gaps to check
    :param start_timestamp: lowest possible timestamp for a gap to start at
    :param end_timestamp: lowest possible timestamp for a gap to end at
    :return: list of correct, valid gaps
    """
    return_gaps: List[Tuple[float, float]] = []
    for gap in gaps:
        if start_timestamp:
            gap = (np.max([start_timestamp, gap[0]]), np.max([start_timestamp, gap[1]]))
        if end_timestamp:
            gap = (np.min([end_timestamp, gap[0]]), np.min([end_timestamp, gap[1]]))
        if gap[0] < gap[1]:
            if len(return_gaps) < 1:
                return_gaps.append(gap)
            for a, r_g in enumerate(return_gaps):
                if (gap[0] < r_g[0] and gap[1] < r_g[0]) or (gap[0] > r_g[1] and gap[1] > r_g[1]):
                    return_gaps.append(gap)
                    break
                else:
                    if gap[0] < r_g[0] < gap[1]:
                        r_g = (gap[0], r_g[1])
                    if gap[0] < r_g[1] < gap[1]:
                        r_g = (r_g[0], gap[1])
                    return_gaps[a] = r_g
    return return_gaps
def create_dataless_timestamps_df(start_timestamp: float, sample_interval_micros: float, columns: pandas.core.indexes.base.Index, num_samples_to_add: int, add_to_start: bool = False) ‑> pandas.core.frame.DataFrame

Creates an empty dataframe with num_samples_to_add timestamps, using columns as the columns the first timestamp created is 1 sample_interval_s from the start_timestamp

:param start_timestamp: timestamp in microseconds since epoch UTC to start calculating other timestamps from :param sample_interval_micros: fixed sample interval in microseconds since epoch UTC :param columns: dataframe the non-timestamp columns of the dataframe :param num_samples_to_add: the number of timestamps to create :param add_to_start: if True, subtracts sample_interval_s from start_timestamp, default False :return: dataframe with timestamps and no data

Expand source code
def create_dataless_timestamps_df(
        start_timestamp: float,
        sample_interval_micros: float,
        columns: pd.Index,
        num_samples_to_add: int,
        add_to_start: bool = False,
) -> pd.DataFrame:
    """
    Creates an empty dataframe with num_samples_to_add timestamps, using columns as the columns
    the first timestamp created is 1 sample_interval_s from the start_timestamp

    :param start_timestamp: timestamp in microseconds since epoch UTC to start calculating other timestamps from
    :param sample_interval_micros: fixed sample interval in microseconds since epoch UTC
    :param columns: dataframe the non-timestamp columns of the dataframe
    :param num_samples_to_add: the number of timestamps to create
    :param add_to_start: if True, subtracts sample_interval_s from start_timestamp, default False
    :return: dataframe with timestamps and no data
    """
    empty_df = pd.DataFrame(np.full([num_samples_to_add, len(columns)], np.nan), columns=columns)
    enum_samples = {
        "location_provider": LocationProvider["UNKNOWN"].value,
        "image_codec": ImageCodec["UNKNOWN"].value,
        "audio_codec": AudioCodec["UNKNOWN"].value,
        "network_type": NetworkType["UNKNOWN_NETWORK"].value,
        "power_state": PowerState["UNKNOWN_POWER_STATE"].value,
        "cell_service": CellServiceState["UNKNOWN"].value
    }
    if num_samples_to_add > 0:
        if add_to_start:
            sample_interval_micros = -sample_interval_micros
        t = start_timestamp + np.arange(1, num_samples_to_add + 1) * sample_interval_micros
        for column_index in columns:
            if column_index == "timestamps":
                empty_df[column_index] = t
            elif column_index in enum_samples.keys():
                empty_df[column_index] = [enum_samples[column_index] for i in range(num_samples_to_add)]
            # elif column_index == "location_provider":
            #     empty_df[column_index] = [LocationProvider.UNKNOWN for i in range(num_samples_to_add)]
            # elif column_index == "image_codec":
            #     empty_df[column_index] = [ImageCodec.UNKNOWN for i in range(num_samples_to_add)]
            # elif column_index == "audio_codec":
            #     empty_df[column_index] = [AudioCodec.UNKNOWN for i in range(num_samples_to_add)]
            # elif column_index == "network_type":
            #     empty_df[column_index] = [NetworkType.UNKNOWN_NETWORK for i in range(num_samples_to_add)]
            # elif column_index == "power_state":
            #     empty_df[column_index] = [PowerState.UNKNOWN_POWER_STATE for i in range(num_samples_to_add)]
            # elif column_index == "cell_service":
            #     empty_df[column_index] = [CellServiceState.UNKNOWN for i in range(num_samples_to_add)]
    return empty_df
def fill_audio_gaps(packet_data: List[Tuple[float, ]], sample_interval_micros: float, gap_upper_limit: float = 0.8, gap_lower_limit: float = 0.02) ‑> GapPadResult

fills gaps in the dataframe with np.nan by interpolating timestamps based on the expected sample interval * ignores gaps with duration less than or equal to packet length * gap_lower_limit * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of packet length

:param packet_data: list of tuples, each tuple containing two pieces of packet information: * packet_start_timestamps: float of packet start timestamp in microseconds * audio_data: array of data points :param sample_interval_micros: sample interval in microseconds :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet, default DEFAULT_GAP_UPPER_LIMIT :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT :return: dataframe without gaps and the list of timestamps of the non-inclusive start and end of the gaps

Expand source code
def fill_audio_gaps(
        packet_data: List[Tuple[float, np.array]],
        sample_interval_micros: float,
        gap_upper_limit: float = DEFAULT_GAP_UPPER_LIMIT,
        gap_lower_limit: float = DEFAULT_GAP_LOWER_LIMIT
) -> GapPadResult:
    """
    fills gaps in the dataframe with np.nan by interpolating timestamps based on the expected sample interval
      * ignores gaps with duration less than or equal to packet length * gap_lower_limit
      * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of
        packet length

    :param packet_data: list of tuples, each tuple containing two pieces of packet information:
        * packet_start_timestamps: float of packet start timestamp in microseconds
        * audio_data: array of data points
    :param sample_interval_micros: sample interval in microseconds
    :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet,
                            default DEFAULT_GAP_UPPER_LIMIT
    :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT
    :return: dataframe without gaps and the list of timestamps of the non-inclusive start and end of the gaps
    """
    result_array = [[], [], []]
    last_data_timestamp: Optional[float] = None
    gaps = []
    for packet in packet_data:
        samples_in_packet = len(packet[1])
        start_ts = packet[0]
        packet_length = sample_interval_micros * samples_in_packet
        if last_data_timestamp:
            last_data_timestamp += sample_interval_micros
            # check if start_ts is close to the last timestamp in data_timestamps
            last_timestamp_diff = start_ts - last_data_timestamp
            if last_timestamp_diff > gap_lower_limit * packet_length:
                fractional_packet, num_packets = modf(last_timestamp_diff /
                                                      (samples_in_packet * sample_interval_micros))
                if fractional_packet >= gap_upper_limit:
                    num_samples = samples_in_packet * (num_packets + 1)
                else:
                    num_samples = np.max([np.floor((fractional_packet + num_packets) * samples_in_packet), 1])
                gap_ts = calc_evenly_sampled_timestamps(last_data_timestamp, num_samples, sample_interval_micros)
                gap_array = [gap_ts, np.full(len(gap_ts), np.nan)]
                start_ts = gap_ts[-1] + sample_interval_micros
                gaps.append((last_data_timestamp, start_ts))
                result_array[0].extend(gap_array[0])
                result_array[1].extend(gap_array[0])
                result_array[2].extend(gap_array[1])
            elif last_timestamp_diff < -gap_lower_limit * packet_length:
                result = GapPadResult()
                result.add_error(f"Packet start timestamp: {dtu.microseconds_to_seconds(start_ts)} "
                                 f"is before last timestamp of previous "
                                 f"packet: {dtu.microseconds_to_seconds(last_data_timestamp)}")
                return result
        estimated_ts = calc_evenly_sampled_timestamps(start_ts, samples_in_packet, sample_interval_micros)
        last_data_timestamp = estimated_ts[-1]
        result_array[0].extend(estimated_ts)
        result_array[1].extend(estimated_ts)
        result_array[2].extend(packet[1])
    return GapPadResult(pd.DataFrame(np.transpose(result_array), columns=AUDIO_DF_COLUMNS), gaps)
def fill_gaps(data_df: pandas.core.frame.DataFrame, gaps: List[Tuple[float, float]], sample_interval_micros: float, copy: bool = False) ‑> pandas.core.frame.DataFrame

fills gaps in the dataframe with np.nan or interpolated values by interpolating timestamps based on the calculated sample interval

:param data_df: dataframe with timestamps as column "timestamps" :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps :param sample_interval_micros: known sample interval of the data points :param copy: if True, copy the data points, otherwise interpolate from edges, default False :return: dataframe without gaps

Expand source code
def fill_gaps(
        data_df: pd.DataFrame,
        gaps: List[Tuple[float, float]],
        sample_interval_micros: float,
        copy: bool = False
) -> pd.DataFrame:
    """
    fills gaps in the dataframe with np.nan or interpolated values by interpolating timestamps based on the
    calculated sample interval

    :param data_df: dataframe with timestamps as column "timestamps"
    :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps
    :param sample_interval_micros: known sample interval of the data points
    :param copy: if True, copy the data points, otherwise interpolate from edges, default False
    :return: dataframe without gaps
    """
    # extract the necessary information to compute gap size and gap timestamps
    data_time_stamps = data_df["timestamps"].to_numpy()
    if len(data_time_stamps) > 1:
        result_df = data_df.copy()
        data_duration = data_time_stamps[-1] - data_time_stamps[0]
        expected_samples = (np.floor(data_duration / sample_interval_micros)
                            + (1 if data_duration % sample_interval_micros >=
                               sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1
        if expected_samples > len(data_time_stamps):
            if copy:
                pcm = DataPointCreationMode["COPY"]
            else:
                pcm = DataPointCreationMode["NAN"]
            # make it safe to alter the gap values
            my_gaps = check_gap_list(gaps, data_time_stamps[0], data_time_stamps[-1])
            for gap in my_gaps:
                # if timestamps are around gaps, we have to update the values
                before_start = np.argwhere([t <= gap[0] for t in data_time_stamps])
                after_end = np.argwhere([t >= gap[1] for t in data_time_stamps])
                if len(before_start) > 0:
                    before_start = before_start[-1][0]
                    # sim = gap[0] - data_time_stamps[before_start]
                    # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm)
                    gap = (data_time_stamps[before_start], gap[1])
                else:
                    before_start = None
                if len(after_end) > 0:
                    after_end = after_end[0][0]
                    # sim = gap[1] - data_time_stamps[after_end]
                    gap = (gap[0], data_time_stamps[after_end])
                else:
                    after_end = None
                num_new_points = int((gap[1] - gap[0]) / sample_interval_micros) - 1
                if before_start is not None:
                    result_df = add_data_points_to_df(result_df, before_start, sample_interval_micros,
                                                      num_new_points, pcm)
                elif after_end is not None:
                    result_df = add_data_points_to_df(result_df, after_end, -sample_interval_micros,
                                                      num_new_points, pcm)
        return result_df.sort_values("timestamps", ignore_index=True)
    return data_df
def pad_data(expected_start: float, expected_end: float, data_df: pandas.core.frame.DataFrame, sample_interval_micros: float) ‑> pandas.core.frame.DataFrame

Pad the start and end of the dataframe with np.nan

:param expected_start: timestamp indicating start time of the data to pad from :param expected_end: timestamp indicating end time of the data to pad from :param data_df: dataframe with timestamps as column "timestamps" :param sample_interval_micros: constant sample interval in microseconds :return: dataframe padded with np.nans in front and back to meet full size of expected start and end

Expand source code
def pad_data(
        expected_start: float,
        expected_end: float,
        data_df: pd.DataFrame,
        sample_interval_micros: float,
) -> pd.DataFrame:
    """
    Pad the start and end of the dataframe with np.nan

    :param expected_start: timestamp indicating start time of the data to pad from
    :param expected_end: timestamp indicating end time of the data to pad from
    :param data_df: dataframe with timestamps as column "timestamps"
    :param sample_interval_micros: constant sample interval in microseconds
    :return: dataframe padded with np.nans in front and back to meet full size of expected start and end
    """
    # extract the necessary information to pad the data
    data_time_stamps = data_df["timestamps"].to_numpy()
    first_data_timestamp = data_time_stamps[0]
    last_data_timestamp = data_time_stamps[-1]
    result_df = data_df.copy()
    result_before_update_length = len(result_df) - 1
    # FRONT/END GAP FILL!  calculate the samples missing based on inputs
    if expected_start < first_data_timestamp:
        start_diff = first_data_timestamp - expected_start
        num_missing_samples = int(start_diff / sample_interval_micros)
        if num_missing_samples > 0:
            # add the gap data to the result dataframe
            result_df = add_dataless_timestamps_to_df(
                result_df,
                0,
                sample_interval_micros,
                num_missing_samples,
                True
            )
    if expected_end > last_data_timestamp:
        last_diff = expected_end - last_data_timestamp
        num_missing_samples = int(last_diff / sample_interval_micros)
        if num_missing_samples > 0:
            # add the gap data to the result dataframe
            result_df = add_dataless_timestamps_to_df(
                result_df,
                result_before_update_length,
                sample_interval_micros,
                num_missing_samples
            )
    return result_df.sort_values("timestamps", ignore_index=True)

Classes

class DataPointCreationMode (value, names=None, *, module=None, qualname=None, type=None, start=1)

Type of data point to create

Expand source code
class DataPointCreationMode(enum.Enum):
    """
    Type of data point to create
    """

    NAN: int = 0
    COPY: int = 1
    INTERPOLATE: int = 2

    @staticmethod
    def list_names() -> List[str]:
        return [n.name for n in DataPointCreationMode]

Ancestors

  • enum.Enum

Class variables

var COPY : int
var INTERPOLATE : int
var NAN : int

Static methods

def list_names() ‑> List[str]
Expand source code
@staticmethod
def list_names() -> List[str]:
    return [n.name for n in DataPointCreationMode]
class GapPadResult (result_df: Optional[pandas.core.frame.DataFrame] = None, gaps: List[Tuple[float, float]] = <factory>, errors: RedVoxExceptions = <factory>)

The result of filling gaps or padding a time series

Expand source code
@dataclass_json()
@dataclass
class GapPadResult:
    """
    The result of filling gaps or padding a time series
    """
    result_df: Optional[pd.DataFrame] = None
    gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
    errors: RedVoxExceptions = field(default_factory=lambda: RedVoxExceptions("GapPadResult"))

    def add_error(self, error: str):
        """
        add an error to the result
        :param error: error message to add
        """
        self.errors.append(error)

Class variables

var errorsRedVoxExceptions
var gaps : List[Tuple[float, float]]
var result_df : Optional[pandas.core.frame.DataFrame]

Static methods

def from_dict(kvs: Union[dict, list, str, int, float, bool, NoneType], *, infer_missing=False) ‑> ~A
Expand source code
@classmethod
def from_dict(cls: Type[A],
              kvs: Json,
              *,
              infer_missing=False) -> A:
    return _decode_dataclass(cls, kvs, infer_missing)
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
Expand source code
@classmethod
def from_json(cls: Type[A],
              s: JsonData,
              *,
              parse_float=None,
              parse_int=None,
              parse_constant=None,
              infer_missing=False,
              **kw) -> A:
    kvs = json.loads(s,
                     parse_float=parse_float,
                     parse_int=parse_int,
                     parse_constant=parse_constant,
                     **kw)
    return cls.from_dict(kvs, infer_missing=infer_missing)
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
Expand source code
@classmethod
def schema(cls: Type[A],
           *,
           infer_missing: bool = False,
           only=None,
           exclude=(),
           many: bool = False,
           context=None,
           load_only=(),
           dump_only=(),
           partial: bool = False,
           unknown=None) -> SchemaType:
    Schema = build_schema(cls, DataClassJsonMixin, infer_missing, partial)

    if unknown is None:
        undefined_parameter_action = _undefined_parameter_action_safe(cls)
        if undefined_parameter_action is not None:
            # We can just make use of the same-named mm keywords
            unknown = undefined_parameter_action.name.lower()

    return Schema(only=only,
                  exclude=exclude,
                  many=many,
                  context=context,
                  load_only=load_only,
                  dump_only=dump_only,
                  partial=partial,
                  unknown=unknown)

Methods

def add_error(self, error: str)

add an error to the result :param error: error message to add

Expand source code
def add_error(self, error: str):
    """
    add an error to the result
    :param error: error message to add
    """
    self.errors.append(error)
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, NoneType]]
Expand source code
def to_dict(self, encode_json=False) -> Dict[str, Json]:
    return _asdict(self, encode_json=encode_json)
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, NoneType] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) ‑> str
Expand source code
def to_json(self,
            *,
            skipkeys: bool = False,
            ensure_ascii: bool = True,
            check_circular: bool = True,
            allow_nan: bool = True,
            indent: Optional[Union[int, str]] = None,
            separators: Tuple[str, str] = None,
            default: Callable = None,
            sort_keys: bool = False,
            **kw) -> str:
    return json.dumps(self.to_dict(encode_json=False),
                      cls=_ExtendedEncoder,
                      skipkeys=skipkeys,
                      ensure_ascii=ensure_ascii,
                      check_circular=check_circular,
                      allow_nan=allow_nan,
                      indent=indent,
                      separators=separators,
                      default=default,
                      sort_keys=sort_keys,
                      **kw)