--- title: TimeSeries Dataset keywords: fastai sidebar: home_sidebar nb_path: "nbs/data__tsdataset.ipynb" ---
{% raw %}
{% endraw %}

Base Dataset

Transforms pandas DataFrame into a TimeSeriesDataset for the Dataloder.

{% raw %}
{% endraw %} {% raw %}

class BaseDataset[source]

BaseDataset(*args, **kwds) :: Dataset

A class used to store Time Series data.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

BaseDataset.__getitem__[source]

BaseDataset.__getitem__(idx:Union[slice, int])

Creates batch based on index.

Parameters

index: np.ndarray Indexes of time series to consider.

Returns

Dictionary with keys:

- S
- Y
- X
- available_mask
- sample_mask
- idxs
{% endraw %} {% raw %}
{% endraw %} {% raw %}

BaseDataset.__len__[source]

BaseDataset.__len__()

{% endraw %} {% raw %}
{% endraw %} {% raw %}

BaseDataset.get_n_variables[source]

BaseDataset.get_n_variables()

Gets number of exogenous and static variables.

{% endraw %} {% raw %}

BaseDataset.get_n_series[source]

BaseDataset.get_n_series()

Gets number of time series.

{% endraw %} {% raw %}

BaseDataset.get_max_len[source]

BaseDataset.get_max_len()

Gets max len of time series.

{% endraw %} {% raw %}

BaseDataset.get_n_channels[source]

BaseDataset.get_n_channels()

Gets number of channels considered.

{% endraw %} {% raw %}

BaseDataset.get_frequency[source]

BaseDataset.get_frequency()

Gets infered frequency.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_default_mask_df[source]

get_default_mask_df(Y_df:DataFrame, ds_in_test:int, is_test:bool)

Constructs default mask df.

Parameters

Y_df: pd.DataFrame Target time series with columns ['unique_id', 'ds', 'y']. ds_in_test: int Numer of datestamps to use as outsample. is_test: bool Wheter target time series belongs to test set.

Returns

Mask DataFrame with columns ['unique_id', 'ds', 'available_mask', 'sample_mask'].

{% endraw %} {% raw %}
{% endraw %} {% raw %}

class TimeSeriesDataset[source]

TimeSeriesDataset(*args, **kwds) :: BaseDataset

A class used to store Time Series data.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

TimeSeriesDataset.__getitem__[source]

TimeSeriesDataset.__getitem__(idx:Union[slice, int])

Creates batch based on index.

Parameters

index: np.ndarray Indexes of time series to consider.

Returns

Dictionary with keys:

- S
- Y
- X
- available_mask
- sample_mask
- idxs
{% endraw %} {% raw %}
{% endraw %}

Windows Dataset

{% raw %}

class WindowsDataset[source]

WindowsDataset(*args, **kwds) :: BaseDataset

A class used to store Time Series data.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

WindowsDataset.__getitem__[source]

WindowsDataset.__getitem__(idx:Union[slice, int])

Creates batch based on index.

Parameters

index: np.ndarray Indexes of time series to consider.

Returns

Dictionary with keys:

- S
- Y
- X
- available_mask
- sample_mask
- idxs
{% endraw %} {% raw %}
{% endraw %}

Default mask example and tests

{% raw %}
def test_default_mask(Y_df, ds_in_test, is_test):
    mask_df = get_default_mask_df(Y_df, ds_in_test, is_test)
    assert Y_df.index.equals(mask_df.index), 'Unmatching index bewteen Y_df and mask_df'
    
    for uid, df in mask_df.groupby('unique_id'):
        len_ts = df.shape[0]
        expected_sample_mask = np.ones(len_ts)
        expected_sample_mask[-ds_in_test:] = 0
        if is_test: 
            expected_sample_mask = 1 - expected_sample_mask
        expected_available_mask = np.ones(len_ts)
        
        sample_mask = df['sample_mask'].values
        available_mask = df['available_mask'].values
        
        assert np.array_equal(sample_mask, expected_sample_mask), (
            f'Error for sample mask for time series {uid}'
        )
        
        assert np.array_equal(available_mask, expected_available_mask), (
            f'Error for available mask for time series {uid}'
        )
{% endraw %}

Test for synthtetic time series data

{% raw %}
from nixtlats.data.utils import create_synthetic_tsdata

Y_df, X_df, S_df = create_synthetic_tsdata()
ds_in_test = 2
is_test = False
test_default_mask(Y_df, ds_in_test, is_test)
test_default_mask(Y_df, ds_in_test, True)
{% endraw %}

Example and test for datasets with two time series

{% raw %}
import matplotlib.pyplot as plt

from nixtlats.data.datasets.epf import EPF, EPFInfo

Y_df, X_df, S_df = EPF.load_groups(directory='data', groups=['NP', 'PJM'])
test_default_mask(Y_df, ds_in_test=728 * 24, is_test=False)
mask_df = get_default_mask_df(Y_df=Y_df, ds_in_test=728 * 24, is_test=False)

plt.plot(mask_df.sample_mask.values)
[<matplotlib.lines.Line2D at 0x7fb7fd6d1ad0>]
{% endraw %} {% raw %}
mask_df = get_default_mask_df(Y_df=Y_df, ds_in_test=728 * 24, is_test=True)

plt.plot(mask_df.sample_mask.values)
[<matplotlib.lines.Line2D at 0x7fb7fbc96990>]
{% endraw %}

Test for datasets with more than two time series

{% raw %}
from nixtlats.data.datasets.tourism import Tourism, TourismInfo

meta = TourismInfo['Yearly']
Y_df, *_ = Tourism.load(directory='data', group=meta.name)
test_default_mask(Y_df, ds_in_test=meta.horizon, is_test=False)
test_default_mask(Y_df, ds_in_test=meta.horizon, is_test=True)
{% endraw %}

Dataset tests

{% raw %}
def instantiate_datasets(Y_df, S_df, X_df, f_cols=None, 
                         ds_in_test=0, is_test=False,
                         input_size=15,
                         output_size=1,
                         complete_windows=False,
                         sample_freq=1):
    mask_df = get_default_mask_df(Y_df=Y_df, ds_in_test=ds_in_test, is_test=is_test)
                        
    ts_dataset = TimeSeriesDataset(Y_df=Y_df, S_df=S_df, X_df=X_df, f_cols=f_cols, 
                                   mask_df=mask_df,
                                   input_size=input_size,
                                   output_size=output_size,
                                   complete_windows=complete_windows)
    
    wd_dataset = WindowsDataset(Y_df=Y_df, S_df=S_df, X_df=X_df, f_cols=f_cols, 
                                   mask_df=mask_df,
                                   input_size=input_size,
                                   output_size=output_size,
                                   sample_freq=sample_freq,
                                   complete_windows=complete_windows)
    
    return ts_dataset, wd_dataset, mask_df
{% endraw %} {% raw %}
def test_dataset_attrs(Y_df, S_df, X_df, f_cols, ds_in_test, is_test):
    # This set catches mistmaches between Y_df and ts_tensor
    ts_dataset, wd_dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df, 
                                           f_cols=f_cols, ds_in_test=ds_in_test, 
                                           is_test=is_test)
    
    dfs = [Y_df, X_df, mask_df]
    dfs = [df.set_index(['unique_id', 'ds']) for df in dfs]
    dfs = dfs[0].join(dfs[1:])
    
    #Temporal variables
    for dataset in [ts_dataset, wd_dataset]:
        for idx_ts, (uid, df) in enumerate(dfs.groupby('unique_id')):
            len_ts = dataset.len_series[idx_ts]

            for col in dataset.t_cols:
                ts = t.Tensor(df[col].values)
                idx_tensor = dataset.t_cols.index(col)
                ts_tensor = dataset.ts_tensor[idx_ts, idx_tensor, -len_ts:]

                assert np.array_equal(ts, ts_tensor), (
                    f'Error with time series {uid} and col {col} (idx={idx_ts}).'
                )

        #Static variables
        for idx_ts, (uid, df) in enumerate(S_df.groupby('unique_id')):
            len_ts = dataset.len_series[idx_ts]

            s = df[dataset.s_cols].values
            s_matrix = dataset.s_matrix[[idx_ts]]

            assert np.array_equal(s, s_matrix), (
                f'Error with static variables for time series {uid} (idx={idx_ts})'
            )
{% endraw %} {% raw %}
def test_get_f_idxs(Y_df, S_df, X_df, f_cols, ds_in_test, is_test, expected_f_idxs):
    ts_dataset, wd_dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df, 
                                           f_cols=f_cols, ds_in_test=ds_in_test, 
                                           is_test=is_test)
    
    assert ts_dataset._get_f_idxs(f_cols) == expected_f_idxs
    assert wd_dataset._get_f_idxs(f_cols) == expected_f_idxs
{% endraw %} {% raw %}
def test_ts_tensor(Y_df, S_df, X_df, f_cols, ds_in_test, is_test, 
                   input_size, output_size, ts_idxs):
    ts_dataset, wd_dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df, 
                                           f_cols=f_cols, ds_in_test=ds_in_test, 
                                           is_test=is_test,
                                           input_size=input_size,
                                           output_size=output_size)
    
    for dataset in [ts_dataset, wd_dataset]:
        min_len = min(dataset.len_series)
        dfs = [Y_df, X_df, mask_df]
        dfs = [df.set_index(['unique_id', 'ds']) for df in dfs]
        dfs = dfs[0].join(dfs[1:])
        
        # This process only works for balanced datasets.
        
        n_ts = Y_df['unique_id'].unique().shape[0]
        n_x = dfs.columns.shape[0]
        idxs = range(n_ts) if ts_idxs is None else ts_idxs

        e_filtered_tensor = t.Tensor(dfs.values.reshape((n_ts, min_len, n_x))[idxs])
        e_filtered_tensor = np.swapaxes(e_filtered_tensor, 2, 1)
        filtered_tensor = dataset.ts_tensor[ts_idxs, :, dataset.first_ds:]

        assert np.array_equal(e_filtered_tensor, filtered_tensor), (
            "Expected and dataset filtered_tensor are different. Check."
        )
{% endraw %} {% raw %}
from numpy.lib.stride_tricks import sliding_window_view

# This test only works for the synthetic dataset constructed 
# using create_synthetic_tsdata
# and for the 21 time series
def test_batch_construction_windows(Y_df, S_df, X_df, ds_in_test, 
                                    is_test, input_size, output_size, sample_freq, 
                                    ):
    """Test to verify that the batch (of windows) is well constructed."""

    _, dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df, 
                                               ds_in_test=ds_in_test, is_test=is_test,
                                               input_size=input_size,
                                               output_size=output_size,
                                               sample_freq=sample_freq)
    
    windows_size = input_size + output_size
    max_len = Y_df.groupby('unique_id').size().max()
    
    windows = dataset[20]['Y'].numpy()
    
    #Expected windows
    uid = Y_df['unique_id'].unique()[20]
    Y_original = Y_df.query('unique_id == @uid')['y'].values
    size = Y_original.size
    Y_sample = np.zeros(max_len)
    Y_sample[-size:] = Y_original
    
    Y_sample = np.pad(Y_sample, (input_size+100, output_size))

    e_windows = sliding_window_view(Y_sample, window_shape=windows_size)
    e_windows = e_windows[0:-(ds_in_test+output_size-1):sample_freq] #-1 for at least one available sample

    # This test works assuming there are no series with all values of zero.
    sampleable_windows_idxs = np.where((e_windows > 0).sum(1) >= 1)[0]     
    e_windows = e_windows[sampleable_windows_idxs]
    
    #Comparison
    assert np.array_equal(windows, e_windows), (
        'Expected and actual windows are different'
    )
{% endraw %} {% raw %}
from numpy.lib.stride_tricks import sliding_window_view

# This test only works for the synthetic dataset constructed 
# using create_synthetic_tsdata
# and for the 21 time series
def test_batch_construction_ts(Y_df, S_df, X_df, ds_in_test, 
                                    is_test, input_size, output_size, sample_freq, 
                                    ):
    """Test to verify that the batch (of windows) is well constructed."""

    dataset, _, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df, 
                                               ds_in_test=ds_in_test, is_test=is_test,
                                               input_size=input_size,
                                               output_size=output_size,
                                               sample_freq=sample_freq)
    
    batch = dataset[20]['Y'].numpy()
    max_len = Y_df.groupby('unique_id').size().max()
    
    #Expected windows
    uid = Y_df['unique_id'].unique()[20]
    Y_original = Y_df.query('unique_id == @uid')['y'].values
    size = Y_original.size
    e_batch = np.zeros(max_len)
    e_batch[-size:] = Y_original
    e_batch = e_batch[None,:]

    #Comparison
    assert np.array_equal(batch, e_batch), (
        'Expected and actual windows are different'
    )
{% endraw %}

Test for not sorted datasets with more than two time series

{% raw %}
from nixtlats.data.utils import create_synthetic_tsdata

Y_df, X_df, S_df = create_synthetic_tsdata()
ds_in_test = 2
is_test = False
f_cols = ['future_1']
expected_f_idxs = [2]
len_sample_chunks = 15 #only for ESRNN
{% endraw %} {% raw %}
test_dataset_attrs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test)
{% endraw %} {% raw %}
test_get_f_idxs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test, 
                expected_f_idxs=expected_f_idxs)
{% endraw %}

Expected error for non-sorted datasets

For the ts_tensor attribute from the dataset and Y_df to have the same order it is necessary that Y_df is sorted by unique_id and ds. This test (expected error) proves that for unordered data the order is different.

{% raw %}
def _fail_non_sorted(): 
    test_ts_tensor(Y_df, S_df, X_df, f_cols=f_cols, 
                   ds_in_test=ds_in_test, 
                   is_test=is_test,
                   ts_idxs=[1, 0], 
                   output_size=ds_in_test)
test_fail(_fail_non_sorted)
{% endraw %}

Test using sorted synthtic ts data

{% raw %}
from nixtlats.data.utils import create_synthetic_tsdata

Y_df, X_df, S_df = create_synthetic_tsdata(sort=True)
ds_in_test = 2
is_test = False
f_cols = ['future_1']
expected_f_idxs = [2]
len_sample_chunks = 15 #only for ESRNN
{% endraw %} {% raw %}
test_batch_construction_windows(Y_df, S_df, X_df, ds_in_test=ds_in_test,
                                is_test=is_test, input_size=5, output_size=2,
                                sample_freq=1)
{% endraw %} {% raw %}
test_batch_construction_ts(Y_df, S_df, X_df, ds_in_test=ds_in_test,
                           is_test=is_test, input_size=5, output_size=2,
                           sample_freq=1)
{% endraw %}

Test for already sorted datasets

{% raw %}
from nixtlats.data.datasets.epf import EPF, EPFInfo

Y_df, X_df, S_df = EPF.load_groups(directory='data', groups=['NP', 'PJM'])
f_cols = ['Exogenous1', 'Exogenous2']
ds_in_test = 728 * 24
is_test = True
expected_f_idxs = [1, 2] #after y column
{% endraw %} {% raw %}
test_dataset_attrs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test)
{% endraw %} {% raw %}
test_ts_tensor(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test,
               input_size=ds_in_test, output_size=ds_in_test, ts_idxs=[1, 0])
{% endraw %} {% raw %}
test_get_f_idxs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test, 
                expected_f_idxs=expected_f_idxs)
{% endraw %}

Test for already sorted datasets with more than two time series

{% raw %}
from nixtlats.data.datasets.tourism import Tourism, TourismInfo

meta = TourismInfo['Yearly']
df, *_ = Tourism.load(directory='./data', group=meta.name)
df['day_of_week'] = df['ds'].dt.day_of_week
df['id_ts'] = df['unique_id'].astype('category').cat.codes

Y_df = df.filter(items=['unique_id', 'ds', 'y'])
X_df = df.filter(items=['unique_id', 'ds', 'day_of_week'])
S_df = df.filter(items=['unique_id', 'id_ts']).drop_duplicates().reset_index(drop=True)
Y_df = Y_df.groupby('unique_id').tail(11)
X_df = X_df.groupby('unique_id').tail(11)
{% endraw %} {% raw %}
test_dataset_attrs(Y_df, S_df, X_df, f_cols=[], ds_in_test=meta.horizon, is_test=True)
{% endraw %} {% raw %}
test_ts_tensor(Y_df, S_df, X_df, f_cols=[], ds_in_test=meta.horizon, is_test=False,
               input_size=5, 
               output_size=meta.horizon, ts_idxs=[1, 7, 10, 15])
{% endraw %} {% raw %}
test_get_f_idxs(Y_df, S_df, X_df, f_cols=[], ds_in_test=ds_in_test, is_test=is_test, 
                expected_f_idxs=[])
{% endraw %}