--- title: TimeSeries Dataset keywords: fastai sidebar: home_sidebar nb_path: "nbs/data__tsdataset.ipynb" ---
def test_default_mask(Y_df, ds_in_test, is_test):
mask_df = get_default_mask_df(Y_df, ds_in_test, is_test)
assert Y_df.index.equals(mask_df.index), 'Unmatching index bewteen Y_df and mask_df'
for uid, df in mask_df.groupby('unique_id'):
len_ts = df.shape[0]
expected_sample_mask = np.ones(len_ts)
expected_sample_mask[-ds_in_test:] = 0
if is_test:
expected_sample_mask = 1 - expected_sample_mask
expected_available_mask = np.ones(len_ts)
sample_mask = df['sample_mask'].values
available_mask = df['available_mask'].values
assert np.array_equal(sample_mask, expected_sample_mask), (
f'Error for sample mask for time series {uid}'
)
assert np.array_equal(available_mask, expected_available_mask), (
f'Error for available mask for time series {uid}'
)
from nixtlats.data.utils import create_synthetic_tsdata
Y_df, X_df, S_df = create_synthetic_tsdata()
ds_in_test = 2
is_test = False
test_default_mask(Y_df, ds_in_test, is_test)
test_default_mask(Y_df, ds_in_test, True)
import matplotlib.pyplot as plt
from nixtlats.data.datasets.epf import EPF, EPFInfo
Y_df, X_df, S_df = EPF.load_groups(directory='data', groups=['NP', 'PJM'])
test_default_mask(Y_df, ds_in_test=728 * 24, is_test=False)
mask_df = get_default_mask_df(Y_df=Y_df, ds_in_test=728 * 24, is_test=False)
plt.plot(mask_df.sample_mask.values)
mask_df = get_default_mask_df(Y_df=Y_df, ds_in_test=728 * 24, is_test=True)
plt.plot(mask_df.sample_mask.values)
from nixtlats.data.datasets.tourism import Tourism, TourismInfo
meta = TourismInfo['Yearly']
Y_df, *_ = Tourism.load(directory='data', group=meta.name)
test_default_mask(Y_df, ds_in_test=meta.horizon, is_test=False)
test_default_mask(Y_df, ds_in_test=meta.horizon, is_test=True)
def instantiate_datasets(Y_df, S_df, X_df, f_cols=None,
ds_in_test=0, is_test=False,
input_size=15,
output_size=1,
complete_windows=False,
sample_freq=1):
mask_df = get_default_mask_df(Y_df=Y_df, ds_in_test=ds_in_test, is_test=is_test)
ts_dataset = TimeSeriesDataset(Y_df=Y_df, S_df=S_df, X_df=X_df, f_cols=f_cols,
mask_df=mask_df,
input_size=input_size,
output_size=output_size,
complete_windows=complete_windows)
wd_dataset = WindowsDataset(Y_df=Y_df, S_df=S_df, X_df=X_df, f_cols=f_cols,
mask_df=mask_df,
input_size=input_size,
output_size=output_size,
sample_freq=sample_freq,
complete_windows=complete_windows)
return ts_dataset, wd_dataset, mask_df
def test_dataset_attrs(Y_df, S_df, X_df, f_cols, ds_in_test, is_test):
# This set catches mistmaches between Y_df and ts_tensor
ts_dataset, wd_dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df,
f_cols=f_cols, ds_in_test=ds_in_test,
is_test=is_test)
dfs = [Y_df, X_df, mask_df]
dfs = [df.set_index(['unique_id', 'ds']) for df in dfs]
dfs = dfs[0].join(dfs[1:])
#Temporal variables
for dataset in [ts_dataset, wd_dataset]:
for idx_ts, (uid, df) in enumerate(dfs.groupby('unique_id')):
len_ts = dataset.len_series[idx_ts]
for col in dataset.t_cols:
ts = t.Tensor(df[col].values)
idx_tensor = dataset.t_cols.index(col)
ts_tensor = dataset.ts_tensor[idx_ts, idx_tensor, -len_ts:]
assert np.array_equal(ts, ts_tensor), (
f'Error with time series {uid} and col {col} (idx={idx_ts}).'
)
#Static variables
for idx_ts, (uid, df) in enumerate(S_df.groupby('unique_id')):
len_ts = dataset.len_series[idx_ts]
s = df[dataset.s_cols].values
s_matrix = dataset.s_matrix[[idx_ts]]
assert np.array_equal(s, s_matrix), (
f'Error with static variables for time series {uid} (idx={idx_ts})'
)
def test_get_f_idxs(Y_df, S_df, X_df, f_cols, ds_in_test, is_test, expected_f_idxs):
ts_dataset, wd_dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df,
f_cols=f_cols, ds_in_test=ds_in_test,
is_test=is_test)
assert ts_dataset._get_f_idxs(f_cols) == expected_f_idxs
assert wd_dataset._get_f_idxs(f_cols) == expected_f_idxs
def test_ts_tensor(Y_df, S_df, X_df, f_cols, ds_in_test, is_test,
input_size, output_size, ts_idxs):
ts_dataset, wd_dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df,
f_cols=f_cols, ds_in_test=ds_in_test,
is_test=is_test,
input_size=input_size,
output_size=output_size)
for dataset in [ts_dataset, wd_dataset]:
min_len = min(dataset.len_series)
dfs = [Y_df, X_df, mask_df]
dfs = [df.set_index(['unique_id', 'ds']) for df in dfs]
dfs = dfs[0].join(dfs[1:])
# This process only works for balanced datasets.
n_ts = Y_df['unique_id'].unique().shape[0]
n_x = dfs.columns.shape[0]
idxs = range(n_ts) if ts_idxs is None else ts_idxs
e_filtered_tensor = t.Tensor(dfs.values.reshape((n_ts, min_len, n_x))[idxs])
e_filtered_tensor = np.swapaxes(e_filtered_tensor, 2, 1)
filtered_tensor = dataset.ts_tensor[ts_idxs, :, dataset.first_ds:]
assert np.array_equal(e_filtered_tensor, filtered_tensor), (
"Expected and dataset filtered_tensor are different. Check."
)
from numpy.lib.stride_tricks import sliding_window_view
# This test only works for the synthetic dataset constructed
# using create_synthetic_tsdata
# and for the 21 time series
def test_batch_construction_windows(Y_df, S_df, X_df, ds_in_test,
is_test, input_size, output_size, sample_freq,
):
"""Test to verify that the batch (of windows) is well constructed."""
_, dataset, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df,
ds_in_test=ds_in_test, is_test=is_test,
input_size=input_size,
output_size=output_size,
sample_freq=sample_freq)
windows_size = input_size + output_size
max_len = Y_df.groupby('unique_id').size().max()
windows = dataset[20]['Y'].numpy()
#Expected windows
uid = Y_df['unique_id'].unique()[20]
Y_original = Y_df.query('unique_id == @uid')['y'].values
size = Y_original.size
Y_sample = np.zeros(max_len)
Y_sample[-size:] = Y_original
Y_sample = np.pad(Y_sample, (input_size+100, output_size))
e_windows = sliding_window_view(Y_sample, window_shape=windows_size)
e_windows = e_windows[0:-(ds_in_test+output_size-1):sample_freq] #-1 for at least one available sample
# This test works assuming there are no series with all values of zero.
sampleable_windows_idxs = np.where((e_windows > 0).sum(1) >= 1)[0]
e_windows = e_windows[sampleable_windows_idxs]
#Comparison
assert np.array_equal(windows, e_windows), (
'Expected and actual windows are different'
)
from numpy.lib.stride_tricks import sliding_window_view
# This test only works for the synthetic dataset constructed
# using create_synthetic_tsdata
# and for the 21 time series
def test_batch_construction_ts(Y_df, S_df, X_df, ds_in_test,
is_test, input_size, output_size, sample_freq,
):
"""Test to verify that the batch (of windows) is well constructed."""
dataset, _, mask_df = instantiate_datasets(Y_df=Y_df, S_df=S_df, X_df=X_df,
ds_in_test=ds_in_test, is_test=is_test,
input_size=input_size,
output_size=output_size,
sample_freq=sample_freq)
batch = dataset[20]['Y'].numpy()
max_len = Y_df.groupby('unique_id').size().max()
#Expected windows
uid = Y_df['unique_id'].unique()[20]
Y_original = Y_df.query('unique_id == @uid')['y'].values
size = Y_original.size
e_batch = np.zeros(max_len)
e_batch[-size:] = Y_original
e_batch = e_batch[None,:]
#Comparison
assert np.array_equal(batch, e_batch), (
'Expected and actual windows are different'
)
from nixtlats.data.utils import create_synthetic_tsdata
Y_df, X_df, S_df = create_synthetic_tsdata()
ds_in_test = 2
is_test = False
f_cols = ['future_1']
expected_f_idxs = [2]
len_sample_chunks = 15 #only for ESRNN
test_dataset_attrs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test)
test_get_f_idxs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test,
expected_f_idxs=expected_f_idxs)
def _fail_non_sorted():
test_ts_tensor(Y_df, S_df, X_df, f_cols=f_cols,
ds_in_test=ds_in_test,
is_test=is_test,
ts_idxs=[1, 0],
output_size=ds_in_test)
test_fail(_fail_non_sorted)
from nixtlats.data.utils import create_synthetic_tsdata
Y_df, X_df, S_df = create_synthetic_tsdata(sort=True)
ds_in_test = 2
is_test = False
f_cols = ['future_1']
expected_f_idxs = [2]
len_sample_chunks = 15 #only for ESRNN
test_batch_construction_windows(Y_df, S_df, X_df, ds_in_test=ds_in_test,
is_test=is_test, input_size=5, output_size=2,
sample_freq=1)
test_batch_construction_ts(Y_df, S_df, X_df, ds_in_test=ds_in_test,
is_test=is_test, input_size=5, output_size=2,
sample_freq=1)
from nixtlats.data.datasets.epf import EPF, EPFInfo
Y_df, X_df, S_df = EPF.load_groups(directory='data', groups=['NP', 'PJM'])
f_cols = ['Exogenous1', 'Exogenous2']
ds_in_test = 728 * 24
is_test = True
expected_f_idxs = [1, 2] #after y column
test_dataset_attrs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test)
test_ts_tensor(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test,
input_size=ds_in_test, output_size=ds_in_test, ts_idxs=[1, 0])
test_get_f_idxs(Y_df, S_df, X_df, f_cols=f_cols, ds_in_test=ds_in_test, is_test=is_test,
expected_f_idxs=expected_f_idxs)
from nixtlats.data.datasets.tourism import Tourism, TourismInfo
meta = TourismInfo['Yearly']
df, *_ = Tourism.load(directory='./data', group=meta.name)
df['day_of_week'] = df['ds'].dt.day_of_week
df['id_ts'] = df['unique_id'].astype('category').cat.codes
Y_df = df.filter(items=['unique_id', 'ds', 'y'])
X_df = df.filter(items=['unique_id', 'ds', 'day_of_week'])
S_df = df.filter(items=['unique_id', 'id_ts']).drop_duplicates().reset_index(drop=True)
Y_df = Y_df.groupby('unique_id').tail(11)
X_df = X_df.groupby('unique_id').tail(11)
test_dataset_attrs(Y_df, S_df, X_df, f_cols=[], ds_in_test=meta.horizon, is_test=True)
test_ts_tensor(Y_df, S_df, X_df, f_cols=[], ds_in_test=meta.horizon, is_test=False,
input_size=5,
output_size=meta.horizon, ts_idxs=[1, 7, 10, 15])
test_get_f_idxs(Y_df, S_df, X_df, f_cols=[], ds_in_test=ds_in_test, is_test=is_test,
expected_f_idxs=[])