--- title: Splitting keywords: fastai sidebar: home_sidebar summary: "Data Splitting Transforms." description: "Data Splitting Transforms." nb_path: "nbs/transforms/transforms.splitting.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

Random split

{% raw %}

random_split[source]

random_split(mat, val_p=0.0, test_p=0.2, seed=42)

Randomly split interactions into training, validation, and testing sets.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import pandas as pd

def test_random_split():

    interactions_to_split_df = pd.DataFrame(
        data={
        'user_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4],
        'item_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 4, 1, 2, 4, 5],
        'rating': [1, 2, 3, 4, 5, 4, 3, 2, 1, 1, 2, 3, 4, 2, 3, 4, 5, 1, 5, 4, 2, 3, 5, 4]
        }
    )
    interactions_to_split=coo_matrix(
        (
            interactions_to_split_df['rating'],
            (interactions_to_split_df['user_id'], interactions_to_split_df['item_id']),
        ),
        shape=(interactions_to_split_df.user_id.nunique(), interactions_to_split_df.item_id.nunique()),
    )

    train_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 0, 0, 0],
            'item_id': [0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7, 8],
            'rating': [1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 4, 3, 2, 1], 
        }
    )
    train_expected=coo_matrix(
        (
            train_expected_df['rating'],
            (train_expected_df['user_id'], train_expected_df['item_id']),
        ),
        shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
    )

    validate_expected_df = pd.DataFrame(
        data={'user_id': [2, 3, 3], 'item_id': [4, 1, 2], 'rating': [5, 1, 5]}
    )
    validate_expected=coo_matrix(
        (
            validate_expected_df['rating'],
            (validate_expected_df['user_id'], validate_expected_df['item_id']),
        ),
        shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
    )

    test_expected_df = pd.DataFrame(
        data={
            'user_id': [3, 4, 4, 4, 4],
            'item_id': [4, 1, 2, 4, 5],
            'rating': [4, 2, 3, 5, 4],
        }
    )
    test_expected=coo_matrix(
        (
            test_expected_df['rating'],
            (test_expected_df['user_id'], test_expected_df['item_id']),
        ),
        shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
    )

    (train_actual, validate_actual, test_actual) = random_split(
        mat=interactions_to_split, val_p=0.1, test_p=0.2, seed=42
    )

    np.testing.assert_array_equal(train_actual.toarray(), train_expected.toarray())
    np.testing.assert_array_equal(
        validate_actual.toarray(), validate_expected.toarray()
    )
    np.testing.assert_array_equal(test_actual.toarray(), test_expected.toarray())

    assert (
        train_actual.shape[0]
        == train_expected.shape[0]
        == validate_actual.shape[0]
        == validate_expected.shape[0]
        == test_actual.shape[0]
        == test_expected.shape[0]
    )

    assert (
        train_actual.shape[1]
        == train_expected.shape[1]
        == validate_actual.shape[1]
        == validate_expected.shape[1]
        == test_actual.shape[1]
        == test_expected.shape[1]
    )

    assert (
        type(train_actual)
        == type(train_expected)
        == type(validate_actual)
        == type(validate_expected)
        == type(test_actual)
        == type(test_expected)
    )

test_random_split()
{% endraw %}

Stratified split

{% raw %}

stratified_split[source]

stratified_split(mat, val_p=0.0, test_p=0.2, seed=42)

Split into train, validate, and test datasets in a stratified manner such that each user appears at least once in each of the datasets. This split guarantees that every user will be represented in the training, validation, and testing datasets given they appear at least three times. If val_p == 0, they will appear in the training and testing datasets given they appear at least two times. If a user appears fewer than this number of times, a ValueError will be raised.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import pandas as pd


def test_stratified_split():

    interactions_to_split_df = pd.DataFrame(
        data={
        'user_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4],
        'item_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 4, 1, 2, 4, 5],
        'rating': [1, 2, 3, 4, 5, 4, 3, 2, 1, 1, 2, 3, 4, 2, 3, 4, 5, 1, 5, 4, 2, 3, 5, 4]
        }
    )
    interactions_to_split=coo_matrix(
        (
            interactions_to_split_df['rating'],
            (interactions_to_split_df['user_id'], interactions_to_split_df['item_id']),
        ),
        shape=(interactions_to_split_df.user_id.nunique(), interactions_to_split_df.item_id.nunique()),
    )

    train_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4],
            'item_id': [0, 1, 2, 4, 5, 7, 3, 4, 3, 4, 1, 1, 5],
            'rating': [1, 2, 3, 5, 4, 2, 3, 4, 4, 5, 1, 2, 4],  
        }
    )
    train_expected=coo_matrix(
        (
            train_expected_df['rating'],
            (train_expected_df['user_id'], train_expected_df['item_id']),
        ),
        shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
    )

    validate_expected_df = pd.DataFrame(
        data={'user_id': [0, 1, 2, 3, 4], 'item_id': [8, 2, 2, 2, 4], 'rating': [1, 2, 3, 5, 5]}
    )
    validate_expected=coo_matrix(
        (
            validate_expected_df['rating'],
            (validate_expected_df['user_id'], validate_expected_df['item_id']),
        ),
        shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
    )

    test_expected_df = pd.DataFrame(
        data={
            'user_id': [0, 0, 1, 2, 3, 4],
            'item_id': [3, 6, 1, 1, 4, 2],
            'rating': [4, 3, 1, 2, 4, 3],
        }
    )
    test_expected=coo_matrix(
        (
            test_expected_df['rating'],
            (test_expected_df['user_id'], test_expected_df['item_id']),
        ),
        shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
    )

    (train_actual, validate_actual, test_actual) = stratified_split(
        mat=interactions_to_split, val_p=0.1, test_p=0.2, seed=42
    )

    np.testing.assert_array_equal(train_actual.toarray(), train_expected.toarray())
    np.testing.assert_array_equal(
        validate_actual.toarray(), validate_expected.toarray()
    )
    np.testing.assert_array_equal(test_actual.toarray(), test_expected.toarray())

    assert (
        train_actual.shape[0]
        == train_expected.shape[0]
        == validate_actual.shape[0]
        == validate_expected.shape[0]
        == test_actual.shape[0]
        == test_expected.shape[0]
    )

    assert (
        train_actual.shape[1]
        == train_expected.shape[1]
        == validate_actual.shape[1]
        == validate_expected.shape[1]
        == test_actual.shape[1]
        == test_expected.shape[1]
    )

    assert (
        type(train_actual)
        == type(train_expected)
        == type(validate_actual)
        == type(validate_expected)
        == type(test_actual)
        == type(test_expected)
    )


test_stratified_split()
{% endraw %}

Split by Ratio

{% raw %}

split_by_ratio[source]

split_by_ratio(data, shuffle=False, test_size=None, pad_unknown=True, filter_unknown=False, seed=42)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame.from_dict(
    {
        'user':[1,1,1,1,1,2,2,3],
        'item':[1,2,3,2,2,1,2,3]
    }
)

df
user item
0 1 1
1 1 2
2 1 3
3 1 2
4 1 2
5 2 1
6 2 2
7 3 3
{% endraw %} {% raw %}
train, test = split_by_ratio(df, shuffle=False, test_size=0.2, pad_unknown=True, filter_unknown=False)
print("train:\n{}\n\ntest:\n{}".format(train,test))
train:
   user  item
0     1     1
1     1     2
2     1     3
3     1     2
5     2     1
6     2     2
7     3     3

test:
   user  item
4     1     2
{% endraw %} {% raw %}
train, test = split_by_ratio(df, shuffle=False, test_size=0.4, pad_unknown=True, filter_unknown=True)
print("train:\n{}\n\ntest:\n{}".format(train,test))
train:
   user  item
0     1     1
1     1     2
2     1     3
5     2     1
6     2     2
7     3     3

test:
   user  item
3     1     2
4     1     2
{% endraw %}

Last-session-out Split

{% raw %}

last_session_out_split[source]

last_session_out_split(data, user_key='user_id', session_key='session_id', time_key='ts')

Assign the last session of every user to the test set and the remaining ones to the training set

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import pandas as pd

df = pd.DataFrame.from_dict({
    'session_id': [357,359,394,4127,6400],
    'sequence': [[793, 3489],[1762],[1256],
                 [1948, 1364, 2060, 1115, 6488, 2060],
                 [687, 1394]],
    'ts': [1421003874, 1421018535, 1421007470,
           1421416896, 1420807778],
    'user_id': [4296, 4296, 30980, 28117, 35247]
})

df		
session_id sequence ts user_id
0 357 [793, 3489] 1421003874 4296
1 359 [1762] 1421018535 4296
2 394 [1256] 1421007470 30980
3 4127 [1948, 1364, 2060, 1115, 6488, 2060] 1421416896 28117
4 6400 [687, 1394] 1420807778 35247
{% endraw %} {% raw %}
train_data, test_data = last_session_out_split(df)
train_data
session_id sequence ts user_id
0 357 [793, 3489] 1421003874 4296
{% endraw %}

Random Split v2

{% raw %}

random_split_v2[source]

random_split_v2(data, ratio=0.8, seed=42)

Pandas random splitter. The splitter randomly splits the input data. Args: data (pandas.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. Returns: list: Splits of the input data as pandas.DataFrame.

{% endraw %} {% raw %}
{% endraw %}

Chrono Split

{% raw %}

chrono_split[source]

chrono_split(data, ratio=0.75, min_rating=1, filter_by='user', col_user='USERID', col_item='ITEMID', col_timestamp='TIMESTAMP')

Pandas chronological splitter. This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (pandas.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as pandas.DataFrame.

{% endraw %} {% raw %}
{% endraw %}

Stratified Split v2

{% raw %}

stratified_split_v2[source]

stratified_split_v2(data, ratio=0.75, min_rating=1, filter_by='user', col_user='USERID', col_item='ITEMID', seed=42)

Pandas stratified splitter. For each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (pandas.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. Returns: list: Splits of the input data as pandas.DataFrame.

{% endraw %} {% raw %}
{% endraw %}