--- title: Splitting keywords: fastai sidebar: home_sidebar summary: "Data Splitting Transforms." description: "Data Splitting Transforms." nb_path: "nbs/transforms/transforms.splitting.ipynb" ---
import pandas as pd
def test_random_split():
interactions_to_split_df = pd.DataFrame(
data={
'user_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4],
'item_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 4, 1, 2, 4, 5],
'rating': [1, 2, 3, 4, 5, 4, 3, 2, 1, 1, 2, 3, 4, 2, 3, 4, 5, 1, 5, 4, 2, 3, 5, 4]
}
)
interactions_to_split=coo_matrix(
(
interactions_to_split_df['rating'],
(interactions_to_split_df['user_id'], interactions_to_split_df['item_id']),
),
shape=(interactions_to_split_df.user_id.nunique(), interactions_to_split_df.item_id.nunique()),
)
train_expected_df = pd.DataFrame(
data={
'user_id': [0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 0, 0, 0],
'item_id': [0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7, 8],
'rating': [1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 4, 3, 2, 1],
}
)
train_expected=coo_matrix(
(
train_expected_df['rating'],
(train_expected_df['user_id'], train_expected_df['item_id']),
),
shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
)
validate_expected_df = pd.DataFrame(
data={'user_id': [2, 3, 3], 'item_id': [4, 1, 2], 'rating': [5, 1, 5]}
)
validate_expected=coo_matrix(
(
validate_expected_df['rating'],
(validate_expected_df['user_id'], validate_expected_df['item_id']),
),
shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
)
test_expected_df = pd.DataFrame(
data={
'user_id': [3, 4, 4, 4, 4],
'item_id': [4, 1, 2, 4, 5],
'rating': [4, 2, 3, 5, 4],
}
)
test_expected=coo_matrix(
(
test_expected_df['rating'],
(test_expected_df['user_id'], test_expected_df['item_id']),
),
shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
)
(train_actual, validate_actual, test_actual) = random_split(
mat=interactions_to_split, val_p=0.1, test_p=0.2, seed=42
)
np.testing.assert_array_equal(train_actual.toarray(), train_expected.toarray())
np.testing.assert_array_equal(
validate_actual.toarray(), validate_expected.toarray()
)
np.testing.assert_array_equal(test_actual.toarray(), test_expected.toarray())
assert (
train_actual.shape[0]
== train_expected.shape[0]
== validate_actual.shape[0]
== validate_expected.shape[0]
== test_actual.shape[0]
== test_expected.shape[0]
)
assert (
train_actual.shape[1]
== train_expected.shape[1]
== validate_actual.shape[1]
== validate_expected.shape[1]
== test_actual.shape[1]
== test_expected.shape[1]
)
assert (
type(train_actual)
== type(train_expected)
== type(validate_actual)
== type(validate_expected)
== type(test_actual)
== type(test_expected)
)
test_random_split()
import pandas as pd
def test_stratified_split():
interactions_to_split_df = pd.DataFrame(
data={
'user_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4],
'item_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 4, 1, 2, 4, 5],
'rating': [1, 2, 3, 4, 5, 4, 3, 2, 1, 1, 2, 3, 4, 2, 3, 4, 5, 1, 5, 4, 2, 3, 5, 4]
}
)
interactions_to_split=coo_matrix(
(
interactions_to_split_df['rating'],
(interactions_to_split_df['user_id'], interactions_to_split_df['item_id']),
),
shape=(interactions_to_split_df.user_id.nunique(), interactions_to_split_df.item_id.nunique()),
)
train_expected_df = pd.DataFrame(
data={
'user_id': [0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 4, 4],
'item_id': [0, 1, 2, 4, 5, 7, 3, 4, 3, 4, 1, 1, 5],
'rating': [1, 2, 3, 5, 4, 2, 3, 4, 4, 5, 1, 2, 4],
}
)
train_expected=coo_matrix(
(
train_expected_df['rating'],
(train_expected_df['user_id'], train_expected_df['item_id']),
),
shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
)
validate_expected_df = pd.DataFrame(
data={'user_id': [0, 1, 2, 3, 4], 'item_id': [8, 2, 2, 2, 4], 'rating': [1, 2, 3, 5, 5]}
)
validate_expected=coo_matrix(
(
validate_expected_df['rating'],
(validate_expected_df['user_id'], validate_expected_df['item_id']),
),
shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
)
test_expected_df = pd.DataFrame(
data={
'user_id': [0, 0, 1, 2, 3, 4],
'item_id': [3, 6, 1, 1, 4, 2],
'rating': [4, 3, 1, 2, 4, 3],
}
)
test_expected=coo_matrix(
(
test_expected_df['rating'],
(test_expected_df['user_id'], test_expected_df['item_id']),
),
shape=(interactions_to_split.shape[0], interactions_to_split.shape[1]),
)
(train_actual, validate_actual, test_actual) = stratified_split(
mat=interactions_to_split, val_p=0.1, test_p=0.2, seed=42
)
np.testing.assert_array_equal(train_actual.toarray(), train_expected.toarray())
np.testing.assert_array_equal(
validate_actual.toarray(), validate_expected.toarray()
)
np.testing.assert_array_equal(test_actual.toarray(), test_expected.toarray())
assert (
train_actual.shape[0]
== train_expected.shape[0]
== validate_actual.shape[0]
== validate_expected.shape[0]
== test_actual.shape[0]
== test_expected.shape[0]
)
assert (
train_actual.shape[1]
== train_expected.shape[1]
== validate_actual.shape[1]
== validate_expected.shape[1]
== test_actual.shape[1]
== test_expected.shape[1]
)
assert (
type(train_actual)
== type(train_expected)
== type(validate_actual)
== type(validate_expected)
== type(test_actual)
== type(test_expected)
)
test_stratified_split()
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.DataFrame.from_dict(
{
'user':[1,1,1,1,1,2,2,3],
'item':[1,2,3,2,2,1,2,3]
}
)
df
train, test = split_by_ratio(df, shuffle=False, test_size=0.2, pad_unknown=True, filter_unknown=False)
print("train:\n{}\n\ntest:\n{}".format(train,test))
train, test = split_by_ratio(df, shuffle=False, test_size=0.4, pad_unknown=True, filter_unknown=True)
print("train:\n{}\n\ntest:\n{}".format(train,test))
import pandas as pd
df = pd.DataFrame.from_dict({
'session_id': [357,359,394,4127,6400],
'sequence': [[793, 3489],[1762],[1256],
[1948, 1364, 2060, 1115, 6488, 2060],
[687, 1394]],
'ts': [1421003874, 1421018535, 1421007470,
1421416896, 1420807778],
'user_id': [4296, 4296, 30980, 28117, 35247]
})
df
train_data, test_data = last_session_out_split(df)
train_data