--- title: MovieLens Dataset Transformation keywords: fastai sidebar: home_sidebar summary: "Implementation of transformation functions specific to movielens datasets." description: "Implementation of transformation functions specific to movielens datasets." nb_path: "nbs/transforms/datasets/transforms.datasets.movielens.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

sparseFeature[source]

sparseFeature(feat, feat_num, embed_dim=4)

create dictionary for sparse feature :param feat: feature name :param feat_num: the total number of sparse features that do not repeat :param embed_dim: embedding dimension :return:

{% endraw %} {% raw %}
{% endraw %} {% raw %}

create_ml_1m_dataset[source]

create_ml_1m_dataset(file, trans_score=2, embed_dim=8, test_neg_num=100)

:param file: A string. dataset path. :param trans_score: A scalar. Greater than it is 1, and less than it is 0. :param embed_dim: A scalar. latent factor. :param test_neg_num: A scalar. The number of test negative samples :return: user_num, item_num, train_df, test_df

{% endraw %} {% raw %}
{% endraw %} {% raw %}

create_implicit_ml_1m_dataset[source]

create_implicit_ml_1m_dataset(file, trans_score=2, embed_dim=8, maxlen=40)

:param file: A string. dataset path. :param trans_score: A scalar. Greater than it is 1, and less than it is 0. :param embed_dim: A scalar. latent factor. :param maxlen: A scalar. maxlen. :return: user_num, item_num, train_df, test_df

{% endraw %} {% raw %}
{% endraw %} {% raw %}
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
ml-1m.zip           100%[===================>]   5.64M  3.71MB/s    in 1.5s    
Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         
{% endraw %} {% raw %}
file = 'ml-1m/ratings.dat'
test_neg_num = 100
embed_dim = 64
trans_score = 1
maxlen = 200
{% endraw %} {% raw %}
feature_columns, train, val, test = create_ml_1m_dataset(file, trans_score, embed_dim, test_neg_num)
==========Data Preprocess Start=============
============Negative Sampling===============
100%|██████████| 6040/6040 [00:44<00:00, 134.33it/s]
============Data Preprocess End=============
{% endraw %} {% raw %}
feature_columns
[{'embed_dim': 64, 'feat': 'user_id', 'feat_num': 6041},
 {'embed_dim': 64, 'feat': 'item_id', 'feat_num': 3953}]
{% endraw %} {% raw %}
train
[array([   1,    1,    1, ..., 6040, 6040, 6040]),
 array([1270, 1721, 1022, ..., 2917, 1921, 1784]),
 array([2152, 1229, 3617, ..., 2960, 3686, 1569])]
{% endraw %} {% raw %}
val
[array([   1,    2,    3, ..., 6038, 6039, 6040]),
 array([1907, 1544, 3868, ..., 2700, 1204,  161]),
 array([3132, 1812,  271, ..., 1697, 2718, 3572])]
{% endraw %} {% raw %}
test
[array([   1,    2,    3, ..., 6038, 6039, 6040]),
 array([  48, 1917, 2081, ..., 1183, 1254, 1221]),
 array([[ 426, 1915, 2201, ..., 1687, 2916, 1266],
        [3294, 2362,  167, ..., 1322, 2715, 3013],
        [2973, 3000, 1832, ...,  514, 2845, 1901],
        ...,
        [1258,  335, 3638, ..., 3582, 2221,  763],
        [1767, 2924,  691, ..., 1624, 2493,  371],
        [1106, 3048, 1940, ..., 3520, 2102, 2275]])]
{% endraw %} {% raw %}
feature_columns, train, val, test = create_implicit_ml_1m_dataset(file, trans_score, embed_dim, maxlen)
==========Data Preprocess Start=============
100%|██████████| 6040/6040 [00:35<00:00, 170.13it/s]
==================Padding===================
============Data Preprocess End=============
{% endraw %} {% raw %}
feature_columns
[{'embed_dim': 64, 'feat': 'user_id', 'feat_num': 6041},
 {'embed_dim': 64, 'feat': 'item_id', 'feat_num': 3953}]
{% endraw %} {% raw %}
train
([array([5534, 3031, 1764, ..., 3159, 2137, 3129]),
  array([[   0,    0,    0, ...,  349, 1356, 1580],
         [   0,    0,    0, ..., 3255, 2108,  507],
         [2322, 3316,    9, ..., 2502, 1476, 2759],
         ...,
         [   0,    0,    0, ..., 1258, 1240, 1270],
         [   0,    0,    0, ..., 2038, 1831,   24],
         [2379, 3846, 3041, ..., 2391,  866, 3476]], dtype=int32),
  array([1372, 2309, 3052, ..., 1285, 2668, 2143])],
 array([1, 0, 1, ..., 1, 1, 0]))
{% endraw %} {% raw %}
val
([array([3468, 1903, 1902, ..., 5215, 2977, 3597]),
  array([[   0,    0,    0, ..., 3114,  593, 2345],
         [   0,    0,    0, ..., 1201, 3671, 3681],
         [1754,   44,  247, ..., 1092, 3005, 2605],
         ...,
         [   0,    0,    0, ..., 1252,  720,  745],
         [   0,    0,    0, ..., 2581, 2724, 2763],
         [   0,    0,    0, ..., 2096, 2137, 1032]], dtype=int32),
  array([3951, 1202,  832, ..., 3177,  476, 1029])],
 array([0, 0, 1, ..., 0, 0, 1]))
{% endraw %}