--- title: User Grouping keywords: fastai sidebar: home_sidebar summary: "Generate user groups for training group recommenders." description: "Generate user groups for training group recommenders." nb_path: "nbs/transforms/transforms.user_grouping.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class GroupGenerator[source]

GroupGenerator(user_ids, item_ids, ratings, output_path, rating_threshold, num_groups, group_sizes, min_num_ratings, train_ratio, val_ratio, negative_sample_size, verbose=True)

Group Data Generator

{% endraw %} {% raw %}
{% endraw %} {% raw %}
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
ml-1m.zip           100%[===================>]   5.64M  18.2MB/s    in 0.3s    
Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         
{% endraw %} {% raw %}
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python', header=None)
ratings.head()
0 1 2 3
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
{% endraw %} {% raw %}
ratings.max()
0          6040
1          3952
2             5
3    1046454590
dtype: int64
{% endraw %} {% raw %}
group_generator = GroupGenerator(
                    user_ids=np.arange(6041),
                    item_ids=np.arange(3953),
                    ratings=ratings,
                    output_path='./data/silver',
                    rating_threshold=4,
                    num_groups=10,
                    group_sizes=[2, 3],
                    min_num_ratings=20,
                    train_ratio=0.7,
                    val_ratio=0.1,
                    negative_sample_size=10)
Save data: ./data/silver
# Users: 23
# Items: 360
# Groups: 10
# U-I ratings: 2866
# G-I ratings: 503
Avg. # ratings / user: 124.61
Avg. # ratings / group: 50.30
Avg. group size: 2.30
{% endraw %} {% raw %}
path = './data/silver'
fun = lambda x : os.path.isfile(os.path.join(path,x))
files_list = filter(fun, os.listdir(path))
size_of_file = [
    (f,os.stat(os.path.join(path, f)).st_size)
    for f in files_list
]
for f,s in size_of_file:
    print("{:.1f}K: {}".format(round(s/(1024),3),f))
53.0K: userRatingTrain.dat
0.1K: groupMember.dat
5.4K: groupRatingTestNegative.dat
5.0K: userRatingValNegative.dat
1.8K: userRatingVal.dat
15.2K: userRatingTestNegative.dat
5.7K: userRatingTest.dat
6.4K: groupRatingTrain.dat
1.8K: groupRatingTest.dat
0.9K: groupRatingVal.dat
2.8K: groupRatingValNegative.dat
{% endraw %}