--- title: Criteo keywords: fastai sidebar: home_sidebar summary: "Criteo dataset." description: "Criteo dataset." nb_path: "nbs/datasets/datasets.criteo.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

Criteo sample dataset

{% raw %}

class CriteoSampleDataset[source]

CriteoSampleDataset(root, test_size=0.2, random_seed=42) :: Dataset

Criteo Sample Dataset

Reference:

1. https://github.com/huangjunheng/recommendation_model/blob/master/DCN/dcn.py
{% endraw %} {% raw %}
{% endraw %} {% raw %}
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss, roc_auc_score

from recohut.models.dcn import DCNv2 as DCN


def get_auc(loader, model):
    pred, target = [], []
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device).float(), y.to(device).float()
            y_hat = model(x)
            pred += list(y_hat.cpu().numpy())
            target += list(y.cpu().numpy())
    auc = roc_auc_score(target, pred)
    return auc


root = '/content/data'
batch_size = 1024
lr = 1e-2
wd = 1e-3
epoches = 20
seed = 2022
embedding_size = 4
device = 'cpu'

ds = CriteoSampleDataset(root=root)
train_tensor_data, test_tensor_data = ds.load()
train_loader = DataLoader(train_tensor_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_tensor_data, batch_size=batch_size)

model = DCN(ds.feat_sizes, embedding_size, ds.linear_feature_columns, ds.dnn_feature_columns).to(device)
loss_func = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

for epoch in range(epoches):
    total_loss_epoch = 0.0
    total_tmp = 0
    model.train()
    for index, (x, y) in enumerate(train_loader):
        x, y = x.to(device).float(), y.to(device).float()
        y_hat = model(x)
        optimizer.zero_grad()
        loss = loss_func(y_hat, y)
        loss.backward()
        optimizer.step()
        total_loss_epoch += loss.item()
        total_tmp += 1
    auc = get_auc(test_loader, model)
    print('epoch/epoches: {}/{}, train loss: {:.3f}, test auc: {:.3f}'.format(epoch, epoches, total_loss_epoch / total_tmp, auc))
Processing...
Done!
epoch/epoches: 0/20, train loss: 0.536, test auc: 0.679
epoch/epoches: 1/20, train loss: 0.499, test auc: 0.702
epoch/epoches: 2/20, train loss: 0.486, test auc: 0.722
epoch/epoches: 3/20, train loss: 0.477, test auc: 0.740
epoch/epoches: 4/20, train loss: 0.468, test auc: 0.745
epoch/epoches: 5/20, train loss: 0.463, test auc: 0.752
epoch/epoches: 6/20, train loss: 0.460, test auc: 0.757
epoch/epoches: 7/20, train loss: 0.458, test auc: 0.753
epoch/epoches: 8/20, train loss: 0.455, test auc: 0.758
epoch/epoches: 9/20, train loss: 0.452, test auc: 0.759
epoch/epoches: 10/20, train loss: 0.450, test auc: 0.757
epoch/epoches: 11/20, train loss: 0.449, test auc: 0.758
epoch/epoches: 12/20, train loss: 0.444, test auc: 0.758
epoch/epoches: 13/20, train loss: 0.438, test auc: 0.757
epoch/epoches: 14/20, train loss: 0.428, test auc: 0.753
epoch/epoches: 15/20, train loss: 0.413, test auc: 0.750
epoch/epoches: 16/20, train loss: 0.395, test auc: 0.744
epoch/epoches: 17/20, train loss: 0.381, test auc: 0.735
epoch/epoches: 18/20, train loss: 0.365, test auc: 0.737
epoch/epoches: 19/20, train loss: 0.354, test auc: 0.735
{% endraw %}

Criteo dataset

{% raw %}

class CriteoDataset[source]

CriteoDataset(*args, **kwds) :: Dataset

Criteo Display Advertising Challenge Dataset Data prepration:

* Remove the infrequent features (appearing in less than threshold instances) and treat them as a single feature
* Discretize numerical values by log2 transformation which is proposed by the winner of Criteo Competition

:param dataset_path: criteo train.txt path. :param cache_path: lmdb cache path. :param rebuild_cache: If True, lmdb cache is refreshed. :param min_threshold: infrequent feature threshold. Reference: https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset https://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf

{% endraw %} {% raw %}

convert_numeric_feature[source]

convert_numeric_feature(val:str)

{% endraw %} {% raw %}
{% endraw %}
{% raw %}
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut
Author: Sparsh A.

Last updated: 2022-01-07 08:45:18

recohut: 0.0.9

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

matplotlib: 3.2.2
pandas    : 1.1.5
lmdb      : 0.99
PIL       : 7.1.2
IPython   : 5.5.0
numpy     : 1.19.5
torch     : 1.10.0+cu111

{% endraw %}