--- title: Trainer keywords: fastai sidebar: home_sidebar summary: "Implementation of PyTorch model trainer." description: "Implementation of PyTorch model trainer." nb_path: "nbs/trainers/trainers.trainer.ipynb" ---
from recohut.datasets.movielens import ML1mRatingDataset
# models
from recohut.models.afm import AFM
from recohut.models.afn import AFN
from recohut.models.autoint import AutoInt
from recohut.models.dcn import DCN
from recohut.models.deepfm import DeepFM
from recohut.models.ffm import FFM
from recohut.models.fm import FM
from recohut.models.fnfm import FNFM
from recohut.models.fnn import FNN
from recohut.models.hofm import HOFM
from recohut.models.lr import LR
from recohut.models.ncf import NCF
from recohut.models.nfm import NFM
from recohut.models.ncf import NCF
from recohut.models.pnn import PNN
from recohut.models.wide_and_deep import WideAndDeep
from recohut.models.xdeepfm import xDeepFM
ds = ML1mRatingDataset(root='/content/ML1m', min_uc=10, min_sc=5)
import torch
import os
import tqdm
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader
class Args:
def __init__(self,
dataset='ml_1m',
model='wide_and_deep'
):
self.dataset = dataset
self.model = model
# dataset
if dataset == 'ml_1m':
self.dataset_root = '/content/ML1m'
self.min_uc = 20
self.min_sc = 20
# model training
self.device = 'cpu' # 'cuda:0'
self.num_workers = 2
self.batch_size = 256
self.lr = 0.001
self.weight_decay = 1e-6
self.save_dir = '/content/chkpt'
self.n_epochs = 2
self.dropout = 0.2
self.log_interval = 100
# model architecture
if model == 'wide_and_deep':
self.embed_dim = 16
self.mlp_dims = (16, 16)
elif model == 'fm':
self.embed_dim = 16
elif model == 'ffm':
self.embed_dim = 4
elif model == 'hofm':
self.embed_dim = 16
self.order = 3
elif model == 'fnn':
self.embed_dim = 16
self.mlp_dims = (16, 16)
elif model == 'ipnn':
self.embed_dim = 16
self.mlp_dims = (16,)
self.method = 'inner'
elif model == 'opnn':
self.embed_dim = 16
self.mlp_dims = (16,)
self.method = 'outer'
elif model == 'dcn':
self.embed_dim = 16
self.num_layers = 3
self.mlp_dims = (16, 16)
elif model == 'nfm':
self.embed_dim = 64
self.mlp_dims = (64,)
self.dropouts = (0.2, 0.2)
elif model == 'ncf':
self.embed_dim = 16
self.mlp_dims = (16, 16)
elif model == 'fnfm':
self.embed_dim = 4
self.mlp_dims = (64,)
self.dropouts = (0.2, 0.2)
elif model == 'deep_fm':
self.embed_dim = 16
self.mlp_dims = (16, 16)
elif model == 'xdeep_fm':
self.embed_dim = 16
self.cross_layer_sizes = (16, 16)
self.split_half = False
self.mlp_dims = (16, 16)
elif model == 'afm':
self.embed_dim = 16
self.attn_size = 16
self.dropouts = (0.2, 0.2)
elif model == 'autoint':
self.embed_dim = 16
self.atten_embed_dim = 64
self.num_heads = 2
self.num_layers = 3
self.mlp_dims = (400, 400)
self.dropouts = (0, 0, 0)
elif model == 'afn':
self.embed_dim = 16
self.LNN_dim = 1500
self.mlp_dims = (400, 400, 400)
self.dropouts = (0, 0, 0)
def get_dataset(self):
if self.dataset == 'ml_1m':
return ML1mRatingDataset(root = self.dataset_root,
min_uc = self.min_uc,
min_sc = self.min_sc
)
def get_model(self, field_dims, user_field_idx=None, item_field_idx=None):
if self.model == 'wide_and_deep':
return WideAndDeep(field_dims,
embed_dim=self.embed_dim,
mlp_dims = self.mlp_dims,
dropout = self.dropout
)
elif self.model == 'fm':
return FM(field_dims,
embed_dim = self.embed_dim
)
elif self.model == 'lr':
return LR(field_dims
)
elif self.model == 'ffm':
return FFM(field_dims,
embed_dim = self.embed_dim
)
elif self.model == 'hofm':
return HOFM(field_dims,
embed_dim = self.embed_dim,
order = self.order
)
elif self.model == 'fnn':
return FNN(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropout = self.dropout
)
elif self.model == 'ipnn':
return PNN(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
method = self.method,
dropout = self.dropout
)
elif self.model == 'opnn':
return PNN(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
method = self.method,
dropout = self.dropout
)
elif self.model == 'dcn':
return DCN(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
num_layers = self.num_layers,
dropout = self.dropout,
)
elif self.model == 'nfm':
return NFM(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropouts = self.dropouts,
)
elif self.model == 'ncf':
return NCF(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropout = self.dropout,
user_field_idx=user_field_idx,
item_field_idx=item_field_idx
)
elif self.model == 'fnfm':
return FNFM(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropouts = self.dropouts,
)
elif self.model == 'deep_fm':
return DeepFM(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropout = self.dropout,
)
elif self.model == 'xdeep_fm':
return xDeepFM(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropout = self.dropout,
cross_layer_sizes = self.cross_layer_sizes,
split_half = self.split_half,
)
elif self.model == 'afm':
return AFM(field_dims,
embed_dim = self.embed_dim,
dropouts = self.dropouts,
attn_size = self.attn_size,
)
elif self.model == 'autoint':
return AutoInt(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropouts = self.dropouts,
atten_embed_dim = self.atten_embed_dim,
num_heads = self.num_heads,
num_layers = self.num_layers,
)
elif self.model == 'afn':
return AFN(field_dims,
embed_dim = self.embed_dim,
mlp_dims = self.mlp_dims,
dropouts = self.dropouts,
LNN_dim = self.LNN_dim,
)
class EarlyStopper(object):
def __init__(self, num_trials, save_path):
self.num_trials = num_trials
self.trial_counter = 0
self.best_accuracy = 0
self.save_path = save_path
def is_continuable(self, model, accuracy):
if accuracy > self.best_accuracy:
self.best_accuracy = accuracy
self.trial_counter = 0
torch.save(model, self.save_path)
return True
elif self.trial_counter + 1 < self.num_trials:
self.trial_counter += 1
return True
else:
return False
class Trainer:
def __init__(self, args):
device = torch.device(args.device)
# dataset
dataset = args.get_dataset()
# model
model = args.get_model(dataset.field_dims,
user_field_idx = dataset.user_field_idx,
item_field_idx = dataset.item_field_idx)
model = model.to(device)
model_name = type(model).__name__
# data split
train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)
test_length = len(dataset) - train_length - valid_length
# data loader
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
dataset, (train_length, valid_length, test_length))
train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
valid_data_loader = DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
# handlers
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
os.makedirs(args.save_dir, exist_ok=True)
early_stopper = EarlyStopper(num_trials=2, save_path=f'{args.save_dir}/{model_name}.pt')
# # scheduler
# # ref - https://github.com/sparsh-ai/stanza/blob/7961a0a00dc06b9b28b71954b38181d6a87aa803/trainer/bert.py#L36
# import torch.optim as optim
# if args.enable_lr_schedule:
# if args.enable_lr_warmup:
# self.lr_scheduler = self.get_linear_schedule_with_warmup(
# optimizer, args.warmup_steps, len(train_data_loader) * self.n_epochs)
# else:
# self.lr_scheduler = optim.lr_scheduler.StepLR(
# optimizer, step_size=args.decay_step, gamma=args.gamma)
# training
for epoch_i in range(args.n_epochs):
self._train(model, optimizer, train_data_loader, criterion, device)
auc = self._test(model, valid_data_loader, device)
print('epoch:', epoch_i, 'validation: auc:', auc)
if not early_stopper.is_continuable(model, auc):
print(f'validation: best auc: {early_stopper.best_accuracy}')
break
auc = self._test(model, test_data_loader, device)
print(f'test auc: {auc}')
@staticmethod
def _train(model, optimizer, data_loader, criterion, device, log_interval=100):
model.train()
total_loss = 0
tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0)
for i, (fields, target) in enumerate(tk0):
fields, target = fields.to(device), target.to(device)
y = model(fields)
loss = criterion(y, target.float())
model.zero_grad()
loss.backward()
# self.clip_gradients(5)
optimizer.step()
# if self.args.enable_lr_schedule:
# self.lr_scheduler.step()
total_loss += loss.item()
if (i + 1) % log_interval == 0:
tk0.set_postfix(loss=total_loss / log_interval)
total_loss = 0
@staticmethod
def _test(model, data_loader, device):
model.eval()
targets, predicts = list(), list()
with torch.no_grad():
for fields, target in tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0):
fields, target = fields.to(device), target.to(device)
y = model(fields)
targets.extend(target.tolist())
predicts.extend(y.tolist())
return roc_auc_score(targets, predicts)
# def clip_gradients(self, limit=5):
# """
# Reference:
# 1. https://github.com/sparsh-ai/stanza/blob/7961a0a00dc06b9b28b71954b38181d6a87aa803/trainer/bert.py#L175
# """
# for p in self.model.parameters():
# nn.utils.clip_grad_norm_(p, 5)
# def _create_optimizer(self):
# args = self.args
# param_optimizer = list(self.model.named_parameters())
# no_decay = ['bias', 'layer_norm']
# optimizer_grouped_parameters = [
# {
# 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
# 'weight_decay': args.weight_decay,
# },
# {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
# ]
# if args.optimizer.lower() == 'adamw':
# return optim.AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon)
# elif args.optimizer.lower() == 'adam':
# return optim.Adam(optimizer_grouped_parameters, lr=args.lr, weight_decay=args.weight_decay)
# elif args.optimizer.lower() == 'sgd':
# return optim.SGD(optimizer_grouped_parameters, lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum)
# else:
# raise ValueError
# def get_linear_schedule_with_warmup(self, optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
# # based on hugging face get_linear_schedule_with_warmup
# def lr_lambda(current_step: int):
# if current_step < num_warmup_steps:
# return float(current_step) / float(max(1, num_warmup_steps))
# return max(
# 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
# )
# return LambdaLR(optimizer, lr_lambda, last_epoch)
models = [
'wide_and_deep',
'fm',
'lr',
'ffm',
'hofm',
'fnn',
'ipnn',
'opnn',
'dcn',
'nfm',
'ncf',
'fnfm',
'deep_fm',
'xdeep_fm',
'afm',
# 'autoint',
# 'afn'
]
for model in models:
args = Args(model=model)
trainer = Trainer(args)
models = [
'autoint',
'afn'
]
for model in models:
args = Args(model=model)
trainer = Trainer(args)
!tree --du -h -C /content/chkpt
!pip install -q wandb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
import os
import copy
import random
from pathlib import Path
from collections import defaultdict
from argparse import Namespace
from joblib import dump, load
from tqdm import tqdm
import wandb
from torch.utils.data import DataLoader as dl
class RecsysDataset(torch.utils.data.Dataset):
def __init__(self,df,usr_dict=None,mov_dict=None):
self.df = df
self.usr_dict = usr_dict
self.mov_dict = mov_dict
def __getitem__(self,index):
if self.usr_dict and self.mov_dict:
return [self.usr_dict[int(self.df.iloc[index]['user_id'])],self.mov_dict[int(self.df.iloc[index]['movie_id'])]],self.df.iloc[index]['rating']
else:
return [int(self.df.iloc[index]['user_id']-1),int(self.df.iloc[index]['movie_id']-1)],self.df.iloc[index]['rating']
def __len__(self):
return len(self.df)
sample = pd.DataFrame({'user_id':[1,2,3,2,2,3,2,2],'movie_id':[1,2,3,3,3,2,1,1],'rating':[2.0,1.0,4.0,5.0,1.3,3.5,3.0,4.5]})
trn_ids = random.sample(range(8),4,)
valid_ids = [i for i in range(8) if i not in trn_ids]
sample_trn,sample_vld = copy.deepcopy(sample.iloc[trn_ids].reset_index()),copy.deepcopy(sample.iloc[valid_ids].reset_index())
sample_vld = RecsysDataset(sample_vld)
sample_trn = RecsysDataset(sample_trn)
train_loader = dl(sample_trn, batch_size=2, shuffle=True)
valid_loader = dl(sample_vld, batch_size=2, shuffle=True)
class NCF(nn.Module):
def __init__(self,user_sz,item_sz,embd_sz,dropout_fac,min_r=0.0,max_r=5.0,alpha=0.5,with_variable_alpha=False):
super().__init__()
self.dropout_fac = dropout_fac
self.user_embd_mtrx = nn.Embedding(user_sz,embd_sz)
self.item_embd_mtrx = nn.Embedding(item_sz,embd_sz)
#bias = torch.zeros(size=(user_sz, 1), requires_grad=True)
self.h = nn.Linear(embd_sz,1)
self.fst_lyr = nn.Linear(embd_sz*2,embd_sz)
self.snd_lyr = nn.Linear(embd_sz,embd_sz//2)
self.thrd_lyr = nn.Linear(embd_sz//2,embd_sz//4)
self.out_lyr = nn.Linear(embd_sz//4,1)
self.alpha = torch.tensor(alpha)
self.min_r,self.max_r = min_r,max_r
if with_variable_alpha:
self.alpha = torch.tensor(alpha,requires_grad=True)
def forward(self,x):
user_emd = self.user_embd_mtrx(x[0])
item_emd = self.item_embd_mtrx(x[-1])
#hadamard-product
gmf = user_emd*item_emd
gmf = self.h(gmf)
mlp = torch.cat([user_emd,item_emd],dim=-1)
mlp = self.out_lyr(F.relu(self.thrd_lyr(F.relu(self.snd_lyr(F.dropout(F.relu(self.fst_lyr(mlp)),p=self.dropout_fac))))))
fac = torch.clip(self.alpha,min=0.0,max=1.0)
out = fac*gmf+ (1-fac)*mlp
out = torch.clip(out,min=self.min_r,max=self.max_r)
return out
model = NCF(3,3,4,0.5)
for u,r in train_loader:
#user,item = u
print(f'user:{u[0]},item:{u[-1]} and rating:{r}')
#print(u)
out = model(u)
print(f'output of the network=> out:{out},shape:{out.shape}')
break
class Trainer(object):
def __init__(self, model, device,loss_fn=None, optimizer=None, scheduler=None,artifacts_loc=None,exp_tracker=None):
# Set params
self.model = model
self.device = device
self.loss_fn = loss_fn
self.optimizer = optimizer
self.scheduler = scheduler
self.store_loc = artifacts_loc
self.exp_tracker = exp_tracker
def train_step(self, dataloader):
"""Train step."""
# Set model to train mode
self.model.train()
loss = 0.0
# Iterate over train batches
for i, batch in enumerate(dataloader):
#batch = [item.to(self.device) for item in batch] # Set device
inputs,targets = batch
inputs = [item.to(self.device) for item in inputs]
targets = targets.to(self.device)
#inputs, targets = batch[:-1], batch[-1]
#import pdb;pdb.set_trace()
self.optimizer.zero_grad() # Reset gradients
z = self.model(inputs) # Forward pass
targets = targets.reshape(z.shape)
J = self.loss_fn(z.float(), targets.float()) # Define loss
J.backward() # Backward pass
self.optimizer.step() # Update weights
# Cumulative Metrics
loss += (J.detach().item() - loss) / (i + 1)
return loss
def eval_step(self, dataloader):
"""Validation or test step."""
# Set model to eval mode
self.model.eval()
loss = 0.0
y_trues, y_probs = [], []
# Iterate over val batches
with torch.inference_mode():
for i, batch in enumerate(dataloader):
inputs,y_true = batch
inputs = [item.to(self.device) for item in inputs]
y_true = y_true.to(self.device).float()
# Step
z = self.model(inputs).float() # Forward pass
y_true = y_true.reshape(z.shape)
J = self.loss_fn(z, y_true).item()
# Cumulative Metrics
loss += (J - loss) / (i + 1)
# Store outputs
y_prob = z.cpu().numpy()
y_probs.extend(y_prob)
y_trues.extend(y_true.cpu().numpy())
return loss, np.vstack(y_trues), np.vstack(y_probs)
def predict_step(self, dataloader):
"""Prediction step."""
# Set model to eval mode
self.model.eval()
y_probs = []
# Iterate over val batches
with torch.inference_mode():
for i, batch in enumerate(dataloader):
# Forward pass w/ inputs
inputs, targets = batch
z = self.model(inputs).float()
# Store outputs
y_prob = z.cpu().numpy()
y_probs.extend(y_prob)
return np.vstack(y_probs)
def train(self, num_epochs, patience, train_dataloader, val_dataloader,
tolerance=1e-5):
best_val_loss = np.inf
training_stats = defaultdict(list)
for epoch in tqdm(range(num_epochs)):
# Steps
train_loss = self.train_step(dataloader=train_dataloader)
val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
#store stats
training_stats['epoch'].append(epoch)
training_stats['train_loss'].append(train_loss)
training_stats['val_loss'].append(val_loss)
#log-stats
# wandb.init(project=f"{args.trail_id}_{args.dataset}_{args.data_type}",config=config_dict)
if self.exp_tracker == 'wandb':
log_metrics = {'epoch':epoch,'train_loss':train_loss,'val_loss':val_loss}
wandb.log(log_metrics,step=epoch)
self.scheduler.step(val_loss)
# Early stopping
if val_loss < best_val_loss - tolerance:
best_val_loss = val_loss
best_model = self.model
_patience = patience # reset _patience
else:
_patience -= 1
if not _patience: # 0
print("Stopping early!")
break
# Tracking
#mlflow.log_metrics({"train_loss": train_loss, "val_loss": val_loss}, step=epoch)
# Logging
if epoch%5 == 0:
print(
f"Epoch: {epoch+1} | "
f"train_loss: {train_loss:.5f}, "
f"val_loss: {val_loss:.5f}, "
f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
f"_patience: {_patience}"
)
if self.store_loc:
pd.DataFrame(training_stats).to_csv(self.store_loc/'training_stats.csv',index=False)
return best_model, best_val_loss
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.1, patience=5)
trainer = Trainer(model,'cpu',loss_fn,optimizer,scheduler)
trainer.train(100,10,train_loader,valid_loader)