--- title: Title keywords: fastai sidebar: home_sidebar ---
%load_ext autoreload
%autoreload 2
%matplotlib inline
import boto3
from collections.abc import Iterable
import matplotlib.pyplot as plt
import numpy as np
from operator import lt, gt, add, sub
import os
import pandas as pd
import requests
from tabulate import tabulate
import torch
import torch.nn as nn
import warnings
from accio.s3tool import S3tool
from htools import auto_repr, valuecheck, save
from incendio.utils import DEVICE
from incendio.optimizers import variable_lr_optimizer, update_optimizer
@auto_repr
class TorchCallback:
def on_train_begin(self, trainer, epochs, lrs, lr_mult, **kwargs):
pass
def on_train_end(self, trainer, epoch, stats, val_stats):
pass
def on_epoch_begin(self, trainer, epoch, stats, val_stats):
pass
def on_epoch_end(self, trainer, epoch, stats, val_stats):
pass
def on_batch_begin(self, trainer, i, sum_i, stats):
pass
def on_batch_end(self, trainer, i, sum_i, stats):
pass
class SchedulerMixin(TorchCallback):
verbose = False
def on_train_end(self, trainer, *args, **kwargs):
self.plot_lrs(os.path.join(trainer.out_dir, 'lrs.png'))
def update_lr(self, trainer, n):
try:
lr = self.lrs[n]
except IndexError as e:
return
update_optimizer(trainer.optim, lr, lr_mult=self.lr_mult)
if self.verbose:
trainer.logger.info(f'Set learning rate to {lr:.4f}.')
def plot_lrs(self, path=None):
"""Display learning rate by iteration.
Note: If the plot is not as smooth as expected, this likely
means that there are very few iterations per epoch
(i.e. the batch size is very large, at least in relative terms).
"""
plt.plot(self.lrs)
plt.xlabel('Iteration')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')
if path:
plt.savefig(path)
plt.close()
else:
plt.show()
class SawtoothScheduler(SchedulerMixin):
def __init__(self, add=1e-5, scale=0.6, patience=5, priority=10):
self.add = add
self.scale = scale
self.patience = patience
self.priority = priority
# These are reset in `on_train_begin`, but types remain the same.
self.lrs = []
self.since_improve = 0
self.recent_best = float('inf')
self.lr_mult = 1.0
def on_train_begin(self, trainer, epochs, lrs, lr_mult, **kwargs):
"""Wrapper to schedule learning rates depending on chosen method.
Parameters
----------
restarts: bool
If True, use schedule with restarts. If False, use regular
cosine annealing that spans whole duration of training.
Returns
-------
np.array: LR for each iteration (i.e. output[i] is the LR to use
at iteration i).
"""
self.lrs.clear()
self.since_improve = 0
self.recent_best = float('inf')
self.lr_mult = lr_mult
def on_batch_begin(self, trainer, i, sum_i, stats):
loss = stats.get('loss')
if loss is None: return
lr = max(p['lr'] for p in trainer.optim.param_groups)
if loss < self.recent_best:
self.recent_best = loss
self.since_improve = 0
lr += self.add
elif loss >= self.recent_best and self.since_improve < self.patience:
self.since_improve += 1
lr += self.add / (self.since_improve+1)
else:
self.since_improve += 1
lr *= self.scale
update_optimizer(trainer.optim, lr, lr_mult=self.lr_mult)
class Net(nn.Module):
def __init__(self, in_size, out_size):
super().__init__()
fc1 = nn.Linear(in_size, out_size)
fc2 = nn.Linear(out_size, 1)
self.groups = nn.ModuleList([fc1, fc2])
def forward(self, x):
for g in self.groups:
x = torch.sigmoid(g(x))
return x
net = Net(10, 4)
opt = variable_lr_optimizer(net, [2e-3, 3e-3])
max(p['lr'] for p in opt.param_groups)
class CosineLRScheduler(SchedulerMixin):
"""Learning rate scheduler that makes updates each batch.
"""
def __init__(self, warm=0.3, restarts=False, cycle_len=5, cycle_decay=0.0,
min_lr=None, verbose=False, priority=10):
"""
Parameters
----------
warm: float
Percent of training run (or cycle length) devoted to the increasing
portion of the schedule. Default 0.3.
restarts: bool
Specifies whether to use restarts, i.e. use a cyclical LR.
True: Version of cosine annealing with restarts. In one
cycle, LR starts high and gradually decreases.
At the start of the next cycle, it is
immediately increased again.
False: Version of cosine annealing where LR increases
for first 30% of training, then decreases for
remaining 70%.
cycle_len: int
Number of epochs contained in a single cycle. Only used
when scheduler uses restarts.
cycle_decay: float
Scalar to decay the learning rate at the end of each cycle.
This is only used with restarts, since the regular cosine
annealing already decays the LR over time.
E.g. 1.0 will use no decay.
0.9 means that cycle 2 LRs = cycle 1 LRs * 0.9,
cycle 3 LRs = cycle 1 LRs * .81,
etc.
min_lr: float
Minimum learning rate. If None is specified, it will be set
to max_lr / 10.
"""
super().__init__()
self.warm = warm
self.cycle_len = cycle_len
self.cycle_decay = cycle_decay
self.restarts = restarts
self.verbose = verbose
self.min_lr = min_lr
self.priority = priority
# Set in `on_train_begin()`.
self.lrs = None # Iterable[float]
self.batches_per_e = None # int
self.batches = None # int
self.max_lr = None # float
self.lr_mult = None # float
def on_train_begin(self, trainer, epochs, lrs, lr_mult, **kwargs):
"""Wrapper to schedule learning rates depending on chosen method.
Parameters
----------
restarts: bool
If True, use schedule with restarts. If False, use regular
cosine annealing that spans whole duration of training.
Returns
-------
np.array: LR for each iteration (i.e. output[i] is the LR to use
at iteration i).
"""
self.batches_per_e = len(trainer.dl_train)
self.batches = epochs * self.batches_per_e
self.max_lr = max(lrs) if isinstance(lrs, Iterable) else lrs
self.lr_mult = lr_mult
if not self.min_lr: self.min_lr = self.max_lr / 10
if self.restarts and self.batches < self.cycle_len:
warnings.warn('Training will be less than 1 full cycle.')
if self.restarts:
self.lrs = self._cosine_restarts_schedule()
else:
self.lrs = self._cosine_schedule()
def on_batch_begin(self, trainer, i, sum_i, stats):
self.update_lr(trainer, sum_i)
@staticmethod
def _cosine_anneal(batches, lr1, lr2):
"""Helper function for _cosine_schedule().
Parameters
----------
batches: int
Number of batches in segment.
lr1: float
Learning rate at start of segment.
lr2: float
Learning rate at end of segment.
Returns
-------
np.array
"""
i = np.arange(batches)
return lr2 + (lr1 - lr2)*(1 + np.cos(np.pi * i/batches))/2
def _cosine_schedule(self):
"""Cosine annealing scheduler. Computes learning rates for each
iteration.
Returns
-------
np.array
"""
seg1 = self._cosine_anneal(int(self.warm * self.batches),
self.min_lr, self.max_lr)
seg2 = self._cosine_anneal(int(np.ceil((1 - self.warm) * self.batches)),
self.max_lr, self.min_lr)
return np.concatenate((seg1, seg2))
def _cosine_restarts_schedule(self):
"""Cosine annealing with restarts."""
cycles = int(np.ceil(self.batches / (self.cycle_len * self.batches_per_e)))
cycle_batches = self.cycle_len * self.batches_per_e
lrs = [self._cosine_anneal(cycle_batches, self.max_lr, self.min_lr)
/ (1 + self.cycle_decay * i) for i in range(cycles)]
return np.concatenate(lrs)