--- title: Title keywords: fastai sidebar: home_sidebar ---
{% raw %}
{% endraw %} {% raw %}
%load_ext autoreload
%autoreload 2
%matplotlib inline
{% endraw %} {% raw %}
import boto3
from collections.abc import Iterable
import matplotlib.pyplot as plt
import numpy as np
from operator import lt, gt, add, sub
import os
import pandas as pd
import requests
from tabulate import tabulate
import torch
import torch.nn as nn
import warnings

from accio.s3tool import S3tool
from htools import auto_repr, valuecheck, save
from incendio.utils import DEVICE
from incendio.optimizers import variable_lr_optimizer, update_optimizer
{% endraw %} {% raw %}
@auto_repr
class TorchCallback:

    def on_train_begin(self, trainer, epochs, lrs, lr_mult, **kwargs):
        pass

    def on_train_end(self, trainer, epoch, stats, val_stats):
        pass

    def on_epoch_begin(self, trainer, epoch, stats, val_stats):
        pass

    def on_epoch_end(self, trainer, epoch, stats, val_stats):
        pass

    def on_batch_begin(self, trainer, i, sum_i, stats):
        pass

    def on_batch_end(self, trainer, i, sum_i, stats):
        pass
{% endraw %} {% raw %}
class SchedulerMixin(TorchCallback):

    verbose = False

    def on_train_end(self, trainer, *args, **kwargs):
        self.plot_lrs(os.path.join(trainer.out_dir, 'lrs.png'))

    def update_lr(self, trainer, n):
        try:
            lr = self.lrs[n]
        except IndexError as e:
            return

        update_optimizer(trainer.optim, lr, lr_mult=self.lr_mult)
        if self.verbose:
            trainer.logger.info(f'Set learning rate to {lr:.4f}.')

    def plot_lrs(self, path=None):
        """Display learning rate by iteration.

        Note: If the plot is not as smooth as expected, this likely
        means that there are very few iterations per epoch
        (i.e. the batch size is very large, at least in relative terms).
        """
        plt.plot(self.lrs)
        plt.xlabel('Iteration')
        plt.ylabel('Learning Rate')
        plt.title('Learning Rate Schedule')
        if path:
            plt.savefig(path)
            plt.close()
        else:
            plt.show()
{% endraw %} {% raw %}
class SawtoothScheduler(SchedulerMixin):

    def __init__(self, add=1e-5, scale=0.6, patience=5, priority=10):
        self.add = add
        self.scale = scale
        self.patience = patience
        self.priority = priority
        
        # These are reset in `on_train_begin`, but types remain the same.
        self.lrs = []
        self.since_improve = 0
        self.recent_best = float('inf')
        self.lr_mult = 1.0

    def on_train_begin(self, trainer, epochs, lrs, lr_mult, **kwargs):
        """Wrapper to schedule learning rates depending on chosen method.

        Parameters
        ----------
        restarts: bool
            If True, use schedule with restarts. If False, use regular
            cosine annealing that spans whole duration of training.

        Returns
        -------
        np.array: LR for each iteration (i.e. output[i] is the LR to use
            at iteration i).
        """
        self.lrs.clear()
        self.since_improve = 0
        self.recent_best = float('inf')
        self.lr_mult = lr_mult

    def on_batch_begin(self, trainer, i, sum_i, stats):
        loss = stats.get('loss')
        if loss is None: return
        
        lr = max(p['lr'] for p in trainer.optim.param_groups)
        if loss < self.recent_best:
            self.recent_best = loss
            self.since_improve = 0
            lr += self.add
        elif loss >= self.recent_best and self.since_improve < self.patience:
            self.since_improve += 1
            lr += self.add / (self.since_improve+1)
        else:
            self.since_improve += 1
            lr *= self.scale
        update_optimizer(trainer.optim, lr, lr_mult=self.lr_mult)
{% endraw %} {% raw %}
class Net(nn.Module):
    
    def __init__(self, in_size, out_size):
        super().__init__()
        fc1 = nn.Linear(in_size, out_size)
        fc2 = nn.Linear(out_size, 1)
        self.groups = nn.ModuleList([fc1, fc2])
        
    def forward(self, x):
        for g in self.groups:
            x = torch.sigmoid(g(x))
        return x

net = Net(10, 4)
opt = variable_lr_optimizer(net, [2e-3, 3e-3])
max(p['lr'] for p in opt.param_groups)
0.003
{% endraw %} {% raw %}
class CosineLRScheduler(SchedulerMixin):
    """Learning rate scheduler that makes updates each batch.
    """

    def __init__(self, warm=0.3, restarts=False, cycle_len=5, cycle_decay=0.0,
                 min_lr=None, verbose=False, priority=10):
        """
        Parameters
        ----------
        warm: float
            Percent of training run (or cycle length) devoted to the increasing
            portion of the schedule. Default 0.3.
        restarts: bool
            Specifies whether to use restarts, i.e. use a cyclical LR.
            True: Version of cosine annealing with restarts. In one
                  cycle, LR starts high and gradually decreases.
                  At the start of the next cycle, it is
                  immediately increased again.
            False: Version of cosine annealing where LR increases
                   for first 30% of training, then decreases for
                   remaining 70%.
        cycle_len: int
            Number of epochs contained in a single cycle. Only used
            when scheduler uses restarts.
        cycle_decay: float
            Scalar to decay the learning rate at the end of each cycle.
            This is only used with restarts, since the regular cosine
            annealing already decays the LR over time.
            E.g. 1.0 will use no decay.
            0.9 means that cycle 2 LRs = cycle 1 LRs * 0.9,
            cycle 3 LRs = cycle 1 LRs * .81,
            etc.
        min_lr: float
            Minimum learning rate. If None is specified, it will be set
            to max_lr / 10.
        """
        super().__init__()
        self.warm = warm
        self.cycle_len = cycle_len
        self.cycle_decay = cycle_decay
        self.restarts = restarts
        self.verbose = verbose
        self.min_lr = min_lr
        self.priority = priority

        # Set in `on_train_begin()`.
        self.lrs = None             # Iterable[float]
        self.batches_per_e = None   # int
        self.batches = None         # int
        self.max_lr = None          # float
        self.lr_mult = None         # float

    def on_train_begin(self, trainer, epochs, lrs, lr_mult, **kwargs):
        """Wrapper to schedule learning rates depending on chosen method.

        Parameters
        ----------
        restarts: bool
            If True, use schedule with restarts. If False, use regular
            cosine annealing that spans whole duration of training.

        Returns
        -------
        np.array: LR for each iteration (i.e. output[i] is the LR to use
            at iteration i).
        """
        self.batches_per_e = len(trainer.dl_train)
        self.batches = epochs * self.batches_per_e
        self.max_lr = max(lrs) if isinstance(lrs, Iterable) else lrs
        self.lr_mult = lr_mult
        if not self.min_lr: self.min_lr = self.max_lr / 10

        if self.restarts and self.batches < self.cycle_len:
            warnings.warn('Training will be less than 1 full cycle.')

        if self.restarts:
            self.lrs = self._cosine_restarts_schedule()
        else:
            self.lrs = self._cosine_schedule()

    def on_batch_begin(self, trainer, i, sum_i, stats):
        self.update_lr(trainer, sum_i)

    @staticmethod
    def _cosine_anneal(batches, lr1, lr2):
        """Helper function for _cosine_schedule().

        Parameters
        ----------
        batches: int
            Number of batches in segment.
        lr1: float
            Learning rate at start of segment.
        lr2: float
            Learning rate at end of segment.

        Returns
        -------
        np.array
        """
        i = np.arange(batches)
        return lr2 + (lr1 - lr2)*(1 + np.cos(np.pi * i/batches))/2

    def _cosine_schedule(self):
        """Cosine annealing scheduler. Computes learning rates for each
        iteration.

        Returns
        -------
        np.array
        """
        seg1 = self._cosine_anneal(int(self.warm * self.batches),
                                   self.min_lr, self.max_lr)
        seg2 = self._cosine_anneal(int(np.ceil((1 - self.warm) * self.batches)),
                                   self.max_lr, self.min_lr)
        return np.concatenate((seg1, seg2))

    def _cosine_restarts_schedule(self):
        """Cosine annealing with restarts."""
        cycles = int(np.ceil(self.batches / (self.cycle_len * self.batches_per_e)))
        cycle_batches = self.cycle_len * self.batches_per_e
        lrs = [self._cosine_anneal(cycle_batches, self.max_lr, self.min_lr)
               / (1 + self.cycle_decay * i) for i in range(cycles)]
        return np.concatenate(lrs)
{% endraw %}