In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Recorder and Annealing

Parameter Scheduling

In [2]:

from exp.nb_04 import *

import torch.nn.functional as F
import torch.nn as nn
from functools import partial

Get Data

In [3]:
x_train, y_train, x_valid, y_valid = get_data()
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
nh = 50
bs = 512
c = y_train.max().item()+1
loss_func = F.cross_entropy
In [4]:
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c=c)

Create Learner and Test

In [5]:
def create_learner(model_func, loss_func, data):
    return Learner(*model_func(data),loss_func, data)
In [6]:
learn = create_learner(get_model, loss_func, data)
run = Runner(cbs=[AvgStatsCallback(accuracy)])
In [7]:, learn)
train: [0.667735703125, tensor(0.8055)]
valid: [0.33708525390625, tensor(0.8984)]
train: [0.30191533203125, tensor(0.9112)]
valid: [0.243979052734375, tensor(0.9282)]
train: [0.24016150390625, tensor(0.9305)]
valid: [0.202760400390625, tensor(0.9431)]

To change the learning rate we need to change the get_model default arg and pass it along to create_learner we do this with a partial:

In [8]:
learn = create_learner(partial(get_model, lr=0.3), loss_func, data)
In [9]:
run = Runner(cbs=[AvgStatsCallback(accuracy)]), learn)
train: [0.77007109375, tensor(0.7936)]
valid: [0.369775732421875, tensor(0.8933)]
train: [0.346526640625, tensor(0.9011)]
valid: [0.2892301513671875, tensor(0.9174)]
train: [0.29322056640625, tensor(0.9165)]
valid: [0.251543115234375, tensor(0.9279)]

So we can wrap that partial in a function:

In [10]:
def get_model_func(lr=0.5): return partial(get_model, lr=lr)

Recorder & Parameter Scheduling

Recent research has demonstrated the importance of varying certain parameters over the course of a training epoch.

Hyper-parameters like learning rate, momentum, and weight decay should be tuned and change according to the position in the training.

To do this we'll make two callbacks:

  • Recorder which will track (or record) the loss and any other parameter we want
  • ParamScheduler which will change any parameter that is registered in our optimizer param dict.

Let's start with the Recorder

In [11]:
class Callback():

    _order = 0

    def set_runner(self, run): = run

    def __getattr__(self, k):
        return getattr(, k)

    def name(self):
        name = re.sub(r'Callback$', '', self.__class__.__name__) # removes Callback from custom callback class name
        return camel2snake(name or "callback")
In [12]:
class Recorder(Callback):
    def begin_fit(self):
        self.losses = []
        self.lrs = []
    def after_step(self):
        if not self.in_train: return # don't 
    def plot_losses(self):
    def plot_lr(self):
In [13]:
learn = create_learner(get_model_func(), loss_func, data)
run = Runner(cb_funcs=[AvgStatsCallback(accuracy), Recorder()]), learn)
train: [0.67179203125, tensor(0.8049)]
valid: [0.3221376953125, tensor(0.9067)]
train: [0.29242376953125, tensor(0.9155)]
valid: [0.2788956787109375, tensor(0.9176)]
train: [0.23997259765625, tensor(0.9303)]
valid: [0.2074302734375, tensor(0.9439)]

Alright our recorder is working. We can plot the losses:

In [14]:

We can also plot the learning rate:

In [15]:

The learning rate here is constant for the entire duration of the epoch.

Let's fix that with a ParamScheduler callback:

In [16]:
class ParamScheduler(Callback):
    _order = 1
    def __init__(self, pname, sched_func):
        self.pname = pname
        self.sched_func = sched_func
    def set_param(self):
        for pg in self.opt.param_groups:
            pg[self.pname] = self.sched_func(self.n_epochs/self.epochs)
    def begin_batch(self):
        if self.in_train: self.set_param()


Let's start easy with a linear scheduler.

We want a function that takes a place to start, an end, and the number of steps to take.

The start and end should be established before training starts and the function should then take in the current position and return the value.

We'll have to use partial for this.

In [17]:
def linsched(start, stop):
    def _inner(start, stop, pos): return start + (stop - start) * pos
    return partial(_inner, start, stop)
In [18]:
start = 0.1 
end = 0.5
ls = linsched(start, end)

A more pythonic and cleaner way of doing this would be to use a decorator:

In [19]:

def annealer(f):
    def _inner(start, end): return partial(f, start, end)
    return _inner
def sched_lin(start, end, pos): return start + (end -start) * pos
In [20]:
start = 0.1 
end = 0.5
ls = sched_lin(start, end)

Now we can use this decorator and define simple sched functions.

They take start and end args to initialize and then are called with a percentage of the epoch (between 0 - 1) and return a parameter value at that position.

The first and most obvious is sched_no which does nothing.

In [21]:

import math

def sched_no(start, end, pos): return start

def sched_cos(start, end, pos): return start + (1 + math.cos(math.pi*(1-pos))) * (end-start) / 2

def sched_exp(start, end, pos): return start * (end/start) ** pos
In [22]:
def cos_1cycle_anneal(start, high, end):
    return [sched_cos(start, high), sched_cos(high, end)]

Plotting the different schedulers gives a clear picture of what they are doing over the course of an epoch.

In [23]:
annealings = "NO LINEAR COS EXP".split()

iterations = torch.arange(0,100)
pos = torch.linspace(0.01, 1, 100)

funcs = [sched_no, sched_lin, sched_cos, sched_exp]

for fn, title in zip(funcs, annealings):
    f = fn(1e-04, 3e-2)
    plt.plot(iterations, [f(o) for o in pos], label=title)

plt.ylabel("Param Value")
<matplotlib.legend.Legend at 0x257000b11f0>

The tricker part of scheduling is combining these functions together to design how our parameters are scheduled. We don't necessary want them to increase linearly, exponentially, or like a cosine.

In [24]:
def combine_scheds(pcts, scheds):
    assert sum(pcts) == 1.
    pcts = tensor([0] + listify(pcts))
    assert torch.all(pcts>=0)
    pcts = torch.cumsum(pcts, 0)
    def _inner(pos):
        idx = (pos >= pcts).nonzero().max()
        actual_pos = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
        return scheds[idx](actual_pos)
    return _inner
In [25]:
sched = combine_scheds([0.4, 0.6], [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)]) 
In [26]:
pos = torch.linspace(0.01, 1, 100)
plt.plot(iterations, [sched(o) for o in pos])
[<matplotlib.lines.Line2D at 0x257001bc9d0>]


In [27]:
scheduler = partial(ParamScheduler, 'lr', sched)
In [28]:
learn = create_learner(get_model_func(0.3), loss_func, data)
run = Runner(cbs=[AvgStatsCallback(accuracy), scheduler()], cb_funcs=Recorder())
In [29]:, learn)
train: [0.732454609375, tensor(0.7949)]
valid: [0.30394931640625, tensor(0.9126)]
In [30]:
In [31]:
In [32]:
!python 05_recorder_annealing.ipynb
Converted 05_recorder_annealing.ipynb to exp\