%reload_ext autoreload
%autoreload 2
%matplotlib inline
A initialization technique for deep architectures
From the abstract:
Layer-sequential unit-variance (LSUV) initialization – a simple method for weight initialization for deep net learning – is proposed. The method consists of the two steps. First, pre-initialize weights of each convolution or inner-product layer with orthonormal matrices. Second, proceed from the first to the final layer, normalizing the variance of the output of each layer to be equal to one.
#export
from exp.nb_08 import *
x_train, y_train, x_valid, y_valid = get_data()
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)
len(data.train_ds)
mnist_view = view_tfm(1,28,28)
callbacks = [
Recorder,
partial(AvgStatsCallback, accuracy),
partial(BatchTransformXCallback, mnist_view),
CudaCallback
]
nfs = [8,16,32,64,64]
Let's refactor our conv_layer
with two new properties which will make it easier to apply the LSUV algorithm:
class ConvLayer(nn.Module):
def __init__(self, ni, nf, ks=3, stride=2, sub=0., **kwargs):
super().__init__()
self.conv = nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True)
self.relu = GeneralRelu(sub=sub, **kwargs)
def forward(self, x):
return self.relu(self.conv(x))
@property
def bias(self): return -self.relu.sub
@bias.setter
def bias(self, v): self.relu.sub = -v
@property
def weight(self): return self.conv.weight
learn, run = get_learn_run(data, nfs, ConvLayer, 0.6, cbs=callbacks)
Here is our baseline without initializing the weights with LSUV:
run.fit(2, learn)
Process looks like this:
learn, run = get_learn_run(data, nfs, ConvLayer, 0.4, cbs=callbacks)
Helper function to get one batch from the dataloader and calls the passed callbacks on it.
#export
def get_batch(dl, run):
run.xb, run.yb = next(iter(dl))
for cb in run.cbs: cb.set_runner(run)
run('begin_batch')
return run.xb, run.yb
xb, yb = get_batch(learn.data.train_dl, run)
Iterating through the model and selecting the linear layers - not Relu or Adaptive Pool.
#export
def find_mods(m, func):
if func(m): return [m]
return sum([find_mods(o, func) for o in m.children()], [])
def is_lin_layer(l):
layers = (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear, nn.ReLU)
return isinstance(l, layers)
mods = find_mods(learn.model, lambda o: isinstance(o, ConvLayer))
mods
#export
def lsuv_append_stat(hook, mod, inp, outp):
d = outp.data
hook.mean, hook.std = d.mean().item(), d.std().item()
mdl = learn.model.cuda()
with Hooks(mods, lsuv_append_stat) as hooks:
mdl(xb)
for hook in hooks: print(hook.mean, hook.std)
#export
def lsuv_module(m, xb):
h = Hook(m, lsuv_append_stat)
while mdl(xb) is not None and abs(h.mean ) > 1e-3: m.bias -= h.mean
while mdl(xb) is not None and abs(h.std-1) > 1e-3: m.weight.data /= h.std
h.remove()
return h.mean, h.std
Mean is slightly too high. But unit variance all the way through.
for m in mods: print(lsuv_module(m, xb))
And it works. It improves the accuracy from our baseline to 96%:
%time run.fit(2, learn)
nb_auto_export()