%reload_ext autoreload
%autoreload 2
%matplotlib inline
Forward and backward passes
With matrix multiplication out of the way we can now get to work constructing a basic neural network that can perform a forward and backward pass.
#export
from exp.nb_01a import *
Now grab the data and convert into tensors
x_train, y_train, x_valid, y_valid = get_data()
train_mean, train_std = x_train.mean(), x_train.std(); train_mean, train_std
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)
get_stats(x_train)
n,m = x_train.shape
c = y_train.max()+1
n,m,c
nh = 50 # number of hidden units
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)
b2 = torch.zeros(1)
def relu(x): return x.clamp_min(0.) - 0.5
w1 = torch.randn(m,nh)*math.sqrt(2./m )
w1.shape, w2.shape
m, nh
def model(xb):
l1 = lin(xb, w1, b1)
l2 = relu(l1)
l3 = lin(l2, w2, b2)
return l3
%timeit -n 10 _=model(x_valid)
The basic model we've built is capable of making a forward pass - it takes an input, does some calculations on it and outputs a single value.
We now need some way to determine how wrong that prediction is from the correct answer and adjust the model's parameters accordingly.
Determining how wrong the prediction is from the correct answer is done with a loss function.
To start we'll use the Mean Squared Error which does not make any sense given our data but it simplifies the gradient calculations in the adjustment ("learning") phase.
model(x_valid).shape
We'll reshape this with squeeze to remove that trailing 1.
model(x_valid).squeeze(-1).shape, y_valid.shape
def mse(output, target): return (output.squeeze(-1) - target).pow(2).mean()
y_train.type()
y_train,y_valid = y_train.float(),y_valid.float()
preds = model(x_train)
mse(preds, y_train)
We're ready to use all of the components we've written so far and piece them together into a model.
The last piece we need to write is the backward pass. We'll do this manually for this first basic model and then we'll rely on Pytorch's autograd.
We need to imagine our forward pass as simply a function composition:
$$ \hat{y} = \text{mse}(\text{Linear}(\text{Relu}(\text{Linear}(\text{input})))) $$
With this in mind we can write it as:
$$ \hat{y} = L(f(u(g(x)))) $$
Then to calculate the backward pass...
def mse_grad(inp, target):
inp.g = 2. * (inp.squeeze() - target).unsqueeze(-1) / inp.shape[0]
def relu_grad(inp,out):
inp.g = (inp >0).float() * out.g
def lin_grad(inp, out, w, b):
inp.g = out.g @ w.t()
w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
b.g = out.g.sum(0)
def forward_and_backward(inp, target):
print("Input: ", inp.shape)
l1 = inp @ w1 + b1
print("l1: ", l1.shape)
l2 = relu(l1)
print("relu: ", l2.shape)
out = l2 @ w2 + b2
print("out: ", out.shape)
loss = mse(out, target)
print("loss:", loss)
# out gets passed as the "inp" to mse_grad along with the target
mse_grad(out, target)
# l2 in the in
lin_grad(l2, out, w2, b2)
relu_grad(l1, l2)
lin_grad(inp, l1, w1, b1)
forward_and_backward(x_train, y_train)
The goal of backprop is to find the gradients of the loss function with respect to the weights and biases. This gives the model a "direction" to shift its parameters to learn a function with can predict the y
based on a given x
.
So the gradient matrix should be the same shape as our weights as we'll be subtracting a fraction of them from the weights.
assert w1.g.shape == w1.shape
Looks good.
To test the accuracy of our calculations let's clone the gradients and compare them to Pytorch's gradients.
First, we'll make a copy of them to compare with later:
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig = x_train.g.clone()
Then we'll clone the weights and turn on autograd
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)
Run the same basic model again with the new weights that have Pytorch's gradients enabled:
def forward(inp, targ):
l1 = inp @ w12 + b12
l2 = relu(l1)
out = l2 @ w22 + b22
return mse(out, targ)
loss = forward(xt2, y_train)
loss.backward()
And it looks like our manual version was successful.
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig)
We'll now refactor the basic model from above by implementing each function as its own class.
Relu and MSE do not have any parameters so they have no need of __init__
class Relu():
def __call__(self, inp):
self.inp = inp
self.out = inp.clamp_min(0.)-.5
return self.out
def backward(self):
self.inp.g = (self.inp >0).float() * self.out.g
class Mse():
def __call__(self, inp, targ):
self.inp = inp
self.targ = targ
self.out = (inp.squeeze() - targ).pow(2).mean()
return self.out
def backward(self):
self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]
class Lin():
def __init__(self, w, b):
self.w = w
self.b = b
def __call__(self, inp):
self.inp = inp
self.out = inp@self.w + self.b
return self.out
def backward(self):
self.inp.g = self.out.g @ self.w.t()
self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
self.b.g = self.out.g.sum(0)
class Model():
def __init__(self):
self.layers = [Lin(w1, b1), Relu(), Lin(w2,b2)]
self.loss = Mse()
def __call__(self, x, targ):
for l in self.layers: x = l(x)
return self.loss(x, targ)
def backward(self):
self.loss.backward()
for l in reversed(self.layers): l.backward()
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()
%time loss = model(x_train, y_train)
The backward pass is very slow relative to the forward. Why??
%time model.backward()
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)
When refactoring we should be on the lookout for redundant code or patterns of code that can be condensed and then reused.
We'll start with a base class called module
that all of our layers will inherit from. It will set up the __call__
method to call a foward pass that needs to be implemented.
This will give us an insight into how the Pytorch nn.module
is structured.
class Module():
def __call__(self, *args):
self.args = args
self.out = self.forward(*args)
return self.out
def forward(self):
raise Exception('not implemented')
def backward(self):
self.bwd(self.out, *self.args)
class Relu(Module):
def forward(self, inp): return inp.clamp_min(0.) -0.5
def bwd(self, out, inp): inp.g = (inp>0).float() * out.g
class Lin(Module):
def __init__(self, w, b): self.w, self.b = w, b
def forward(self, inp): return inp@self.w + self.b
def bwd(self, out, inp):
inp.g = out.g @ self.w.t()
# Using Einsum we will speed up the backward pass
self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
# But its even faster if we do it this way
#self.w.g = inp.t() @ out.g
self.b.g = out.g.sum(0)
class Mse(Module):
def forward(self, inp, target): return (inp.squeeze() - target).pow(2).mean()
def bwd(self, out, inp, target):
inp.g = 2. * (inp.squeeze()-target).unsqueeze(-1) / target.shape[0]
This model is the same as before.
class Model():
def __init__(self):
self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
self.loss = Mse()
def __call__(self, x, targ):
for l in self.layers: x = l(x)
return self.loss(x, targ)
def backward(self):
self.loss.backward()
for l in reversed(self.layers): l.backward()
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()
%time loss = model(x_train, y_train)
The backward pass is much faster than above... WHY?
%time model.backward()
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)
We're ready to use Pytorch's torch.nn.module
This is what the docs say:
`Base class for all neural network modules.
Your models should also subclass this class.`
class Model(nn.Module):
def __init__(self, n_in, nh, n_out):
super().__init__()
self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
self.loss = mse
def __call__(self, x, target):
for l in self.layers: x = l(x)
return self.loss(x.squeeze(), target)
model = Model(m,nh, 1)
%time loss = model(x_train, y_train)
%time loss.backward()
!python notebook2script.py 02_fully_connected.ipynb