Bag of Tricks¶

Refinements and Improvements to CNN's for image classification

From the paper: Bag of Tricks for Image Classification with Convolutional Neural Networks

By: Tong He, Zhi Zhang, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li

Much of the recent progress made in image classification research can be credited to training procedure refinements, such as changes in data augmentations and optimization methods. In the literature, however, most refinements are either briefly mentioned as implementation details or only visible in source code. In this paper, we will examine a collection of such refinements and empirically evaluate their impact on the final model accuracy through ablation study. We will show that, by combining these refinements together, we are able to improve various CNN models significantly. For example, we raise ResNet-50's top-1 validation accuracy from 75.3% to 79.29% on ImageNet. We will also demonstrate that improvement on image classification accuracy leads to better transfer learning performance in other application domains such as object detection and semantic segmentation.

%reload_ext autoreload
%autoreload 2
%matplotlib inline

#export
from exp.nb_09 import *

Setup¶

Data to DataBunch¶

bs = 64

path = datasets.untar_data(datasets.URLs.IMAGENETTE_160) # downloads and returns a path to folder
tfms = [make_rgb, ResizeFixed(128), to_byte_tensor, to_float_tensor] # transforms to be applied to images

il = ImageList.from_files(path, tfms=tfms) # Imagelist from files
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name="val")) # Splitdata by function
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcesser()) # label the data by parent folder
data = ll.to_databunch(bs, c_in=3, c_out=10)

callbacks = [partial(AvgStatsCallback, accuracy), 
             CudaCallback,
             partial(BatchTransformXCallback, norm_imagenette)]

Model Arch¶

nfs = [64,64,128,256]

$$ f(x) = 2^{\log_2(x)} $$

#export
def prev_pow_2(x): return 2**math.floor(math.log2(x))

for i in [1,3]:
    print(f'{i}:', (2**math.floor(math.log2(i*3*3))))

1: 8
3: 16

Now we'll define a function that will automatically generate our model layers based on the geometry suggested in the BoT paper:

#export
def get_cnn_layers(data, nfs, layer, **kwargs):
    
    def f(ni, nf, stride=2): 
        return layer(ni, nf, ks=3, stride=stride, **kwargs)
    
    l1 = data.c_in # channels in from databunch 
    l2 = prev_pow_2(l1*3*3)
    
    layers = [f(l1,   l2,   stride=1), # input channels, 2**input channels 
              f(l2,   l2*2, stride=2),
              f(l2*2, l2*4, stride=2)]
    nfs = [l2*4] + nfs
    
    layers += [f(nfs[i], nfs[i+1]) for i in range(len(nfs)-1)]
    layers += [nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c_out)]
    
    return layers

#export
def get_cnn_model(data, nfs, layer, **kwargs):
    return nn.Sequential(*get_cnn_layers(data, nfs, layer, **kwargs))

def get_learn_run(data, nfs, layer, lr, cbs=None, opt_func=None, uniform=False, **kwargs):
    model = get_cnn_model(data, nfs, layer, **kwargs)
    init_cnn(model, uniform=uniform)
    return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)

sched = combine_scheds([0.3, 0.7], cos_1cycle_anneal(0.1, 0.3, 0.05))

learn, run = get_learn_run(data, nfs, conv_layer, lr=0.2, cbs=callbacks+[partial(ParamScheduler, 'lr', sched)])

run.fit(1, learn)

train: [1.727458206549002, tensor(0.4080, device='cuda:0')]
valid: [1.3914296377388535, tensor(0.5299, device='cuda:0')]

A function that would print out a summary of the layers and their activation shapes of our model would be very helpful.

We can do this by using Hooks and sending batch through the model to print out what happens at every stage:

#export
def model_summary(run, learn, data, find_all=False):
    xb, yb = get_batch(data.valid_dl, run)
    device = next(learn.model.parameters()).device
    xb, yb = xb.to(device), yb.to(device)
    hf = lambda hook,mod,inp,outp: print(f'Module:\n{mod}\nOutput Shape: {outp.shape}\n')
    mods = find_mods(learn.model, is_lin_layer) if find_all else learn.model.children()
    with Hooks(mods, hf) as hook: learn.model(xb)

model_summary(run, learn, data)

Sequential(
  (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 16, 128, 128])

Sequential(
  (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 32, 64, 64])

Sequential(
  (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 64, 32, 32])

Sequential(
  (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 64, 16, 16])

Sequential(
  (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 64, 8, 8])

Sequential(
  (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 128, 4, 4])

Sequential(
  (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): GeneralRelu()
  (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
Output:torch.Size([128, 256, 2, 2])

AdaptiveAvgPool2d(output_size=1)
Output:torch.Size([128, 256, 1, 1])

Lambda()
Output:torch.Size([128, 256])

Linear(in_features=256, out_features=10, bias=True)
Output:torch.Size([128, 10])

Training¶

%time run.fit(5, learn)

train: [1.2597042611086176, tensor(0.5824, device='cuda:0')]
valid: [1.4853317824442676, tensor(0.5177, device='cuda:0')]
train: [1.1744825012870947, tensor(0.6126, device='cuda:0')]
valid: [1.2985995969347133, tensor(0.5809, device='cuda:0')]
train: [0.8226048796566428, tensor(0.7270, device='cuda:0')]
valid: [1.2384917396496815, tensor(0.6094, device='cuda:0')]
train: [0.43220947858637393, tensor(0.8683, device='cuda:0')]
valid: [1.1581633160828027, tensor(0.6492, device='cuda:0')]
train: [0.18811001638158067, tensor(0.9600, device='cuda:0')]
valid: [1.1953391222133758, tensor(0.6530, device='cuda:0')]
Wall time: 1min 16s

nb_auto_export()