LeNet with No Batch Norm

The below code is aspired by the working of the wonderful book dive into deep learning, they have created their own d2l package that raps many of the complex implementations for training a model. This allows trying many different approaches to test assumptions and theories.

Show Code

import torch
from torch import nn
from d2l import torch as d2l

c:\Users\user\Documents\GitHub\simpe-AI\d2l_env\lib\site-packages\torch\cuda\__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at  ..\c10\cuda\CUDAFunctions.cpp:100.)
  return torch._C._cuda_getDeviceCount() > 0

Show Code

def init_cnn(module):  # @save
    """Initialize weights for CNNs."""
    if type(module) == nn.Linear or type(module) == nn.Conv2d:
        nn.init.xavier_uniform_(module.weight)


class LeNet(d2l.Classifier):  # @save
    """The LeNet-5 model."""

    def __init__(self, lr=0.1, num_classes=10):
        super().__init__()
        self.save_hyperparameters()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6,
                      kernel_size=5, padding=2), nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5), nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
            nn.Linear(120, 84), nn.Sigmoid(),
            nn.Linear(84, num_classes))

Show Code

@d2l.add_to_class(d2l.Classifier)  # @save
def layer_summary(self, X_shape):
    X = torch.randn(*X_shape)
    for layer in self.net:
        X = layer(X)
        print(layer.__class__.__name__, 'output shape:\t', X.shape)


model = LeNet()
model.layer_summary((1, 1, 28, 28))

Conv2d output shape:     torch.Size([1, 6, 28, 28])
Sigmoid output shape:    torch.Size([1, 6, 28, 28])
AvgPool2d output shape:  torch.Size([1, 6, 14, 14])
Conv2d output shape:     torch.Size([1, 16, 10, 10])
Sigmoid output shape:    torch.Size([1, 16, 10, 10])
AvgPool2d output shape:  torch.Size([1, 16, 5, 5])
Flatten output shape:    torch.Size([1, 400])
Linear output shape:     torch.Size([1, 120])
Sigmoid output shape:    torch.Size([1, 120])
Linear output shape:     torch.Size([1, 84])
Sigmoid output shape:    torch.Size([1, 84])
Linear output shape:     torch.Size([1, 10])

Show Code

trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128)
data.num_workers = 0
model = LeNet(lr=0.1)
model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)
trainer.fit(model, data)

Retrain the model with Batch norm

Show Code

class BNLeNet(d2l.Classifier):
    def __init__(self, lr=0.1, num_classes=10):
        super().__init__()
        self.save_hyperparameters()
        self.net = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5, padding=2), 
            nn.BatchNorm2d(6),
            nn.Sigmoid(), 
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5), 
            nn.BatchNorm2d(16),
            nn.Sigmoid(), 
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(), 
            nn.Linear(16 * 5 * 5, 120), 
            nn.BatchNorm1d(120),
            nn.Sigmoid(), 
            nn.Linear(120, 84), 
            nn.BatchNorm1d(84),
            nn.Sigmoid(), 
            nn.Linear(84, num_classes))

Show Code

@d2l.add_to_class(d2l.Classifier)  # @save
def layer_summary(self, X_shape):
    X = torch.randn(*X_shape)
    for layer in self.net:
        X = layer(X)
        print(layer.__class__.__name__, 'output shape:\t', X.shape)


model2 = BNLeNet()
model2.layer_summary((128, 1, 28, 28))

Conv2d output shape:     torch.Size([128, 6, 28, 28])
BatchNorm2d output shape:    torch.Size([128, 6, 28, 28])
Sigmoid output shape:    torch.Size([128, 6, 28, 28])
AvgPool2d output shape:  torch.Size([128, 6, 14, 14])
Conv2d output shape:     torch.Size([128, 16, 10, 10])
BatchNorm2d output shape:    torch.Size([128, 16, 10, 10])
Sigmoid output shape:    torch.Size([128, 16, 10, 10])
AvgPool2d output shape:  torch.Size([128, 16, 5, 5])
Flatten output shape:    torch.Size([128, 400])
Linear output shape:     torch.Size([128, 120])
BatchNorm1d output shape:    torch.Size([128, 120])
Sigmoid output shape:    torch.Size([128, 120])
Linear output shape:     torch.Size([128, 84])
BatchNorm1d output shape:    torch.Size([128, 84])
Sigmoid output shape:    torch.Size([128, 84])
Linear output shape:     torch.Size([128, 10])

Show Code

trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128)
data.num_workers = 0
model2 = BNLeNet(lr=0.1)

# Pass a full batch instead of a single sample
batch = next(iter(data.get_dataloader(True)))[0]  # shape: [128, 1, 28, 28]
model2.apply_init([batch], init_cnn)
trainer.fit(model2, data)

Notice the huge difference in accuracy and loss achieved by introducing BatchNorm layers compared to the first model trained where batch norm was not used.

Show Code

import inspect

print(inspect.getsource(d2l.Trainer))

class Trainer(d2l.HyperParameters):
    """The base class for training models with data.

    Defined in :numref:`subsec_oo-design-models`"""
    def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
        self.save_hyperparameters()
        assert num_gpus == 0, 'No GPU support yet'

    def prepare_data(self, data):
        self.train_dataloader = data.train_dataloader()
        self.val_dataloader = data.val_dataloader()
        self.num_train_batches = len(self.train_dataloader)
        self.num_val_batches = (len(self.val_dataloader)
                                if self.val_dataloader is not None else 0)

    def prepare_model(self, model):
        model.trainer = self
        model.board.xlim = [0, self.max_epochs]
        self.model = model

    def fit(self, model, data):
        self.prepare_data(data)
        self.prepare_model(model)
        self.optim = model.configure_optimizers()
        self.epoch = 0
        self.train_batch_idx = 0
        self.val_batch_idx = 0
        for self.epoch in range(self.max_epochs):
            self.fit_epoch()

    def fit_epoch(self):
        raise NotImplementedError

    def prepare_batch(self, batch):
        """Defined in :numref:`sec_linear_scratch`"""
        return batch

    def fit_epoch(self):
        """Defined in :numref:`sec_linear_scratch`"""
        self.model.train()
        for batch in self.train_dataloader:
            loss = self.model.training_step(self.prepare_batch(batch))
            self.optim.zero_grad()
            with torch.no_grad():
                loss.backward()
                if self.gradient_clip_val > 0:  # To be discussed later
                    self.clip_gradients(self.gradient_clip_val, self.model)
                self.optim.step()
            self.train_batch_idx += 1
        if self.val_dataloader is None:
            return
        self.model.eval()
        for batch in self.val_dataloader:
            with torch.no_grad():
                self.model.validation_step(self.prepare_batch(batch))
            self.val_batch_idx += 1

    def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
        """Defined in :numref:`sec_use_gpu`"""
        self.save_hyperparameters()
        self.gpus = [d2l.gpu(i) for i in range(min(num_gpus, d2l.num_gpus()))]
    

    def prepare_batch(self, batch):
        """Defined in :numref:`sec_use_gpu`"""
        if self.gpus:
            batch = [d2l.to(a, self.gpus[0]) for a in batch]
        return batch
    

    def prepare_model(self, model):
        """Defined in :numref:`sec_use_gpu`"""
        model.trainer = self
        model.board.xlim = [0, self.max_epochs]
        if self.gpus:
            model.to(self.gpus[0])
        self.model = model

    def clip_gradients(self, grad_clip_val, model):
        """Defined in :numref:`sec_rnn-scratch`"""
        params = [p for p in model.parameters() if p.requires_grad]
        norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
        if norm > grad_clip_val:
            for param in params:
                param.grad[:] *= grad_clip_val / norm

Retrain the model with Batch norm

Comments