MNIST exercise (handwritten printed digits recognition tutorial)¶

Goal: Introduction convolution feature maps, and features

Exercise:

Run the notebook, observe the images of filter weights and activations (at end)

Try changing the filter size for the first convolution layer to something large (like 9x9 or 16x16) How does that change the images of filter weights and activations?


Question to consider: for 10 digits what is min number of filters needed?
In [ ]:
# ----------- IMPORT STATEMENTS ---------------
import argparse
import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
import os
import sys
import numpy as np
import time

#---------------------------------------------
print('import done')
In [ ]:
# -----------------------------------
#Parameters for training
# -----------------------------------
num_worker2use = 4     #for parallel reading/prefetching of data (for bigger data)
batch_size     = 256  
max_numtrain   = 1024       #for this exercise, train on limited num of input, to save time
max_numtest    = batch_size # and test on limited num of input
epochs         = 10
lrate          = 0.01
numfilt        = 16   #Try 8 or 24? or a mininumal number like 2?

# --------------------------------------------------------------
# NOTE 3x3 kernel leaves 12x12 after maxpooling, so use 12 for reduced_size
#     16x16 leaves 6x6
#     9x9 leaves  9x9
# --------------------------------------------------------------
kernel_size2use= 3   #Try 9 or even 16,
reduced_size   = 12    # also, see the note below in fwd method for 'MyNet' class
# --------------------------------------------------------------

data_path      = './data'
torch.manual_seed(777)
In [ ]:
# -------------------------------------------------------------
#   Define network class object and its 
#             initialization and forward function
#             (other functions are inherited from torch.nn)
# -------------------------------------------------------------
class MyNet(torch.nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        #Conv:  input size 1 channel, output is number of filters, the 
        #  actual batch of input is implicit
        # see:   https://docs.pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        self.conv1   = torch.nn.Conv2d(in_channels=1,out_channels=numfilt,kernel_size=kernel_size2use,stride=1) 
        self.linear1 = torch.nn.Linear(numfilt*reduced_size*reduced_size,16) #after max pooling it wil lbe 12 x12
        self.linear2 = torch.nn.Linear(16, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        #Uncomment this to see what the size actually is after max pooling
        #print('MYINFO  fwd, after conv1relu, x shape:',x.shape)

        x = F.max_pool2d(x, 3, 2)
        # <<<<<<<<<<<<<<<<<--------------------
        #Uncomment this to see what the size actually is after max pooling
        #print('MYINFO  fwd, after max, x shape:',x.shape)

        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        #not sure i need this   x = F.relu(x)
        output = F.log_softmax(x, dim=1)  #log softmax for classfcnt or binary?
        return output
print('Net class defined ')
In [ ]:
# --------------------------------------------------------
#   Define training function
# --------------------------------------------------------
def train(model, device, train_loader, optimizer, epoch):
    ''' This is called for each epoch.  
        Arguments:  the model, the device to run on, data loader, optimizer, and current epoch
    ''' 
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
      if batch_idx*batch_size>= max_numtrain:
           break
      else:
        if batch_idx==0:  #print one message
          print('INFO train, ep:',epoch,' batidx:',batch_idx, ' batch size:',target.shape[0])
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()                 #reset optimizer state
        output = model(data)                  #get predictions
        loss = F.nll_loss(output, target)     #get loss (nll_loss for softmax outputs)
        loss.backward()                       #backprop loss
        optimizer.step()                      #update weights

# -------------------------------------------------------------
#   Define test function
# -------------------------------------------------------------
def test(model, device, test_loader):
    ''' This is called for after training each epoch 
        Arguments:  the model, the device to run on, test data loader
    ''' 
    model.eval()

    #accumulate loss, accuracy info
    total_loss    = 0
    total_correct = 0
    total         = 0
    with torch.no_grad():
      for batch_idx, (data, target) in enumerate(test_loader):
        if batch_idx*batch_size>= max_numtest:
           break
        else:
            data, target = data.to(device), target.to(device)
            output       = model(data)
            total_loss  += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss

            _, predicted  = torch.max(output, dim=1)
            total_correct += (predicted == target).sum().item()
            total         += output.shape[0]
           
    acc       = total_correct/total
    test_loss = total_loss/total 
    print('INFO evaluation acc:',f'{acc:.4}',' loss:',f'{test_loss:.4}','tot:',total)
    return acc,test_loss
def get_activation(name, activation):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook
    
print('Train,test, support functions defined ')
In [ ]:
# -------------------------------------------------
#  Get device  
#  (note, this is set up for 1 GPU device
#    if this were to run on a full GPU node with >1 gpu device, you would
#     want to get rank, world size info and set device id 
#     as in:   torch.cuda.set_device(local_rank) 
#     and then also run distributed initialization )
# -------------------------------------------------
use_cuda = torch.cuda.is_available() 
if use_cuda:
        num_gpu = torch.cuda.device_count()
        print('INFO,  cuda, num gpu:',num_gpu)
        device     = torch.cuda.current_device()
        print('environ visdevs:',os.environ["CUDA_VISIBLE_DEVICES"])
else:
        num_gpu = 0
        print('INFO, cuda not available')
        device  = torch.device("cpu")   
print('INFO, device is:', device)
In [ ]:
# -------------------------------------------
#prepare images for network as they are loaded
#   crop or other functions can be added here
# -------------------------------------------
transform=transforms.Compose([
        transforms.ToTensor(),  #])  #also transforms image pixels to 0,1 range from 0,255
        transforms.Normalize((0.1307,), (0.3081,))
        ])

dataset1 = datasets.MNIST(data_path, train=True, download=True,transform=transform)
dataset2 = datasets.MNIST(data_path, train=False,download=True,transform=transform)

train_loader =torch.utils.data.DataLoader(dataset1, 
            batch_size =batch_size,     sampler   =None,
            num_workers=num_worker2use, pin_memory=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(dataset2, 
            batch_size =batch_size,     sampler   =None,
            num_workers=num_worker2use, pin_memory=True, drop_last=True)
In [ ]:
# -------------------------------------------
#  Set up model
# -------------------------------------------
mymodel = MyNet().to(device)

#summary(mymodel,input_size=(1, 1, 28, 28))
In [ ]:
# -------------------------------------------
#  Do training loop
# -------------------------------------------

# Dictionary to store activations
activations = {}
# Register hooks
mymodel.conv1.register_forward_hook(get_activation('conv1', activations))

optimizer = torch.optim.Adam(mymodel.parameters(), lr=lrate)

train_results = []
test_results  = []
for epoch in range(epochs):
        print('INFO about to train epoch:',epoch)
        start_time=time.time()
        train(mymodel, device, train_loader, optimizer, epoch)
        print('INFO training time:',str.format('{0:.5f}', time.time()-start_time))
        print('INFO train metrics for epoch:',epoch)
        train_results.append(test(mymodel, device, train_loader))
        print('INFO test metrices for epoch:',epoch)
        test_results.append(test(mymodel, device, test_loader))

print('INFO  done');
In [ ]:
 

Below is code to plot and see results¶

In [ ]:
#reshape results
train_results=np.array(train_results)
test_results =np.array(test_results)
In [ ]:
# plot accuracy over epochs

import matplotlib.pyplot as plt      #These provide matlab type of plotting functions
import matplotlib.image as mpimg
%matplotlib inline                   

plt.figure()
plt.axis([0 ,epochs, 0, 1])
plt.plot(train_results[:,0]) #0th col is accuracy, col 1 is loss
plt.plot(test_results[:,0])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
In [ ]:
# Get some sample predictions
with torch.no_grad():
  for batch_idx, (data, target) in enumerate(test_loader):
        data, target = data.to(device), target.to(device)
        output       = mymodel(data)
        _, predicted = torch.max(output, dim=1)
        break
output=output.cpu().numpy()
predicted=predicted.cpu().numpy()
In [ ]:
#To view a sample image and predictions
import matplotlib.pyplot as plt      
import matplotlib.image as mpimg

#use 0 to 5, or 1 to 6, etc.. (but 5 at a time)
range_start=11
for i in range(range_start,range_start+5):
  #print('For example i:',i,' rawoutput:',np.round(output[i,:],1))
  print('For example i:',i,' predicted:',predicted[i])
  print('----------------------------------------------------')
for i in range(5):
    plt.subplot(1,5,i+1)
    plt.xticks([])
    plt.yticks([])
    tmpimg=np.squeeze(data[range_start+i,:,:,:].cpu()).reshape((28,28))
    plt.imshow(tmpimg,'gray')   
In [ ]:
 
In [ ]:
 
In [ ]:
# ------------ GET WEIGHTS From Convolution Layer and make mosaic image

#take weights from conv layer and detach from model, move to cpu (in case we are on gpu)
Wlist   =mymodel.conv1.weight.detach().cpu()  #returns array: numfilters,1,3,3 

W3Dchan     =np.squeeze(Wlist) #get the channels 
print("W3D shape:"+str(W3Dchan.shape))

#plot mosaic of filters of 
ncol =4
nrow =np.ceil(16/ncol).astype(int)   #assume 16 is number of filters
plt.figure()
for i in range(min(16,W3Dchan.shape[0])):
   plt.subplot(nrow,ncol,i+1)
   plt.imshow(W3Dchan[i],'gray')
   plt.axis('off')

plt.show()
print('done plotting weights mosaic')
In [ ]:
#  ---------------- NOW Visualize the activations for the first training example --------
#   1. gather activations from the model layers
# -------------------------------------------------------------------------

with torch.no_grad():
  for batch_idx, (data, target) in enumerate(test_loader):
      break
#try different images by change 0:1 to  1:2, etc..      
test_img   = data[0:1,:,:,:].to(device)
model_pred = mymodel(test_img) #run model on 1 imput
conv1_act  = np.squeeze(activations['conv1'].detach().cpu())
print('activation array shape:',conv1_act.shape)
# 2.  Now output a mosaic of layer 1
ncol =4
nrow =np.ceil(16/ncol).astype(int)
plt.figure()
for i in range(min(conv1_act.shape[0],16)):  
   plt.subplot(nrow,ncol,i+1)
   plt.imshow(conv1_act[i,:,:],'gray')
   plt.axis('off')
#plt.savefig("test.png", bbox_inches='tight')
plt.show()
print('done plotting layer1 activation output mosaic')
In [ ]:
 
In [ ]:
#=========================
In [ ]: