Pytorch¶

In [1]:
# To install Pytorch on your machine 
#!pip install torch
In [2]:
import numpy as np 

# install Pytorch in Python
import torch
import torch.nn as nn
from torchviz import make_dot

Tensors¶

Tensor data structure is almost exactly like numpy arrays. The only difference is tensors can also be loaded onto GPUs.

In [3]:
data = [[1, 2],[3, 4]]
np_array = np.array(data)
np_array
Out[3]:
array([[1, 2],
       [3, 4]])
In [4]:
tensor = torch.from_numpy(np_array)
tensor
Out[4]:
tensor([[1, 2],
        [3, 4]])
In [5]:
tensor.shape
Out[5]:
torch.Size([2, 2])
In [6]:
# Check which device it's on
tensor.device
Out[6]:
device(type='cpu')
In [7]:
# Cast back into numpy array 
np_array_back = tensor.detach().numpy() #.detach() is necessary if its on a GPU
np_array_back
Out[7]:
array([[1, 2],
       [3, 4]])

Automatically obtaining the computation graph and gradients¶

  • Elegant design principle: computation graph created dynamically as you specify the forward pass.
  • Pytorch keeps track of which operations have performed, and then it replays them backward to compute the gradients.
In [8]:
# Using the example from Computation graph #1 yesterday 
In [9]:
# Input data 
xyz = torch.tensor([-2.0, 5.0, -4.0], requires_grad=True) 
#requires_grad=True tells Pytorch that we want to backprop thru these variables
In [10]:
# forward pass calcuates the function 
# f = (x+y)*z
f = (xyz[0]+xyz[1])*xyz[2]
In [11]:
f
Out[11]:
tensor(-12., grad_fn=<MulBackward0>)
In [12]:
f.backward() #Tell Pytorch to do the backwards pass through the computation graph 
In [13]:
# Partial derivatives of f with respect to each element
# in vector xyz
print(xyz.grad)
tensor([-4., -4.,  3.])
In [14]:
# Slight tweak on computation graph #1
xyz = torch.tensor([-2.0, 5.0, -4.0, 1.0], requires_grad=True) 
f2 = 3*(xyz[0]+xyz[1])*xyz[2]
In [15]:
f2
Out[15]:
tensor(-36., grad_fn=<MulBackward0>)
In [16]:
f2.backward()
In [17]:
print(xyz.grad)
tensor([-12., -12.,   9.,   0.])
In [18]:
# Note: visualization may only work for Macs
#!brew install graphviz
In [19]:
# # Visualize the computation graph 
# # Blue is the shape of the input tensor, xyz
# # Green is the output
# make_dot(f2, params={'xyz': xyz})

Logistic Regresison with Pytorch¶

In [20]:
# Same toy data as HW3
NUM_CLASSES = 2 #let's do binary logistic regression again 

X_train = torch.tensor([[1.0, 0.0, 0.0], 
                        [0.0, 1.0, 1.0], 
                        [1.0, 1.0, 0.0], 
                        [0.0, 0.0, 1.0]])
Y_train = torch.tensor([0, 1, 0, 1])

num_examples = X_train.shape[0]
num_words = X_train.shape[1]

assert X_train.shape[0] == Y_train.shape[0]
In [21]:
# Create a linear layer
theta = nn.Linear(num_words, NUM_CLASSES) 
out = theta(X_train)
out.shape
Out[21]:
torch.Size([4, 2])
In [22]:
# Logistic function via log sigmoid (log for numerical stability)
log_softmax = nn.LogSoftmax(dim=1)
log_probs = log_softmax(out)
In [23]:
# Get back to numpy 
log_probs_numpy = log_probs.detach().numpy()
log_probs_numpy
Out[23]:
array([[-1.941936  , -0.15481459],
       [-0.5606183 , -0.84596276],
       [-1.4656038 , -0.26258427],
       [-0.8536415 , -0.5548843 ]], dtype=float32)
In [24]:
# How do we get probabilities out?
np.exp(log_probs_numpy)
Out[24]:
array([[0.143426  , 0.856574  ],
       [0.570856  , 0.429144  ],
       [0.23093851, 0.76906157],
       [0.42586133, 0.5741387 ]], dtype=float32)

Put into a class structure¶

In [25]:
class BinaryLogisticRegressionModel(nn.Module):
    """
    Pytorch implementation for binary logistic regression 
    """
    def __init__(self, num_words, num_classes):
        super().__init__()
        
        # Create the network architecture ("stack the legos")
        self.theta = nn.Linear(num_words, num_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        out = self.theta(X)
        log_probs = self.log_softmax(out)
        return log_probs
    
    def train_model(self, X, Y, loss_fn, optimizer, num_iterations): 
        """
        Training our binary logistic regression model 

        Note: Here, we are not using mini-batches, we are going through the entire
            dataset in every iteration (not recommended beyond toy examples)
        """
        self.train() # tells nn.Module its in training mode 
                      # (important when we get to things like dropout)

        for t in range(num_iterations): 
            # Forward pass 
            pred = self.forward(X)
            loss = loss_fn(pred, Y)

            #Backprop
            optimizer.zero_grad() # clears the gradients from the previous iteration
                                  # this step is important because otherwise Pytorch will 
                                  # *accumulate* gradients for all itereations (all backwards passes)
            loss.backward() # calculate gradients from forward step 
            optimizer.step() # gradient descent update equation 

            loss_value = loss.item() # call .item() to detach from the tensor 
            print(f"Iteration={t}, Loss={loss_value}") 
    
    def predict(self, X): 
        self.eval() # tells nn.Module its NOT in training mode 
                 # (important when we get to things like dropout)
    
        pred_log_probs = self.forward(X)
        log_pred_pos_class = pred_log_probs[:,1].detach().numpy() #get only the positive class 
        pred_pos_class = np.exp(log_pred_pos_class) #exp to undo the log 
        # decision threshold
        y_pred = np.zeros(X.shape[0])
        y_pred[pred_pos_class>= 0.5] = 1 
        return y_pred, pred_pos_class
In [26]:
LEARNING_RATE = 1e-3
NUMBER_ITERATIONS = 10 
loss_fn= nn.NLLLoss() #choose our loss function 
In [27]:
# Wouldn't expect loss to imporove dramatically each iteration b/c we're not
# using mini-batches here 
model = BinaryLogisticRegressionModel(num_words, NUM_CLASSES) #initalize
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) #stochastic gradient descent 
model.train_model(X_train, Y_train, loss_fn, optimizer, NUMBER_ITERATIONS)
Iteration=0, Loss=0.7447301745414734
Iteration=1, Loss=0.744430661201477
Iteration=2, Loss=0.7441313862800598
Iteration=3, Loss=0.7438322901725769
Iteration=4, Loss=0.7435333728790283
Iteration=5, Loss=0.7432346343994141
Iteration=6, Loss=0.7429361343383789
Iteration=7, Loss=0.7426378130912781
Iteration=8, Loss=0.742339551448822
Iteration=9, Loss=0.7420416474342346
In [28]:
# Inference time (predict on a single example) 
X_test = torch.tensor([[1.0, 0.0, 0.0]])
y_pred, pred_pos_class = model.predict(X_test)
In [29]:
pred_pos_class
Out[29]:
array([0.42864218], dtype=float32)
In [30]:
y_pred
Out[30]:
array([0.])
In [31]:
# print(f"Model structure: {model}\n")

# for name, param in model.named_parameters():
#     print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
In [32]:
# make_dot(model.forward(X_train))

#Addmm is matrix multiplication (in the linear layer)
# Blue: thetas (weight and bias term)
# Green: Loss function output (num_examples, num_classes)

Shallow -> Deep Network¶

In [33]:
class BasicFeedForwardModel(nn.Module):
    """
    Pytorch implementation for single hidden layer deep learning model 
    """
    def __init__(self, num_words, num_classes, hidden_dim1):
        super().__init__()
        
        # Create the network architecture ("stack the legos")
        
        ###### UPDATE FROM LOGREG ######
        self.hidden1 = nn.Linear(num_words, hidden_dim1)
        self.theta = nn.Linear(hidden_dim1, num_classes)
        #################################
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        ###### UPDATE FROM LOGREG ######
        hid1 = self.hidden1(X)
        hid1 = nn.functional.relu(hid1)
        out = self.theta(hid1)
        #################################
        log_probs = self.log_softmax(out)
        return log_probs
    
    def train_model(self, X, Y, loss_fn, optimizer, num_iterations): 
        """
        Training our binary logistic regression model 

        Note: Here, we are not using mini-batches, we are going through the entire
            dataset in every iteration (not recommended beyond toy examples)
        """
        self.train() # tells nn.Module its in training mode 
                      # (important when we get to things like dropout)

        for t in range(num_iterations): 
            # Forward pass 
            pred = self.forward(X)
            loss = loss_fn(pred, Y)

            #Backprop
            optimizer.zero_grad() # clears the gradients from the previous iteration
                                  # this step is important because otherwise Pytorch will 
                                  # *accumulate* gradients for all itereations (all backwards passes)
            loss.backward() # calculate gradients from forward step 
            optimizer.step() # gradient descent update equation 

            loss_value = loss.item() # call .item() to detach from the tensor 
            print(f"Iteration={t}, Loss={loss_value}") 
    
    def predict(self, X): 
        self.eval() # tells nn.Module its NOT in training mode 
                 # (important when we get to things like dropout)
    
        pred_log_probs = self.forward(X)
        log_pred_pos_class = pred_log_probs[:,1].detach().numpy() #get only the positive class 
        pred_pos_class = np.exp(log_pred_pos_class) #exp to undo the log 
        # decision threshold
        y_pred = np.zeros(X.shape[0])
        y_pred[pred_pos_class>= 0.5] = 1 
        return y_pred, pred_pos_class
In [34]:
HIDDEN_DIM = 5

model = BasicFeedForwardModel(num_words, NUM_CLASSES, HIDDEN_DIM) #initalize
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) #stochastic gradient descent 
model.train_model(X_train, Y_train, loss_fn, optimizer, NUMBER_ITERATIONS)
Iteration=0, Loss=0.704086184501648
Iteration=1, Loss=0.7040359377861023
Iteration=2, Loss=0.703985869884491
Iteration=3, Loss=0.7039358019828796
Iteration=4, Loss=0.7038857340812683
Iteration=5, Loss=0.7038357257843018
Iteration=6, Loss=0.7037858963012695
Iteration=7, Loss=0.7037360072135925
Iteration=8, Loss=0.7036861777305603
Iteration=9, Loss=0.7036364674568176
In [ ]: