# To install Pytorch on your machine 
#!pip install torch


import numpy as np 

# install Pytorch in Python
import torch
import torch.nn as nn
from torchviz import make_dot


data = [[1, 2],[3, 4]]
np_array = np.array(data)
np_array

array([[1, 2],
       [3, 4]])


tensor = torch.from_numpy(np_array)
tensor

tensor([[1, 2],
        [3, 4]])


tensor.shape

torch.Size([2, 2])


# Check which device it's on
tensor.device

device(type='cpu')


# Cast back into numpy array 
np_array_back = tensor.detach().numpy() #.detach() is necessary if its on a GPU
np_array_back

array([[1, 2],
       [3, 4]])


# Using the example from Computation graph #1 yesterday


# Input data 
xyz = torch.tensor([-2.0, 5.0, -4.0], requires_grad=True) 
#requires_grad=True tells Pytorch that we want to backprop thru these variables


# forward pass calcuates the function 
# f = (x+y)*z
f = (xyz[0]+xyz[1])*xyz[2]

f

tensor(-12., grad_fn=<MulBackward0>)


f.backward() #Tell Pytorch to do the backwards pass through the computation graph


# Partial derivatives of f with respect to each element
# in vector xyz
print(xyz.grad)

tensor([-4., -4.,  3.])


# Slight tweak on computation graph #1
xyz = torch.tensor([-2.0, 5.0, -4.0, 1.0], requires_grad=True) 
f2 = 3*(xyz[0]+xyz[1])*xyz[2]

f2

tensor(-36., grad_fn=<MulBackward0>)


f2.backward()


print(xyz.grad)

tensor([-12., -12.,   9.,   0.])


# Note: visualization may only work for Macs
#!brew install graphviz


# # Visualize the computation graph 
# # Blue is the shape of the input tensor, xyz
# # Green is the output
# make_dot(f2, params={'xyz': xyz})


# Same toy data as HW3
NUM_CLASSES = 2 #let's do binary logistic regression again 

X_train = torch.tensor([[1.0, 0.0, 0.0], 
                        [0.0, 1.0, 1.0], 
                        [1.0, 1.0, 0.0], 
                        [0.0, 0.0, 1.0]])
Y_train = torch.tensor([0, 1, 0, 1])

num_examples = X_train.shape[0]
num_words = X_train.shape[1]

assert X_train.shape[0] == Y_train.shape[0]


# Create a linear layer
theta = nn.Linear(num_words, NUM_CLASSES) 
out = theta(X_train)
out.shape

torch.Size([4, 2])


# Logistic function via log sigmoid (log for numerical stability)
log_softmax = nn.LogSoftmax(dim=1)
log_probs = log_softmax(out)


# Get back to numpy 
log_probs_numpy = log_probs.detach().numpy()
log_probs_numpy

array([[-1.941936  , -0.15481459],
       [-0.5606183 , -0.84596276],
       [-1.4656038 , -0.26258427],
       [-0.8536415 , -0.5548843 ]], dtype=float32)


# How do we get probabilities out?
np.exp(log_probs_numpy)

array([[0.143426  , 0.856574  ],
       [0.570856  , 0.429144  ],
       [0.23093851, 0.76906157],
       [0.42586133, 0.5741387 ]], dtype=float32)


class BinaryLogisticRegressionModel(nn.Module):
    """
    Pytorch implementation for binary logistic regression 
    """
    def __init__(self, num_words, num_classes):
        super().__init__()
        
        # Create the network architecture ("stack the legos")
        self.theta = nn.Linear(num_words, num_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        out = self.theta(X)
        log_probs = self.log_softmax(out)
        return log_probs
    
    def train_model(self, X, Y, loss_fn, optimizer, num_iterations): 
        """
        Training our binary logistic regression model 

        Note: Here, we are not using mini-batches, we are going through the entire
            dataset in every iteration (not recommended beyond toy examples)
        """
        self.train() # tells nn.Module its in training mode 
                      # (important when we get to things like dropout)

        for t in range(num_iterations): 
            # Forward pass 
            pred = self.forward(X)
            loss = loss_fn(pred, Y)

            #Backprop
            optimizer.zero_grad() # clears the gradients from the previous iteration
                                  # this step is important because otherwise Pytorch will 
                                  # *accumulate* gradients for all itereations (all backwards passes)
            loss.backward() # calculate gradients from forward step 
            optimizer.step() # gradient descent update equation 

            loss_value = loss.item() # call .item() to detach from the tensor 
            print(f"Iteration={t}, Loss={loss_value}") 
    
    def predict(self, X): 
        self.eval() # tells nn.Module its NOT in training mode 
                 # (important when we get to things like dropout)
    
        pred_log_probs = self.forward(X)
        log_pred_pos_class = pred_log_probs[:,1].detach().numpy() #get only the positive class 
        pred_pos_class = np.exp(log_pred_pos_class) #exp to undo the log 
        # decision threshold
        y_pred = np.zeros(X.shape[0])
        y_pred[pred_pos_class>= 0.5] = 1 
        return y_pred, pred_pos_class


LEARNING_RATE = 1e-3
NUMBER_ITERATIONS = 10 
loss_fn= nn.NLLLoss() #choose our loss function


# Wouldn't expect loss to imporove dramatically each iteration b/c we're not
# using mini-batches here 
model = BinaryLogisticRegressionModel(num_words, NUM_CLASSES) #initalize
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) #stochastic gradient descent 
model.train_model(X_train, Y_train, loss_fn, optimizer, NUMBER_ITERATIONS)

Iteration=0, Loss=0.7447301745414734
Iteration=1, Loss=0.744430661201477
Iteration=2, Loss=0.7441313862800598
Iteration=3, Loss=0.7438322901725769
Iteration=4, Loss=0.7435333728790283
Iteration=5, Loss=0.7432346343994141
Iteration=6, Loss=0.7429361343383789
Iteration=7, Loss=0.7426378130912781
Iteration=8, Loss=0.742339551448822
Iteration=9, Loss=0.7420416474342346


# Inference time (predict on a single example) 
X_test = torch.tensor([[1.0, 0.0, 0.0]])
y_pred, pred_pos_class = model.predict(X_test)


pred_pos_class

array([0.42864218], dtype=float32)


y_pred

array([0.])


# print(f"Model structure: {model}\n")

# for name, param in model.named_parameters():
#     print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")


# make_dot(model.forward(X_train))

#Addmm is matrix multiplication (in the linear layer)
# Blue: thetas (weight and bias term)
# Green: Loss function output (num_examples, num_classes)


class BasicFeedForwardModel(nn.Module):
    """
    Pytorch implementation for single hidden layer deep learning model 
    """
    def __init__(self, num_words, num_classes, hidden_dim1):
        super().__init__()
        
        # Create the network architecture ("stack the legos")
        
        ###### UPDATE FROM LOGREG ######
        self.hidden1 = nn.Linear(num_words, hidden_dim1)
        self.theta = nn.Linear(hidden_dim1, num_classes)
        #################################
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, X):
        ###### UPDATE FROM LOGREG ######
        hid1 = self.hidden1(X)
        hid1 = nn.functional.relu(hid1)
        out = self.theta(hid1)
        #################################
        log_probs = self.log_softmax(out)
        return log_probs
    
    def train_model(self, X, Y, loss_fn, optimizer, num_iterations): 
        """
        Training our binary logistic regression model 

        Note: Here, we are not using mini-batches, we are going through the entire
            dataset in every iteration (not recommended beyond toy examples)
        """
        self.train() # tells nn.Module its in training mode 
                      # (important when we get to things like dropout)

        for t in range(num_iterations): 
            # Forward pass 
            pred = self.forward(X)
            loss = loss_fn(pred, Y)

            #Backprop
            optimizer.zero_grad() # clears the gradients from the previous iteration
                                  # this step is important because otherwise Pytorch will 
                                  # *accumulate* gradients for all itereations (all backwards passes)
            loss.backward() # calculate gradients from forward step 
            optimizer.step() # gradient descent update equation 

            loss_value = loss.item() # call .item() to detach from the tensor 
            print(f"Iteration={t}, Loss={loss_value}") 
    
    def predict(self, X): 
        self.eval() # tells nn.Module its NOT in training mode 
                 # (important when we get to things like dropout)
    
        pred_log_probs = self.forward(X)
        log_pred_pos_class = pred_log_probs[:,1].detach().numpy() #get only the positive class 
        pred_pos_class = np.exp(log_pred_pos_class) #exp to undo the log 
        # decision threshold
        y_pred = np.zeros(X.shape[0])
        y_pred[pred_pos_class>= 0.5] = 1 
        return y_pred, pred_pos_class


HIDDEN_DIM = 5

model = BasicFeedForwardModel(num_words, NUM_CLASSES, HIDDEN_DIM) #initalize
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) #stochastic gradient descent 
model.train_model(X_train, Y_train, loss_fn, optimizer, NUMBER_ITERATIONS)

Iteration=0, Loss=0.704086184501648
Iteration=1, Loss=0.7040359377861023
Iteration=2, Loss=0.703985869884491
Iteration=3, Loss=0.7039358019828796
Iteration=4, Loss=0.7038857340812683
Iteration=5, Loss=0.7038357257843018
Iteration=6, Loss=0.7037858963012695
Iteration=7, Loss=0.7037360072135925
Iteration=8, Loss=0.7036861777305603
Iteration=9, Loss=0.7036364674568176

Pytorch¶

Tensors¶

Automatically obtaining the computation graph and gradients¶

Logistic Regresison with Pytorch¶

Put into a class structure¶

Shallow -> Deep Network¶