### Pretrained models

References: 
- See tutorials from the company "huggingface" [1](https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter1/section3.ipynb#scrollTo=tn8H2n8VeDcL) [2](https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter3/section3.ipynb)

In [1]:
# You must install transformers library first 
# !pip install transformers

In [2]:
# Ignore warnings for the demo 
# TODO: comment this out if you'd like to see the warnings 
import logging
logging.disable(logging.WARNING)

#### Softmax 
This function generalizes the logistic/sigmoid to multiple classes

In [3]:
import torch
torch.set_printoptions(sci_mode=False) # no scientific notation on prints
                                       # just to visualize outputs better

In [4]:
a = torch.tensor([2.0, -3.0, 5.0, 7.0])
a

tensor([ 2., -3.,  5.,  7.])

In [5]:
softmax = torch.nn.Softmax(dim=0) #the dimension of the input tensor
                                  # for which you want it to sum to 1
out = softmax(a)
out 

tensor([    0.0059,     0.0000,     0.1185,     0.8756])

In [6]:
sum(out)

tensor(1.)

#### Generation 

In [7]:
from transformers import pipeline

In [8]:
# "Distilled" version of GPT-2 (smaller but still decent performance)
generator = pipeline("text-generation", model="distilgpt2")

In [9]:
generator(
    "Hath in the",
    max_length=30,
    num_return_sequences=5,
)



[{'generated_text': 'Hath in the Middle East,‚Äù he said.\n\n\n\n"At the same time, I think it\'s interesting that we still'},
 {'generated_text': 'Hath in the dark. The eyes of her father-in-law, Srinagar, are now glazed with red lights.\n\n'},
 {'generated_text': 'Hath in the morning, the man was rushed to the scene and died in a hospital at the scene. (Published Thursday, Jan. 21,'},
 {'generated_text': 'Hath in the United States.\n\n\nThe Obama administration‚Ä§s attempt to build on the campaign rhetoric and the press conferences around it at'},
 {'generated_text': 'Hath in the last few episodes.\n\n\n\n\n\nThis episode also includes exclusive material and the interview with Eileen Lee (Dr.'}]

In [10]:
# Why don't these sound like Shakespeare?

#### Zero-shot text classification 

No fine-tuning on in-domain data. 

In [11]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [12]:
classifier(
    "while the total impact on grain production is unclear china has said the flooding slashed summer grain output by 11 percent from last year's harvest",
    candidate_labels=["not about aid", "about aid"],
)

{'sequence': "while the total impact on grain production is unclear china has said the flooding slashed summer grain output by 11 percent from last year's harvest",
 'labels': ['about aid', 'not about aid'],
 'scores': [0.6267188191413879, 0.3732811510562897]}

#### Fine-tuning


In [13]:
import csv
import os 
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score

Load data 

In [14]:
#change with your own path to HW 2 data
path_to_hw2_data = './data/triage/'

In [15]:
def load_triage_data(data_dir, split_name):
    all_texts = []
    all_labels = []
    
    with open(os.path.join(data_dir, split_name + ".csv"),
              newline='', mode="r", encoding="utf8") as infile:
        reader = csv.DictReader(infile, delimiter="|")
        for row in reader:
            text = row["Text"]
            label = int(row["Label"])
            all_texts.append(text)
            all_labels.append(label)
    print(f'read {len(all_texts)} lines of data')
    return all_texts, all_labels

In [16]:
train_texts, train_labels = load_triage_data(path_to_hw2_data, 'train')

read 21046 lines of data


In [17]:
dev_texts, dev_labels = load_triage_data(path_to_hw2_data, 'dev')

read 2573 lines of data


Tokenize

In [18]:
MAX_LENGTH = 20 #specify the maximum length of a sentence 
MODEL_NAME = 'distilbert-base-uncased' 
#DistilBERT is a small, fast, cheap and light Transformer model 

In [19]:
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME) # The model_name needs to match our pre-trained model.

In [20]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
dev_encodings  = tokenizer(dev_texts, truncation=True, padding=True, max_length=MAX_LENGTH)

In [21]:
train_labels_encoded = torch.tensor(train_labels)
dev_labels_encoded = torch.tensor(dev_labels)

In [22]:
' '.join(train_encodings[0].tokens[0:MAX_LENGTH])

'[CLS] i would like to know when the national archives will begin to work [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [23]:
' '.join(train_encodings[200].tokens[0:MAX_LENGTH])

'[CLS] night time loading takes some time under the flood ##lights [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [24]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [25]:
train_dataset = MyDataset(train_encodings, train_labels_encoded)
dev_dataset = MyDataset(dev_encodings, dev_labels_encoded)

In [26]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

In [27]:
# Hyperparameters you need to set! 
training_args = TrainingArguments(
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=50,  # batch size per device during training
    per_device_eval_batch_size=50,   # batch size for evaluation
    learning_rate=1e-5,              # initial learning rate for Adam optimizer
    output_dir='./results',          # output directory
    logging_steps=10,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

In [28]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

In [29]:
trainer = Trainer(
    model=model,                         # the instantiated ü§ó Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function 
)

In [30]:
trainer.train()

  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss,Validation Loss,Accuracy
10,0.7021,0.679824,0.592305
20,0.6781,0.664889,0.592693
30,0.6672,0.652154,0.592693
40,0.665,0.642443,0.688302
50,0.6384,0.61392,0.691411
60,0.5979,0.588133,0.687136
70,0.5886,0.564596,0.73766
80,0.547,0.548516,0.741935
90,0.5616,0.533032,0.753206
100,0.5628,0.531821,0.746211


  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])


  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])


TrainOutput(global_step=421, training_loss=0.5281096048423061, metrics={'train_runtime': 499.8907, 'train_samples_per_second': 42.101, 'train_steps_per_second': 0.842, 'total_flos': 108902690316960.0, 'train_loss': 0.5281096048423061, 'epoch': 1.0})

In [31]:
trainer.evaluate()

  item['labels'] = torch.tensor(self.labels[idx])


{'eval_loss': 0.4891909658908844,
 'eval_accuracy': 0.7691410804508356,
 'eval_runtime': 7.294,
 'eval_samples_per_second': 352.754,
 'eval_steps_per_second': 7.129,
 'epoch': 1.0}