a = 1e-10
b = 1e-90
c = 1e-30
d = 5e-130
e = 1e-40
f = 1e-100
a*b*c*d*e*f
0.0
import numpy as np
np.log(a) + np.log(b) + np.log(c) + np.log(d) + np.log(e) + np.log(f)
-919.4245992851843
# Below, we annotate the name argument as being a str type
# -> indicates the type of the output of the function,
# in this case also a string
def greet(name: str) -> str:
return "Hello"+ name
from typing import List, Tuple
These were in Python tutorial in HW0 but as a reminder we'll use them a lot in the weeks to come.
from collections import defaultdict, Counter
#Defaultdict is a special dictionary that returns a default value
#when a key queried isn't found.
adict = defaultdict(int)
adict['cat'] = 5
print(adict['cat'])
print(adict['dog'])
5 0
# A Counter is a dictionary with default value of 0
counter1 = Counter()
counter1['t'] = 10
counter1['t'] += 1
counter1['e'] += 1
print(counter1)
Counter({'t': 11, 'e': 1})
# Below, we'll implement our language model as the defaultdict with a counter
lm = defaultdict(Counter)
# keys: previous N-1 words
# values: Counter dictionaries
#keys: w_n
#values: count of times you saw w_n with the previous N-1 words
#trigram language model
lm[('the', 'dog')]['ate'] += 1
lm[('the', 'dog')]['slept'] += 1
lm[('the', 'dog')]['ate'] += 1
lm[('the', 'dog')]['ate']
2
#helpful because you don't get errors with keys that don't exist
lm[('the', 'dog')]['drank']
0
import re
import random
# Downloading the complete works of William Shakespeare
#https://www.kaggle.com/datasets/kewagbln/shakespeareonline
shakespeare_text = open("t8.shakespeare-remove-preamble.txt").read()
type(shakespeare_text)
str
#look at some of the text
print(shakespeare_text[20000:21000])
xpense of many a vanished sight. Then can I grieve at grievances foregone, And heavily from woe to woe tell o'er The sad account of fore-bemoaned moan, Which I new pay as if not paid before. But if the while I think on thee (dear friend) All losses are restored, and sorrows end. 31 Thy bosom is endeared with all hearts, Which I by lacking have supposed dead, And there reigns love and all love's loving parts, And all those friends which I thought buried. How many a holy and obsequious tear Hath dear religious love stol'n from mine eye, As interest of the dead, which now appear, But things removed that hidden in thee lie. Thou art the grave where buried love doth live, Hung with the trophies of my lovers gone, Who all their parts of me to thee did give, That due of many, now is thine alone. Their images I loved, I view in thee, And thou (all they) hast all the all of me. 32 If thou survive m
#Simple tokenization, just split on any non-alphanumeric character
tokens = re.split(r"\W+", shakespeare_text.lower())
print('Total number of tokens=', len(tokens))
print('Total number of word types=', len(set(tokens)))
Total number of tokens= 927705 Total number of word types= 23724
#look at some of the tokens
tokens[105:115]
['else', 'this', 'glutton', 'be', 'to', 'eat', 'the', 'world', 's', 'due']
def create_ngrams(toks: list, N: int)-> List[tuple]:
"""
Iterate through the tokens in the order they appear in the corpus
Sliding window of N to create the n-grams
Returns: list of tuples of n-grams
Example bigram (N=2) output:
[('else', 'this'),
('this', 'glutton'),
('glutton', 'be'),
('be', 'to'),
('to', 'eat'),]
"""
all_ngrams = []
for i in range(len(tokens)-N+1):
ngram = toks[i:i+N]
all_ngrams.append(tuple(ngram))
return all_ngrams
bigrams = create_ngrams(tokens, 2)
bigrams[105:115]
[('else', 'this'), ('this', 'glutton'), ('glutton', 'be'), ('be', 'to'), ('to', 'eat'), ('eat', 'the'), ('the', 'world'), ('world', 's'), ('s', 'due'), ('due', 'by')]
trigrams = create_ngrams(tokens, 3)
trigrams[105:115]
[('else', 'this', 'glutton'), ('this', 'glutton', 'be'), ('glutton', 'be', 'to'), ('be', 'to', 'eat'), ('to', 'eat', 'the'), ('eat', 'the', 'world'), ('the', 'world', 's'), ('world', 's', 'due'), ('s', 'due', 'by'), ('due', 'by', 'the')]
def count_ngrams(ngrams: List[tuple]) -> dict:
"""
Count the occurance of each unique n-gram
Returns: Dictionary with
keys = ngram as a tuple
values = count of those ngrams
"""
ngram_counts = Counter()
for item in ngrams:
ngram_counts[item] += 1
return ngram_counts
bigram_counts = count_ngrams(bigrams)
#look at ranked list
topk= 10
sorted(bigram_counts.items(), key=lambda kv: -kv[1])[0:topk]
[(('i', 'am'), 1855), (('i', 'll'), 1745), (('of', 'the'), 1715), (('my', 'lord'), 1666), (('in', 'the'), 1643), (('i', 'have'), 1620), (('i', 'will'), 1566), (('to', 'the'), 1430), (('it', 'is'), 1078), (('to', 'be'), 973)]
trigram_counts = count_ngrams(trigrams)
#look at ranked list
topk= 10
sorted(trigram_counts.items(), key=lambda kv: -kv[1])[0:topk]
[(('i', 'pray', 'you'), 242), (('so', 'long', 'as'), 234), (('of', 'the', 'complete'), 219), (('the', 'complete', 'works'), 219), (('complete', 'works', 'of'), 219), (('works', 'of', 'william'), 219), (('of', 'william', 'shakespeare'), 219), (('this', 'electronic', 'version'), 218), (('electronic', 'version', 'of'), 218), (('version', 'of', 'the'), 218)]
Remainder of the code adpated from adapted from Yoav Goldberg's character-level language model
#Previous function expanded across many lines
def probability_over_all_words(word2counts: Counter)-> List[tuple]:
"""
Input: a Counter with
keys = words
values = counts of that word
Returns: List of (word, probability)
Example:
>>> word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1}
>>> probability_over_all_words(word2counts)
[('too', 0.25), ('dost', 0.25), ('resemble', 0.25), ('grow', 0.25)]
"""
total_counts = float(sum(word2counts.values()))
word_probs = []
for word, count in word2counts.items():
prob = count/total_counts
word_probs.append((word, prob))
return word_probs
# we can also do this in just two lines below via list comprehension\
# total_counts = float(sum(word2counts.values()))
# return [(word,count/total_counts) for word,count in word2counts.items()]
Above, list comprehensions (the two-liner version) are must faster than appending to a list because it actually pushes the loop from the interpreter to compiled C code. See this reference
word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1, 'prove': 1, 'was': 1, 'put': 1, 'are': 1}
probability_over_all_words(word2counts)
[('too', 0.125), ('dost', 0.125), ('resemble', 0.125), ('grow', 0.125), ('prove', 0.125), ('was', 0.125), ('put', 0.125), ('are', 0.125)]
def train_lm(tokens: List, N: int) -> dict:
"""
Use maximum likelihood estimates to train (learn the probabilites) of
the langauge model
Returns: Language model dict
keys = previous N-1 words
values = List of tuples
first item = word w_n
second item = probability w_n given previous N-1 words (keys)
Example (N=3):
{('sweet', 'self'):
[('too', 0.25),
('dost', 0.25),
('resemble', 0.25),
('grow', 0.25)]
}
"""
lm = defaultdict(Counter)
# get the counts
for i in range(len(tokens)-(N-1)):
previous_words = tokens[i:i+N-1]
next_word = tokens[i+N-1]
lm[tuple(previous_words)][next_word] += 1
#turn counts into probabilites
outlm = {previous_words: probability_over_all_words(words2count) for previous_words, words2count in lm.items()}
return outlm
N = 4
outlm = train_lm(tokens, N)
type(outlm)
dict
#just visualize some of the outputs
list(outlm.items())[125:135]
[(('thy', 'brow', 'and'), [('dig', 1.0)]), (('brow', 'and', 'dig'), [('deep', 1.0)]), (('and', 'dig', 'deep'), [('trenches', 1.0)]), (('dig', 'deep', 'trenches'), [('in', 1.0)]), (('deep', 'trenches', 'in'), [('thy', 1.0)]), (('trenches', 'in', 'thy'), [('beauty', 1.0)]), (('in', 'thy', 'beauty'), [('s', 1.0)]), (('thy', 'beauty', 's'), [('field', 0.2), ('use', 0.2), ('legacy', 0.2), ('form', 0.2), ('image', 0.2)]), (('beauty', 's', 'field'), [('thy', 1.0)]), (('s', 'field', 'thy'), [('youth', 1.0)])]
#this Python method gives a float between 0 and 1
x = random.random()
x
0.16802632186102107
# Intuition behind how we sample from the following distribution
distribution = [('world', 0.5), ('skirts',0.2), ('tables',0.3)]
count_chosen = Counter()
#Simulation with 10,000 trials, see if we get the correct expected values
for sim_num in range(10000):
random.shuffle(distribution) #shuffle which word we consider first
x = random.random() #randomly generates real-valued number between 0 and 1
for word_candidate, prob in distribution:
x = x - prob #higher probability words more likely
if x <=0 :
count_chosen[word_candidate] +=1
break
count_chosen
Counter({'skirts': 2041, 'world': 4951, 'tables': 3008})
def generate_word(lm, N, previous_words):
"""
Generates a single word from the learned language model
"""
distribution = lm[tuple(previous_words)]
random.shuffle(distribution) #shuffle which word we consider first
x = random.random() #randomly generates real-valued number between 0 and 1
for word_candidate, prob in distribution:
x = x - prob #higher probability words more likely
if x <=0 : return word_candidate
# if the candidate is not accepted, x is already deprecated
# only three learned options for the next starting word
outlm[("hath", "in", "the")]
[('world', 0.3333333333333333), ('skirts', 0.3333333333333333), ('tables', 0.3333333333333333)]
generate_word(outlm, 4, ["hath", "in", "the"])
'skirts'
outlm[("the", "tables", "of")]
[('his', 0.5), ('their', 0.5)]
def generate_text(lm, N, start_grams, nwords=100):
"""
Generates nwords of text given:
- the trained language model (lm)
- the start grams
"""
assert len(start_grams) == N-1
previous_words = start_grams
out = []
for i in range(nwords):
generated_word = generate_word(lm, N, previous_words)
previous_words = previous_words[1:N-1] + [generated_word]
out.append(generated_word)
return " ".join(out)
generate_text(outlm, 4, ["hath", "in", "the"], nwords=20)
'world that caius marcius wears this war s garland in token of the which five hundred were but yesterday dubb'
# How many bigrams actually produced?
vocab = len(set(tokens))
possible_bigrams = vocab**2
actual_bigrams = len(set(bigram_counts.keys()))
print(f"""vocab size = {vocab}
possible bigrams = {possible_bigrams}
actual bigrams = {actual_bigrams}
% never seen bigrams = {(possible_bigrams-actual_bigrams)/possible_bigrams*100}
""")
vocab size = 23724 possible bigrams = 562828176 actual bigrams = 346130 % never seen bigrams = 99.93850165738681
# Motivates using Laplace smoothing