a = 1e-10
b = 1e-90
c = 1e-30
d = 5e-130
e = 1e-40
f = 1e-100
a*b*c*d*e*f

0.0


import numpy as np 
np.log(a) + np.log(b) + np.log(c) + np.log(d) + np.log(e) + np.log(f)

-919.4245992851843


# Below, we annotate the name argument as being a str type 
# -> indicates the type of the output of the function, 
# in this case also a string
def greet(name: str) -> str: 
    return "Hello"+ name


from typing import List, Tuple


from collections import defaultdict, Counter


#Defaultdict is a special dictionary that returns a default value 
#when a key queried isn't found.
adict = defaultdict(int)
adict['cat'] = 5
print(adict['cat'])
print(adict['dog'])

5
0


# A Counter is a dictionary with default value of 0 
counter1 = Counter()
counter1['t'] = 10
counter1['t'] += 1
counter1['e'] += 1
print(counter1)

Counter({'t': 11, 'e': 1})


# Below, we'll implement our language model as the defaultdict with a counter
lm = defaultdict(Counter) 
# keys: previous N-1 words 
# values: Counter dictionaries 
    #keys: w_n
    #values: count of times you saw w_n with the previous N-1 words 
    
#trigram language model 
lm[('the', 'dog')]['ate'] += 1 
lm[('the', 'dog')]['slept'] += 1
lm[('the', 'dog')]['ate'] += 1


lm[('the', 'dog')]['ate']

2


#helpful because you don't get errors with keys that don't exist 
lm[('the', 'dog')]['drank']

0


import re 
import random


# Downloading the complete works of William Shakespeare 
#https://www.kaggle.com/datasets/kewagbln/shakespeareonline


shakespeare_text = open("t8.shakespeare-remove-preamble.txt").read()


type(shakespeare_text)

str


#look at some of the text  
print(shakespeare_text[20000:21000])

xpense of many a vanished sight.
  Then can I grieve at grievances foregone,
  And heavily from woe to woe tell o'er
  The sad account of fore-bemoaned moan,
  Which I new pay as if not paid before.
    But if the while I think on thee (dear friend)
    All losses are restored, and sorrows end.


                     31  
  Thy bosom is endeared with all hearts,
  Which I by lacking have supposed dead,
  And there reigns love and all love's loving parts,
  And all those friends which I thought buried.
  How many a holy and obsequious tear
  Hath dear religious love stol'n from mine eye,
  As interest of the dead, which now appear,
  But things removed that hidden in thee lie.
  Thou art the grave where buried love doth live,
  Hung with the trophies of my lovers gone,
  Who all their parts of me to thee did give,
  That due of many, now is thine alone.
    Their images I loved, I view in thee,
    And thou (all they) hast all the all of me.


                     32
  If thou survive m


#Simple tokenization, just split on any non-alphanumeric character 
tokens = re.split(r"\W+", shakespeare_text.lower())


print('Total number of tokens=', len(tokens))
print('Total number of word types=', len(set(tokens)))

Total number of tokens= 927705
Total number of word types= 23724


#look at some of the tokens
tokens[105:115]

['else', 'this', 'glutton', 'be', 'to', 'eat', 'the', 'world', 's', 'due']


def create_ngrams(toks: list, N: int)-> List[tuple]:
    """
    Iterate through the tokens in the order they appear in the corpus 
    Sliding window of N to create the n-grams
    
    Returns: list of tuples of n-grams
    
    Example bigram (N=2) output: 
        [('else', 'this'),
         ('this', 'glutton'),
         ('glutton', 'be'),
         ('be', 'to'),
         ('to', 'eat'),]
    """
    all_ngrams = []
    for i in range(len(tokens)-N+1): 
        ngram = toks[i:i+N]
        all_ngrams.append(tuple(ngram))
    return all_ngrams


bigrams = create_ngrams(tokens, 2)
bigrams[105:115]

[('else', 'this'),
 ('this', 'glutton'),
 ('glutton', 'be'),
 ('be', 'to'),
 ('to', 'eat'),
 ('eat', 'the'),
 ('the', 'world'),
 ('world', 's'),
 ('s', 'due'),
 ('due', 'by')]


trigrams = create_ngrams(tokens, 3)
trigrams[105:115]

[('else', 'this', 'glutton'),
 ('this', 'glutton', 'be'),
 ('glutton', 'be', 'to'),
 ('be', 'to', 'eat'),
 ('to', 'eat', 'the'),
 ('eat', 'the', 'world'),
 ('the', 'world', 's'),
 ('world', 's', 'due'),
 ('s', 'due', 'by'),
 ('due', 'by', 'the')]


def count_ngrams(ngrams: List[tuple]) -> dict:
    """
    Count the occurance of each unique n-gram 
    
    Returns: Dictionary with 
        keys = ngram as a tuple 
        values = count of those ngrams 
    """
    ngram_counts = Counter()
    for item in ngrams: 
        ngram_counts[item] += 1
    return ngram_counts


bigram_counts = count_ngrams(bigrams)


#look at ranked list
topk= 10
sorted(bigram_counts.items(), key=lambda kv: -kv[1])[0:topk]

[(('i', 'am'), 1855),
 (('i', 'll'), 1745),
 (('of', 'the'), 1715),
 (('my', 'lord'), 1666),
 (('in', 'the'), 1643),
 (('i', 'have'), 1620),
 (('i', 'will'), 1566),
 (('to', 'the'), 1430),
 (('it', 'is'), 1078),
 (('to', 'be'), 973)]


trigram_counts = count_ngrams(trigrams)


#look at ranked list
topk= 10
sorted(trigram_counts.items(), key=lambda kv: -kv[1])[0:topk]

[(('i', 'pray', 'you'), 242),
 (('so', 'long', 'as'), 234),
 (('of', 'the', 'complete'), 219),
 (('the', 'complete', 'works'), 219),
 (('complete', 'works', 'of'), 219),
 (('works', 'of', 'william'), 219),
 (('of', 'william', 'shakespeare'), 219),
 (('this', 'electronic', 'version'), 218),
 (('electronic', 'version', 'of'), 218),
 (('version', 'of', 'the'), 218)]


#Previous function expanded across many lines 
def probability_over_all_words(word2counts: Counter)-> List[tuple]:
    """
    Input: a Counter with 
        keys = words
        values = counts of that word 
        
    Returns: List of (word, probability)
    
    Example: 
        >>> word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1} 
        >>> probability_over_all_words(word2counts)
        [('too', 0.25), ('dost', 0.25), ('resemble', 0.25), ('grow', 0.25)]
    """
    total_counts = float(sum(word2counts.values()))
    word_probs = []
    for word, count in word2counts.items():
        prob = count/total_counts
        word_probs.append((word, prob))
    return word_probs

    # we can also do this in just two lines below via list comprehension\
#     total_counts = float(sum(word2counts.values()))
#     return [(word,count/total_counts) for word,count in word2counts.items()]


word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1, 'prove': 1, 'was': 1, 'put': 1, 'are': 1}
probability_over_all_words(word2counts)

[('too', 0.125),
 ('dost', 0.125),
 ('resemble', 0.125),
 ('grow', 0.125),
 ('prove', 0.125),
 ('was', 0.125),
 ('put', 0.125),
 ('are', 0.125)]


def train_lm(tokens: List, N: int) -> dict:
    """
    Use maximum likelihood estimates to train (learn the probabilites) of 
    the langauge model 
    
    Returns: Language model dict  
        keys = previous N-1 words 
        values = List of tuples
            first item = word w_n 
            second item = probability w_n given previous N-1 words (keys)
            
    Example (N=3):  
        {('sweet', 'self'):
              [('too', 0.25),
               ('dost', 0.25),
               ('resemble', 0.25),
               ('grow', 0.25)]
        }
    """
    lm = defaultdict(Counter)
    
    # get the counts
    for i in range(len(tokens)-(N-1)):
        previous_words =  tokens[i:i+N-1]
        next_word = tokens[i+N-1]
        lm[tuple(previous_words)][next_word] += 1
        
    #turn counts into probabilites 
    outlm = {previous_words: probability_over_all_words(words2count) for previous_words, words2count in lm.items()}
    return outlm


N = 4


outlm = train_lm(tokens, N)


type(outlm)

dict


#just visualize some of the outputs
list(outlm.items())[125:135]

[(('thy', 'brow', 'and'), [('dig', 1.0)]),
 (('brow', 'and', 'dig'), [('deep', 1.0)]),
 (('and', 'dig', 'deep'), [('trenches', 1.0)]),
 (('dig', 'deep', 'trenches'), [('in', 1.0)]),
 (('deep', 'trenches', 'in'), [('thy', 1.0)]),
 (('trenches', 'in', 'thy'), [('beauty', 1.0)]),
 (('in', 'thy', 'beauty'), [('s', 1.0)]),
 (('thy', 'beauty', 's'),
  [('field', 0.2),
   ('use', 0.2),
   ('legacy', 0.2),
   ('form', 0.2),
   ('image', 0.2)]),
 (('beauty', 's', 'field'), [('thy', 1.0)]),
 (('s', 'field', 'thy'), [('youth', 1.0)])]


#this Python method gives a float between 0 and 1 
x = random.random()
x

0.16802632186102107


# Intuition behind how we sample from the following distribution 
distribution = [('world', 0.5), ('skirts',0.2), ('tables',0.3)]

count_chosen = Counter()

#Simulation with 10,000 trials, see if we get the correct expected values
for sim_num in range(10000): 
    random.shuffle(distribution) #shuffle which word we consider first 
    x = random.random() #randomly generates real-valued number between 0 and 1 
    for word_candidate, prob in distribution:
        x = x - prob #higher probability words more likely  
        if x <=0 : 
            count_chosen[word_candidate] +=1 
            break


count_chosen

Counter({'skirts': 2041, 'world': 4951, 'tables': 3008})


def generate_word(lm, N, previous_words):
    """
    Generates a single word from the learned language model 
    """
    distribution = lm[tuple(previous_words)]
    random.shuffle(distribution) #shuffle which word we consider first 
    x = random.random() #randomly generates real-valued number between 0 and 1 
    for word_candidate, prob in distribution:
        x = x - prob #higher probability words more likely  
        if x <=0 : return word_candidate
        # if the candidate is not accepted, x is already deprecated


# only three learned options for the next starting word 
outlm[("hath", "in", "the")]

[('world', 0.3333333333333333),
 ('skirts', 0.3333333333333333),
 ('tables', 0.3333333333333333)]


generate_word(outlm, 4, ["hath", "in", "the"])

'skirts'


outlm[("the", "tables", "of")]

[('his', 0.5), ('their', 0.5)]


def generate_text(lm, N, start_grams, nwords=100):
    """
    Generates nwords of text given: 
        - the trained language model (lm)
        - the start grams
    """
    assert len(start_grams) == N-1
    previous_words = start_grams 
    out = []
    for i in range(nwords):
        generated_word = generate_word(lm, N, previous_words)
        previous_words = previous_words[1:N-1] + [generated_word]
        out.append(generated_word)
    return " ".join(out)


generate_text(outlm, 4, ["hath", "in", "the"], nwords=20)

'world that caius marcius wears this war s garland in token of the which five hundred were but yesterday dubb'


# How many bigrams actually produced? 
vocab = len(set(tokens))
possible_bigrams = vocab**2
actual_bigrams = len(set(bigram_counts.keys()))
print(f"""vocab size = {vocab}
possible bigrams = {possible_bigrams} 
actual bigrams = {actual_bigrams}
% never seen bigrams = {(possible_bigrams-actual_bigrams)/possible_bigrams*100}
""")

vocab size = 23724
possible bigrams = 562828176 
actual bigrams = 346130
% never seen bigrams = 99.93850165738681


# Motivates using Laplace smoothing

Coding tips¶

Underflow example¶

Type hints¶

Defaultdict and Counter¶

(Unsmoothed) Ngram language model¶

1. Load and preprocess¶

2. Create n-grams¶

3. Generate¶

Sparsity¶