import re 
import nltk


text = "Good muffins cost $303.8\nin New York.  Please buy me\ntwo of them. Thanks!!"


gold_standard = set(['Good', 'muffins', 'cost', '$303.8', 'in', 'New', 'York', '.', 
                     'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '!!'])


def score(regex, text, gold_standard): 
    """
    # Measure precision and recall of our regex capturing every *word type* 
    """
    guesses = nltk.regexp_tokenize(text, regex) #splits a string into substrings using a regex
    print('regex: ', regex)
    print('regex returns: ', guesses)
    
    guesses = set(guesses)
    tp = len(guesses.intersection(gold_standard))
    fp = len(guesses - gold_standard)
    fn = len(gold_standard - guesses)
    print('---'*10)
    print('Total word types from regex =', len(guesses))
    print('True positives =', tp)
    print('False positives=', fp)
    print('False negatives=', fn)  
    print('Precision =', tp/(tp+fp))
    print('Recall =', tp/(tp+fn))


#baseline
regex = r"\w+" #any alphanumeric character 1 or more times 
score(regex, text, gold_standard)

regex:  \w+
regex returns:  ['Good', 'muffins', 'cost', '303', '8', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']
------------------------------
Total word types from regex = 15
True positives = 13
False positives= 2
False negatives= 3
Precision = 0.8666666666666667
Recall = 0.8125


regex = r"[A-Za-z]+|\$[\d\.]+|\S+" 
score(regex, text, gold_standard)

regex:  [A-Za-z]+|\$[\d\.]+|\S+
regex returns:  ['Good', 'muffins', 'cost', '$303.8', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '!!']
------------------------------
Total word types from regex = 16
True positives = 16
False positives= 0
False negatives= 0
Precision = 1.0
Recall = 1.0


re.findall(r"dog|cat", "catdogcat")

['cat', 'dog', 'cat']


re.findall(r"cat|dog", "catdogcat")

['cat', 'dog', 'cat']


#Finds left-most expression in the regex disjunction first
text = "Good!!!"
regex = r"[A-Za-z]+|\$[\d\.]+|\S+"
re.findall(regex, text)

['Good', '!!!']


#Finds left-most expression in the regex disjunction first
text = "Good!!!"
regex = r"\S+|[A-Za-z]+|\$[\d\.]+"
re.findall(regex, text)

['Good!!!']