import re
import nltk
text = "Good muffins cost $303.8\nin New York. Please buy me\ntwo of them. Thanks!!"
gold_standard = set(['Good', 'muffins', 'cost', '$303.8', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '!!'])
def score(regex, text, gold_standard):
"""
# Measure precision and recall of our regex capturing every *word type*
"""
guesses = nltk.regexp_tokenize(text, regex) #splits a string into substrings using a regex
print('regex: ', regex)
print('regex returns: ', guesses)
guesses = set(guesses)
tp = len(guesses.intersection(gold_standard))
fp = len(guesses - gold_standard)
fn = len(gold_standard - guesses)
print('---'*10)
print('Total word types from regex =', len(guesses))
print('True positives =', tp)
print('False positives=', fp)
print('False negatives=', fn)
print('Precision =', tp/(tp+fp))
print('Recall =', tp/(tp+fn))
#baseline
regex = r"\w+" #any alphanumeric character 1 or more times
score(regex, text, gold_standard)
regex: \w+ regex returns: ['Good', 'muffins', 'cost', '303', '8', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks'] ------------------------------ Total word types from regex = 15 True positives = 13 False positives= 2 False negatives= 3 Precision = 0.8666666666666667 Recall = 0.8125
regex = r"[A-Za-z]+|\$[\d\.]+|\S+"
score(regex, text, gold_standard)
regex: [A-Za-z]+|\$[\d\.]+|\S+ regex returns: ['Good', 'muffins', 'cost', '$303.8', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '!!'] ------------------------------ Total word types from regex = 16 True positives = 16 False positives= 0 False negatives= 0 Precision = 1.0 Recall = 1.0
Examine how Python's regular expression library interacts with disjunctions.
re.findall(r"dog|cat", "catdogcat")
['cat', 'dog', 'cat']
re.findall(r"cat|dog", "catdogcat")
['cat', 'dog', 'cat']
#Finds left-most expression in the regex disjunction first
text = "Good!!!"
regex = r"[A-Za-z]+|\$[\d\.]+|\S+"
re.findall(regex, text)
['Good', '!!!']
#Finds left-most expression in the regex disjunction first
text = "Good!!!"
regex = r"\S+|[A-Za-z]+|\$[\d\.]+"
re.findall(regex, text)
['Good!!!']