In [1]:
import re 
import nltk
In [2]:
text = "Good muffins cost $303.8\nin New York.  Please buy me\ntwo of them. Thanks!!"
In [3]:
gold_standard = set(['Good', 'muffins', 'cost', '$303.8', 'in', 'New', 'York', '.', 
                     'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '!!'])
In [4]:
def score(regex, text, gold_standard): 
    """
    # Measure precision and recall of our regex capturing every *word type* 
    """
    guesses = nltk.regexp_tokenize(text, regex) #splits a string into substrings using a regex
    print('regex: ', regex)
    print('regex returns: ', guesses)
    
    guesses = set(guesses)
    tp = len(guesses.intersection(gold_standard))
    fp = len(guesses - gold_standard)
    fn = len(gold_standard - guesses)
    print('---'*10)
    print('Total word types from regex =', len(guesses))
    print('True positives =', tp)
    print('False positives=', fp)
    print('False negatives=', fn)  
    print('Precision =', tp/(tp+fp))
    print('Recall =', tp/(tp+fn))
In [5]:
#baseline
regex = r"\w+" #any alphanumeric character 1 or more times 
score(regex, text, gold_standard)
regex:  \w+
regex returns:  ['Good', 'muffins', 'cost', '303', '8', 'in', 'New', 'York', 'Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']
------------------------------
Total word types from regex = 15
True positives = 13
False positives= 2
False negatives= 3
Precision = 0.8666666666666667
Recall = 0.8125
In [6]:
regex = r"[A-Za-z]+|\$[\d\.]+|\S+" 
score(regex, text, gold_standard)
regex:  [A-Za-z]+|\$[\d\.]+|\S+
regex returns:  ['Good', 'muffins', 'cost', '$303.8', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '!!']
------------------------------
Total word types from regex = 16
True positives = 16
False positives= 0
False negatives= 0
Precision = 1.0
Recall = 1.0

Examine how Python's regular expression library interacts with disjunctions.

In [7]:
re.findall(r"dog|cat", "catdogcat")
Out[7]:
['cat', 'dog', 'cat']
In [8]:
re.findall(r"cat|dog", "catdogcat")
Out[8]:
['cat', 'dog', 'cat']
In [9]:
#Finds left-most expression in the regex disjunction first
text = "Good!!!"
regex = r"[A-Za-z]+|\$[\d\.]+|\S+"
re.findall(regex, text)
Out[9]:
['Good', '!!!']
In [10]:
#Finds left-most expression in the regex disjunction first
text = "Good!!!"
regex = r"\S+|[A-Za-z]+|\$[\d\.]+"
re.findall(regex, text)
Out[10]:
['Good!!!']