Cliff Whitworth

spaCy Matcher

import nltk
import spacy
from spacy.matcher import Matcher

# Load model
nlp = spacy.load('en_core_web_sm', disable=['new', 'textcat'])
text = '''This was done on the third of July. 
        The third is the day before the fourth of July. 
        The day after the fourth is the fifth of July.
        Next month will be the third of August.'''

pattern = [ {'POS':{'IN':['NOUN', 'ADJ']}},
            {'POS':'ADP'},
            {'LOWER':'july'}]

matcher = Matcher(nlp.vocab)

# matcher ID, callback, pattern
matcher.add('FindDayOfJuly', None, pattern)

# Tokenize text to track each sentence
tokenized_sentence = nltk.sent_tokenize(text)

for sentence in tokenized_sentence: 
    doc = nlp(sentence) # Tokenize the sentence for spaCy 
    matches = matcher(doc)
    words = nltk.word_tokenize(sentence) # Alt to see word tags in sentence
    # https://stackoverflow.com/questions/29332851/what-does-nn-vbd-in-dt-nns-rb-means-in-nltk
    print(nltk.pos_tag(words))
    for id, start, end in matches:
        string_id = nlp.vocab.strings[id]
        print(f'\n{string_id}: {doc[start:end]}\n')

Phrase Matcher

import bs4 as bs
import urllib.request
import re
import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import PhraseMatcher
pmatcher = PhraseMatcher(nlp.vocab)

source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Alice%27s_Adventures_in_Wonderland').read()
soup = bs.BeautifulSoup(source, 'lxml')

text = ''
for paragraph in soup.find_all('p'):
    text += paragraph.text
    
text = re.sub(r'\[[0-9]+\]', ' ', text.lower()).strip()
text = re.sub(r'[\n\s]+', ' ', text)

txt = nlp(text)

phrase_list = ['white rabbit', 'blue caterpillar', 'mock turtle', 'march hare', 'cheshire cat']
phrase_patterns = [nlp(p) for p in phrase_list]
pmatcher.add('AliceWonders', None, *phrase_patterns)
matches_found = pmatcher(txt)

matches_list = []
for id, start, end in matches_found:
    name = nlp.vocab.strings[id]
    span = txt[start:end]
    matches_list.append(span.text)
    print(start, end, span.text)

print()
sentences = [s for s in txt.sents]
for s in sentences:
    s_joined = ''.join([token.text_with_ws for token in s])
    for m in matches_list:        
        if m in s_joined:
            print(s_joined.strip())
            break

Notes

Recent Notes

spaCy Matcher