Cliff Whitworth

NGrams
import nltk
import random

# Map 2 word sequences followed by list of words that occur next
n = 2
ngrams = {}
words = nltk.word_tokenize(text)
for i in range(len(words)-n):
    gram = ' '.join(words[i:i+n])
    if gram not in ngrams.keys():
        ngrams[gram] = []
        
    ngrams[gram].append(words[i+n])


# Create ngram sequence based on corpus
currentGram = ' '.join(words[0:n])
result = currentGram
for i in range(100):
    if currentGram not in ngrams.keys():
        break
        
    possibilities = ngrams[currentGram]
    nextItem = possibilities[random.randrange(len(possibilities))]
    result += ' ' + nextItem
    tokenized_result = nltk.word_tokenize(result)
    currentGram = ' '.join(tokenized_result[len(rWords)-n:len(words)])
Notes

Recent Notes

NGrams