Cliff Whitworth

RNN NLP

Shoutout

Sequence Tokens

from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Embedding

# Sequence tokens
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)
seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)

Pad sequences for equal length

pad_train = pad_sequences(seq_train)
pad_test = pad_sequences(seq_test, maxlen=pad_train.shape[1])

Create model

# Create the model
T = pad_train.shape[1] # padded_train.shape(1)
V = len(tokenizer.word_index) # len(tokenizer.word_counts)
D = 20 # Sequence Length
M = 15 # LSTM Units

i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

Train model

# Train model
r = model.fit(pad_train, y_train, epochs=10, validation_data=(pad_test, y_test))

Make predictions

# Confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report

predictions = model.predict(pad_test).round()
cm = confusion_matrix(y_test, predictions)
print(cm)
print()
cr = classification_report(y_test, predictions)
print(cr)

Enumerate to find class of 1s

[(i, p) for (i, p) in enumerate(predictions) if p == 1][:20]

Make single prediction

d = 185
print(model.predict(pad_test[d:d+1]).round())

Notes

Recent Notes

RNN NLP