RNN NLP
Sequence Tokens
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Embedding
# Sequence tokens
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)
seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)
Pad sequences for equal length
pad_train = pad_sequences(seq_train)
pad_test = pad_sequences(seq_test, maxlen=pad_train.shape[1])
Create model
# Create the model
T = pad_train.shape[1] # padded_train.shape(1)
V = len(tokenizer.word_index) # len(tokenizer.word_counts)
D = 20 # Sequence Length
M = 15 # LSTM Units
i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(i, x)
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
Train model
# Train model
r = model.fit(pad_train, y_train, epochs=10, validation_data=(pad_test, y_test))
Make predictions
# Confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report
predictions = model.predict(pad_test).round()
cm = confusion_matrix(y_test, predictions)
print(cm)
print()
cr = classification_report(y_test, predictions)
print(cr)
Enumerate to find class of 1s
[(i, p) for (i, p) in enumerate(predictions) if p == 1][:20]
Make single prediction
d = 185
print(model.predict(pad_test[d:d+1]).round())