IMDB GRU With Tokenization
Table of Contents
Beginning
This is another version of the RNN model to classify the IMDB reviews, but this time we're going to tokenize it ourselves and use a GRU, instead of using the tensorflow-datasets version.
Imports
Python
from argparse import Namespace
PyPi
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import hvplot.pandas
import numpy
import pandas
import tensorflow
import tensorflow_datasets
Other
from graeae import Timer, EmbedHoloviews
Set Up
The Timer
TIMER = Timer()
Plotting
Middle
Set Up the Data
imdb, info = tensorflow_datasets.load("imdb_reviews",
with_info=True,
as_supervised=True)
WARNING: Logging before flag parsing goes to stderr. W0924 21:52:10.158111 139862640383808 dataset_builder.py:439] Warning: Setting shuffle_files=True because split=TRAIN and shuffle_files=None. This behavior will be deprecated on 2019-08-06, at which point shuffle_files=False will be the default for all splits.
training, testing = imdb["train"], imdb["test"]
Building Up the Tokenizer
Since we didn't pass in a specifier for the configuration we wanted (e.g. imdb/subwords8k
) it defaulted to giving us the plain text reviews (and their labels) so we have to build the tokenizer ourselves.
Split Up the Sentences and Their Labels
As you might recall, the data set consists of 50,000 IMDB movie reviews categorized as positive or negative. To build the tokenize we first have to split the sentences from their labels
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []
with TIMER:
for sentence, label in training:
training_sentences.append(str(sentence.numpy()))
training_labels.append(str(label.numpy()))
for sentence, label in testing:
testing_sentences.append(str(sentence.numpy))
testing_labels.append(str(label.numpy()))
2019-09-24 21:52:11,396 graeae.timers.timer start: Started: 2019-09-24 21:52:11.395126 I0924 21:52:11.396310 139862640383808 timer.py:70] Started: 2019-09-24 21:52:11.395126 2019-09-24 21:52:18,667 graeae.timers.timer end: Ended: 2019-09-24 21:52:18.667789 I0924 21:52:18.667830 139862640383808 timer.py:77] Ended: 2019-09-24 21:52:18.667789 2019-09-24 21:52:18,670 graeae.timers.timer end: Elapsed: 0:00:07.272663 I0924 21:52:18.670069 139862640383808 timer.py:78] Elapsed: 0:00:07.272663
training_labels_final = numpy.array(training_labels)
testing_labels_final = numpy.array(testing_labels)
Some Constants
Text = Namespace(
vocab_size = 10000,
embedding_dim = 16,
max_length = 120,
trunc_type='post',
oov_token = "<OOV>",
)
Build the Tokenizer
tokenizer = Tokenizer(num_words=Text.vocab_size, oov_token=Text.oov_token)
with TIMER:
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=Text.max_length, truncating=Text.trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=Text.max_length)
2019-09-24 21:52:21,705 graeae.timers.timer start: Started: 2019-09-24 21:52:21.705287 I0924 21:52:21.705317 139862640383808 timer.py:70] Started: 2019-09-24 21:52:21.705287 2019-09-24 21:52:32,152 graeae.timers.timer end: Ended: 2019-09-24 21:52:32.152267 I0924 21:52:32.152314 139862640383808 timer.py:77] Ended: 2019-09-24 21:52:32.152267 2019-09-24 21:52:32,154 graeae.timers.timer end: Elapsed: 0:00:10.446980 I0924 21:52:32.154620 139862640383808 timer.py:78] Elapsed: 0:00:10.446980
Decoder Ring
index_to_word = {value: key for key, value in word_index.items()}
def decode_review(text: numpy.array) -> str:
return " ".join([index_to_word.get(item, "<?>") for item in text])
Build the Model
This time we're going to build a four-layer model with one Bidirectional layer that uses a GRU (Gated Recurrent Unit) instead of a LSTM.
model = tensorflow.keras.Sequential([
tensorflow.keras.layers.Embedding(Text.vocab_size, Text.embedding_dim, input_length=Text.max_length),
tensorflow.keras.layers.Bidirectional(tensorflow.compat.v2.keras.layers.GRU(32)),
tensorflow.keras.layers.Dense(6, activation='relu'),
tensorflow.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 120, 16) 160000 _________________________________________________________________ bidirectional (Bidirectional (None, 64) 9600 _________________________________________________________________ dense (Dense) (None, 6) 390 _________________________________________________________________ dense_1 (Dense) (None, 1) 7 ================================================================= Total params: 169,997 Trainable params: 169,997 Non-trainable params: 0 _________________________________________________________________ None
Train it
EPOCHS = 50
ONCE_PER_EPOCH = 2
batch_size = 8
history = model.fit(padded, training_labels_final,
epochs=EPOCHS,
batch_size=batch_size,
validation_data=(testing_padded, testing_labels_final),
verbose=ONCE_PER_EPOCH)
Plot It
data = pandas.DataFrame(history.history)
plot = data.hvplot().opts(title="GRU Training Performance", width=1000, height=800)
Embed(plot=plot, file_name="gru_training")()