NER: Testing the Model

Cloistered Monkey

2021-01-13 15:03

Table of Contents

Testing New Sentences

# python
from pathlib import Path

# pypi
from trax import layers

import numpy

# this project
from neurotic.nlp.named_entity_recognition import (NER,
                                                   NERData,
                                                   TOKEN)

Set Up the Model and Maps

data = NERData().data
model = NER(vocabulary_size=len(data.vocabulary),
            tag_count=len(data.tags)).model
model.init_from_file(Path("~/models/ner/model.pkl.gz", weights_only=True).expanduser())
print(model)

Serial[
  Embedding_35180_50
  LSTM_50
  Dense_18
  LogSoftmax
]

Middle

def predict(sentence: str,
            model: layers.Serial=model,
            vocabulary: dict=data.vocabulary,
            tags: dict=data.tags,
            unknown: str=data.vocabulary[TOKEN.unknown]) -> list:
    """Predicts the named entities in a sentence

    Args:
     sentence: the sentence to analyze
     model: the NER model
     vocabulary: token to id map
     tags: tag to id map
     unknown: key in the vocabulary for unknown tokens
    """
    tokens = [vocabulary.get(token, unknown)
              for token in sentence.split()]
    batch_data = numpy.ones((1, len(tokens)))
    batch_data[0][:] = tokens
    sentence = numpy.array(batch_data).astype(int)
    output = model(sentence)
    outputs = numpy.argmax(output, axis=-1)
    labels = list(tags.keys())

    indices = (outputs[0][index] for index in range(len(outputs[0])))
    predictions = [labels[index] for index in indices]
    return predictions

sentence = "Bilbo Baggins, the Shire's director of trade and manufacturing policy for the Lord Sauron, said in an interview on Sunday morning that Rumblefish was working to prepare for the possibility of a second wave of the Coronavirus in the Fall, although he said it wouldn’t necessarily come before the fall of the Empire and the rise of the corpse brigade in July"

def print_predictions(sentence: str):
    predictions = predict(sentence)
    for word, entity in zip(sentence.split(), predictions):
        if entity != 'O':
            print(f"{word} - {entity}")
    return

print_predictions(sentence)

Lord - B-org
Sauron, - I-org
Sunday - B-tim
morning - I-tim
July - B-tim

print_predictions("anyone lived in a pretty how town "
                  "(with up so floating many bells down) "
                  "spring summer autumn winter "
                  "he sang his didn't he danced his did.")

summer - I-tim
autumn - I-tim

Hmm, that's interesting.

print_predictions("Spring Summer Autumn Winter")

Summer - B-eve

Some kind of anti-spring bias.

print_predictions("Boogie booty bunny butt")

booty - B-per

Well, I suppose I'd have to match the dataset to put more weird things in there.