Parts-of-Speech Tagging: Training

Beginning

Imports

# python
from collections import defaultdict

# pypi
from dotenv import load_dotenv

# this repository
from neurotic.nlp.parts_of_speech.preprocessing import Environment, DataLoader

Set Up

The Environment

load_dotenv("posts/nlp/.env")

The Data

loader = DataLoader()

Training

In this section, you will find the words that are not ambiguous.

  • For example, the word is is a verb and it is not ambiguous.
  • In the WSJ corpus, 86% of the tokens are unambiguous (meaning they have only one tag)
  • About 14% are ambiguous (meaning that they have more than one tag)

Before you start predicting the tags of each word, you will need to compute a few dictionaries that will help you to generate the tables.

Preprocessing

# replace the next three code blocks once the assignment is done
import string

punct = set(string.punctuation)
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]
def get_word_tag(line: str, vocab: dict):
    """splits line and handles unknowns and empty lines


    Args:
     line: whitespace separated string with word and tag
     vocab: hashable that holds the known words
    """
    if not line.split():
        word = "--n--"
        tag = "--s--"
        return word, tag
    else:
        word, tag = line.split()
        if word not in vocab: 
            # Handle unknown words
            word = assign_unk(word)
        return word, tag
    return None 
def assign_unk(tok):
    """
    Assign unknown word tokens
    """
    # Digits
    if any(char.isdigit() for char in tok):
        return "--unk_digit--"

    # Punctuation
    elif any(char in punct for char in tok):
        return "--unk_punct--"

    # Upper-case
    elif any(char.isupper() for char in tok):
        return "--unk_upper--"

    # Nouns
    elif any(tok.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Verbs
    elif any(tok.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Adjectives
    elif any(tok.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Adverbs
    elif any(tok.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"

    return "--unk--"

Transition counts

  • The first dictionary is the transition_counts dictionary which computes the number of times each tag happened next to another tag.

This dictionary will be used to compute: \[ P(t_i |t_{i-1}) \]

This is the probability of a tag at position i given the tag at position i-1.

In order for you to compute equation 1, you will create a transition_counts dictionary where

  • The keys are (prev_tag, tag)
  • The values are the number of times those two tags appeared in that order.

Emission counts

The second dictionary you will compute is the emission_counts dictionary. This dictionary will be used to compute:

\[ P(w_i|t_i) \]

In other words, you will use it to compute the probability of a word given its tag.

In order for you to compute equation 2, you will create an emission_counts dictionary where

  • The keys are (tag, word)
  • The values are the number of times that pair showed up in your training set.

Tag counts

The last dictionary you will compute is the tag_counts dictionary.

  • The key is the tag
  • The value is the number of times each tag appeared.
 def create_dictionaries(training_corpus: list, vocab: dict):
     """Creat the three training dictionaries

     Args: 
        ``training_corpus``: a corpus where each line has a word followed by its tag.
        ``vocab``: a dictionary where keys are words in vocabulary and value is an index
     Returns: 
        ``emission_counts``: a dictionary where the keys are (tag, word) and the values are the counts
        ``transition_counts``: a dictionary where the keys are (prev_tag, tag) and the values are the counts
        ``tag_counts``: a dictionary where the keys are the tags and the values are the counts
     """

     # initialize the dictionaries using defaultdict
     emission_counts = defaultdict(int)
     transition_counts = defaultdict(int)
     tag_counts = defaultdict(int)

     # Initialize "prev_tag" (previous tag) with the start state, denoted by '--s--'
     prev_tag = '--s--' 

     # use 'i' to track the line number in the corpus
     i = 0 

     # Each item in the training corpus contains a word and its POS tag
     # Go through each word and its tag in the training corpus
     for word_tag in training_corpus:

         # Increment the word_tag count
         i += 1

         # Every 50,000 words, print the word count
         if i % 50000 == 0:
             print(f"word count = {i}")

         ### START CODE HERE (Replace instances of 'None' with your code) ###
         # get the word and tag using the get_word_tag helper function (imported from utils_pos.py)
         word, tag = get_word_tag(word_tag, vocab)

         # Increment the transition count for the previous word and tag
         transition_counts[(prev_tag, tag)] += 1

         # Increment the emission count for the tag and word
         emission_counts[(tag, word)] += 1

         # Increment the tag count
         tag_counts[tag] += 1

         # Set the previous tag to this tag (for the next iteration of the loop)
         prev_tag = tag

         ### END CODE HERE ###

     return emission_counts, transition_counts, tag_counts
emission_counts, transition_counts, tag_counts = create_dictionaries(loader.training_corpus, loader.vocabulary)

Get all the POS states.

states = sorted(tag_counts.keys())
print(f"Number of POS tags (number of 'states'): {len(states)}")
print("View these POS tags (states)")
print(states)

expected_states = ['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']

print(set(expected_states) - set(states))
for expected, actual in zip(expected_states, states):
    assert expected == actual, (expected, actual)
assert len(states) == 46, len(states)    

:RESULTS:

Number of POS tags (number of 'states'): 45
View these POS tags (states)
['#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
{'--s--'}
print("transition examples: ")
expected = ((('--s--', 'IN'), 5050),
            (('IN', 'DT'), 32364),
            (('DT', 'NNP'), 9044))

for index, example in enumerate(list(transition_counts.items())[:3]):
    print(example)
    assert example == expected[index]
transition examples: 
(('--s--', 'IN'), 5050)
(('IN', 'DT'), 32364)
(('DT', 'NNP'), 9044)
expected = ((('DT', 'any'), 721),
            (('NN', 'decrease'), 7),
            (('NN', 'insider-trading'), 5))

print("emission examples: ")
for actual, expected in zip(list(emission_counts.items())[200:203], expected):
    print (actual)
    assert actual == expected
emission examples: 
(('DT', 'any'), 721)
(('NN', 'decrease'), 7)
(('NN', 'insider-trading'), 5)
expected = ((('RB', 'back'), 304),
            (('VB', 'back'), 20),
            (('RP', 'back'), 84),
            (('JJ', 'back'), 25),
            (('NN', 'back'), 29),
            (('VBP', 'back'), 4))

print("ambiguous word example: ")
counter = 0
for tup, cnt in emission_counts.items():
    if tup[1] == 'back':
        print(tup, cnt)
        assert expected[counter] == (tup, cnt)
        counter += 1
ambiguous word example: 
('RB', 'back') 304
('VB', 'back') 20
('RP', 'back') 84
('JJ', 'back') 25
('NN', 'back') 29
('VBP', 'back') 4

Bundle It Up

Imports

# python
from collections import defaultdict, Counter
# pypi
import attr

The Trainer

@attr.s(auto_attribs=True)
class TheTrainer:
    """Trains the POS model

    Args:
     corpus: iterable of word, tag tuples
    """
    corpus: list
    _transition_counts: dict=None
    _emission_counts: dict=None
    _tag_counts: dict=None

Transition Counts

This dictionary will be used to compute: \[ P(t_i |t_{i-1}) \]

This is the probability of a tag at position i given the tag at position i-1.

@property
def transition_counts(self) -> dict:
    """maps previous, next tags to counts"""
    if self._transition_counts is None:
        self._transition_counts = defaultdict(int)
        previous_tag = "--s--"
        for word, tag in self.corpus:
            self._transition_counts[(previous_tag, tag)] += 1
            previous_tag = tag
    return self._transition_counts

Emission Counts

The second dictionary you will compute is the emission_counts dictionary. This dictionary will be used to compute:

\[ P(w_i|t_i) \]

In other words, you will use it to compute the probability of a word given its tag.

@property
def emission_counts(self) -> dict:
    """Maps tag, word pairs to counts"""
    if self._emission_counts is None:
        self._emission_counts = Counter(
            ((tag, word) for word, tag in self.corpus)
        )
    return self._emission_counts

Tag Counts

@property
def tag_counts(self) -> dict:
    """Count of tags"""
    if self._tag_counts is None:
        self._tag_counts = Counter((tag for word, tag in self.corpus))
    return self._tag_counts

Test It Out

from neurotic.nlp.parts_of_speech.training import TheTrainer

trainer = TheTrainer(loader.processed_training)

Tag Counts

states = sorted(trainer.tag_counts.keys())
print(f"Number of POS tags (number of 'states'): {len(states)}")
print("View these POS tags (states)")
print(states)

assert len(states) == 46, len(states)
expected_states = ['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
for expected, actual in zip(expected_states, states):
    assert expected == actual
Number of POS tags (number of 'states'): 46
View these POS tags (states)
['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']

Transition Counts

print("transition examples: ")
expected = ((('--s--', 'IN'), 5050),
            (('IN', 'DT'), 32364),
            (('DT', 'NNP'), 9044))

for index, example in enumerate(list(trainer.transition_counts.items())[:3]):
    print(example)
    assert example == expected[index]
transition examples: 
(('--s--', 'IN'), 5050)
(('IN', 'DT'), 32364)
(('DT', 'NNP'), 9044)

Emission Counts

expected = ((('DT', 'any'), 721),
            (('NN', 'decrease'), 7),
            (('NN', 'insider-trading'), 5))

print("emission examples: ")
for actual, expected in zip(list(trainer.emission_counts.items())[200:203], expected):
    print (actual)
    assert actual == expected
emission examples: 
(('DT', 'any'), 721)
(('NN', 'decrease'), 7)
(('NN', 'insider-trading'), 5)

Ambiuguous Word Emission Counts

expected = ((('RB', 'back'), 304),
            (('VB', 'back'), 20),
            (('RP', 'back'), 84),
            (('JJ', 'back'), 25),
            (('NN', 'back'), 29),
            (('VBP', 'back'), 4))

print("ambiguous word example: ")
counter = 0
for tag_word, count in trainer.emission_counts.items():
    if tag_word[1] == 'back':
        print(tag_word, count)
        assert expected[counter] == (tag_word, count)
        counter += 1
ambiguous word example: 
('RB', 'back') 304
('VB', 'back') 20
('RP', 'back') 84
('JJ', 'back') 25
('NN', 'back') 29
('VBP', 'back') 4