Parts-of-Speech Tagging: Training
Table of Contents
Beginning
Imports
# python
from collections import defaultdict
# pypi
from dotenv import load_dotenv
# this repository
from neurotic.nlp.parts_of_speech.preprocessing import Environment, DataLoader
Set Up
The Environment
load_dotenv("posts/nlp/.env")
The Data
loader = DataLoader()
Training
In this section, you will find the words that are not ambiguous.
- For example, the word
is
is a verb and it is not ambiguous. - In the
WSJ
corpus, 86% of the tokens are unambiguous (meaning they have only one tag) - About 14% are ambiguous (meaning that they have more than one tag)
Before you start predicting the tags of each word, you will need to compute a few dictionaries that will help you to generate the tables.
Preprocessing
# replace the next three code blocks once the assignment is done
import string
punct = set(string.punctuation)
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]
def get_word_tag(line: str, vocab: dict):
"""splits line and handles unknowns and empty lines
Args:
line: whitespace separated string with word and tag
vocab: hashable that holds the known words
"""
if not line.split():
word = "--n--"
tag = "--s--"
return word, tag
else:
word, tag = line.split()
if word not in vocab:
# Handle unknown words
word = assign_unk(word)
return word, tag
return None
def assign_unk(tok):
"""
Assign unknown word tokens
"""
# Digits
if any(char.isdigit() for char in tok):
return "--unk_digit--"
# Punctuation
elif any(char in punct for char in tok):
return "--unk_punct--"
# Upper-case
elif any(char.isupper() for char in tok):
return "--unk_upper--"
# Nouns
elif any(tok.endswith(suffix) for suffix in noun_suffix):
return "--unk_noun--"
# Verbs
elif any(tok.endswith(suffix) for suffix in verb_suffix):
return "--unk_verb--"
# Adjectives
elif any(tok.endswith(suffix) for suffix in adj_suffix):
return "--unk_adj--"
# Adverbs
elif any(tok.endswith(suffix) for suffix in adv_suffix):
return "--unk_adv--"
return "--unk--"
Transition counts
- The first dictionary is the
transition_counts
dictionary which computes the number of times each tag happened next to another tag.
This dictionary will be used to compute: \[ P(t_i |t_{i-1}) \]
This is the probability of a tag at position i given the tag at position i-1.
In order for you to compute equation 1, you will create a transition_counts
dictionary where
- The keys are
(prev_tag, tag)
- The values are the number of times those two tags appeared in that order.
Emission counts
The second dictionary you will compute is the emission_counts
dictionary. This dictionary will be used to compute:
\[ P(w_i|t_i) \]
In other words, you will use it to compute the probability of a word given its tag.
In order for you to compute equation 2, you will create an emission_counts
dictionary where
- The keys are
(tag, word)
- The values are the number of times that pair showed up in your training set.
Tag counts
The last dictionary you will compute is the tag_counts
dictionary.
- The key is the tag
- The value is the number of times each tag appeared.
def create_dictionaries(training_corpus: list, vocab: dict):
"""Creat the three training dictionaries
Args:
``training_corpus``: a corpus where each line has a word followed by its tag.
``vocab``: a dictionary where keys are words in vocabulary and value is an index
Returns:
``emission_counts``: a dictionary where the keys are (tag, word) and the values are the counts
``transition_counts``: a dictionary where the keys are (prev_tag, tag) and the values are the counts
``tag_counts``: a dictionary where the keys are the tags and the values are the counts
"""
# initialize the dictionaries using defaultdict
emission_counts = defaultdict(int)
transition_counts = defaultdict(int)
tag_counts = defaultdict(int)
# Initialize "prev_tag" (previous tag) with the start state, denoted by '--s--'
prev_tag = '--s--'
# use 'i' to track the line number in the corpus
i = 0
# Each item in the training corpus contains a word and its POS tag
# Go through each word and its tag in the training corpus
for word_tag in training_corpus:
# Increment the word_tag count
i += 1
# Every 50,000 words, print the word count
if i % 50000 == 0:
print(f"word count = {i}")
### START CODE HERE (Replace instances of 'None' with your code) ###
# get the word and tag using the get_word_tag helper function (imported from utils_pos.py)
word, tag = get_word_tag(word_tag, vocab)
# Increment the transition count for the previous word and tag
transition_counts[(prev_tag, tag)] += 1
# Increment the emission count for the tag and word
emission_counts[(tag, word)] += 1
# Increment the tag count
tag_counts[tag] += 1
# Set the previous tag to this tag (for the next iteration of the loop)
prev_tag = tag
### END CODE HERE ###
return emission_counts, transition_counts, tag_counts
emission_counts, transition_counts, tag_counts = create_dictionaries(loader.training_corpus, loader.vocabulary)
Get all the POS states.
states = sorted(tag_counts.keys())
print(f"Number of POS tags (number of 'states'): {len(states)}")
print("View these POS tags (states)")
print(states)
expected_states = ['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
print(set(expected_states) - set(states))
for expected, actual in zip(expected_states, states):
assert expected == actual, (expected, actual)
assert len(states) == 46, len(states)
:RESULTS:
Number of POS tags (number of 'states'): 45 View these POS tags (states) ['#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``'] {'--s--'}
print("transition examples: ")
expected = ((('--s--', 'IN'), 5050),
(('IN', 'DT'), 32364),
(('DT', 'NNP'), 9044))
for index, example in enumerate(list(transition_counts.items())[:3]):
print(example)
assert example == expected[index]
transition examples: (('--s--', 'IN'), 5050) (('IN', 'DT'), 32364) (('DT', 'NNP'), 9044)
expected = ((('DT', 'any'), 721),
(('NN', 'decrease'), 7),
(('NN', 'insider-trading'), 5))
print("emission examples: ")
for actual, expected in zip(list(emission_counts.items())[200:203], expected):
print (actual)
assert actual == expected
emission examples: (('DT', 'any'), 721) (('NN', 'decrease'), 7) (('NN', 'insider-trading'), 5)
expected = ((('RB', 'back'), 304),
(('VB', 'back'), 20),
(('RP', 'back'), 84),
(('JJ', 'back'), 25),
(('NN', 'back'), 29),
(('VBP', 'back'), 4))
print("ambiguous word example: ")
counter = 0
for tup, cnt in emission_counts.items():
if tup[1] == 'back':
print(tup, cnt)
assert expected[counter] == (tup, cnt)
counter += 1
ambiguous word example: ('RB', 'back') 304 ('VB', 'back') 20 ('RP', 'back') 84 ('JJ', 'back') 25 ('NN', 'back') 29 ('VBP', 'back') 4
Bundle It Up
Imports
# python
from collections import defaultdict, Counter
# pypi
import attr
The Trainer
@attr.s(auto_attribs=True)
class TheTrainer:
"""Trains the POS model
Args:
corpus: iterable of word, tag tuples
"""
corpus: list
_transition_counts: dict=None
_emission_counts: dict=None
_tag_counts: dict=None
Transition Counts
This dictionary will be used to compute: \[ P(t_i |t_{i-1}) \]
This is the probability of a tag at position i given the tag at position i-1.
@property
def transition_counts(self) -> dict:
"""maps previous, next tags to counts"""
if self._transition_counts is None:
self._transition_counts = defaultdict(int)
previous_tag = "--s--"
for word, tag in self.corpus:
self._transition_counts[(previous_tag, tag)] += 1
previous_tag = tag
return self._transition_counts
Emission Counts
The second dictionary you will compute is the emission_counts
dictionary. This dictionary will be used to compute:
\[ P(w_i|t_i) \]
In other words, you will use it to compute the probability of a word given its tag.
@property
def emission_counts(self) -> dict:
"""Maps tag, word pairs to counts"""
if self._emission_counts is None:
self._emission_counts = Counter(
((tag, word) for word, tag in self.corpus)
)
return self._emission_counts
Tag Counts
@property
def tag_counts(self) -> dict:
"""Count of tags"""
if self._tag_counts is None:
self._tag_counts = Counter((tag for word, tag in self.corpus))
return self._tag_counts
Test It Out
from neurotic.nlp.parts_of_speech.training import TheTrainer
trainer = TheTrainer(loader.processed_training)
Tag Counts
states = sorted(trainer.tag_counts.keys())
print(f"Number of POS tags (number of 'states'): {len(states)}")
print("View these POS tags (states)")
print(states)
assert len(states) == 46, len(states)
expected_states = ['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
for expected, actual in zip(expected_states, states):
assert expected == actual
Number of POS tags (number of 'states'): 46 View these POS tags (states) ['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
Transition Counts
print("transition examples: ")
expected = ((('--s--', 'IN'), 5050),
(('IN', 'DT'), 32364),
(('DT', 'NNP'), 9044))
for index, example in enumerate(list(trainer.transition_counts.items())[:3]):
print(example)
assert example == expected[index]
transition examples: (('--s--', 'IN'), 5050) (('IN', 'DT'), 32364) (('DT', 'NNP'), 9044)
Emission Counts
expected = ((('DT', 'any'), 721),
(('NN', 'decrease'), 7),
(('NN', 'insider-trading'), 5))
print("emission examples: ")
for actual, expected in zip(list(trainer.emission_counts.items())[200:203], expected):
print (actual)
assert actual == expected
emission examples: (('DT', 'any'), 721) (('NN', 'decrease'), 7) (('NN', 'insider-trading'), 5)
Ambiuguous Word Emission Counts
expected = ((('RB', 'back'), 304),
(('VB', 'back'), 20),
(('RP', 'back'), 84),
(('JJ', 'back'), 25),
(('NN', 'back'), 29),
(('VBP', 'back'), 4))
print("ambiguous word example: ")
counter = 0
for tag_word, count in trainer.emission_counts.items():
if tag_word[1] == 'back':
print(tag_word, count)
assert expected[counter] == (tag_word, count)
counter += 1
ambiguous word example: ('RB', 'back') 304 ('VB', 'back') 20 ('RP', 'back') 84 ('JJ', 'back') 25 ('NN', 'back') 29 ('VBP', 'back') 4