Parts-of-Speech Tagging: Numpy
Table of Contents
Beginning
Imports
# python
from functools import partial
from itertools import product
import math
# pypi
from tabulate import tabulate
import numpy
import pandas
Set Up
The Parts-of-Speech Decoder
URL = "https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
data = pandas.read_html(URL, header=0)[0]
TRANSLATOR = {row.Tag:row.Description for row in data.itertuples()}
Tabulate
TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")
Middle
The Tags
We're only going to use three tags.
tags = ['RB', 'NN', 'TO']
for tag in tags:
print(f" - {tag} ({TRANSLATOR[tag]})")
- RB (Adverb)
- NN (Noun, singular or mass)
- TO (to)
Start with a Dictionary
transition_counts
is a dictionary with(previous tag, this tag)
tuples as keys and the number of times these tags appeared together as the values.
transition_counts = {
('NN', 'NN'): 16241,
('RB', 'RB'): 2263,
('TO', 'TO'): 2,
('NN', 'TO'): 5256,
('RB', 'TO'): 855,
('TO', 'NN'): 734,
('NN', 'RB'): 2431,
('RB', 'NN'): 358,
('TO', 'RB'): 200
}
We're going to need the individual tags later on.
tags = list(zip(*transition_counts))
tags = sorted(set(tags[0] + tags[1]))
I don't know what the source is, presumably the Wall Street Journal file that we used in the previous exercise.
A Transition Matrix
We're going to make a transition matrix for the transition_counts
keys.
tag_count = len(tags)
transition_matrix = numpy.zeros((tag_count, tag_count), dtype=int)
for row, column in product(range(tag_count), range(tag_count)):
transition_matrix[row, column] = transition_counts[
(tags[row],
tags[column])
]
transitions = pandas.DataFrame(transition_matrix, index=tags, columns=tags)
print(TABLE(transitions))
NN | RB | TO | |
---|---|---|---|
NN | 16241 | 2431 | 5256 |
RB | 358 | 2263 | 855 |
TO | 734 | 200 | 2 |
Normalization
We're going to normalize each row so that each value is equal to \(\frac{value}{\textit{sum of row}}\).
row_sums = transitions.sum(axis="rows")
normalized = transitions/row_sums
print(TABLE(normalized))
NN | RB | TO | |
---|---|---|---|
NN | 0.936999 | 0.496731 | 0.859807 |
RB | 0.0206542 | 0.462403 | 0.139866 |
TO | 0.042347 | 0.0408664 | 0.000327172 |
:END:
Log Sum
Now we'll add the log of the sum of the current row to the current value along the diagonal.
diagonal = numpy.diagonal(transitions)
diagonal = diagonal + numpy.log(diagonal)
values = transitions.values.astype("float64")
row, column = numpy.diag_indices_from(values)
values[row, column] = diagonal
diagonalized = pandas.DataFrame(values, index=tags, columns=tags)
print(TABLE(diagonalized))
NN | RB | TO | |
---|---|---|---|
NN | 16277.7 | 2431 | 5256 |
RB | 358 | 2291.73 | 855 |
TO | 734 | 200 | 2.69315 |
Brute Force Check
rows, columns = numpy.diag_indices_from(transitions.values)
indices = set(zip(rows, columns))
for row, column in product(range(len(tags)),
range(len(tags))):
expected = transitions.iloc[row, column]
if (row, column) in indices:
expected += numpy.log(transitions.iloc[row, column])
actual = diagonalized.iloc[row, column]
assert math.isclose(expected, actual), f"({row, column}) expected: {expected}, actual: {actual} {expected - actual}"