Parts-of-Speech Tagging: Numpy

Beginning

Imports

# python
from functools import partial
from itertools import product

import math
# pypi
from tabulate import tabulate

import numpy
import pandas

Set Up

The Parts-of-Speech Decoder

URL = "https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
data = pandas.read_html(URL, header=0)[0]

TRANSLATOR = {row.Tag:row.Description for row in data.itertuples()}

Tabulate

TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")

Middle

The Tags

We're only going to use three tags.

tags = ['RB', 'NN', 'TO']
for tag in tags:
    print(f" - {tag} ({TRANSLATOR[tag]})")
  • RB (Adverb)
  • NN (Noun, singular or mass)
  • TO (to)

Start with a Dictionary

  • transition_counts is a dictionary with (previous tag, this tag) tuples as keys and the number of times these tags appeared together as the values.
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

We're going to need the individual tags later on.

tags = list(zip(*transition_counts))
tags = sorted(set(tags[0] + tags[1]))

I don't know what the source is, presumably the Wall Street Journal file that we used in the previous exercise.

A Transition Matrix

We're going to make a transition matrix for the transition_counts keys.

tag_count = len(tags)

transition_matrix = numpy.zeros((tag_count, tag_count), dtype=int)

for row, column in product(range(tag_count), range(tag_count)):
        transition_matrix[row, column] = transition_counts[
            (tags[row],
             tags[column])
        ]

transitions = pandas.DataFrame(transition_matrix, index=tags, columns=tags)
print(TABLE(transitions))
  NN RB TO
NN 16241 2431 5256
RB 358 2263 855
TO 734 200 2

Normalization

We're going to normalize each row so that each value is equal to \(\frac{value}{\textit{sum of row}}\).

row_sums = transitions.sum(axis="rows")
normalized = transitions/row_sums
print(TABLE(normalized))
  NN RB TO
NN 0.936999 0.496731 0.859807
RB 0.0206542 0.462403 0.139866
TO 0.042347 0.0408664 0.000327172

:END:

Log Sum

Now we'll add the log of the sum of the current row to the current value along the diagonal.

diagonal = numpy.diagonal(transitions)
diagonal = diagonal + numpy.log(diagonal)
values = transitions.values.astype("float64")
row, column = numpy.diag_indices_from(values)
values[row, column] = diagonal

diagonalized = pandas.DataFrame(values, index=tags, columns=tags)
print(TABLE(diagonalized))
  NN RB TO
NN 16277.7 2431 5256
RB 358 2291.73 855
TO 734 200 2.69315

Brute Force Check

rows, columns = numpy.diag_indices_from(transitions.values)
indices = set(zip(rows, columns))
for row, column in product(range(len(tags)),
                           range(len(tags))):
    expected = transitions.iloc[row, column]
    if (row, column) in indices:
        expected += numpy.log(transitions.iloc[row, column])
    actual = diagonalized.iloc[row, column]
    assert math.isclose(expected, actual), f"({row, column}) expected: {expected}, actual: {actual} {expected - actual}"