Parts-of-Speech Tagging: Creating a Vocabulary
Table of Contents
Beginning
Imports
# python
from argparse import Namespace
from collections import Counter
from functools import partial
import os
import random
import re
import string
# from pypi
from dotenv import load_dotenv
import pandas
Set Up
load_dotenv("posts/nlp/.env")
Middle
Reading Text Data
We're going to start with a pre-tagged dataset taken from the Wall Street Journal.
path = os.environ["WALL_STREET_JOURNAL_POS"]
with open(path) as reader:
lines = reader.read().split("\n")
Here's what the head of the file looks like.
for line in lines[:5]:
print(line)
In IN an DT Oct. NNP 19 CD review NN
It's a two-column (tab-separated) file with no header, but we're told that the first column is the word being tagged for its part-of-speech and the second column is the tag itself.
Word Counts
Here we'll count the number of times a word appears in our data set and filter out words that only appear once.
words = [line.split("\t")[0] for line in lines]
print(f"Pre-Filtered word count: {len(words):,}")
counts = Counter(words)
words = [key for key, value in counts.items() if value > 1]
print(f"Filtered Word Count: {len(words):,}")
Pre-Filtered word count: 989,861 Filtered Word Count: 23,768
Just a quick check to make sure the counts are right.
grab_count = lambda pair: pair[1]
COUNT = 1
remaining = len(words)
kept = counts.most_common(remaining)
assert min(kept, key=grab_count)[COUNT] > 1
rejected = counts.most_common()[remaining:]
assert max(rejected, key=grab_count)[COUNT] < 2
Now, a sorted version.
words = sorted(words)
assert type(words) is list
And a peek at some of the values.
for word in random.sample(words, 5):
print(word)
shifts solvency downbeat reassurance UFOs
Known Unknowns
We have a labeled vocabulary, but any new documents we encounter might have words that aren't in our vocabulary, in case we will label them as "unknown", but there are some unknowns that we can classify based on certain conditions (like their suffix).
Known Stuff
- Suffixes
#+begin_src python :results none Suffixes = Namespace( noun = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"], verb = ["ate", "ify", "ise", "ize"], adjective = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"], adverb = ["ward", "wards", "wise"] )
- Labels for the Unknowns
UNKNOWN = "--unknown-{}--" Label = Namespace( digit=UNKNOWN.format("digit"), punctuation=UNKNOWN.format("punctuation"), uppercase=UNKNOWN.format("uppercase"), noun=UNKNOWN.format("noun"), verb=UNKNOWN.format("verb"), adjective=UNKNOWN.format("adjective"), adverb=UNKNOWN.format("adverb"), unknown="--unknown--" )
- Bundle Them Up
Unknown = Namespace( punctuation = set(string.punctuation), suffix = Suffixes, label=Label, has_digit=re.compile(r"\d"), has_uppercase=re.compile("[A-Z]") )
Label the Unknowns
def label_unknown(word: str) -> str:
"""
Assign tokens to unknown words
Args:
word: word not in our vocabulary
Returns:
label for the word
"""
if Unknown.has_digit.search(word):
return Unknown.label.digit
if not Unknown.punctuation.isdisjoint(set(word)):
return Unknown.label.punctuation
if Unknown.has_uppercase.search(word):
return Unknown.label.uppercase
if any(word.endswith(suffix) for suffix in Unknown.suffix.noun):
return Unknown.label.noun
if any(word.endswith(suffix) for suffix in Unknown.suffix.verb):
return Unknown.label.verb
if any(word.endswith(suffix) for suffix in Unknown.suffix.adjective):
return Unknown.label.adjective
if any(word.endswith(suffix) for suffix in Unknown.suffix.adverb):
return Unknown.label.adverb
return Unknown.label.unknown
print(f"{label_unknown('cow2pig')}")
print(label_unknown("cow,pig"))
print(label_unknown("cowPig"))
print(label_unknown(f"cowpig{random.choice(Unknown.suffix.noun)}"))
print(label_unknown(f"cowpig{random.choice(Unknown.suffix.verb)}"))
print(label_unknown(f"cowpig{random.choice(Unknown.suffix.adjective)}"))
print(label_unknown(f"cowpig{random.choice(Unknown.suffix.adverb)}"))
print(label_unknown("cowdog"))
--unknown-digit-- --unknown-punctuation-- --unknown-uppercase-- --unknown-noun-- --unknown-verb-- --unknown-adjective-- --unknown-adverb-- --unknown--
Getting Tags
I don't know what the Coursera example is for - they check to see if an already tagged word is in our vocabulary and then clobber the word with an unknown tag if it isn't and return the original tag. There must be a reason for this, but it isn't explained in the notebook so I'm going to do something different. I'm going to assume that the word isn't tagged and we want to tag it.
POS Tag Interpreter
The notebook doesn't say whose tagging system is being used so I'm going to assume that it's the Penn Treebank P.O.S. system. I'll make an interpreter for the tags, since I have no idea what some of them mean.
URL = "https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
data = pandas.read_html(URL, header=0)[0]
TRANSLATOR = {row.Tag:row.Description for row in data.itertuples()}
cleaned = (line for line in lines if line)
pairs = (line.split("\t") for line in cleaned)
VOCABULARY = {key:value for key, value in pairs}
EMPTY_LINE = "--n--"
TAG_FOR_EMPTY_LINE = "--s--"
DESCRIPTION_FOR_EMPTY_LINE = "--d--"
def tag_word(word: str, vocabulary: set, translator: dict) -> tuple:
"""gets the part-of-speech tag for the word
Args:
word: the word to tag
vocabulary: word to tag dictionary
translator: part of speech tag description
Returns:
word, part-of-speech tag, description
"""
if not word:
return EMPTY_LINE, TAG_FOR_EMPTY_LINE, DESCRIPTION_FOR_EMPTY_LINE
if word not in vocabulary:
return word, label_unknown(word), Unknown.label.unknown
return word, vocabulary[word], translator.get(vocabulary[word], Unknown.label.unknown)
tagger = partial(tag_word, vocabulary=VOCABULARY, translator=TRANSLATOR)
Special Character
print(tagger("\n"))
('\n', '--unknown--', '--unknown--')
Empty String
print(tagger(""))
('--n--', '--s--', '--d--')
Known Preposition
print(tagger("In"))
('In', 'IN', 'Preposition or subordinating conjunction')
Nouns
Noun
print(tagger("bicycle"))
('bicycle', 'NN', 'Noun, singular or mass')
print(tagger("flatulence"))
('flatulence', '--unknown-noun--', '--unknown--')
Unknown Unknown
print(tagger("tardigrade"))
('tardigrade', '--unknown--', '--unknown--')
Verbs
print(tagger("scrutinize"))
('scrutinize', 'VB', 'Verb, base form')
print(tagger("euthanize"))
('euthanize', '--unknown-verb--', '--unknown--')
Adjectives
print(tagger("venerable"))
('venerable', 'JJ', 'Adjective')
print(tagger("malodorous"))
('malodorous', '--unknown-adjective--', '--unknown--')
Adverbs
print(tagger("backwards"))
('backwards', 'RB', 'Adverb')
print(tagger("bitwise"))
('bitwise', '--unknown-verb--', '--unknown--')
End
So, there you have it, a rudimentary way to handle tagging parts of speech for words outside of our vocabulary.