Parts-of-Speech Tagging: The Data

Beginning

Imports

# python
from argparse import Namespace
from collections import Counter

import os
import random
import re
import string

# from pypi
from dotenv import load_dotenv

Set Up

The Environment

load_dotenv("posts/nlp/.env")
Environment = Namespace(
    training_corpus="WALL_STREET_JOURNAL_POS",
    vocabulary="WALL_STREET_JOURNAL_VOCABULARY",
    test_corpus= "WALL_STREET_JOURNAL_TEST_POS",
    test_words="WALL_STREET_JOURNAL_TEST_WORDS",
)

Middle

The Training Corpus

The training corpus is made up of words and parts-of-speech tags from the Wall Street Journal (previously looked at in this post).

with open(os.environ[Environment.training_corpus]) as reader:
    training_corpus = reader.read().split("\n")
for row in training_corpus[:5]:
    print(f" - {row}")
- In    IN
- an    DT
- Oct.  NNP
- 19    CD
- review        NN

The Vocabulary

There is also a pre-processed version of that same file that has only the words without the parts-of-speech tags that we can load. It has been altered slightly as well:

  • Words that appear only once have been removed
  • Words that don't exist in the original have been added so that there are some unknown words to handle.
with open(os.environ[Environment.vocabulary]) as reader:
    vocabulary_words = reader.read().split("\n")

for row in vocabulary_words[:5]:
    print(f" - {row}")
- !
- #
- $
- %
- &

Odd.

for row in random.sample(vocabulary_words, 5):
    print(f" - {row}")
- cabinet
- Byrd
- done
- Fueling
- investments

Our actual vocabulary is going to be a dictionary mapping each word to its index in the list we just loaded.

vocabulary_words = sorted(vocabulary_words)
vocabulary = {key: index for index, key in enumerate(vocabulary_words)}
assert len(vocabulary) == len(vocabulary_words)
training_counter = Counter([row.split("\t")[0] for row in training_corpus])
filtered = set(key for key, value in training_counter.items() if value > 1)
extra = set(vocabulary) - filtered
print(f"{len(extra):,}")
for item in random.sample(extra, 5):
    print(f" - {item}")
9
 - --unk_adj--
 - --n--
 - --unk--
 - --unk_noun--
 - --unk_adv--

So, it looks like the "unknowns" are pre-process tags.

The Test Corpus

This is another list of words taken from the Wall Street Journal with parts-of-speech tags added that we'll use as the test-set.

with open(os.environ[Environment.test_corpus]) as reader:
    test_corpus = reader.read().split("\n")

The Test Vocabulary

There is also a set of words that we're going to try and tag. These need to be pre-processed.

Handle Empty

We are going to replace empty lines with a special token.

def handle_empty(words: list, empty_token="--n--"):
    """replace empty strings withh empty_token

    Args:
     words: list to process
     empty_token: what to replace empty strings with

    Yields:
     processed words
    """
    for word in words:
        if not word.strip():
            yield empty_token
        else:
            yield word
    return

Labeling Unknowns

  • Suffixes
    Suffixes = Namespace(
        noun = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood",
                "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry",
                "scape", "ship", "ty"],
        verb = ["ate", "ify", "ise", "ize"],
        adjective = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive",
                     "less", "ly", "ous"],
        adverb = ["ward", "wards", "wise"]
    )
    
  • Labels for the Unknowns
    UNKNOWN = "--unknown-{}--"
    Label = Namespace(
        digit=UNKNOWN.format("digit"),
        punctuation=UNKNOWN.format("punctuation"),
        uppercase=UNKNOWN.format("uppercase"),
        noun=UNKNOWN.format("noun"),    
        verb=UNKNOWN.format("verb"),
        adjective=UNKNOWN.format("adjective"),
        adverb=UNKNOWN.format("adverb"),
        unknown="--unknown--"
    )
    
  • Bundle Them Up
    Unknown = Namespace(
        punctuation = set(string.punctuation),
        suffix = Suffixes,
        label=Label,
        has_digit=re.compile(r"\d"),
        has_uppercase=re.compile("[A-Z]")
    )
    
    def label_unknowns(words: str, vocabulary: set) -> str:
        """
        Assign tokens to unknown words
    
        Args:
         word: word not in our vocabulary
         vocabulary: something to check if it is a known word
    
        Yields:
         word or label for the word if unknown
        """
        for word in words:
            if word in vocabulary:
                yield word
    
            elif Unknown.has_digit.search(word):
                yield Unknown.label.digit
    
            elif not Unknown.punctuation.isdisjoint(set(word)):
                yield Unknown.label.punctuation
    
            elif Unknown.has_uppercase.search(word):
                yield Unknown.label.uppercase
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.noun):
                yield Unknown.label.noun
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.verb):
                yield Unknown.label.verb
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.adjective):
                yield Unknown.label.adjective
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.adverb):
                yield Unknown.label.adverb
            else:
                yield Unknown.label.unknown
        return
    

The Pre-Processor

def preprocess(words: list, vocabulary: set) -> list:
    """Preprocess words

    Args:
     words: words to pre-process

    Returns:
     words with empty lines and unknown words labeled
    """
    processed = (word.strip() for word in words)
    processed = handle_empty(processed)
    processed = [word for word in label_unknowns(processed, vocabulary)]
    return processed

Load the Test Words

with open(os.environ[Environment.test_words]) as reader:
    test_words = reader.read().strip()
before = len(test_words)
test_words = preprocess(test_words, vocabulary)
assert len(test_words) == before

print(f"{len(test_words):,}")
for word in random.sample(test_words, 5):
    print(f" - {word}")
180,264
 - --unknown--
 - --n--
 - e
 - r
 - --unknown--

Weird that the letters "e" and "r" are known words…

End

So, that's our data. To make it replicable for the next section I'm going to tangle it out.

The Code

Imports

# from python
from argparse import Namespace

import os
import re
import string

# pypi
import attr

The Environment

I don't really know that I should save these keys, but I don't really want to cut and paste all the time…

Environment = Namespace(
    training_corpus="WALL_STREET_JOURNAL_POS",
    vocabulary="WALL_STREET_JOURNAL_VOCABULARY",
    test_corpus= "WALL_STREET_JOURNAL_TEST_POS",
    test_words="WALL_STREET_JOURNAL_TEST_WORDS",
)

The Suffixes

Suffixes = Namespace(
    noun = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood",
            "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry",
            "scape", "ship", "ty"],
    verb = ["ate", "ify", "ise", "ize"],
    adjective = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive",
                 "less", "ly", "ous"],
    adverb = ["ward", "wards", "wise"]
)

The Label

UNKNOWN = "--unknown-{}--"
Label = Namespace(
    digit=UNKNOWN.format("digit"),
    punctuation=UNKNOWN.format("punctuation"),
    uppercase=UNKNOWN.format("uppercase"),
    noun=UNKNOWN.format("noun"),    
    verb=UNKNOWN.format("verb"),
    adjective=UNKNOWN.format("adjective"),
    adverb=UNKNOWN.format("adverb"),
    unknown="--unknown--",
 )

Theirs To Ours

The vocabulary file has their unknown tags alreads inserted which don't matnch mine so, rather than redoing everything I'm going to translate theirs to match mine.

THEIR_UNKNOWN = "--unk_{}--"

THEIRS_TO_MINE = {
    "--unk--": "--unknown--",
    THEIR_UNKNOWN.format("digit"): Label.digit,
    THEIR_UNKNOWN.format("punct"): Label.punctuation,
    THEIR_UNKNOWN.format("upper"): Label.uppercase,
    THEIR_UNKNOWN.format("noun"): Label.noun,
    THEIR_UNKNOWN.format("verb"): Label.verb,
    THEIR_UNKNOWN.format("adj"): Label.adjective,
    THEIR_UNKNOWN.format("adv"): Label.adverb,
}

The Unknown

Unknown = Namespace(
    punctuation = set(string.punctuation),
    suffix = Suffixes,
    label=Label,
    has_digit=re.compile(r"\d"),
    has_uppercase=re.compile("[A-Z]")
)

The Empty

Empty = Namespace(
    word="--n--",
    tag="--s--",
)

The Corpus Pre-Processor

@attr.s(auto_attribs=True)
class CorpusProcessor:
    """Pre-processes the corpus

    Args:
     vocabulary: holder of our known words
    """
    vocabulary: dict
  • Split Tuples
    def split_tuples(self, lines: list):
        """Generates tuples
    
        Args:
         lines: iterable of lines from the corpus
    
        Yields:
         whitespace split of line
        """
        for line in lines:
            yield line.split()
        return
    
  • Handle Empty Lines
    def handle_empty(self, tuples: list):
        """checks for empty strings
    
        Args:
         tuples: tuples of corpus lines
    
        Yields:
         line with empty string marked
        """
        for line in tuples:
            if not line:
                yield Empty.word, Empty.tag
            else:
                yield line
        return
    
  • Handle Unknowns
    def label_unknowns(self, tuples: list) -> str:
        """
        Assign tokens to unknown words
    
        Args:
         tuples: word, tag tuples
    
        Yields:
         (word (or label for the word if unknown), tag) tuple
        """
        for word, tag in tuples:
            if word in self.vocabulary:
                yield word, tag
    
            elif Unknown.has_digit.search(word):
                yield Unknown.label.digit, tag
    
            elif not Unknown.punctuation.isdisjoint(set(word)):
                yield Unknown.label.punctuation, tag
    
            elif Unknown.has_uppercase.search(word):
                yield Unknown.label.uppercase, tag
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.noun):
                yield Unknown.label.noun, tag
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.verb):
                yield Unknown.label.verb, tag
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.adjective):
                yield Unknown.label.adjective, tag
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.adverb):
                yield Unknown.label.adverb, tag
            else:
                yield Unknown.label.unknown, tag
        return
    
  • The Call
    def __call__(self, tuples: list) -> list:
        """preprocesses the words and tags
    
        Args:
         tuples: list of words and tags to process
    
        Returns:
         preprocessed version of words, tags
        """
        processed = self.split_tuples(tuples)
        processed = self.handle_empty(processed)
        processed = [(word, tag) for (word, tag) in self.label_unknowns(processed)]
        return processed
    

The Data Pre-Processor

@attr.s(auto_attribs=True)
class DataPreprocessor:
    """A pre-processor for the data

    Args:
     vocabulary: holder of our known words
     empty_token: what to use if a line is an empty string
    """
    vocabulary: dict
  • Handle Empty Lines
    def handle_empty(self, words: list):
        """replace empty strings withh empty_token
    
        Args:
         words: list to process
         empty_token: what to replace empty strings with
    
        Yields:
         processed words
        """
        for word in words:
            if not word.strip():
                yield Empty.word
            else:
                yield word
        return
    
  • Label Unknowns
    def label_unknowns(self, words: list) -> str:
        """
        Assign tokens to unknown words
    
        Args:
         words: iterable of words to check
    
        Yields:
         word or label for the word if unknown
        """
        for word in words:
            if word in self.vocabulary:
                yield word
    
            elif Unknown.has_digit.search(word):
                yield Unknown.label.digit
    
            elif not Unknown.punctuation.isdisjoint(set(word)):
                yield Unknown.label.punctuation
    
            elif Unknown.has_uppercase.search(word):
                yield Unknown.label.uppercase
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.noun):
                yield Unknown.label.noun
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.verb):
                yield Unknown.label.verb
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.adjective):
                yield Unknown.label.adjective
    
            elif any(word.endswith(suffix) for suffix in Unknown.suffix.adverb):
                yield Unknown.label.adverb
            else:
                yield Unknown.label.unknown
        return
    
  • The Call
    def __call__(self, words: list) -> list:
        """preprocesses the words
    
        Args:
         words: list of words to process
    
        Returns:
         preprocessed version of words
        """
        processed = (word.strip() for word in words)
        processed = self.handle_empty(processed)
        processed = [word for word in self.label_unknowns(processed)]
        return processed
    

The Data Loader

@attr.s(auto_attribs=True)
class DataLoader:
    """Loads the training and test data

    Args:
     environment: namespace with keys for the environment to load paths
    """
    environment: Namespace=Environment
    _preprocess: DataPreprocessor=None
    _vocabulary_words: list=None
    _vocabulary: dict=None
    _training_corpus: list=None
    _training_data: dict=None
    _processed_training: list=None
    _test_data_raw: list=None
    _test_data_tuples: list=None
    _test_data: dict=None
    _test_words: list=None

The Preprocessor

@property
def preprocess(self) -> DataPreprocessor:
    """The Preprocessor for the data"""
    if self._preprocess is None:
        self._preprocess = DataPreprocessor(self.vocabulary)
    return self._preprocess

The Vocabulary Words

@property
def vocabulary_words(self) -> list:
    """The list of vocabulary words for training

    Note:
     This is ``hmm_vocab.txt``
    """
    if self._vocabulary_words is None:
        words = self.load(os.environ[self.environment.vocabulary])
        self._vocabulary_words = [THEIRS_TO_MINE.get(word, word) for word in words]
    return self._vocabulary_words

The Vocabulary

@property
def vocabulary(self) -> dict:
    """Converts the vocabulary list of words to a dict

    Returns:
     word : index of word in vocabulary words> dictionary
    """
    if self._vocabulary is None:
        self._vocabulary = {
            word: index
            for index, word in enumerate(sorted(self.vocabulary_words))}
    return self._vocabulary

The Training Corpus

@property
def training_corpus(self) -> list:
    """The corpus for training

    Note:
     ``WSJ_02_20.pos`` lines (<word><tab><pos> all as one string)
    """
    if self._training_corpus is None:
        self._training_corpus = self.load(os.environ[self.environment.training_corpus])
    return self._training_corpus

The Training Data

@property
def training_data(self) -> dict:
    """The word-tag training data

    Note:
     this is the ``training_corpus`` converted to a <word>:<pos> dictionary
    """
    if self._training_data is None:
        words_tags = (line.split() for line in self.training_corpus)
        words_tags = (tokens for tokens in words_tags if len(tokens) == 2)        
        self._training_data = {word: tag for word, tag in words_tags}
    return self._training_data

Processed Training

@property
def processed_training(self) -> list:
    """Pre-processes the training corpus

    Note:
     ``training_corpus`` converted to (word, tag) tuples with unknown tags added
    """
    if self._processed_training is None:
        processor = CorpusProcessor(self.vocabulary)
        self._processed_training = processor(self.training_corpus)
    return self._processed_training

The Raw Test Corpus

@property
def test_data_raw(self) -> list:
    """The lines for testing

    Note:
     The assignment expects the lines to be un-processed so this is just the
    raw lines from ``WSJ_24.pos``
    """
    if self._test_data_raw is None:
        self._test_data_raw = self.load(os.environ[self.environment.test_corpus])
    return self._test_data_raw

The Raw Test Tuples

@property
def test_data_tuples(self) -> list:
    """The test data lines split into tuples

    Note:
     this is the test_data_raw with the lines split
    """
    if self._test_data_tuples is None:
        self._test_data_tuples = [line.split() for line in self.test_data_raw]
    return self._test_data_tuples

Test Words

@property
def test_words(self) -> list:
    """The pre-processed test words

    Note:
     ``test.words`` with some pre-processing done
    """
    if self._test_words is None:
        self._test_words = self.load(os.environ[self.environment.test_words])
        self._test_words = self.preprocess(self._test_words)
    return self._test_words

Load Method

def load(self, path: str, keep_empty: bool=True) -> list:
    """Loads the strings from the file

    Args:
     path: path to the text file
     keep_empty: keep empty lines if true

    Returns:
     list of lines from the file
    """
    with open(path) as reader:
        lines = [line for line in reader.read().split("\n") if keep_empty or line]
    return lines

Test it Out

from neurotic.nlp.parts_of_speech.preprocessing import Environment, DataLoader

loader = DataLoader()
for theirs, mine in zip(vocabulary_words, loader.vocabulary_words):
    assert theirs == mine
for theirs, mine in zip(vocabulary, loader.vocabulary):
    assert vocabulary[theirs] == loader.vocabulary[mine]
loader = DataLoader(Environment)

print(f"{len(loader.vocabulary_words):,}")
print(random.sample(loader.vocabulary_words, 2))
assert len(loader.vocabulary_words) == len(vocabulary_words)

print(f"\n{len(loader.training_corpus):,}")
assert (len(loader.training_corpus)) == len(training_corpus)
print(random.sample(loader.training_corpus, 2))

print(f"\n{len(loader.vocabulary):,}")
assert len(loader.vocabulary) == len(vocabulary)

print(f"\n{len(loader.test_corpus):,}")
assert len(loader.test_corpus) == len(test_corpus)
print(random.sample(loader.test_corpus, 2))


print(f"\n{len(loader.test_words):,}")
print(random.sample(loader.test_words, 2))
23,777
['Island', 'Gibbons']

989,861
['nine\tCD', 'in\tIN']

23,777

34,200
['at\tIN', 'to\tTO']

34,200
['the', ';']

Re-Test

I'm trying to troubleshoot differences between my output and the original notebook.

punct = set(string.punctuation)

noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]
def preprocess(vocab, data_fp):
    """
    Preprocess data
    """
    orig = []
    prep = []

    # Read data
    with open(data_fp, "r") as data_file:

        for cnt, word in enumerate(data_file):

            # End of sentence
            if not word.split():
                orig.append(word.strip())
                word = "--n--"
                prep.append(word)
                continue

            # Handle unknown words
            elif word.strip() not in vocab:
                orig.append(word.strip())
                # this seems like a bug
                word = assign_unk(word)

                # this makes it work better, but isn't in their code
                # word = assign_unk(word.strip())
                prep.append(word)
                continue

            else:
                orig.append(word.strip())
                prep.append(word.strip())

    assert(len(orig) == len(open(data_fp, "r").readlines()))
    assert(len(prep) == len(open(data_fp, "r").readlines()))

    return orig, prep
def assign_unk(tok):
    """
    Assign unknown word tokens
    """
    # Digits
    if any(char.isdigit() for char in tok):
        return "--unk_digit--"

    # Punctuation
    elif any(char in punct for char in tok):
        return "--unk_punct--"

    # Upper-case
    elif any(char.isupper() for char in tok):
        return "--unk_upper--"

    # Nouns
    elif any(tok.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Verbs
    elif any(tok.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Adjectives
    elif any(tok.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Adverbs
    elif any(tok.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"

    return "--unk--"
loader = DataLoader(Environment)

The Vocabulary

UNKNOWN = "--unknown-{}--"
THEIR_UNKNOWN = "--unk_{}--"

THEIRS_TO_MINE = {
    "--unk--": "--unknown--",
    THEIR_UNKNOWN.format("digit"):UNKNOWN.format("digit"),
    THEIR_UNKNOWN.format("punct"):UNKNOWN.format("punctuation"),
    THEIR_UNKNOWN.format("upper"):UNKNOWN.format("uppercase"),
    THEIR_UNKNOWN.format("noun"):UNKNOWN.format("noun"),    
    THEIR_UNKNOWN.format("verb"):UNKNOWN.format("verb"),
    THEIR_UNKNOWN.format("adj"): UNKNOWN.format("adjective"),
    THEIR_UNKNOWN.format("adv"):UNKNOWN.format("adverb"),    
}

with open(os.environ[Environment.vocabulary]) as reader:
    voc_l = reader.read().split("\n")

index = 0
for theirs, mine in zip(voc_l, loader.vocabulary_words):
    assert THEIRS_TO_MINE.get(theirs, theirs)==mine, (index, theirs, mine)
    index += 1

Vocabulary Word

They include empty strings so the dictionaries won't match exactly.

vocab = {}
for index, word in enumerate(sorted(voc_l)):
    vocab[word] = index

for word in vocab:
    assert THEIRS_TO_MINE.get(word, word) in loader.vocabulary, (word)

The Preprocessed Test Words

original, prep = preprocess(vocab, os.environ[Environment.test_words])

index = 0
for theirs, mine in zip(prep, loader.test_words):
    if THEIRS_TO_MINE.get(theirs, theirs) != mine:
        token = original[index]
        print(index, theirs, mine, token, assign_unk(token))
    index += 1
9 --unk-- --unknown-noun-- vantage --unk_noun--
228 --unk-- --unknown-verb-- exacerbate --unk_verb--
845 --unk-- --unknown-adjective-- relentless --unk_adj--
896 --unk-- --unknown-adjective-- slickly --unk_adj--
906 --unk-- --unknown-adjective-- cognoscenti --unk_adj--
919 --unk-- --unknown-noun-- depiction --unk_noun--
1076 --unk-- --unknown-adjective-- perfidious --unk_adj--
1160 --unk-- --unknown-adjective-- innocently --unk_adj--
1196 --unk-- --unknown-adjective-- loutish --unk_adj--
1244 --unk-- --unknown-verb-- premise --unk_verb--
1268 --unk-- --unknown-adjective-- conspicuously --unk_adj--
1331 --unk-- --unknown-noun-- ignition --unk_noun--
1347 --unk-- --unknown-noun-- obligatory --unk_noun--
1448 --unk-- --unknown-adjective-- lavishly --unk_adj--
1468 --unk-- --unknown-adjective-- palatable --unk_adj--
1564 --unk-- --unknown-adjective-- inconsiderable --unk_adj--
1678 --unk-- --unknown-adjective-- discreetly --unk_adj--
1774 --unk-- --unknown-adjective-- marvelously --unk_adj--
1855 --unk-- --unknown-adjective-- passable --unk_adj--
1970 --unk-- --unknown-noun-- diaper --unk_noun--
3248 --unk-- --unknown-adjective-- comparably --unk_adj--
3672 --unk-- --unknown-noun-- imperialism --unk_noun--
3791 --unk-- --unknown-noun-- livelier --unk_noun--
3793 --unk-- --unknown-noun-- bolder --unk_noun--
4132 --unk-- --unknown-noun-- intimidation --unk_noun--
4648 --unk-- --unknown-adjective-- politic --unk_adj--
4661 --unk-- --unknown-verb-- mutate --unk_verb--
4682 --unk-- --unknown-noun-- aspersion --unk_noun--
4777 --unk-- --unknown-adjective-- utopian --unk_adj--
4802 --unk-- --unknown-adjective-- psychic --unk_adj--
4892 --unk-- --unknown-noun-- coercion --unk_noun--
5043 --unk-- --unknown-noun-- centrist --unk_noun--
5090 --unk-- --unknown-noun-- schooling --unk_noun--
5131 --unk-- --unknown-noun-- voucher --unk_noun--
5270 --unk-- --unknown-noun-- recruitment --unk_noun--
5316 --unk-- --unknown-noun-- expansionism --unk_noun--
5429 --unk-- --unknown-verb-- palate --unk_verb--
5440 --unk-- --unknown-noun-- engorgement --unk_noun--
5557 --unk-- --unknown-noun-- patronage --unk_noun--
5912 --unk-- --unknown-noun-- volunteerism --unk_noun--
5990 --unk-- --unknown-adjective-- utopian --unk_adj--
6709 --unk-- --unknown-noun-- citizenship --unk_noun--
6738 --unk-- --unknown-noun-- abomination --unk_noun--
6793 --unk-- --unknown-adjective-- reflexively --unk_adj--
6809 --unk-- --unknown-noun-- regimentation --unk_noun--
6876 --unk-- --unknown-noun-- compulsory --unk_noun--
7150 --unk-- --unknown-adjective-- arguably --unk_adj--
7192 --unk-- --unknown-noun-- compulsion --unk_noun--
7208 --unk-- --unknown-adjective-- unenforceable --unk_adj--
7374 --unk-- --unknown-noun-- recruitment --unk_noun--
7468 --unk-- --unknown-adjective-- challengeable --unk_adj--
7591 --unk-- --unknown-adjective-- unprecedentedly --unk_adj--
7657 --unk-- --unknown-adjective-- cooperatively --unk_adj--
8182 --unk-- --unknown-noun-- reticence --unk_noun--
8772 --unk-- --unknown-noun-- render --unk_noun--
11071 --unk-- --unknown-noun-- breather --unk_noun--
11942 --unk-- --unknown-verb-- unify --unk_verb--
12859 --unk-- --unknown-verb-- frigate --unk_verb--
12907 --unk-- --unknown-noun-- disarmament --unk_noun--
12936 --unk-- --unknown-noun-- affinity --unk_noun--
13240 --unk-- --unknown-verb-- patriarchate --unk_verb--
13438 --unk-- --unknown-noun-- underselling --unk_noun--
13650 --unk-- --unknown-adjective-- wrathful --unk_adj--
13725 --unk-- --unknown-noun-- gist --unk_noun--
14081 --unk-- --unknown-noun-- segmentation --unk_noun--
14110 --unk-- --unknown-adjective-- brightly --unk_adj--
15331 --unk-- --unknown-noun-- connector --unk_noun--
17846 --unk-- --unknown-noun-- readjustment --unk_noun--
18397 --unk-- --unknown-noun-- observation --unk_noun--
18410 --unk-- --unknown-noun-- ticker --unk_noun--
18566 --unk-- --unknown-noun-- trickery --unk_noun--
18750 --unk-- --unknown-adjective-- supersonic --unk_adj--
18854 --unk-- --unknown-noun-- existance --unk_noun--
18885 --unk-- --unknown-adjective-- thankfully --unk_adj--
19011 --unk-- --unknown-noun-- coherence --unk_noun--
19304 --unk-- --unknown-adjective-- autocratic --unk_adj--
19305 --unk-- --unknown-noun-- pseudosocialism --unk_noun--
19326 --unk-- --unknown-noun-- encircling --unk_noun--
19389 --unk-- --unknown-noun-- miscegenation --unk_noun--
19470 --unk-- --unknown-adjective-- ostensible --unk_adj--
19479 --unk-- --unknown-adjective-- purportedly --unk_adj--
19554 --unk-- --unknown-noun-- accuser --unk_noun--
19568 --unk-- --unknown-noun-- embezzler --unk_noun--
19961 --unk-- --unknown-adjective-- unwittingly --unk_adj--
19970 --unk-- --unknown-noun-- platter --unk_noun--
20129 --unk-- --unknown-noun-- centrist --unk_noun--
20384 --unk-- --unknown-adjective-- improbable --unk_adj--
20472 --unk-- --unknown-adjective-- glaringly --unk_adj--
20493 --unk-- --unknown-noun-- clarity --unk_noun--
20496 --unk-- --unknown-noun-- rectification --unk_noun--
20539 --unk-- --unknown-noun-- wiser --unk_noun--
21215 --unk-- --unknown-adjective-- concurrently --unk_adj--
21310 --unk-- --unknown-verb-- revolutionize --unk_verb--
22257 --unk-- --unknown-noun-- compensatory --unk_noun--
22270 --unk-- --unknown-noun-- respiratory --unk_noun--
23603 --unk-- --unknown-noun-- milion --unk_noun--
23928 --unk-- --unknown-noun-- poultry --unk_noun--
23947 --unk-- --unknown-noun-- cessation --unk_noun--
24005 --unk-- --unknown-noun-- mothballing --unk_noun--
24168 --unk-- --unknown-noun-- synthesizer --unk_noun--
24195 --unk-- --unknown-adjective-- distinctly --unk_adj--
24280 --unk-- --unknown-noun-- synthesizer --unk_noun--
24293 --unk-- --unknown-adjective-- robotic --unk_adj--
24294 --unk-- --unknown-noun-- flatness --unk_noun--
24861 --unk-- --unknown-noun-- carnage --unk_noun--
25248 --unk-- --unknown-adjective-- frantic --unk_adj--
25252 --unk-- --unknown-noun-- hammer --unk_noun--
26001 --unk-- --unknown-noun-- whisper --unk_noun--
27468 --unk-- --unknown-noun-- prophecy --unk_noun--
27538 --unk-- --unknown-noun-- encircling --unk_noun--
27716 --unk-- --unknown-verb-- realestate --unk_verb--
27994 --unk-- --unknown-noun-- insolvency --unk_noun--
28110 --unk-- --unknown-adjective-- forceful --unk_adj--
28195 --unk-- --unknown-adjective-- zealous --unk_adj--
28485 --unk-- --unknown-adjective-- belatedly --unk_adj--
29370 --unk-- --unknown-verb-- duplicate --unk_verb--
29902 --unk-- --unknown-adjective-- reptilian --unk_adj--
30078 --unk-- --unknown-noun-- industrialization --unk_noun--
31292 --unk-- --unknown-adjective-- vociferous --unk_adj--
31303 --unk-- --unknown-adjective-- powerless --unk_adj--
31549 --unk-- --unknown-noun-- provocation --unk_noun--
31648 --unk-- --unknown-adjective-- archaic --unk_adj--
31700 --unk-- --unknown-noun-- lament --unk_noun--
32149 --unk-- --unknown-adjective-- tantalizingly --unk_adj--
32393 --unk-- --unknown-noun-- hammer --unk_noun--
33208 --unk-- --unknown-adjective-- defiantly --unk_adj--
33238 --unk-- --unknown-noun-- rickety --unk_noun--
33280 --unk-- --unknown-noun-- unionist --unk_noun--
33336 --unk-- --unknown-noun-- dapper --unk_noun--
33721 --unk-- --unknown-adjective-- repressive --unk_adj--
33832 --unk-- --unknown-noun-- disillusionment --unk_noun--
33861 --unk-- --unknown-noun-- agitation --unk_noun--
for word in (line for line in loader.vocabulary_words if line.startswith("--u")):
    print(word)
--unknown--
--unknown-adjective--
--unknown-adverb--
--unknown-digit--
--unknown-noun--
--unknown-punctuation--
--unknown-uppercase--
--unknown-verb--