Parts-of-Speech Tagging: The Data
Table of Contents
- Beginning
- Middle
- End
- The Code
- Imports
- The Environment
- The Suffixes
- The Label
- Theirs To Ours
- The Unknown
- The Empty
- The Corpus Pre-Processor
- The Data Pre-Processor
- The Data Loader
- The Preprocessor
- The Vocabulary Words
- The Vocabulary
- The Training Corpus
- The Training Data
- Processed Training
- The Raw Test Corpus
- The Raw Test Tuples
- Test Words
- Load Method
- Test it Out
- Re-Test
- The Code
Beginning
Imports
# python
from argparse import Namespace
from collections import Counter
import os
import random
import re
import string
# from pypi
from dotenv import load_dotenv
Set Up
The Environment
load_dotenv("posts/nlp/.env")
Environment = Namespace(
training_corpus="WALL_STREET_JOURNAL_POS",
vocabulary="WALL_STREET_JOURNAL_VOCABULARY",
test_corpus= "WALL_STREET_JOURNAL_TEST_POS",
test_words="WALL_STREET_JOURNAL_TEST_WORDS",
)
Middle
The Training Corpus
The training corpus is made up of words and parts-of-speech tags from the Wall Street Journal (previously looked at in this post).
with open(os.environ[Environment.training_corpus]) as reader:
training_corpus = reader.read().split("\n")
for row in training_corpus[:5]:
print(f" - {row}")
- In IN - an DT - Oct. NNP - 19 CD - review NN
The Vocabulary
There is also a pre-processed version of that same file that has only the words without the parts-of-speech tags that we can load. It has been altered slightly as well:
- Words that appear only once have been removed
- Words that don't exist in the original have been added so that there are some unknown words to handle.
with open(os.environ[Environment.vocabulary]) as reader:
vocabulary_words = reader.read().split("\n")
for row in vocabulary_words[:5]:
print(f" - {row}")
- ! - # - $ - % - &
Odd.
for row in random.sample(vocabulary_words, 5):
print(f" - {row}")
- cabinet - Byrd - done - Fueling - investments
Our actual vocabulary is going to be a dictionary mapping each word to its index in the list we just loaded.
vocabulary_words = sorted(vocabulary_words)
vocabulary = {key: index for index, key in enumerate(vocabulary_words)}
assert len(vocabulary) == len(vocabulary_words)
training_counter = Counter([row.split("\t")[0] for row in training_corpus])
filtered = set(key for key, value in training_counter.items() if value > 1)
extra = set(vocabulary) - filtered
print(f"{len(extra):,}")
for item in random.sample(extra, 5):
print(f" - {item}")
9 - --unk_adj-- - --n-- - --unk-- - --unk_noun-- - --unk_adv--
So, it looks like the "unknowns" are pre-process tags.
The Test Corpus
This is another list of words taken from the Wall Street Journal with parts-of-speech tags added that we'll use as the test-set.
with open(os.environ[Environment.test_corpus]) as reader:
test_corpus = reader.read().split("\n")
The Test Vocabulary
There is also a set of words that we're going to try and tag. These need to be pre-processed.
Handle Empty
We are going to replace empty lines with a special token.
def handle_empty(words: list, empty_token="--n--"):
"""replace empty strings withh empty_token
Args:
words: list to process
empty_token: what to replace empty strings with
Yields:
processed words
"""
for word in words:
if not word.strip():
yield empty_token
else:
yield word
return
Labeling Unknowns
- Suffixes
Suffixes = Namespace( noun = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"], verb = ["ate", "ify", "ise", "ize"], adjective = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"], adverb = ["ward", "wards", "wise"] )
- Labels for the Unknowns
UNKNOWN = "--unknown-{}--" Label = Namespace( digit=UNKNOWN.format("digit"), punctuation=UNKNOWN.format("punctuation"), uppercase=UNKNOWN.format("uppercase"), noun=UNKNOWN.format("noun"), verb=UNKNOWN.format("verb"), adjective=UNKNOWN.format("adjective"), adverb=UNKNOWN.format("adverb"), unknown="--unknown--" )
- Bundle Them Up
Unknown = Namespace( punctuation = set(string.punctuation), suffix = Suffixes, label=Label, has_digit=re.compile(r"\d"), has_uppercase=re.compile("[A-Z]") )
def label_unknowns(words: str, vocabulary: set) -> str: """ Assign tokens to unknown words Args: word: word not in our vocabulary vocabulary: something to check if it is a known word Yields: word or label for the word if unknown """ for word in words: if word in vocabulary: yield word elif Unknown.has_digit.search(word): yield Unknown.label.digit elif not Unknown.punctuation.isdisjoint(set(word)): yield Unknown.label.punctuation elif Unknown.has_uppercase.search(word): yield Unknown.label.uppercase elif any(word.endswith(suffix) for suffix in Unknown.suffix.noun): yield Unknown.label.noun elif any(word.endswith(suffix) for suffix in Unknown.suffix.verb): yield Unknown.label.verb elif any(word.endswith(suffix) for suffix in Unknown.suffix.adjective): yield Unknown.label.adjective elif any(word.endswith(suffix) for suffix in Unknown.suffix.adverb): yield Unknown.label.adverb else: yield Unknown.label.unknown return
The Pre-Processor
def preprocess(words: list, vocabulary: set) -> list:
"""Preprocess words
Args:
words: words to pre-process
Returns:
words with empty lines and unknown words labeled
"""
processed = (word.strip() for word in words)
processed = handle_empty(processed)
processed = [word for word in label_unknowns(processed, vocabulary)]
return processed
Load the Test Words
with open(os.environ[Environment.test_words]) as reader:
test_words = reader.read().strip()
before = len(test_words)
test_words = preprocess(test_words, vocabulary)
assert len(test_words) == before
print(f"{len(test_words):,}")
for word in random.sample(test_words, 5):
print(f" - {word}")
180,264 - --unknown-- - --n-- - e - r - --unknown--
Weird that the letters "e" and "r" are known words…
End
So, that's our data. To make it replicable for the next section I'm going to tangle it out.
The Code
Imports
# from python
from argparse import Namespace
import os
import re
import string
# pypi
import attr
The Environment
I don't really know that I should save these keys, but I don't really want to cut and paste all the time…
Environment = Namespace(
training_corpus="WALL_STREET_JOURNAL_POS",
vocabulary="WALL_STREET_JOURNAL_VOCABULARY",
test_corpus= "WALL_STREET_JOURNAL_TEST_POS",
test_words="WALL_STREET_JOURNAL_TEST_WORDS",
)
The Suffixes
Suffixes = Namespace(
noun = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood",
"ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry",
"scape", "ship", "ty"],
verb = ["ate", "ify", "ise", "ize"],
adjective = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive",
"less", "ly", "ous"],
adverb = ["ward", "wards", "wise"]
)
The Label
UNKNOWN = "--unknown-{}--"
Label = Namespace(
digit=UNKNOWN.format("digit"),
punctuation=UNKNOWN.format("punctuation"),
uppercase=UNKNOWN.format("uppercase"),
noun=UNKNOWN.format("noun"),
verb=UNKNOWN.format("verb"),
adjective=UNKNOWN.format("adjective"),
adverb=UNKNOWN.format("adverb"),
unknown="--unknown--",
)
Theirs To Ours
The vocabulary file has their unknown tags alreads inserted which don't matnch mine so, rather than redoing everything I'm going to translate theirs to match mine.
THEIR_UNKNOWN = "--unk_{}--"
THEIRS_TO_MINE = {
"--unk--": "--unknown--",
THEIR_UNKNOWN.format("digit"): Label.digit,
THEIR_UNKNOWN.format("punct"): Label.punctuation,
THEIR_UNKNOWN.format("upper"): Label.uppercase,
THEIR_UNKNOWN.format("noun"): Label.noun,
THEIR_UNKNOWN.format("verb"): Label.verb,
THEIR_UNKNOWN.format("adj"): Label.adjective,
THEIR_UNKNOWN.format("adv"): Label.adverb,
}
The Unknown
Unknown = Namespace(
punctuation = set(string.punctuation),
suffix = Suffixes,
label=Label,
has_digit=re.compile(r"\d"),
has_uppercase=re.compile("[A-Z]")
)
The Empty
Empty = Namespace(
word="--n--",
tag="--s--",
)
The Corpus Pre-Processor
@attr.s(auto_attribs=True)
class CorpusProcessor:
"""Pre-processes the corpus
Args:
vocabulary: holder of our known words
"""
vocabulary: dict
- Split Tuples
def split_tuples(self, lines: list): """Generates tuples Args: lines: iterable of lines from the corpus Yields: whitespace split of line """ for line in lines: yield line.split() return
- Handle Empty Lines
def handle_empty(self, tuples: list): """checks for empty strings Args: tuples: tuples of corpus lines Yields: line with empty string marked """ for line in tuples: if not line: yield Empty.word, Empty.tag else: yield line return
- Handle Unknowns
def label_unknowns(self, tuples: list) -> str: """ Assign tokens to unknown words Args: tuples: word, tag tuples Yields: (word (or label for the word if unknown), tag) tuple """ for word, tag in tuples: if word in self.vocabulary: yield word, tag elif Unknown.has_digit.search(word): yield Unknown.label.digit, tag elif not Unknown.punctuation.isdisjoint(set(word)): yield Unknown.label.punctuation, tag elif Unknown.has_uppercase.search(word): yield Unknown.label.uppercase, tag elif any(word.endswith(suffix) for suffix in Unknown.suffix.noun): yield Unknown.label.noun, tag elif any(word.endswith(suffix) for suffix in Unknown.suffix.verb): yield Unknown.label.verb, tag elif any(word.endswith(suffix) for suffix in Unknown.suffix.adjective): yield Unknown.label.adjective, tag elif any(word.endswith(suffix) for suffix in Unknown.suffix.adverb): yield Unknown.label.adverb, tag else: yield Unknown.label.unknown, tag return
- The Call
def __call__(self, tuples: list) -> list: """preprocesses the words and tags Args: tuples: list of words and tags to process Returns: preprocessed version of words, tags """ processed = self.split_tuples(tuples) processed = self.handle_empty(processed) processed = [(word, tag) for (word, tag) in self.label_unknowns(processed)] return processed
The Data Pre-Processor
@attr.s(auto_attribs=True)
class DataPreprocessor:
"""A pre-processor for the data
Args:
vocabulary: holder of our known words
empty_token: what to use if a line is an empty string
"""
vocabulary: dict
- Handle Empty Lines
def handle_empty(self, words: list): """replace empty strings withh empty_token Args: words: list to process empty_token: what to replace empty strings with Yields: processed words """ for word in words: if not word.strip(): yield Empty.word else: yield word return
- Label Unknowns
def label_unknowns(self, words: list) -> str: """ Assign tokens to unknown words Args: words: iterable of words to check Yields: word or label for the word if unknown """ for word in words: if word in self.vocabulary: yield word elif Unknown.has_digit.search(word): yield Unknown.label.digit elif not Unknown.punctuation.isdisjoint(set(word)): yield Unknown.label.punctuation elif Unknown.has_uppercase.search(word): yield Unknown.label.uppercase elif any(word.endswith(suffix) for suffix in Unknown.suffix.noun): yield Unknown.label.noun elif any(word.endswith(suffix) for suffix in Unknown.suffix.verb): yield Unknown.label.verb elif any(word.endswith(suffix) for suffix in Unknown.suffix.adjective): yield Unknown.label.adjective elif any(word.endswith(suffix) for suffix in Unknown.suffix.adverb): yield Unknown.label.adverb else: yield Unknown.label.unknown return
- The Call
def __call__(self, words: list) -> list: """preprocesses the words Args: words: list of words to process Returns: preprocessed version of words """ processed = (word.strip() for word in words) processed = self.handle_empty(processed) processed = [word for word in self.label_unknowns(processed)] return processed
The Data Loader
@attr.s(auto_attribs=True)
class DataLoader:
"""Loads the training and test data
Args:
environment: namespace with keys for the environment to load paths
"""
environment: Namespace=Environment
_preprocess: DataPreprocessor=None
_vocabulary_words: list=None
_vocabulary: dict=None
_training_corpus: list=None
_training_data: dict=None
_processed_training: list=None
_test_data_raw: list=None
_test_data_tuples: list=None
_test_data: dict=None
_test_words: list=None
The Preprocessor
@property
def preprocess(self) -> DataPreprocessor:
"""The Preprocessor for the data"""
if self._preprocess is None:
self._preprocess = DataPreprocessor(self.vocabulary)
return self._preprocess
The Vocabulary Words
@property
def vocabulary_words(self) -> list:
"""The list of vocabulary words for training
Note:
This is ``hmm_vocab.txt``
"""
if self._vocabulary_words is None:
words = self.load(os.environ[self.environment.vocabulary])
self._vocabulary_words = [THEIRS_TO_MINE.get(word, word) for word in words]
return self._vocabulary_words
The Vocabulary
@property
def vocabulary(self) -> dict:
"""Converts the vocabulary list of words to a dict
Returns:
word : index of word in vocabulary words> dictionary
"""
if self._vocabulary is None:
self._vocabulary = {
word: index
for index, word in enumerate(sorted(self.vocabulary_words))}
return self._vocabulary
The Training Corpus
@property
def training_corpus(self) -> list:
"""The corpus for training
Note:
``WSJ_02_20.pos`` lines (<word><tab><pos> all as one string)
"""
if self._training_corpus is None:
self._training_corpus = self.load(os.environ[self.environment.training_corpus])
return self._training_corpus
The Training Data
@property
def training_data(self) -> dict:
"""The word-tag training data
Note:
this is the ``training_corpus`` converted to a <word>:<pos> dictionary
"""
if self._training_data is None:
words_tags = (line.split() for line in self.training_corpus)
words_tags = (tokens for tokens in words_tags if len(tokens) == 2)
self._training_data = {word: tag for word, tag in words_tags}
return self._training_data
Processed Training
@property
def processed_training(self) -> list:
"""Pre-processes the training corpus
Note:
``training_corpus`` converted to (word, tag) tuples with unknown tags added
"""
if self._processed_training is None:
processor = CorpusProcessor(self.vocabulary)
self._processed_training = processor(self.training_corpus)
return self._processed_training
The Raw Test Corpus
@property
def test_data_raw(self) -> list:
"""The lines for testing
Note:
The assignment expects the lines to be un-processed so this is just the
raw lines from ``WSJ_24.pos``
"""
if self._test_data_raw is None:
self._test_data_raw = self.load(os.environ[self.environment.test_corpus])
return self._test_data_raw
The Raw Test Tuples
@property
def test_data_tuples(self) -> list:
"""The test data lines split into tuples
Note:
this is the test_data_raw with the lines split
"""
if self._test_data_tuples is None:
self._test_data_tuples = [line.split() for line in self.test_data_raw]
return self._test_data_tuples
Test Words
@property
def test_words(self) -> list:
"""The pre-processed test words
Note:
``test.words`` with some pre-processing done
"""
if self._test_words is None:
self._test_words = self.load(os.environ[self.environment.test_words])
self._test_words = self.preprocess(self._test_words)
return self._test_words
Load Method
def load(self, path: str, keep_empty: bool=True) -> list:
"""Loads the strings from the file
Args:
path: path to the text file
keep_empty: keep empty lines if true
Returns:
list of lines from the file
"""
with open(path) as reader:
lines = [line for line in reader.read().split("\n") if keep_empty or line]
return lines
Test it Out
from neurotic.nlp.parts_of_speech.preprocessing import Environment, DataLoader
loader = DataLoader()
for theirs, mine in zip(vocabulary_words, loader.vocabulary_words):
assert theirs == mine
for theirs, mine in zip(vocabulary, loader.vocabulary):
assert vocabulary[theirs] == loader.vocabulary[mine]
loader = DataLoader(Environment)
print(f"{len(loader.vocabulary_words):,}")
print(random.sample(loader.vocabulary_words, 2))
assert len(loader.vocabulary_words) == len(vocabulary_words)
print(f"\n{len(loader.training_corpus):,}")
assert (len(loader.training_corpus)) == len(training_corpus)
print(random.sample(loader.training_corpus, 2))
print(f"\n{len(loader.vocabulary):,}")
assert len(loader.vocabulary) == len(vocabulary)
print(f"\n{len(loader.test_corpus):,}")
assert len(loader.test_corpus) == len(test_corpus)
print(random.sample(loader.test_corpus, 2))
print(f"\n{len(loader.test_words):,}")
print(random.sample(loader.test_words, 2))
23,777 ['Island', 'Gibbons'] 989,861 ['nine\tCD', 'in\tIN'] 23,777 34,200 ['at\tIN', 'to\tTO'] 34,200 ['the', ';']
Re-Test
I'm trying to troubleshoot differences between my output and the original notebook.
punct = set(string.punctuation)
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]
def preprocess(vocab, data_fp):
"""
Preprocess data
"""
orig = []
prep = []
# Read data
with open(data_fp, "r") as data_file:
for cnt, word in enumerate(data_file):
# End of sentence
if not word.split():
orig.append(word.strip())
word = "--n--"
prep.append(word)
continue
# Handle unknown words
elif word.strip() not in vocab:
orig.append(word.strip())
# this seems like a bug
word = assign_unk(word)
# this makes it work better, but isn't in their code
# word = assign_unk(word.strip())
prep.append(word)
continue
else:
orig.append(word.strip())
prep.append(word.strip())
assert(len(orig) == len(open(data_fp, "r").readlines()))
assert(len(prep) == len(open(data_fp, "r").readlines()))
return orig, prep
def assign_unk(tok):
"""
Assign unknown word tokens
"""
# Digits
if any(char.isdigit() for char in tok):
return "--unk_digit--"
# Punctuation
elif any(char in punct for char in tok):
return "--unk_punct--"
# Upper-case
elif any(char.isupper() for char in tok):
return "--unk_upper--"
# Nouns
elif any(tok.endswith(suffix) for suffix in noun_suffix):
return "--unk_noun--"
# Verbs
elif any(tok.endswith(suffix) for suffix in verb_suffix):
return "--unk_verb--"
# Adjectives
elif any(tok.endswith(suffix) for suffix in adj_suffix):
return "--unk_adj--"
# Adverbs
elif any(tok.endswith(suffix) for suffix in adv_suffix):
return "--unk_adv--"
return "--unk--"
loader = DataLoader(Environment)
The Vocabulary
UNKNOWN = "--unknown-{}--"
THEIR_UNKNOWN = "--unk_{}--"
THEIRS_TO_MINE = {
"--unk--": "--unknown--",
THEIR_UNKNOWN.format("digit"):UNKNOWN.format("digit"),
THEIR_UNKNOWN.format("punct"):UNKNOWN.format("punctuation"),
THEIR_UNKNOWN.format("upper"):UNKNOWN.format("uppercase"),
THEIR_UNKNOWN.format("noun"):UNKNOWN.format("noun"),
THEIR_UNKNOWN.format("verb"):UNKNOWN.format("verb"),
THEIR_UNKNOWN.format("adj"): UNKNOWN.format("adjective"),
THEIR_UNKNOWN.format("adv"):UNKNOWN.format("adverb"),
}
with open(os.environ[Environment.vocabulary]) as reader:
voc_l = reader.read().split("\n")
index = 0
for theirs, mine in zip(voc_l, loader.vocabulary_words):
assert THEIRS_TO_MINE.get(theirs, theirs)==mine, (index, theirs, mine)
index += 1
Vocabulary Word
They include empty strings so the dictionaries won't match exactly.
vocab = {}
for index, word in enumerate(sorted(voc_l)):
vocab[word] = index
for word in vocab:
assert THEIRS_TO_MINE.get(word, word) in loader.vocabulary, (word)
The Preprocessed Test Words
original, prep = preprocess(vocab, os.environ[Environment.test_words])
index = 0
for theirs, mine in zip(prep, loader.test_words):
if THEIRS_TO_MINE.get(theirs, theirs) != mine:
token = original[index]
print(index, theirs, mine, token, assign_unk(token))
index += 1
9 --unk-- --unknown-noun-- vantage --unk_noun-- 228 --unk-- --unknown-verb-- exacerbate --unk_verb-- 845 --unk-- --unknown-adjective-- relentless --unk_adj-- 896 --unk-- --unknown-adjective-- slickly --unk_adj-- 906 --unk-- --unknown-adjective-- cognoscenti --unk_adj-- 919 --unk-- --unknown-noun-- depiction --unk_noun-- 1076 --unk-- --unknown-adjective-- perfidious --unk_adj-- 1160 --unk-- --unknown-adjective-- innocently --unk_adj-- 1196 --unk-- --unknown-adjective-- loutish --unk_adj-- 1244 --unk-- --unknown-verb-- premise --unk_verb-- 1268 --unk-- --unknown-adjective-- conspicuously --unk_adj-- 1331 --unk-- --unknown-noun-- ignition --unk_noun-- 1347 --unk-- --unknown-noun-- obligatory --unk_noun-- 1448 --unk-- --unknown-adjective-- lavishly --unk_adj-- 1468 --unk-- --unknown-adjective-- palatable --unk_adj-- 1564 --unk-- --unknown-adjective-- inconsiderable --unk_adj-- 1678 --unk-- --unknown-adjective-- discreetly --unk_adj-- 1774 --unk-- --unknown-adjective-- marvelously --unk_adj-- 1855 --unk-- --unknown-adjective-- passable --unk_adj-- 1970 --unk-- --unknown-noun-- diaper --unk_noun-- 3248 --unk-- --unknown-adjective-- comparably --unk_adj-- 3672 --unk-- --unknown-noun-- imperialism --unk_noun-- 3791 --unk-- --unknown-noun-- livelier --unk_noun-- 3793 --unk-- --unknown-noun-- bolder --unk_noun-- 4132 --unk-- --unknown-noun-- intimidation --unk_noun-- 4648 --unk-- --unknown-adjective-- politic --unk_adj-- 4661 --unk-- --unknown-verb-- mutate --unk_verb-- 4682 --unk-- --unknown-noun-- aspersion --unk_noun-- 4777 --unk-- --unknown-adjective-- utopian --unk_adj-- 4802 --unk-- --unknown-adjective-- psychic --unk_adj-- 4892 --unk-- --unknown-noun-- coercion --unk_noun-- 5043 --unk-- --unknown-noun-- centrist --unk_noun-- 5090 --unk-- --unknown-noun-- schooling --unk_noun-- 5131 --unk-- --unknown-noun-- voucher --unk_noun-- 5270 --unk-- --unknown-noun-- recruitment --unk_noun-- 5316 --unk-- --unknown-noun-- expansionism --unk_noun-- 5429 --unk-- --unknown-verb-- palate --unk_verb-- 5440 --unk-- --unknown-noun-- engorgement --unk_noun-- 5557 --unk-- --unknown-noun-- patronage --unk_noun-- 5912 --unk-- --unknown-noun-- volunteerism --unk_noun-- 5990 --unk-- --unknown-adjective-- utopian --unk_adj-- 6709 --unk-- --unknown-noun-- citizenship --unk_noun-- 6738 --unk-- --unknown-noun-- abomination --unk_noun-- 6793 --unk-- --unknown-adjective-- reflexively --unk_adj-- 6809 --unk-- --unknown-noun-- regimentation --unk_noun-- 6876 --unk-- --unknown-noun-- compulsory --unk_noun-- 7150 --unk-- --unknown-adjective-- arguably --unk_adj-- 7192 --unk-- --unknown-noun-- compulsion --unk_noun-- 7208 --unk-- --unknown-adjective-- unenforceable --unk_adj-- 7374 --unk-- --unknown-noun-- recruitment --unk_noun-- 7468 --unk-- --unknown-adjective-- challengeable --unk_adj-- 7591 --unk-- --unknown-adjective-- unprecedentedly --unk_adj-- 7657 --unk-- --unknown-adjective-- cooperatively --unk_adj-- 8182 --unk-- --unknown-noun-- reticence --unk_noun-- 8772 --unk-- --unknown-noun-- render --unk_noun-- 11071 --unk-- --unknown-noun-- breather --unk_noun-- 11942 --unk-- --unknown-verb-- unify --unk_verb-- 12859 --unk-- --unknown-verb-- frigate --unk_verb-- 12907 --unk-- --unknown-noun-- disarmament --unk_noun-- 12936 --unk-- --unknown-noun-- affinity --unk_noun-- 13240 --unk-- --unknown-verb-- patriarchate --unk_verb-- 13438 --unk-- --unknown-noun-- underselling --unk_noun-- 13650 --unk-- --unknown-adjective-- wrathful --unk_adj-- 13725 --unk-- --unknown-noun-- gist --unk_noun-- 14081 --unk-- --unknown-noun-- segmentation --unk_noun-- 14110 --unk-- --unknown-adjective-- brightly --unk_adj-- 15331 --unk-- --unknown-noun-- connector --unk_noun-- 17846 --unk-- --unknown-noun-- readjustment --unk_noun-- 18397 --unk-- --unknown-noun-- observation --unk_noun-- 18410 --unk-- --unknown-noun-- ticker --unk_noun-- 18566 --unk-- --unknown-noun-- trickery --unk_noun-- 18750 --unk-- --unknown-adjective-- supersonic --unk_adj-- 18854 --unk-- --unknown-noun-- existance --unk_noun-- 18885 --unk-- --unknown-adjective-- thankfully --unk_adj-- 19011 --unk-- --unknown-noun-- coherence --unk_noun-- 19304 --unk-- --unknown-adjective-- autocratic --unk_adj-- 19305 --unk-- --unknown-noun-- pseudosocialism --unk_noun-- 19326 --unk-- --unknown-noun-- encircling --unk_noun-- 19389 --unk-- --unknown-noun-- miscegenation --unk_noun-- 19470 --unk-- --unknown-adjective-- ostensible --unk_adj-- 19479 --unk-- --unknown-adjective-- purportedly --unk_adj-- 19554 --unk-- --unknown-noun-- accuser --unk_noun-- 19568 --unk-- --unknown-noun-- embezzler --unk_noun-- 19961 --unk-- --unknown-adjective-- unwittingly --unk_adj-- 19970 --unk-- --unknown-noun-- platter --unk_noun-- 20129 --unk-- --unknown-noun-- centrist --unk_noun-- 20384 --unk-- --unknown-adjective-- improbable --unk_adj-- 20472 --unk-- --unknown-adjective-- glaringly --unk_adj-- 20493 --unk-- --unknown-noun-- clarity --unk_noun-- 20496 --unk-- --unknown-noun-- rectification --unk_noun-- 20539 --unk-- --unknown-noun-- wiser --unk_noun-- 21215 --unk-- --unknown-adjective-- concurrently --unk_adj-- 21310 --unk-- --unknown-verb-- revolutionize --unk_verb-- 22257 --unk-- --unknown-noun-- compensatory --unk_noun-- 22270 --unk-- --unknown-noun-- respiratory --unk_noun-- 23603 --unk-- --unknown-noun-- milion --unk_noun-- 23928 --unk-- --unknown-noun-- poultry --unk_noun-- 23947 --unk-- --unknown-noun-- cessation --unk_noun-- 24005 --unk-- --unknown-noun-- mothballing --unk_noun-- 24168 --unk-- --unknown-noun-- synthesizer --unk_noun-- 24195 --unk-- --unknown-adjective-- distinctly --unk_adj-- 24280 --unk-- --unknown-noun-- synthesizer --unk_noun-- 24293 --unk-- --unknown-adjective-- robotic --unk_adj-- 24294 --unk-- --unknown-noun-- flatness --unk_noun-- 24861 --unk-- --unknown-noun-- carnage --unk_noun-- 25248 --unk-- --unknown-adjective-- frantic --unk_adj-- 25252 --unk-- --unknown-noun-- hammer --unk_noun-- 26001 --unk-- --unknown-noun-- whisper --unk_noun-- 27468 --unk-- --unknown-noun-- prophecy --unk_noun-- 27538 --unk-- --unknown-noun-- encircling --unk_noun-- 27716 --unk-- --unknown-verb-- realestate --unk_verb-- 27994 --unk-- --unknown-noun-- insolvency --unk_noun-- 28110 --unk-- --unknown-adjective-- forceful --unk_adj-- 28195 --unk-- --unknown-adjective-- zealous --unk_adj-- 28485 --unk-- --unknown-adjective-- belatedly --unk_adj-- 29370 --unk-- --unknown-verb-- duplicate --unk_verb-- 29902 --unk-- --unknown-adjective-- reptilian --unk_adj-- 30078 --unk-- --unknown-noun-- industrialization --unk_noun-- 31292 --unk-- --unknown-adjective-- vociferous --unk_adj-- 31303 --unk-- --unknown-adjective-- powerless --unk_adj-- 31549 --unk-- --unknown-noun-- provocation --unk_noun-- 31648 --unk-- --unknown-adjective-- archaic --unk_adj-- 31700 --unk-- --unknown-noun-- lament --unk_noun-- 32149 --unk-- --unknown-adjective-- tantalizingly --unk_adj-- 32393 --unk-- --unknown-noun-- hammer --unk_noun-- 33208 --unk-- --unknown-adjective-- defiantly --unk_adj-- 33238 --unk-- --unknown-noun-- rickety --unk_noun-- 33280 --unk-- --unknown-noun-- unionist --unk_noun-- 33336 --unk-- --unknown-noun-- dapper --unk_noun-- 33721 --unk-- --unknown-adjective-- repressive --unk_adj-- 33832 --unk-- --unknown-noun-- disillusionment --unk_noun-- 33861 --unk-- --unknown-noun-- agitation --unk_noun--
for word in (line for line in loader.vocabulary_words if line.startswith("--u")):
print(word)
--unknown-- --unknown-adjective-- --unknown-adverb-- --unknown-digit-- --unknown-noun-- --unknown-punctuation-- --unknown-uppercase-- --unknown-verb--