NER: Pre-Processing the Data
Table of Contents
Preprocessing The Data
We will be using a dataset from Kaggle which appears to have originally come from the Groningen Meaning Bank (a bank of texts, not money). The original data consists of four columns, the sentence number, the word, the part of speech of the word, and the tags. A few tags you might expect to see are:
- geo: geographical entity
- org: organization
- per: person
- gpe: geopolitical entity
- tim: time indicator
- art: artifact
- eve: event
- nat: natural phenomenon
- O: filler word
# python
from collections import namedtuple
from pathlib import Path
import os
# pypi
from dotenv import load_dotenv
from expects import equal, expect
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import pandas
Set Up
The Dataset
Note: to get the encoding for the file use file
file -bi ner_dataset.csv
In this case we get:
application/csv; charset=iso-8859-1
Since it isn't ASCII or ISO-8 we'll have to tell pandas what the encoding is.
load_dotenv("posts/nlp/.env", override=True)
path = Path(os.environ["NER_DATASET"]).expanduser()
data = pandas.read_csv(path, encoding="ISO-8859-1")
The Kaggle Data
print(tabulate(data.iloc[:5], tablefmt="orgtbl", headers="keys"))
Sentence # | Word | POS | Tag | |
0 | Sentence: 1 | Thousands | NNS | O |
1 | nan | of | IN | O |
2 | nan | demonstrators | NNS | O |
3 | nan | have | VBP | O |
4 | nan | marched | VBN | O |
As you can (kind of) tell, the sentences are broken up so that each row has one word in it.
To make it easier to work with I'm going to rename the columns.
data = data.rename(columns={"Sentence #":"sentence", "Word": "word", "Tag": "tag"})
Words and Tags
The first thing we're going to do is separate out the words to build our vocabulary. The vocabulary
will be a mapping of each word to an index so that we can convert our text to numbers for our model. In addition we're going to add a <PAD>
token so that if our input is to short we can pad it to be the right size. And an UNK
token in case we don't know a word.
token = namedtuple("Token", "pad unknown".split())
Token = token(pad="<PAD>", unknown="UNK")
vocabulary = {word: index for index, word in enumerate(data.word.unique())}
vocabulary[Token.pad] = len(vocabulary)
vocabulary[Token.unknown] = len(vocabulary)
We're going to do the same with the Tag column.
tags = {tag: index for index, tag in enumerate(data.tag.unique())}
{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}
Note: This is actually cheating because I am using the whole dataset. Later on make sure to only use the training data.
Sentences and Labels
We're also going to need to smash the words back into sentences. There's probably a clever pandas way to do this, but I'll just brute-force it. We'll also need to join the labels for the sentences into strings.
sentences = []
labels = []
sentence = None
for row in data.itertuples():
if not pandas.isna(row.sentence):
if sentence:
sentence = [row.word]
label = [row.tag]
47,958 47,958 ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'] ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
We're going to convert them to numbers so I didn't join them into strings.
To Numbers
sentence_vectors = [
[vocabulary.get(word, Token.unknown) for word in sentence]
for sentence in sentences
assert len(sentence_vectors) == len(sentences)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
label_vectors = [
[tags[label] for label in sentence_labels] for sentence_labels in labels
assert len(label_vectors) == len(labels)
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]
In this case we're assuming that there's no unknown tags because they are only used for training and testing so we wouldn't expect to see one that isn't in our current dataset, unlike the sentences which are going to be used with new data and so might have tokens we haven't seen before.
We could add the padding here, but instead we're going to do it in the batch generator.
The Train-Test Split
This time we're going to do a real train-validation-test split.
splits = namedtuple("Split", "train validation test".split())
Split = splits(train=33570, validation=7194, test=7194)
x_train, x_leftovers, y_train, y_leftovers = train_test_split(sentences, labels, train_size=Split.train)
x_validation, x_test, y_validation, y_test = train_test_split(x_leftovers, y_leftovers, test_size=Split.test)
assert len(x_train) == Split.train
assert len(y_train) == Split.train
assert len(x_validation) == Split.validation
assert len(y_validation) == Split.validation
assert len(x_test) == Split.test
assert len(y_test) == Split.test
Bundling This Up
# python
from collections import namedtuple
from functools import partial
from pathlib import Path
import os
# pypi
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
import attr
import pandas
Some Constants
Read = namedtuple("Read", "dotenv key encoding".split())
READ = Read(dotenv="posts/nlp/.env", key="NER_DATASET",
COLUMNS={"Sentence #":"sentence",
"Word": "word",
"Tag": "tag"}
Token = namedtuple("Token", "pad unknown".split())
TOKEN = Token(pad="<PAD>", unknown="UNK")
Splits = namedtuple("Split", "train validation test".split())
SPLIT = Splits(train=33570, validation=7194, test=7194)
DataSets = namedtuple("DataSets", [
TheData = namedtuple("TheData", [
The Data Processor
Each of the three sets needs to be vectorized since I'm not saving the sentences beforehand. So this class handles that.
class DataFlattener:
"""Converts the kaggle data to sentences and labels
data: the data to convert
data: pandas.DataFrame
_sentences: list=None
_labels: list=None
- Sentences
@property def sentences(self) -> list: """List of sentences from the data""" if self._sentences is None: self.set_sentences_and_labels() return self._sentences
- Labels
@property def labels(self) -> list: """List of labels from the data""" if self._labels is None: self.set_sentences_and_labels() return self._labels
- Sentences and Labels maker
def set_sentences_and_labels(self) -> None: """Converts the data to lists of sentence token lists and also sets the labels """ self._sentences = [] self._labels = [] sentence = None for row in if not pandas.isna(row.sentence): if sentence: self._sentences.append(sentence) self._labels.append(labels) sentence = [row.word] labels = [row.tag] else: sentence.append(row.word) labels.append(row.tag) return
Data Vectorizer
class DataVectorizer:
"""Converts the data-set strings to vectors
data_sets: the split up data sets
vocabulary: map from token to index
tags: map from tag to index
data_sets: namedtuple
vocabulary: dict
tags: dict
_vectorized_datasets: namedtuple=None
- Vectorized Data Sets
@property def vectorized_datasets(self) -> namedtuple: """the original data sets converted to indices""" if self._vectorized_datasets is None: sentence_vectors = partial(self.to_vectors, to_index=self.vocabulary) label_vectors = partial(self.to_vectors, to_index=self.tags) self._vectorized_datasets = DataSets( x_train = sentence_vectors(self.data_sets.x_train), y_train = label_vectors(self.data_sets.y_train), x_validate = sentence_vectors(self.data_sets.x_validate), y_validate = label_vectors(self.data_sets.y_validate), x_test = sentence_vectors(self.data_sets.x_test), y_test = label_vectors(self.data_sets.y_test), ) return self._vectorized_datasets
- Sentence Vectors
def to_vectors(self, source: list, to_index: dict) -> list: """Sentences converted to Integers Args: source: iterator of tokenized strings to convert to_index: map to convert the tokens to indices Returns: tokens in source converted to indices """ vectors = [ [to_index.get(token, TOKEN.unknown) for token in line] for line in source ] assert len(vectors) == len(source) return vectors
The Splitter
class DataSplitter:
"""Splits up the training, testing, etc.
split: constants with the train, test counts
sentences: input data to split
labels: y-data to split
random_state: seed for the splitting
split: namedtuple
sentences: list
labels: list
random_state: int=None
_data_sets: namedtuple=None
- Data Sets
@property def data_sets(self) -> namedtuple: """The Split data sets""" if self._data_sets is None: x_train, x_leftovers, y_train, y_leftovers = train_test_split( self.sentences, self.labels, train_size=self.split.train, random_state=self.random_state) x_validate, x_test, y_validate, y_test = train_test_split( x_leftovers, y_leftovers, test_size=self.split.test, random_state=self.random_state) self._data_sets = DataSets(x_train=x_train, y_train=y_train, x_validate=x_validate, y_validate=y_validate, x_test=x_test, y_test=y_test, ) assert len(x_train) + len(x_validate) + len(x_test) == len(self.sentences) return self._data_sets
The Loader
class DataLoader:
"""Loads and converts the kaggle data
read: the stuff to download the data
read: namedtuple=READ
_data: pandas.DataFrame=None
_vocabulary: dict=None
_tags: dict=None
- The Kaggle Data
@property def data(self) -> pandas.DataFrame: """The original kaggle dataset""" if self._data is None: load_dotenv( path = Path(os.environ[]).expanduser() self._data = pandas.read_csv(path, self._data = self._data.rename(columns=COLUMNS) return self._data
- The Vocabulary
@property def vocabulary(self) -> dict: """map of word to index Note: This is creating a transformation of the entire data-set so it comes before the train-test-split so it uses the whole dataset, not just training """ if self._vocabulary is None: self._vocabulary = { word: index for index, word in enumerate(} self._vocabulary[TOKEN.pad] = len(self._vocabulary) self._vocabulary[TOKEN.unknown] = len(self._vocabulary) return self._vocabulary
- The Tags
@property def tags(self) -> dict: """map of tag to index""" if self._tags is None: self._tags = {tag: index for index, tag in enumerate(} self._tags[TOKEN.unknown] = len(self._tags) return self._tags
The Processor
class NERData:
"""Master NER Data preparer
read_constants: stuff to help load the dataset
split_constants: stuff to help split the dataset
random_state: seed for the splitting
read_constants: namedtuple=READ
split_constants: namedtuple=SPLIT
random_state: int=33
_data: namedtuple=None
_loader: DataLoader=None
_flattener: DataFlattener=None
_splitter: DataSplitter=None
_vectorizer = DataVectorizer=None
- The Data
@property def data(self) -> namedtuple: """The split up data sets""" if self._data is None: self._data = TheData( vocabulary=self.loader.vocabulary, tags=self.loader.tags, raw_data_sets=self.splitter.data_sets, data_sets=self.vectorizer.vectorized_datasets, ) return self._data
- The Loader
@property def loader(self) -> DataLoader: """The loader of the data""" if self._loader is None: self._loader = DataLoader( read=self.read_constants, ) return self._loader
- The Flattener
@property def flattener(self) -> DataFlattener: """The sentence and label builder""" if self._flattener is None: self._flattener = DataFlattener(, ) return self._flattener
- The Splitter
@property def splitter(self) -> DataSplitter: """The splitter upper for the data""" if self._splitter is None: self._splitter = DataSplitter( split=self.split_constants, sentences = self.flattener.sentences, labels = self.flattener.labels, random_state=self.random_state ) return self._splitter
- The Vectorizer
@property def vectorizer(self) -> DataVectorizer: """Vectorizes the raw-data sets""" if self._vectorizer is None: self._vectorizer = DataVectorizer( data_sets=self.splitter.data_sets, tags=self.loader.tags, vocabulary=self.loader.vocabulary ) return self._vectorizer
Testing It Out
from neurotic.nlp.named_entity_recognition import NERData
ner = NERData()