Deep N-Grams: Loading the Data
Table of Contents
Text to Tensor
In this section we're going to load the text data and transform it into tensors.
Imports
# python
from pathlib import Path
import os
# pypi
from dotenv import load_dotenv
from expects import (be_true,
contain_exactly,
equal,
expect)
Set Up
The path to the data is kept in a .env
file so we'll load it into the environment here.
load_dotenv("posts/nlp/.env", override=True)
data_path = Path(os.environ["SHAKESPEARE"]).expanduser()
expect(data_path.is_dir()).to(be_true)
Middle
Loading the Data
We're going to be using the plays of Shakespeare. Unlike previously, this data source has them in separate files so we'll have to load each one separately. We're going to be generating characters, not words, so each character has to be given an integer ID. We'll use the Unicode values given to us by the built-in ord function.
lines = []
for filename in data_path.glob("*.txt"):
with filename.open() as play:
cleaned = (line.strip() for line in play)
lines += [line for line in cleaned if line]
This only cleans out the leading and trailing whitespace, there are other things like tabs still in there.
line_count = len(lines)
print(f"Number of lines: {line_count:,}")
print(f"Sample line at position 0: {lines[0]}")
print(f"Sample line at position 999: {lines[999]}")
Number of lines: 125,097 Sample line at position 0: king john Sample line at position 999: as it makes harmful all that speak of it.
To make this a little easier, we'll convert all characters to lowercase. This way, for example, the model only needs to predict the likelihood that a letter is 'a' and not decide between uppercase 'A' and lowercase 'a'.
lines = [line.lower() for line in lines]
new_line_count = len(lines)
expect(new_line_count).to(equal(line_count))
print(f"Number of lines: {new_line_count:,}")
print(f"Sample line at position 0: {lines[0]}")
print(f"Sample line at position 999: {lines[999]}")
Number of lines: 125,097 Sample line at position 0: king john Sample line at position 999: as it makes harmful all that speak of it.
Once again, we're gong to do a strait split to create the training and validation data instead of using randomization.
SPLIT = 1000
validation = lines[-SPLIT:]
training = lines[:-SPLIT]
print(f"Number of lines for training: {len(training):,}")
print(f"Number of lines for validation: {len(validation):,}")
Number of lines for training: 124,097 Number of lines for validation: 1,000
To Tensors
Like I mentioned before, we're going to use python's ord
function to convert the letters to integers.
for character in "abc xyz123":
print(f"{character}: {ord(character)}")
a: 97 b: 98 c: 99 : 32 x: 120 y: 121 z: 122 1: 49 2: 50 3: 51
def line_to_tensor(line: str, EOS_int: int=1) -> list:
"""Turns a line of text into a tensor
Args:
line: A single line of text.
EOS_int: End-of-sentence integer. Defaults to 1.
Returns:
a list of integers (unicode values) for the characters in the ``line``.
"""
tensor = []
# for each character:
for c in line:
# convert to unicode int
c_int = ord(c)
# append the unicode integer to the tensor list
tensor.append(c_int)
# include the end-of-sentence integer
tensor.append(EOS_int)
return tensor
Test the Output
actual = line_to_tensor('abc xyz')
expected = [97, 98, 99, 32, 120, 121, 122, 1]
expect(actual).to(contain_exactly(*expected))
Bundle It Up
This is going to be needed in future posts so I'm going to put it in a class.
Imports
# python
from pathlib import Path
import os
# pypi
from dotenv import load_dotenv
import attr
The Data Loader
@attr.s(auto_attribs=True)
class DataLoader:
"""Load the data and convert it to 'tensors'
Args:
env_path: the path to the env file (as a string)
env_key: the environmental variable with the path to the data
validation_size: number for the validation set
end_of_sentence: integer to use to indicate the end of a sentence
"""
env_path: str="posts/nlp/.env"
env_key: str="SHAKESPEARE"
validation_size: int=1000
end_of_sentence: int=1
_data_path: Path=None
_lines: list=None
_training: list=None
_validation: list=None
The Data Path
@property
def data_path(self) -> Path:
"""Loads the dotenv and converts the path
Raises:
assertion error if path doesn't exist
"""
if self._data_path is None:
load_dotenv(self.env_path, override=True)
self._data_path = Path(os.environ[self.env_key]).expanduser()
assert self.data_path.is_dir()
return self._data_path
The Lines
@property
def lines(self) -> list:
"""The lines of text-data"""
if self._lines is None:
self._lines = []
for filename in self.data_path.glob("*.txt"):
with filename.open() as play:
cleaned = (line.strip() for line in play)
self._lines += [line.lower() for line in cleaned if line]
return self._lines
The Training Set
@property
def training(self) -> list:
"""Subset of the lines for training"""
if self._training is None:
self._training = self.lines[:-self.validation_size]
return self._training
The Validation Set
@property
def validation(self) -> list:
"""The validation subset of the lines"""
if self._validation is None:
self._validation = self.lines[-self.validation_size:]
return self._validation
To Tensor
def to_tensor(self, line: str) -> list:
"""Converts the line to the unicode value
Args:
line: the text to convert
Returns:
line converted to unicode integer encodings
"""
return [ord(character) for character in line] + [self.end_of_sentence]
Check the Data Loader
from neurotic.nlp.deep_rnn.data_loader import DataLoader
loader = DataLoader()
expect(len(loader.lines)).to(equal(line_count))
expect(len(loader.validation)).to(equal(SPLIT))
expect(len(loader.training)).to(equal(line_count - SPLIT))
actual = loader.to_tensor('abc xyz')
expected = [97, 98, 99, 32, 120, 121, 122, 1]
expect(actual).to(contain_exactly(*expected))
for line in loader.lines[:10]:
print(line)
king john dramatis personae king john: prince henry son to the king. arthur duke of bretagne, nephew to the king. the earl of pembroke (pembroke:) the earl of essex (essex:) the earl of salisbury (salisbury:)