Loading the English and French Word Embeddings

Cloistered Monkey

2020-10-13 17:04

Beginning

This is the first post in a series - the document with links to all the posts in the series is this post.

The Machine Translation exercise uses word embeddings that are subsets of prebuilt Word2Vec (English) embeddings (GoogleNews-vectors-negative300.bin.gz) and prebuilt French Embeddings (wiki.multi.fr.vec). Coursera provides them but I thought it would be a good exercise to look at how they're built.

Imports

# python
from pathlib import Path
import os

# pypi
from dotenv import load_dotenv

Set Up

The Dotenv

To make loading files more or less portable I'm using a .env file with the paths to the data sets. This loads it into the environment so the values are accessible using os.environ.

load_dotenv("posts/nlp/.env", override=True)

Middle

The Embeddings

As I noted the English and French embeddings are available from the web. I was thinking of making a download if the files don't exist but the Google News embeddings file is pretty big so the download takes a while on my internet connection so I thought it'd be better to download it from a browser anyway. I'm going to assume the files are downloaded and the Google News embeddings are un-zipped (probably using gunzip or pigz, both of which are installed by default on Ubuntu 20.04).

Notes

"""This is a module for word embeddings loaders.
"""

Imports

# python
from argparse import Namespace
from pathlib import Path

import os
import pickle

# from pypi
from gensim.models.keyedvectors import BaseKeyedVectors, KeyedVectors

import attr
import pandas

The Raw Loader

@attr.s(auto_attribs=True)
class Embeddings:
    """Embeddings Loader"""
    path: str
    binary: bool
    _embeddings: BaseKeyedVectors=None

The Embeddings

@property
def embeddings(self) -> BaseKeyedVectors:
    """The loaded embeddings"""
    if self._embeddings is None:
        self._embeddings = KeyedVectors.load_word2vec_format(self.path,
                                                             binary=self.binary)
    return self._embeddings

The Subset Builder

@attr.s(auto_attribs=True)
class SubsetBuilder:
    """Create subset of embeddings that matches sets

    Args:
     embeddings_1: word embeddings
     embeddings_2: word embeddings
     subset_dict: dict whose keys and values to pull out of the embeddings
     output_1: path to save the first subset to
     output_2: path to save the second subset to
    """
    embeddings_1: KeyedVectors
    embeddings_2: KeyedVectors
    subset_dict: dict
    output_1: Path
    output_2: Path

    _vocabulary_1: set=None
    _vocabulary_2: set=None
    _subset_1: dict=None
    _subset_2: dict=None

Subset 1

@property
def subset_1(self) -> dict:
    """Subset of embeddings 1"""
    if self._subset_1 is None and self.output_1.is_file():        
        with self.output_1.open("rb") as reader:
            self._subset_1 = pickle.load(reader)
    return self._subset_1

Subset 2

@property
def subset_2(self) -> dict:
    """subset of embeddings 2"""
    if self._subset_2 is None and self.output_2.is_file():
        with self.output_2.open("rb") as reader:
            self._subset_2 = pickle.load(reader)
    return self._subset_2

Save

def pickle_it(self):
    """Save the subsets"""
    if self.subset_1 is not None:
        with self.output_1.open("wb") as writer:
            pickle.dump(self.subset_1, writer)
    if self.subset_2 is not None:
        with self.output_2.open("wb") as writer:
            pickle.dump(self.subset_2, writer)
    return

Clean it

def clean(self) -> None:
    """Remove any pickled subsets

    Also removes any subset dictionaries
    """
    for path in (self.output_1, self.output_2):
        if path.is_file():
            path.unlink()
    self._subset_1 = self._subset_2 = None
    return

Call the Subset Builder

def  __call__(self, pickle_it: bool=True) -> None:
    """Builds or loads the subsets and saves them as pickles

    Args:
     pickle_it: whether to save the subsets
    """
    if self.subset_1 is None or self.subset_2 is None:
        self.clean()
        self._subset_1, self._subset_2 = {}, {}
        for key, value in self.subset_dict.items():
            if key in self.embeddings_1 and value in self.embeddings_2:
                self._subset_1[key] = self.embeddings_1[key]
                self._subset_2[value] = self.embeddings_2[value]
        if pickle_it:
            self.pickle_it()
    return

Dict Loader

@attr.s(auto_attribs=True)
class DictLoader:
    """Loader for the english and french dictionaries

    This is specifically for the training and testing files
     - CSV-ish (separated by spaces instead of commas)
     - No header: column 1 = English, column 2 = English

    Args:
     path: path to the file
     columns: list of strings
     delimiter: separator for the columns in the source file
    """
    path: str
    columns: list=["English", "French"]
    delimiter: str=" "

    _dataframe: pandas.DataFrame=None
    _dictionary: dict=None

Data Frame

@property
def dataframe(self) -> pandas.DataFrame:
    """Loads the space-separated file as a dataframe"""
    if self._dataframe is None:
        self._dataframe = pandas.read_csv(self.path,
                                          names=self.columns,
                                          delimiter=self.delimiter)
    return self._dataframe

Dictionary

@property
def dictionary(self) -> dict:
    """english to french dictionary"""
    if self._dictionary is None:
        self._dictionary = dict(zip(self.dataframe[self.columns[0]],
                                    self.dataframe[self.columns[1]]))
    return self._dictionary

Loading It

from neurotic.nlp.word_embeddings.embeddings import Embeddings

english_embeddings = Embeddings(os.environ["GOOGLE_EMBEDDINGS"], binary=True)
french_embeddings = Embeddings(os.environ["FRENCH_EMBEDDINGS"], binary=False)
print(english_embeddings.embeddings)

from neurotic.nlp.word_embeddings.embeddings import DictLoader

training = DictLoader(os.environ["ENGLISH_FRENCH_TRAINING"])
testing = DictLoader(os.environ["ENGLISH_FRENCH_TESTING"])

train_keys = set(training.dictionary)
test_keys = set(testing.dictionary)
print(train_keys.intersection(test_keys))

set()

After I made the subset builder it occured to me that if there was overlap between the testing and training sets but they mapped to different definitions then the way I was going to build them would require two separated dictionaries, but as you can see, the training and testing sets don't have English terms in common.

merged = training.dictionary.copy()
merged.update(testing.dictionary)
print(len(training.dictionary))
print(len(testing.dictionary))
print(len(merged))

5000
1500
6500

from neurotic.nlp.word_embeddings.embeddings import SubsetBuilder

english_path = Path(os.environ["ENGLISH_EMBEDDINGS_SUBSET"])
french_path = Path(os.environ["FRENCH_EMBEDDINGS_SUBSET"])

builder = SubsetBuilder(embeddings_1=english_embeddings.embeddings,
                        embeddings_2=french_embeddings.embeddings,
                        subset_dict=merged,
                        output_1=english_path, output_2=french_path)
builder()

builder = SubsetBuilder(embeddings_1=None,
                        embeddings_2=None,
                        subset_dict=None,
                        output_1=english_path, output_2=french_path)

More Builders

After I tried using the EmbeddingsLoader on a different computer I realized that I didn't really simplify the creation of the embeddings all that much so I'm going to make an overall builder that maybe hides it from the end-user (although not entirely since I use environment variables that have to be set).

Source Keys

SourceKeys = Namespace(
    english="GOOGLE_EMBEDDINGS",
    french="FRENCH_EMBEDDINGS",
    training="ENGLISH_FRENCH_TRAINING",
    testing="ENGLISH_FRENCH_TESTING",
    )

Target Keys

TargetKeys = Namespace(
    english="ENGLISH_EMBEDDINGS_SUBSET",
    french="FRENCH_EMBEDDINGS_SUBSET",
    )

Keys

Keys = Namespace(
    source=SourceKeys,
    target=TargetKeys,
    )

Source Paths

@attr.s(auto_attribs=True)
class SourcePaths:
    """Paths to the source files

    These are files provided from other sources
    """
    keys: Namespace=Keys
    _english: Path=None
    _french: Path=None
    _training: Path=None
    _testing: Path=None

    @property
    def english(self) -> Path:
        """Path to the english word-embeddings"""
        if self._english is None:
            self._english = Path(os.environ[self.keys.source.english])
        return self._english

    @property
    def french(self) -> Path:
        """Path to the french word-embeddings"""
        if self._french is None:
            self._french = Path(os.environ[self.keys.source.french])
        return self._french

    @property
    def training(self) -> Path:
        """Path to the training dictionary"""
        if self._training is None:
            self._training = Path(os.environ[self.keys.source.training])
        return self._training

    @property
    def testing(self) -> Path:
        """Path to the testing dictionary"""
        if self._testing is None:
            self._testing = Path(os.environ[self.keys.source.testing])
        return self._testing

Target Paths

@attr.s(auto_attribs=True)
class TargetPaths:
    """Paths to save derived files"""
    keys: Namespace=Keys
    _english: Path=None
    _french: Path=None

    @property
    def english(self) -> Path:
        """Path to derived subset of english embeddings"""
        if self._english is None:
            self._english = Path(os.environ[self.keys.target.english])
        return self._english

    @property
    def french(self) -> Path:
        """Path to derived subset of french embeddings"""
        if self._french is None:
            self._french = Path(os.environ[self.keys.target.french])
        return self._french

Paths

@attr.s(auto_attribs=True)
class Paths:
    """Class to build and hold the source and target file paths"""
    _target: Path=None
    _source: Path=None

    @property
    def target(self) -> TargetPaths:
        """Holds object with paths to created embeddings subsets"""
        if self._target is None:
            self._target = TargetPaths()
        return self._target

    @property
    def source(self) -> SourcePaths:
        """Holds objetw with paths to original source files"""
        if self._source is None:
            self._source = SourcePaths()
        return self._source

Load And Build

@attr.s(auto_attribs=True)
class LoadAndBuild:
    """Loads embeddings and dictionaries and builds subsets"""
    _paths: Paths=None
    _english_embeddings: BaseKeyedVectors=None
    _french_embeddings: BaseKeyedVectors=None
    _training: dict=None
    _testing: dict=None
    _merged_dicts: dict=None
    _subset_builder: SubsetBuilder=None

    @property
    def paths(self) -> Paths:
        """Object with paths to files"""
        if self._paths is None:
            self._paths = Paths()
        return self._paths

    @property
    def english_embeddings(self) -> BaseKeyedVectors:
        """Word embeddings for English"""
        if self._english_embeddings is None:
            self._english_embeddings = Embeddings(self.paths.source.english,
                                                  binary=True).embeddings
        return self._english_embeddings

    @property
    def french_embeddings(self) -> BaseKeyedVectors:
        """Word embeddings for French"""
        if self._french_embeddings is None:
            self._french_embeddings = Embeddings(self.paths.source.french,
                                                 binary=False).embeddings
        return self._french_embeddings

    @property
    def training(self) -> dict:
        """training dictionary"""
        if self._training is None:
            self._training = DictLoader(self.paths.source.training).dictionary
        return self._training

    @property
    def testing(self) -> dict:
        """Testing dictionary"""
        if self._testing is None:
            self._testing = DictLoader(self.paths.source.testing).dictionary
        return self._testing

    @property
    def merged_dicts(self) -> dict:
        """Testing and training merged"""
        if self._merged_dicts is None:
            self._merged_dicts = self.training.copy()
            self._merged_dicts.update(self.testing)
            assert len(self._merged_dicts) == (len(self.training) + len(self.testing))
        return self._merged_dicts

    @property
    def subset_builder(self) -> SubsetBuilder:
        """Builder of the subset dictionaries"""
        if self._subset_builder is None:
            self._subset_builder = SubsetBuilder(
                self.english_embeddings,
                self.french_embeddings,
                self.merged_dicts,
                self.paths.target.english,
                self.paths.target.french)
        return self._subset_builder

    def __call__(self) -> None:
        """Calls the subset builder"""
        self.subset_builder()
        return

A Loader

As a convenience I'm going to make a loader for all the parts.

EmbeddingsKeys = Namespace(
    english_subset="ENGLISH_EMBEDDINGS_SUBSET",
    french_subset="FRENCH_EMBEDDINGS_SUBSET",
    training="ENGLISH_FRENCH_TRAINING",
    testing="ENGLISH_FRENCH_TESTING",
)

@attr.s(auto_attribs=True)
class EmbeddingsLoader:
    """Loads the embeddings and dictionaries

    Warning:
     this assumes that you've loaded the proper environment variables to
    find the files - it doesn't call ``load_dotenv``

    """
    _loader_builder: LoadAndBuild=None
    _english_subset: dict=None
    _french_subset: dict=None
    _training: dict=None
    _testing: dict=None

@property
def loader_builder(self) -> LoadAndBuild:
    """Object to load sources and build subsets"""
    if self._loader_builder is None:
        self._loader_builder = LoadAndBuild()
    return self._loader_builder

@property
def english_subset(self) -> dict:
    """The english embeddings subset

    This is a subset of the Google News embeddings that matches the keys in 
    the english to french dictionaries
    """
    if self._english_subset is None:
        if not self.loader_builder.paths.target.english.is_file():
            self.loader_builder()
            self._english_subset = self.loader_builder.subset_builder.subset_1
        else:
            with self.loader_builder.paths.target.english.open("rb") as reader:
                self._english_subset = pickle.load(reader)
    return self._english_subset

@property
def french_subset(self) -> dict:
    """Subset of the MUSE French embeddings"""
    if self._french_subset is None:
        if self.loader_builder.paths.target.french.is_file():
            with self.loader_builder.paths.target.french.open("rb") as reader:
                self._french_subset = pickle.load(reader)
        else:
            self.loader_builder()
            self._french_subset = self.loader_builder.subset_builder.subset_2
    return self._french_subset

@property
def training(self) -> dict:
    """The english to french dictionary training set"""
    if self._training is None:
        self._training = DictLoader(self.loader_builder.paths.source.training).dictionary
    return self._training

@property
def testing(self) -> dict:
    """testing english to french dictionary"""
    if self._testing is None:
        self._testing = DictLoader(self.loader_builder.paths.source.testing).dictionary
    return self._testing

End

The next step is to convert the embeddings to a data set.
The page that collects all the pages for this project is the Machine Translation page.

Table of Contents

Beginning

Imports

Set Up

The Dotenv

Middle

The Embeddings

Notes

Imports

The Raw Loader

The Subset Builder

Dict Loader

Loading It

More Builders

Source Keys

Target Keys

Keys

Source Paths

Target Paths

Paths

Load And Build

A Loader

End