Wasserstein GAN With Gradient Penalty

A Wasserstein GAN with Gradient Penalty (WGAN-GP)

We're going to build a Wasserstein GAN with Gradient Penalty (WGAN-GP) that solves some of the stability issues with GANs. Specifically, we'll use a special kind of loss function known as the W-loss, where W stands for Wasserstein, and gradient penalties to prevent mode collapse (see Wasserstein Metric).

Wasserstein is named after a mathematician at Penn State, Leonid Vaseršteĭn.

Imports

# python
from pathlib import Path

# from pypi
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import make_grid

import holoviews
import matplotlib.pyplot as pyplot
import torch

# my stuff
from graeae import EmbedHoloviews, Timer

Set Up

The Random Seed

torch.manual_seed(0)

Plotting and the Timer

TIMER = Timer()
SLUG = "wasserstein-gan-with-gradient-penalty"

Helper Functions

def save_tensor_images(image_tensor: torch.Tensor,
                       filename: str, 
                       title: str,
                       folder: str=f"files/posts/gans{SLUG}",
                       num_images: int=25, size: tuple=(1, 28, 28)):
    """Plot an Image Tensor

    Args:
     image_tensor: tensor with the values for the image to plot
     filename: name to save the file under
     folder: path to put the file in
     title: title for the image
     num_images: how many images from the tensor to use
     size: the dimensions for each image
    """
    image_tensor = (image_tensor + 1) / 2
    image_unflat = image_tensor.detach().cpu()
    image_grid = make_grid(image_unflat[:num_images], nrow=5)
    pyplot.title(title)
    pyplot.grid(False)
    pyplot.imshow(image_grid.permute(1, 2, 0).squeeze())
    pyplot.tick_params(bottom=False, top=False, labelbottom=False,
                       right=False, left=False, labelleft=False)
    pyplot.savefig(folder + filename)
    print(f"[[file:{filename}]]")
    return
def holoviews_image(image: torch.tensor) -> holoviews.Image:
    image_tensor = (image_tensor + 1) / 2
    image_unflat = image_tensor.detach().cpu()
    image_grid = make_grid(image_unflat[:num_images], nrow=5)
    return holoview.Image(image_grid)

Gradient Hook

This helps to keep track of the gradient for plotting

def make_grad_hook() -> tuple:
    """
    Function to keep track of gradients for visualization purposes, 
    which fills the grads list when using model.apply(grad_hook).
    """
    grads = []
    def grad_hook(m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            grads.append(m.weight.grad)
    return grads, grad_hook

Noise

def make_noise(n_samples: int, z_dim: int, device: str='cpu') -> torch.Tensor:
    """Alias for torch.randn

    Args:
      n_samples: the number of samples to generate
      z_dim: the dimension of the noise vector
      device: the device type

    Returns:
     tensor with random numbers from the normal distribution.
    """
    return torch.randn(n_samples, z_dim, device=device)

Middle

The Generator

This is the Deep Convolutional GAN from before.

class Generator(nn.Module):
    """The DCGAN Generator

    Args:
       input_dim: the dimension of the input vector
       im_chan: the number of channels in the images, fitted for the dataset used
             (MNIST is black-and-white, so 1 channel is your default)
       hidden_dim: the inner dimension,
    """
    def __init__(self, z_dim: int=10, im_chan: int=1, hidden_dim: int=64):
        super().__init__()
        self.input_dim = input_dim

        self.gen = nn.Sequential(
            self.make_gen_block(input_dim, hidden_dim * 4),
            self.make_gen_block(hidden_dim * 4, hidden_dim * 2, kernel_size=4, stride=1),
            self.make_gen_block(hidden_dim * 2, hidden_dim),
            self.make_gen_block(hidden_dim, im_chan, kernel_size=4, final_layer=True),
        )

    def make_gen_block(self, input_channels: int, output_channels: int,
                       kernel_size: int=3, stride: int=2,
                       final_layer: bool=False) -> nn.Sequential:
        """Creates a block for the generator (sub sequence)

       The parts
        - a transposed convolution
        - a batchnorm (except for in the last layer)
        - an activation.

       Args:
           input_channels: how many channels the input feature representation has
           output_channels: how many channels the output feature representation should have
           kernel_size: the size of each convolutional filter, equivalent to (kernel_size, kernel_size)
           stride: the stride of the convolution
           final_layer: a boolean, true if it is the final layer and false otherwise 
                     (affects activation and batchnorm)

       Returns:
        the sub-sequence of layers
       """

        if not final_layer:
            return nn.Sequential(
                nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
                nn.BatchNorm2d(output_channels),
                nn.ReLU(inplace=True),
            )
        else:
            return nn.Sequential(
                nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
                nn.Tanh(),
            )

    def forward(self, noise: torch.Tensor) -> torch.Tensor:
        """complete a forward pass of the generator: Given a noise tensor, 

       Args:
        noise: a noise tensor with dimensions (n_samples, z_dim)

       Returns:
        generated images.
       """
        # unsqueeze the noise
        x = noise.view(len(noise), self.z_dim, 1, 1)
        return self.gen(x)

The Critic

This is also essentially the same as our Discriminator class from before.

class Critic(nn.Module):
    """
    Critic Class

    Args:
       im_chan: the number of channels in the images, fitted for the dataset used
             (MNIST is black-and-white, so 1 channel is your default)
       hidden_dim: the inner dimension
    """
    def __init__(self, im_chan: int=1, hidden_dim: int=64):
        super().__init__()
        self.crit = nn.Sequential(
            self.make_crit_block(im_chan, hidden_dim),
            self.make_crit_block(hidden_dim, hidden_dim * 2),
            self.make_crit_block(hidden_dim * 2, 1, final_layer=True),
        )

    def make_crit_block(self, input_channels: int, output_channels: int,
                        kernel_size: int=4, stride: int=2,
                        final_layer: bool=False) -> nn.Sequential:
        """Creates a sub-block for the network

        - a convolution
        - a batchnorm (except in the final layer)
        - an activation (except in the final layer).

       Args:
           input_channels: how many channels the input feature representation has
           output_channels: how many channels the output feature representation should have
           kernel_size: the size of each convolutional filter, equivalent to (kernel_size, kernel_size)
           stride: the stride of the convolution
           final_layer: a boolean, true if it is the final layer and false otherwise 
                     (affects activation and batchnorm)
       """
        if not final_layer:
            return nn.Sequential(
                nn.Conv2d(input_channels, output_channels, kernel_size,
                          stride),
                nn.BatchNorm2d(output_channels),
                nn.LeakyReLU(0.2),
            )
        else:
            return nn.Sequential(
                nn.Conv2d(input_channels, output_channels, kernel_size,
                          stride),
            )

    def forward(self, image: torch.Tensor) -> torch.Tensor:
        """Run a forward pass of the critic

       Args:
           image: a flattened image tensor with dimension (im_chan)

       Returns:
        a 1-dimension tensor representing fake/real.
       """
        crit_pred = self.crit(image)
        return crit_pred.view(len(crit_pred), -1)

Training

Hyperparameters

As usual, we'll start by setting the parameters:

  • nepochs: the number of times you iterate through the entire dataset when training
  • zdim: the dimension of the noise vector
  • displaystep: how often to display/visualize the images
  • batchsize: the number of images per forward/backward pass
  • lr: the learning rate
  • beta1, beta2: the momentum terms
  • clambda: weight of the gradient penalty
  • critrepeats: number of times to update the critic per generator update - there are more details about this in the Putting It All Together section
  • device: the device type
n_epochs = 100
z_dim = 64
display_step = 50
batch_size = 128
lr = 0.0002
beta_1 = 0.5
beta_2 = 0.999
c_lambda = 10
crit_repeats = 5
device = 'cuda'

The Data

Once again we'll be using the MNIST dataset.

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

data_path = Path("~/pytorch-data/MNIST/").expanduser()
dataloader = DataLoader(
    MNIST(data_path, download=True, transform=transform),
    batch_size=batch_size,
    shuffle=True)

Setup For Training

gen = Generator(z_dim).to(device)
gen_opt = torch.optim.Adam(gen.parameters(), lr=lr, betas=(beta_1, beta_2))
crit = Critic().to(device) 
crit_opt = torch.optim.Adam(crit.parameters(), lr=lr, betas=(beta_1, beta_2))
def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
        torch.nn.init.normal_(m.weight, 0.0, 0.02)
    if isinstance(m, nn.BatchNorm2d):
        torch.nn.init.normal_(m.weight, 0.0, 0.02)
        torch.nn.init.constant_(m.bias, 0)
    return
gen = gen.apply(weights_init)
crit = crit.apply(weights_init)

The Gradient

Calculating the gradient penalty can be broken into two functions: (1) compute the gradient with respect to the images and (2) compute the gradient penalty given the gradient.

You can start by getting the gradient. The gradient is computed by first creating a mixed image. This is done by weighing the fake and real image using epsilon and then adding them together. Once you have the intermediate image, you can get the critic's output on the image. Finally, you compute the gradient of the critic score's on the mixed images (output) with respect to the pixels of the mixed images (input).

def get_gradient(crit: Critic, real: torch.Tensor, fake: torch.Tensor,
                 epsilon: torch.Tensor) -> torch.tensor:
    """Gradient of the critic's scores with respect to mixes of real and fake images.

    Args:
       crit: the critic model
       real: a batch of real images
       fake: a batch of fake images
       epsilon: a vector of the uniformly random proportions of real/fake per mixed image

    Returns:
       gradient: the gradient of the critic's scores, with respect to the mixed image
    """
    # Mix the images together
    mixed_images = real * epsilon + fake * (1 - epsilon)

    # Calculate the critic's scores on the mixed images
    mixed_scores = crit(mixed_images)

    # Take the gradient of the scores with respect to the images
    gradient = torch.autograd.grad(
        # Note: You need to take the gradient of outputs with respect to inputs.
        #### START CODE HERE ####
        inputs = mixed_images,
        outputs = mixed_scores,
        #### END CODE HERE ####
        # These other parameters have to do with how the pytorch autograd engine works
        grad_outputs=torch.ones_like(mixed_scores), 
        create_graph=True,
        retain_graph=True,
    )[0]
    return gradient
  • Unit Tests
    def test_get_gradient(image_shape):
        real = torch.randn(*image_shape, device=device) + 1
        fake = torch.randn(*image_shape, device=device) - 1
        epsilon_shape = [1 for _ in image_shape]
        epsilon_shape[0] = image_shape[0]
        epsilon = torch.rand(epsilon_shape, device=device).requires_grad_()
        gradient = get_gradient(crit, real, fake, epsilon)
        assert tuple(gradient.shape) == image_shape
        assert gradient.max() > 0
        assert gradient.min() < 0
        return gradient
    
    gradient = test_get_gradient((256, 1, 28, 28))
    

The Gradient Penalty

The second function you need to complete is to compute the gradient penalty given the gradient. First, you calculate the magnitude of each image's gradient. The magnitude of a gradient is also called the norm. Then, you calculate the penalty by squaring the distance between each magnitude and the ideal norm of 1 and taking the mean of all the squared distances.

  1. Make sure you take the mean at the end.
  2. Note that the magnitude of each gradient has already been calculated for you.
def gradient_penalty(gradient: torch.Tensor) -> torch.Tensor:
    """Calculate the size of each image's gradient
    and penalize the mean quadratic distance of each magnitude to 1.

    Args:
       gradient: the gradient of the critic's scores, with respect to the mixed image

    Returns:
       penalty: the gradient penalty
    """
    # Flatten the gradients so that each row captures one image
    gradient = gradient.view(len(gradient), -1)

    # Calculate the magnitude of every row
    gradient_norm = gradient.norm(2, dim=1)

    # Penalize the mean squared distance of the gradient norms from 1
    penalty = torch.mean(torch.square(gradient_norm - 1))
    return penalty
  • Unit Testing
    def test_gradient_penalty(image_shape: tuple):
        bad_gradient = torch.zeros(*image_shape)
        bad_gradient_penalty = gradient_penalty(bad_gradient)
        assert torch.isclose(bad_gradient_penalty, torch.tensor(1.))
    
        image_size = torch.prod(torch.Tensor(image_shape[1:]))
        good_gradient = torch.ones(*image_shape) / torch.sqrt(image_size)
        good_gradient_penalty = gradient_penalty(good_gradient)
        assert torch.isclose(good_gradient_penalty, torch.tensor(0.))
    
        random_gradient = test_get_gradient(image_shape)
        random_gradient_penalty = gradient_penalty(random_gradient)
        assert torch.abs(random_gradient_penalty - 1) < 0.1
    
    test_gradient_penalty((256, 1, 28, 28))
    

Losses

Next, you need to calculate the loss for the generator and the critic.

  • Generator Loss

    For the generator, the loss is calculated by maximizing the critic's prediction on the generator's fake images. The argument has the scores for all fake images in the batch, but you will use the mean of them.

    1. This can be written in one line.
    2. This is the negative of the mean of the critic's scores.
    def get_gen_loss(crit_fake_pred: torch.Tensor) -> torch.Tensor:
        """loss of generator given critic's scores of generator's fake images.
    
        Args:
           crit_fake_pred: the critic's scores of the fake images
    
        Returns:
           gen_loss: a scalar loss value for the current batch of the generator
        """
        return -torch.mean(crit_fake_pred)
    
    assert torch.isclose(
        get_gen_loss(torch.tensor(1.)), torch.tensor(-1.0)
    )
    
    assert torch.isclose(
        get_gen_loss(torch.rand(10000)), torch.tensor(-0.5), 0.05
    )
    
  • The Critic Loss

    For the critic, the loss is calculated by maximizing the distance between the critic's predictions on the real images and the predictions on the fake images while also adding a gradient penalty. The gradient penalty is weighed according to lambda. The arguments are the scores for all the images in the batch, and you will use the mean of them.

    1. The higher the mean fake score, the higher the critic's loss is.
    2. What does this suggest about the mean real score?
    3. The higher the gradient penalty, the higher the critic's loss is, proportional to lambda.
    def get_crit_loss(crit_fake_pred: torch.Tensor, crit_real_pred: torch.Tensor,
                      gp: torch.Tensor, c_lambda: torch.Tensor) -> torch.Tensor:
        """loss of a critic given critic's scores for fake and real images,
        the gradient penalty, and gradient penalty weight.
    
        Args:
           crit_fake_pred: the critic's scores of the fake images
           crit_real_pred: the critic's scores of the real images
           gp: the unweighted gradient penalty
           c_lambda: the current weight of the gradient penalty 
    
        Returns:
           crit_loss: a scalar for the critic's loss, accounting for the relevant factors
        """
        return torch.mean(crit_fake_pred - crit_real_pred  + gp * c_lambda)
    
    assert torch.isclose(
        get_crit_loss(torch.tensor(1.), torch.tensor(2.), torch.tensor(3.), 0.1),
        torch.tensor(-0.7)
    )
    assert torch.isclose(
        get_crit_loss(torch.tensor(20.), torch.tensor(-20.), torch.tensor(2.), 10),
        torch.tensor(60.)
    )
    

Running the Training

Before you put everything together, there are a few things to note.

  1. Even on GPU, the training will run more slowly than previous labs because the gradient penalty requires you to compute the gradient of a gradient – this means potentially a few minutes per epoch! For best results, run this for as long as you can while on GPU.
  2. One important difference from earlier versions is that you will update the critic multiple times every time you update the generator This helps prevent the generator from overpowering the critic. Sometimes, you might see the reverse, with the generator updated more times than the critic. This depends on architectural (e.g. the depth and width of the network) and algorithmic choices (e.g. which loss you're using).
  3. WGAN-GP isn't necessarily meant to improve overall performance of a GAN, but just increases stability and avoids mode collapse. In general, a WGAN will be able to train in a much more stable way than the vanilla DCGAN from last assignment, though it will generally run a bit slower. You should also be able to train your model for more epochs without it collapsing.
def update_critic(critic, critic_optimizer, generator, generator_optimizer, batch_size, z_dim, real):
    critic_optimizer.zero_grad()
    fake_noise = make_noise(batch_size, z_dim, device=device)
    fake = generator(fake_noise)
    crit_fake_pred = critic(fake.detach())
    crit_real_pred = critic(real)

    epsilon = torch.rand(len(real), 1, 1, 1, device=device, requires_grad=True)
    gradient = get_gradient(critic, real, fake.detach(), epsilon)
    gp = gradient_penalty(gradient)
    crit_loss = get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda)

    # Keep track of the average critic loss in this batch
    mean_iteration_critic_loss = crit_loss.detach().item() / crit_repeats
    # Update gradients
    crit_loss.backward()
    # Update optimizer
    crit_opt.step()
    return mean_iteration_critic_loss, fake
def update_generator(generator, generator_optimizer, critic, critic_optimizer,
                     batch_size, z_dim):
        generator_optimizer.zero_grad()
        fake_noise_2 = make_noise(batch_size, z_dim, device=device)
        fake_2 = generator(fake_noise_2)
        crit_fake_pred = critic(fake_2)

        gen_loss = get_gen_loss(crit_fake_pred)
        gen_loss.backward()

        # Update the weights
        generator_optimizer.step()
        return [gen_loss.detach().item()]
cur_step = 0
generator_losses = []
critic_losses = []
fakes = []

with TIMER:
    for epoch in range(n_epochs):
        # Dataloader returns the batches
        for real, _ in dataloader:
            cur_batch_size = len(real)
            real = real.to(device)

            mean_iteration_critic_loss = 0
            for _ in range(crit_repeats):
                ### Update critic ###
                this_loss, fake = update_critic(crit, crit_opt, gen, gen_opt,
                                                cur_batch_size, z_dim, real)
                mean_iteration_critic_loss += this_loss
            critic_losses += [mean_iteration_critic_loss]

            ### Update generator ###
            # Keep track of the average generator loss
            generator_losses += update_generator(gen, gen_opt, crit, crit_opt,
                                                 cur_batch_size, z_dim)

            ### Visualization code ###
            if cur_step % display_step == 0 and cur_step > 0:
                gen_mean = sum(generator_losses[-display_step:]) / display_step
                crit_mean = sum(critic_losses[-display_step:]) / display_step
                print(f"Step {cur_step}: Generator loss: {gen_mean}, critic loss: {crit_mean}")
                fakes.append(fake)
                #show_tensor_images(fake)
                # show_tensor_images(real)
                # step_bins = 20
                #num_examples = (len(generator_losses) // step_bins) * step_bins
                #plt.plot(
                #    range(num_examples // step_bins), 
                #    torch.Tensor(generator_losses[:num_examples]).view(-1, step_bins).mean(1),
                #    label="Generator Loss"
                #)
                #plt.plot(
                #    range(num_examples // step_bins), 
                #    torch.Tensor(critic_losses[:num_examples]).view(-1, step_bins).mean(1),
                #    label="Critic Loss"
                #)
                #plt.legend()
                #plt.show()

            cur_step += 1
Started: 2021-04-23 16:44:37.086571
Step 50: Generator loss: 1.2940945455431938, critic loss: -2.5389487731456755
Step 100: Generator loss: 1.8233803486824036, critic loss: -10.170887191772463
Step 150: Generator loss: -0.8236922709643841, critic loss: -25.889275665283208
Step 200: Generator loss: -1.9489177632331849, critic loss: -57.93669644165039
Step 250: Generator loss: -1.6910316547751427, critic loss: -98.02721130371094
Step 300: Generator loss: -1.057899413406849, critic loss: -148.77607403564457
Step 350: Generator loss: -1.0930944073200226, critic loss: -199.94886077880858
Step 400: Generator loss: 1.900166620016098, critic loss: -245.53067184448247
Step 450: Generator loss: -18.928784263134002, critic loss: -251.46439450645448
Step 500: Generator loss: -7.688082475662231, critic loss: -289.45334830856325
Step 550: Generator loss: 12.447209596633911, critic loss: -395.351733947754
Step 600: Generator loss: 4.604712443947792, critic loss: -442.96986193847647
Step 650: Generator loss: 2.1788939160108565, critic loss: -480.044010131836
Step 700: Generator loss: 2.979072951376438, critic loss: -519.4769331054689
Step 750: Generator loss: -49.77768729448319, critic loss: -406.99980457305907
Step 800: Generator loss: 0.28986886143684387, critic loss: -444.8244698066711
Step 850: Generator loss: 31.1217813873291, critic loss: -608.1500103759765
Step 900: Generator loss: 12.006675623655319, critic loss: -632.0770750808719
Step 950: Generator loss: -0.15041383981704712, critic loss: -659.3277660064699
Step 1000: Generator loss: 15.936325817108154, critic loss: -629.952421447754
Step 1050: Generator loss: -43.25309041261673, critic loss: -504.54743419075004
Step 1100: Generator loss: -127.80617136001587, critic loss: -347.7993973159789
Step 1150: Generator loss: 4.186352119445801, critic loss: -461.6966292152405
Step 1200: Generator loss: 19.471285017728807, critic loss: -417.6742295103073
Step 1250: Generator loss: 34.04052387237549, critic loss: -327.74495936584475
Step 1300: Generator loss: -61.267093954086306, critic loss: -114.96264076042176
Step 1350: Generator loss: -56.96540081501007, critic loss: -257.8397505912781
Step 1400: Generator loss: -58.51407446861267, critic loss: -284.2404485015868
Step 1450: Generator loss: -31.23556293010712, critic loss: -282.15282668590544
Step 1500: Generator loss: 21.97936663866043, critic loss: -201.8184239835738
Step 1550: Generator loss: -35.051265001297, critic loss: -268.2542330398559
Step 1600: Generator loss: -13.768656857013703, critic loss: -201.92625104904172
Step 1650: Generator loss: 22.134875717163087, critic loss: -222.15251140356065
Step 1700: Generator loss: -33.80421092987061, critic loss: -196.00927429389947
Step 1750: Generator loss: -57.25435597419739, critic loss: -182.85244289588928
Step 1800: Generator loss: -41.60410815238953, critic loss: -213.254286611557
Step 1850: Generator loss: -4.978743267059326, critic loss: -101.88668561553959
Step 1900: Generator loss: 43.375376815795896, critic loss: 24.468120357513428
Step 1950: Generator loss: 37.55927352905273, critic loss: 19.142875072479246
Step 2000: Generator loss: 30.793880767822266, critic loss: 27.632160606384268
Step 2050: Generator loss: 28.9916410446167, critic loss: 37.41749234771728
Step 2100: Generator loss: 28.57459102630615, critic loss: 36.46667390441895
Step 2150: Generator loss: 27.179994583129883, critic loss: 37.36057964324953
Step 2200: Generator loss: 26.722407608032228, critic loss: 36.42123816680908
Step 2250: Generator loss: 26.215636711120606, critic loss: 35.10568865203857
Step 2300: Generator loss: 25.28977954864502, critic loss: 38.4949776916504
Step 2350: Generator loss: 25.161714172363283, critic loss: 30.91700393295288
Step 2400: Generator loss: 25.609521713256836, critic loss: 27.127794273376463
Step 2450: Generator loss: 26.457210426330565, critic loss: 23.25596778869629
Step 2500: Generator loss: 27.144473686218262, critic loss: 18.423582084655763
Step 2550: Generator loss: 28.104863624572754, critic loss: 16.720462280273438
Step 2600: Generator loss: 29.460466690063477, critic loss: 13.846090631484987
Step 2650: Generator loss: 31.16196632385254, critic loss: 10.717047594070436
Step 2700: Generator loss: 32.86851013183594, critic loss: 8.973742393493652
Step 2750: Generator loss: 33.90616256713867, critic loss: 9.844469717025756
Step 2800: Generator loss: 34.65669334411621, critic loss: 8.557393852233888
Step 2850: Generator loss: 35.84923110961914, critic loss: 6.227309632301333
Step 2900: Generator loss: 37.1290372467041, critic loss: 4.664727992773057
Step 2950: Generator loss: 39.00773422241211, critic loss: 3.6960949053764343
Step 3000: Generator loss: 41.04932693481445, critic loss: 3.064339481592179
Step 3050: Generator loss: 43.54303398132324, critic loss: 1.5976664029359815
Step 3100: Generator loss: 46.25879165649414, critic loss: 0.43558707976341254
Step 3150: Generator loss: 48.358483200073245, critic loss: -0.8735819962918758
Step 3200: Generator loss: 49.9193138885498, critic loss: -1.9399951877593993
Step 3250: Generator loss: 50.604149169921875, critic loss: -2.96596682035923
Step 3300: Generator loss: 51.37260269165039, critic loss: -4.266795755624772
Step 3350: Generator loss: 50.53414665222168, critic loss: -6.2572406907081595
Step 3400: Generator loss: 49.34995780944824, critic loss: -8.031075536847114
Step 3450: Generator loss: 46.14337966918945, critic loss: -8.019683789610863
Step 3500: Generator loss: 42.769298782348635, critic loss: -9.498445952415468
Step 3550: Generator loss: 37.38293798446655, critic loss: -9.02791331624985
Step 3600: Generator loss: 32.84453460693359, critic loss: -9.934100509524345
Step 3650: Generator loss: 29.88087886810303, critic loss: -9.069164658904075
Step 3700: Generator loss: 27.295934791564942, critic loss: -12.109804625511167
Step 3750: Generator loss: 23.694135398864745, critic loss: -14.327697192192076
Step 3800: Generator loss: 22.836445541381835, critic loss: -15.450897558450697
Step 3850: Generator loss: 21.66964967727661, critic loss: -18.371595690727236
Step 3900: Generator loss: 22.644691734313966, critic loss: -18.472765784740442
Step 3950: Generator loss: 23.275020160675048, critic loss: -14.622903740763663
Step 4000: Generator loss: 20.404177145957945, critic loss: -20.265531128883364
Step 4050: Generator loss: 20.57322360277176, critic loss: -22.811122689247135
Step 4100: Generator loss: 20.653975734710695, critic loss: -21.081045699119564
Step 4150: Generator loss: 22.07396845817566, critic loss: -25.1140656299591
Step 4200: Generator loss: 23.147041385173797, critic loss: -25.637423175573346
Step 4250: Generator loss: 24.7466512966156, critic loss: -27.446938713431358
Step 4300: Generator loss: 23.155011949539183, critic loss: -29.866371290445326
Step 4350: Generator loss: 28.670740413665772, critic loss: -27.526438851594932
Step 4400: Generator loss: 28.197952184677124, critic loss: -32.777981144189845
Step 4450: Generator loss: 30.352355518341064, critic loss: -27.1594803442955
Step 4500: Generator loss: 28.54464930534363, critic loss: -33.94081681919097
Step 4550: Generator loss: 30.315768175125122, critic loss: -32.86432695555688
Step 4600: Generator loss: 31.542511186599732, critic loss: -30.20407930350304
Step 4650: Generator loss: 32.1046596121788, critic loss: -25.409390352845193
Step 4700: Generator loss: 32.14258025169372, critic loss: -31.69375462341309
Step 4750: Generator loss: 34.99601099014282, critic loss: -17.207461384415634
Step 4800: Generator loss: 34.72456073760986, critic loss: -28.68983098757266
Step 4850: Generator loss: 43.15867195129395, critic loss: -3.741025509417056
Step 4900: Generator loss: 39.205870933532715, critic loss: -10.995340047717095
Step 4950: Generator loss: 33.214964599609374, critic loss: -22.35341439080238
Step 5000: Generator loss: 36.83505029678345, critic loss: -22.059852074146274
Step 5050: Generator loss: 44.310142288208006, critic loss: -9.833503689646719
Step 5100: Generator loss: 46.455570983886716, critic loss: -6.97827914196253
Step 5150: Generator loss: 50.3965446472168, critic loss: 2.86564082187414
Step 5200: Generator loss: 49.87795219421387, critic loss: -1.3452879690229889
Step 5250: Generator loss: 47.53674819946289, critic loss: -2.096805039405823
Step 5300: Generator loss: 46.8746314239502, critic loss: -3.2593628435134883
Step 5350: Generator loss: 45.44812057495117, critic loss: -8.15779336643219
Step 5400: Generator loss: 44.419895820617675, critic loss: -14.570247013330457
Step 5450: Generator loss: 46.02410781860352, critic loss: -15.177982830524446
Step 5500: Generator loss: 49.54875686645508, critic loss: -9.89209368979931
Step 5550: Generator loss: 48.06167510986328, critic loss: -14.110691767692567
Step 5600: Generator loss: 49.201857833862306, critic loss: -14.137419148623945
Step 5650: Generator loss: 50.152088012695316, critic loss: -12.306397112727165
Step 5700: Generator loss: 48.29638786315918, critic loss: -16.661144974470133
Step 5750: Generator loss: 48.57353067398071, critic loss: -14.890159791767603
Step 5800: Generator loss: 49.75064552307129, critic loss: -18.844482659339906
Step 5850: Generator loss: 60.04904914855957, critic loss: -6.717597324132919
Step 5900: Generator loss: 51.537723999023434, critic loss: -16.97626993632317
Step 5950: Generator loss: 53.64197952270508, critic loss: -17.934735801696778
Step 6000: Generator loss: 58.61811660766602, critic loss: -12.544874910593034
Step 6050: Generator loss: 57.9530167388916, critic loss: -12.869983579158779
Step 6100: Generator loss: 58.112417755126955, critic loss: -14.860800614833833
Step 6150: Generator loss: 59.45550857543945, critic loss: -16.21854728984833
Step 6200: Generator loss: 61.55990020751953, critic loss: -13.752459713578226
Step 6250: Generator loss: 63.91949012756348, critic loss: -15.32866345870495
Step 6300: Generator loss: 61.11529357910156, critic loss: -19.138810309529305
Step 6350: Generator loss: 68.78476165771484, critic loss: -3.858711770117282
Step 6400: Generator loss: 72.07508163452148, critic loss: -3.3317795319557204
Step 6450: Generator loss: 62.11038558959961, critic loss: -12.74781008577347
Step 6500: Generator loss: 66.10368064880372, critic loss: -13.66576182627678
Step 6550: Generator loss: 62.73857864379883, critic loss: -19.79733684468269
Step 6600: Generator loss: 64.86283889770507, critic loss: -15.91535943055153
Step 6650: Generator loss: 65.02771781921386, critic loss: -16.515603628635407
Step 6700: Generator loss: 73.10651649475098, critic loss: -7.974747009277344
Step 6750: Generator loss: 69.39200439453126, critic loss: -12.647881946563723
Step 6800: Generator loss: 70.61859390258789, critic loss: -14.981548887073998
Step 6850: Generator loss: 71.39209846496583, critic loss: -12.02037605035305
Step 6900: Generator loss: 68.91642692565918, critic loss: -17.377452049493794
Step 6950: Generator loss: 73.83714424133301, critic loss: -14.842290714025498
Step 7000: Generator loss: 76.0492682647705, critic loss: -4.022153543114662
Step 7050: Generator loss: 73.60314575195312, critic loss: -11.167652189731598
Step 7100: Generator loss: 73.69744178771973, critic loss: -16.215790304422377
Step 7150: Generator loss: 73.02161018371582, critic loss: -11.844917020320892
Step 7200: Generator loss: 84.43860961914062, critic loss: -4.338678442955016
Step 7250: Generator loss: 72.4216611480713, critic loss: -16.95018665671349
Step 7300: Generator loss: 75.08161041259766, critic loss: -13.94019952297211
Step 7350: Generator loss: 76.7044221496582, critic loss: -14.254385577440262
Step 7400: Generator loss: 81.03584564208984, critic loss: -3.171723330259324
Step 7450: Generator loss: 80.19454528808593, critic loss: -6.323260527610778
Step 7500: Generator loss: 74.55620361328126, critic loss: -8.62027923491597
Step 7550: Generator loss: 84.05591217041015, critic loss: -3.5706960783004775
Step 7600: Generator loss: 81.2258724975586, critic loss: -8.142396178722382
Step 7650: Generator loss: 73.19812255859375, critic loss: -16.196065732836722
Step 7700: Generator loss: 74.52944702148437, critic loss: -15.7419521817565
Step 7750: Generator loss: 80.32163719177247, critic loss: -7.413010147571564
Step 7800: Generator loss: 76.99493499755859, critic loss: -12.079633572757244
Step 7850: Generator loss: 81.32430145263672, critic loss: -2.8193510160446174
Step 7900: Generator loss: 80.63022003173828, critic loss: -3.1151746976375576
Step 7950: Generator loss: 75.89005561828613, critic loss: -8.688790566921234
Step 8000: Generator loss: 72.94720428466798, critic loss: -14.186805599212649
Step 8050: Generator loss: 80.84135955810547, critic loss: -11.586392744839191
Step 8100: Generator loss: 79.48079322814941, critic loss: -1.3788062819838527
Step 8150: Generator loss: 72.63796539306641, critic loss: -14.767250993669036
Step 8200: Generator loss: 76.29679145812989, critic loss: -16.04671211397648
Step 8250: Generator loss: 72.60974617004395, critic loss: -17.008654308915133
Step 8300: Generator loss: 75.25621772766114, critic loss: -12.109682399034496
Step 8350: Generator loss: 81.09654647827148, critic loss: -10.706179085254668
Step 8400: Generator loss: 77.28005485534668, critic loss: -4.09239830350876
Step 8450: Generator loss: 83.45014526367187, critic loss: -3.1862959499359125
Step 8500: Generator loss: 80.24715942382812, critic loss: -4.144565615177154
Step 8550: Generator loss: 76.43464157104492, critic loss: -9.53649512773752
Step 8600: Generator loss: 73.67140350341796, critic loss: -15.18680296653509
Step 8650: Generator loss: 75.6114599609375, critic loss: -10.128391755342484
Step 8700: Generator loss: 73.68272163391113, critic loss: -16.97586714470387
Step 8750: Generator loss: 83.1702619934082, critic loss: -0.6609140309095384
Step 8800: Generator loss: 80.41752578735351, critic loss: -4.212692310333251
Step 8850: Generator loss: 71.03237358093261, critic loss: -14.983835175275805
Step 8900: Generator loss: 75.80495880126954, critic loss: -12.667168443322183
Step 8950: Generator loss: 81.14228034973145, critic loss: 2.7472501730918872
Step 9000: Generator loss: 81.20193344116211, critic loss: -3.052738008499146
Step 9050: Generator loss: 73.43904174804688, critic loss: -7.423715700268742
Step 9100: Generator loss: 73.12181861877441, critic loss: -14.306883191585541
Step 9150: Generator loss: 76.89906158447266, critic loss: -13.396733086347583
Step 9200: Generator loss: 75.99712623596191, critic loss: -12.318668732821939
Step 9250: Generator loss: 77.78204513549805, critic loss: -6.621456883490087
Step 9300: Generator loss: 77.82661689758301, critic loss: -11.999425900220869
Step 9350: Generator loss: 81.48483535766601, critic loss: -11.480147421479224
Step 9400: Generator loss: 75.37383903503418, critic loss: -11.605070021390913
Step 9450: Generator loss: 83.24758972167969, critic loss: -1.770111013114451
Step 9500: Generator loss: 75.71745803833008, critic loss: -14.370290687352417
Step 9550: Generator loss: 80.75228134155273, critic loss: -12.244659341961144
Step 9600: Generator loss: 80.36522689819336, critic loss: -9.994889120757579
Step 9650: Generator loss: 79.76879989624024, critic loss: -12.11628355455398
Step 9700: Generator loss: 75.03965270996093, critic loss: -15.582087687492374
Step 9750: Generator loss: 78.26055725097656, critic loss: -9.227732668161394
Step 9800: Generator loss: 86.73946716308593, critic loss: -3.9114915781021113
Step 9850: Generator loss: 77.57634506225585, critic loss: -16.903033419966697
Step 9900: Generator loss: 79.62038360595703, critic loss: -13.387711975812913
Step 9950: Generator loss: 83.48049461364747, critic loss: 0.4212318459749224
Step 10000: Generator loss: 86.0385548400879, critic loss: -3.0202082567214954
Step 10050: Generator loss: 84.96556030273437, critic loss: -3.2984186277389527
Step 10100: Generator loss: 82.55163467407226, critic loss: -5.651416356563568
Step 10150: Generator loss: 72.47459297180175, critic loss: -16.2935069770813
Step 10200: Generator loss: 77.47050117492675, critic loss: -14.219993201971054
Step 10250: Generator loss: 82.40048095703125, critic loss: -9.951535837292676
Step 10300: Generator loss: 78.51686393737793, critic loss: -5.037457182884218
Step 10350: Generator loss: 79.00918548583985, critic loss: -10.983480290770531
Step 10400: Generator loss: 79.10479446411132, critic loss: -11.458023426651957
Step 10450: Generator loss: 79.01952590942383, critic loss: -13.550984252214432
Step 10500: Generator loss: 79.7324333190918, critic loss: -15.04755926167965
Step 10550: Generator loss: 83.25529792785645, critic loss: -10.678096773743627
Step 10600: Generator loss: 78.7729409790039, critic loss: -14.363517974853519
Step 10650: Generator loss: 83.80620101928712, critic loss: -12.4009742795825
Step 10700: Generator loss: 83.44554489135743, critic loss: -5.4632708239853365
Step 10750: Generator loss: 84.38950912475586, critic loss: -4.946207571595907
Step 10800: Generator loss: 84.90599151611327, critic loss: -10.688541789770127
Step 10850: Generator loss: 80.39469886779786, critic loss: -13.391746405303474
Step 10900: Generator loss: 79.68403381347656, critic loss: -14.792330410242082
Step 10950: Generator loss: 84.55435623168945, critic loss: -12.792006389081477
Step 11000: Generator loss: 85.3377848815918, critic loss: -1.002582928955554
Step 11050: Generator loss: 76.42176498413086, critic loss: -16.618346381425855
Step 11100: Generator loss: 82.8500619506836, critic loss: -10.213502784013746
Step 11150: Generator loss: 80.111083984375, critic loss: -16.506468793153765
Step 11200: Generator loss: 81.84511749267578, critic loss: -14.588824108004571
Step 11250: Generator loss: 82.36108421325683, critic loss: -14.826971750736238
Step 11300: Generator loss: 82.89525245666503, critic loss: -14.743118989944467
Step 11350: Generator loss: 78.9609211730957, critic loss: -6.072368972778322
Step 11400: Generator loss: 79.75704879760742, critic loss: -11.66915795624256
Step 11450: Generator loss: 92.73718231201173, critic loss: -8.626956017732619
Step 11500: Generator loss: 76.74110557556152, critic loss: -13.485125755786896
Step 11550: Generator loss: 86.92150177001953, critic loss: -11.96049699956179
Step 11600: Generator loss: 87.94025703430175, critic loss: -7.829241111636162
Step 11650: Generator loss: 78.58638778686523, critic loss: -13.818019400000573
Step 11700: Generator loss: 82.94163925170898, critic loss: -16.088717435359957
Step 11750: Generator loss: 82.20194442749023, critic loss: -13.443735618114472
Step 11800: Generator loss: 77.3590771484375, critic loss: -0.26538432469963885
Step 11850: Generator loss: 87.65712219238281, critic loss: -2.2925723257064816
Step 11900: Generator loss: 86.44266906738281, critic loss: -2.755362086296081
Step 11950: Generator loss: 85.7614064025879, critic loss: -2.9416364326477047
Step 12000: Generator loss: 84.22476821899414, critic loss: -3.100327790260315
Step 12050: Generator loss: 81.84705871582031, critic loss: -3.3889783926010137
Step 12100: Generator loss: 74.62463600158691, critic loss: -9.155223772525787
Step 12150: Generator loss: 83.41003746032715, critic loss: -7.312069640517238
Step 12200: Generator loss: 77.82574188232422, critic loss: -10.063361536026001
Step 12250: Generator loss: 77.09058532714843, critic loss: -15.389594004154203
Step 12300: Generator loss: 85.65135437011719, critic loss: -11.597671725511553
Step 12350: Generator loss: 79.91491325378418, critic loss: -0.8456090040206905
Step 12400: Generator loss: 83.31446044921876, critic loss: -3.5672192862033842
Step 12450: Generator loss: 80.4154541015625, critic loss: -9.493659735798834
Step 12500: Generator loss: 77.22660888671875, critic loss: -11.343838263094426
Step 12550: Generator loss: 76.51863540649414, critic loss: -15.957162732720372
Step 12600: Generator loss: 71.82434341430664, critic loss: -15.232202378749843
Step 12650: Generator loss: 81.55846801757812, critic loss: -12.02893185913563
Step 12700: Generator loss: 77.01351791381836, critic loss: -14.394531373143197
Step 12750: Generator loss: 82.79933059692382, critic loss: -10.995534277558324
Step 12800: Generator loss: 80.33022705078125, critic loss: -7.422801446437835
Step 12850: Generator loss: 77.88019416809082, critic loss: -10.48680070441961
Step 12900: Generator loss: 77.28355583190918, critic loss: -15.062006795048712
Step 12950: Generator loss: 72.02762420654297, critic loss: -18.125201426446434
Step 13000: Generator loss: 78.97825164794922, critic loss: -11.02606911355257
Step 13050: Generator loss: 76.02745002746582, critic loss: -13.242777463912965
Step 13100: Generator loss: 82.44893028259277, critic loss: -10.203380972802634
Step 13150: Generator loss: 80.63447105407715, critic loss: -11.436619911789894
Step 13200: Generator loss: 69.52673934936523, critic loss: -12.998723325610163
Step 13250: Generator loss: 75.26367416381837, critic loss: -12.58380482053757
Step 13300: Generator loss: 78.29216751098633, critic loss: 0.21028297042846839
Step 13350: Generator loss: 70.94842475891113, critic loss: -8.405993442416191
Step 13400: Generator loss: 77.60350791931153, critic loss: -12.201066960632803
Step 13450: Generator loss: 78.38650337219238, critic loss: -13.255251537919046
Step 13500: Generator loss: 72.39071220397949, critic loss: -13.91472595399618
Step 13550: Generator loss: 78.81595336914063, critic loss: -12.717635474145416
Step 13600: Generator loss: 69.23250061035156, critic loss: -15.01334501111508
Step 13650: Generator loss: 77.3666291809082, critic loss: -16.321711009979246
Step 13700: Generator loss: 73.45859939575195, critic loss: -17.17580293393135
Step 13750: Generator loss: 74.07134948730469, critic loss: -14.143001305580142
Step 13800: Generator loss: 68.98319381713867, critic loss: -18.013431072473526
Step 13850: Generator loss: 73.18379371643067, critic loss: -13.245033169150352
Step 13900: Generator loss: 73.70108238220215, critic loss: -15.747089947700497
Step 13950: Generator loss: 71.67143341064452, critic loss: -6.442092946648602
Step 14000: Generator loss: 74.99322380065918, critic loss: -5.310949310302733
Step 14050: Generator loss: 69.55456466674805, critic loss: -7.584069814443586
Step 14100: Generator loss: 68.11343818664551, critic loss: -15.932588892817499
Step 14150: Generator loss: 73.32868095397949, critic loss: -14.538219540774824
Step 14200: Generator loss: 71.54050506591797, critic loss: -6.507004916965961
Step 14250: Generator loss: 73.50055587768554, critic loss: -12.074983437180519
Step 14300: Generator loss: 75.37609176635742, critic loss: -12.215355042934414
Step 14350: Generator loss: 78.41978523254394, critic loss: -13.282461894750588
Step 14400: Generator loss: 69.06725090026856, critic loss: -8.44315874606371
Step 14450: Generator loss: 77.47375007629394, critic loss: -10.59642046368122
Step 14500: Generator loss: 72.112548828125, critic loss: -9.080148652315138
Step 14550: Generator loss: 71.41747200012207, critic loss: -12.610691975355143
Step 14600: Generator loss: 68.53853507995605, critic loss: -14.517420025825501
Step 14650: Generator loss: 71.00217765808105, critic loss: -16.055311642885208
Step 14700: Generator loss: 75.56183944702148, critic loss: -4.261986103117466
Step 14750: Generator loss: 68.21860916137695, critic loss: -14.03696541213989
Step 14800: Generator loss: 71.7959959411621, critic loss: -13.989702057063587
Step 14850: Generator loss: 76.38227409362793, critic loss: -10.939811514139176
Step 14900: Generator loss: 67.81556015014648, critic loss: -15.070325279712678
Step 14950: Generator loss: 71.62906150817871, critic loss: -12.239016912937165
Step 15000: Generator loss: 73.60893783569335, critic loss: -5.476252611890436
Step 15050: Generator loss: 64.43828086853027, critic loss: -11.680644391536712
Step 15100: Generator loss: 66.76135398864746, critic loss: -18.935012437820443
Step 15150: Generator loss: 64.45999412536621, critic loss: -16.77594568133354
Step 15200: Generator loss: 68.81907485961914, critic loss: -16.819265387773513
Step 15250: Generator loss: 71.44663459777831, critic loss: -14.780536164999004
Step 15300: Generator loss: 69.05639785766601, critic loss: -15.773872276782981
Step 15350: Generator loss: 72.00610313415527, critic loss: -12.428475862145426
Step 15400: Generator loss: 66.33817840576172, critic loss: -11.460507846534249
Step 15450: Generator loss: 73.98657371520996, critic loss: -12.046799251675607
Step 15500: Generator loss: 71.14604766845703, critic loss: -14.48868891143799
Step 15550: Generator loss: 72.55676879882813, critic loss: -9.285633412837981
Step 15600: Generator loss: 72.27706947326661, critic loss: -11.433179477930068
Step 15650: Generator loss: 70.14899436950684, critic loss: -14.64919223260879
Step 15700: Generator loss: 70.52759353637695, critic loss: -13.822800672113893
Step 15750: Generator loss: 66.5163092803955, critic loss: -13.497988208055496
Step 15800: Generator loss: 65.68713722229003, critic loss: -13.090139507174491
Step 15850: Generator loss: 68.86076667785645, critic loss: -12.112882311582563
Step 15900: Generator loss: 72.71573020935058, critic loss: -11.739855915784835
Step 15950: Generator loss: 69.23649925231933, critic loss: 1.5610642746686931
Step 16000: Generator loss: 63.27606719970703, critic loss: -6.625546929836272
Step 16050: Generator loss: 65.96758232116699, critic loss: -17.750343059539794
Step 16100: Generator loss: 62.09547576904297, critic loss: -17.4672027888298
Step 16150: Generator loss: 75.76868995666504, critic loss: -12.86666469740868
Step 16200: Generator loss: 64.08884880065918, critic loss: -7.587684287369252
Step 16250: Generator loss: 64.09755882263184, critic loss: -10.57423495966196
Step 16300: Generator loss: 66.86840660095214, critic loss: -3.5877239196300508
Step 16350: Generator loss: 70.60188285827637, critic loss: -7.692209842979907
Step 16400: Generator loss: 62.29129165649414, critic loss: -12.677523095130923
Step 16450: Generator loss: 62.98073165893555, critic loss: -13.866112356960771
Step 16500: Generator loss: 61.780632400512694, critic loss: -6.281874860048294
Step 16550: Generator loss: 62.74724609375, critic loss: -13.956338333368299
Step 16600: Generator loss: 61.48925178527832, critic loss: -16.810678883075717
Step 16650: Generator loss: 52.7329150390625, critic loss: -18.111987345457074
Step 16700: Generator loss: 60.26760322570801, critic loss: -17.837719259858133
Step 16750: Generator loss: 60.27441291809082, critic loss: -14.668455944180492
Step 16800: Generator loss: 64.81710945129394, critic loss: -8.937785160303115
Step 16850: Generator loss: 61.87463485717773, critic loss: -13.174851733446122
Step 16900: Generator loss: 66.52726516723632, critic loss: -17.641908020138743
Step 16950: Generator loss: 63.35795883178711, critic loss: -17.725372922539712
Step 17000: Generator loss: 67.46929817199707, critic loss: -13.343407141447067
Step 17050: Generator loss: 59.79177055358887, critic loss: -16.512492282271385
Step 17100: Generator loss: 66.42052528381348, critic loss: -9.183917128443717
Step 17150: Generator loss: 59.208996353149416, critic loss: -13.243339603602893
Step 17200: Generator loss: 63.88817520141602, critic loss: -13.442776112914084
Step 17250: Generator loss: 69.03452033996582, critic loss: -11.0614826682806
Step 17300: Generator loss: 57.58331108093262, critic loss: -13.529039879202841
Step 17350: Generator loss: 67.6368569946289, critic loss: -11.620229701399802
Step 17400: Generator loss: 60.044710845947264, critic loss: -9.055887681692841
Step 17450: Generator loss: 64.43620628356933, critic loss: -11.920627628207207
Step 17500: Generator loss: 56.046851272583005, critic loss: -22.301562000870714
Step 17550: Generator loss: 62.282958908081056, critic loss: -14.955312865734099
Step 17600: Generator loss: 65.897964553833, critic loss: -6.340100202620029
Step 17650: Generator loss: 58.14865257263184, critic loss: -10.649906709671022
Step 17700: Generator loss: 66.7437523651123, critic loss: -12.996105446338657
Step 17750: Generator loss: 63.59517837524414, critic loss: -12.67168800020218
Step 17800: Generator loss: 65.87414787292481, critic loss: -13.124171116769311
Step 17850: Generator loss: 66.03790901184082, critic loss: -13.294757736086847
Step 17900: Generator loss: 56.273787307739255, critic loss: -18.649981175422667
Step 17950: Generator loss: 68.3505224609375, critic loss: -13.77443748676777
Step 18000: Generator loss: 59.758854675292966, critic loss: -13.528435281991955
Step 18050: Generator loss: 70.61318840026856, critic loss: -12.014795050919052
Step 18100: Generator loss: 62.63155372619629, critic loss: -9.601117482304572
Step 18150: Generator loss: 58.44773849487305, critic loss: -9.71425095164776
Step 18200: Generator loss: 59.341090240478515, critic loss: -16.878086137115954
Step 18250: Generator loss: 57.848808708190916, critic loss: -19.507797758817674
Step 18300: Generator loss: 63.10433967590332, critic loss: -4.3734778246283526
Step 18350: Generator loss: 57.56446601867676, critic loss: -10.789146659135817
Step 18400: Generator loss: 51.76399398803711, critic loss: -15.076736944794657
Step 18450: Generator loss: 57.02366355895996, critic loss: -12.479052137970923
Step 18500: Generator loss: 62.833531875610355, critic loss: -12.99720428943634
Step 18550: Generator loss: 56.588841400146485, critic loss: -14.211519970417026
Step 18600: Generator loss: 61.620222854614255, critic loss: -14.894168957710265
Step 18650: Generator loss: 59.04514297485352, critic loss: -3.9987226614952096
Step 18700: Generator loss: 54.68501613616943, critic loss: -13.798751793980603
Step 18750: Generator loss: 60.477030181884764, critic loss: -13.97087904036045
Step 18800: Generator loss: 59.51054759979248, critic loss: -18.694762709856033
Step 18850: Generator loss: 53.82080192565918, critic loss: -14.210277070969342
Step 18900: Generator loss: 64.43251205444336, critic loss: -13.768319549560543
Step 18950: Generator loss: 56.9788289642334, critic loss: -10.571144456863403
Step 19000: Generator loss: 59.03595703125, critic loss: -12.603199533462528
Step 19050: Generator loss: 60.19775802612305, critic loss: -14.499388661146167
Step 19100: Generator loss: 59.592409973144534, critic loss: -8.202755635917187
Step 19150: Generator loss: 55.50546585083008, critic loss: -16.347688998579976
Step 19200: Generator loss: 61.19869083404541, critic loss: -18.950819284915923
Step 19250: Generator loss: 66.31558391571045, critic loss: -12.890463754177098
Step 19300: Generator loss: 57.29240139007568, critic loss: -18.10998232960701
Step 19350: Generator loss: 59.32999900817871, critic loss: -13.577078444600104
Step 19400: Generator loss: 65.96876052856446, critic loss: -11.8188825455904
Step 19450: Generator loss: 56.72755683898926, critic loss: -14.319641982913016
Step 19500: Generator loss: 57.38858169555664, critic loss: -17.450813733339313
Step 19550: Generator loss: 66.02516723632813, critic loss: -10.693548452854154
Step 19600: Generator loss: 54.7833975982666, critic loss: -13.142704640865325
Step 19650: Generator loss: 57.28132354736328, critic loss: -14.967523851156233
Step 19700: Generator loss: 59.98361915588379, critic loss: -16.183865994155408
Step 19750: Generator loss: 58.18478466033935, critic loss: -15.35918751955032
Step 19800: Generator loss: 63.54511661529541, critic loss: -10.502776491999626
Step 19850: Generator loss: 56.24938293457031, critic loss: -6.75664558506012
Step 19900: Generator loss: 60.40652961730957, critic loss: -13.489446130156516
Ended: 2021-04-23 17:33:03.659309
Elapsed: 0:48:26.572738
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-35-08d8bddbdcc4> in <module>
     14             for _ in range(crit_repeats):
     15                 ### Update critic ###
---> 16                 this_loss, fake = update_critic(crit, crit_opt, gen, gen_opt,
     17                                                 cur_batch_size, z_dim, real)
     18                 mean_iteration_critic_loss += this_loss

<ipython-input-33-696fefd91963> in update_critic(critic, critic_optimizer, generator, generator_optimizer, batch_size, z_dim, real)
      7 
      8     epsilon = torch.rand(len(real), 1, 1, 1, device=device, requires_grad=True)
----> 9     gradient = get_gradient(critic, real, fake.detach(), epsilon)
     10     gp = gradient_penalty(gradient)
     11     crit_loss = get_crit_loss(crit_fake_pred, crit_real_pred, gp, c_lambda)

<ipython-input-16-06dea6615386> in get_gradient(crit, real, fake, epsilon)
     19 
     20     # Take the gradient of the scores with respect to the images
---> 21     gradient = torch.autograd.grad(
     22         # Note: You need to take the gradient of outputs with respect to inputs.
     23         #### START CODE HERE ####

~/.conda/envs/neurotic-pytorch/lib/python3.9/site-packages/torch/autograd/__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
    221         retain_graph = create_graph
    222 
--> 223     return Variable._execution_engine.run_backward(
    224         outputs, grad_outputs_, retain_graph, create_graph,
    225         inputs, allow_unused, accumulate_grad=False)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.92 GiB total capacity; 7.13 GiB already allocated; 24.62 MiB free; 7.13 GiB reserved in total by PyTorch)

End

  • Arjovsky M, Chintala S, Bottou L. Wasserstein generative adversarial networks. International conference on machine learning 2017 Jul 17 (pp. 214-223). PMLR. (archiv.org)
  • Gulrajani I, Ahmed F, Arjovsky M, Dumoulin V, Courville A. Improved training of wasserstein gans. arXiv preprint arXiv:1704.00028. 2017 Mar 31. (archiv.org)

CNN GAN

Deep Convolutional GAN (DCGAN)

We're going to build a Generative Adversarial Network to generate handwritten digits. Instead of using fully-connected layers we'll use Convolutional layers.

Here are the main features of a DCGAN.

  • Replace any pooling layers with strided convolutions (discriminator) and fractional-strided convolutions (generator).
  • Use BatchNorm in both the generator and the discriminator.
  • Remove fully connected hidden layers for deeper architectures.
  • ReLU activation in generator for all layers except for the output, which uses Tanh.
  • Use LeakyReLU activation in the discriminator for all layers.

Imports

# python
from collections import namedtuple
from functools import partial
from pathlib import Path

# conda
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import make_grid

import holoviews
import hvplot.pandas
import matplotlib.pyplot as pyplot
import pandas
import torch
# my stuff
from graeae import EmbedHoloviews, Timer

Set Up

The Random Seed

torch.manual_seed(0)

Plotting and Timing

TIMER = Timer()
slug = "cnn-gan"

Embed = partial(EmbedHoloviews, folder_path=f"files/posts/gans/{slug}")

Plot = namedtuple("Plot", ["width", "height", "fontscale", "tan", "blue", "red"])
PLOT = Plot(
    width=900,
    height=750,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
 )

Helper Functions

A Plotter

def plot_image(image: torch.Tensor,
                filename: str,
                title: str,
                num_images: int=25,
                size: tuple=(1, 28, 28),
                folder: str=f"files/posts/gans/{slug}/") -> None:
    """Plot the image and save it

    Args:
     image: the tensor with the image to plot
     filename: name for the final image file
     title: title to put on top of the image
     num_images: how many images to put in the composite image
     size: the size for the image
     folder: sub-folder to save the file in
    """
    unflattened_image = image.detach().cpu().view(-1, *size)
    image_grid = make_grid(unflattened_image[:num_images], nrow=5)

    pyplot.title(title)
    pyplot.grid(False)
    pyplot.imshow(image_grid.permute(1, 2, 0).squeeze())

    pyplot.tick_params(bottom=False, top=False, labelbottom=False,
                       right=False, left=False, labelleft=False)
    pyplot.savefig(folder + filename)
    print(f"[[file:{filename}]]")
    return

A Noise Maker

def make_some_noise(n_samples: int, z_dim: int, device: str="cpu") -> torch.Tensor:
    """create noise vectors

    creates 
    Args:
       n_samples: the number of samples to generate, a scalar
       z_dim: the dimension of the noise vector, a scalar
       device: the device type (cpu or cuda)

    Returns:
     tensor with random numbers from the normal distribution.
    """

    return torch.randn(n_samples, z_dim, device=device)

Middle

The Generator

The first component you will make is the generator. You may notice that instead of passing in the image dimension, you will pass the number of image channels to the generator. This is because with DCGAN, you use convolutions which don’t depend on the number of pixels on an image. However, the number of channels is important to determine the size of the filters.

You will build a generator using 4 layers (3 hidden layers + 1 output layer). As before, you will need to write a function to create a single block for the generator's neural network. From the paper:

  • [u]se batchnorm in both the generator and the discriminator"
  • [u]se ReLU activation in generator for all layers except for the output, which uses Tanh.

Since in DCGAN the activation function will be different for the output layer, you will need to check what layer is being created.

At the end of the generator class, you are given a forward pass function that takes in a noise vector and generates an image of the output dimension using your neural network. You are also given a function to create a noise vector. These functions are the same as the ones from the last assignment.

See also:

The Generator Class

class Generator(nn.Module):
    """The DCGAN Generator

    Args:
       z_dim: the dimension of the noise vector
       im_chan: the number of channels in the images, fitted for the dataset used
             (MNIST is black-and-white, so 1 channel is your default)
       hidden_dim: the inner dimension,
    """
    def __init__(self, z_dim: int=10, im_chan: int=1, hidden_dim: int=64):
        super().__init__()
        self.z_dim = z_dim
        # Build the neural network
        self.gen = nn.Sequential(
            self.make_gen_block(z_dim, hidden_dim * 4),
            self.make_gen_block(hidden_dim * 4, hidden_dim * 2, kernel_size=4, stride=1),
            self.make_gen_block(hidden_dim * 2, hidden_dim),
            self.make_gen_block(hidden_dim, im_chan, kernel_size=4, final_layer=True),
        )

    def make_gen_block(self, input_channels: int, output_channels: int,
                       kernel_size: int=3, stride: int=2,
                       final_layer: bool=False) -> nn.Sequential:
        """Creates a block for the generator (sub sequence)

       The parts
        - a transposed convolution
        - a batchnorm (except for in the last layer)
        - an activation.

       Args:
           input_channels: how many channels the input feature representation has
           output_channels: how many channels the output feature representation should have
           kernel_size: the size of each convolutional filter, equivalent to (kernel_size, kernel_size)
           stride: the stride of the convolution
           final_layer: a boolean, true if it is the final layer and false otherwise 
                     (affects activation and batchnorm)

       Returns:
        the sub-sequence of layers
       """
        if not final_layer:
            return nn.Sequential(
                nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
                nn.BatchNorm2d(output_channels),
                nn.ReLU()
            )
        else: # Final Layer
            return nn.Sequential(
                nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
                nn.Tanh()
            )

    def unsqueeze_noise(self, noise: torch.Tensor) -> torch.Tensor:
        """transforms the noise tensor

       Args:
           noise: a noise tensor with dimensions (n_samples, z_dim)

       Returns:
        copy of noise with width and height = 1 and channels = z_dim.
       """
        return noise.view(len(noise), self.z_dim, 1, 1)

    def forward(self, noise: torch.Tensor) -> torch.Tensor:
        """complete a forward pass of the generator: Given a noise tensor, 

       Args:
        noise: a noise tensor with dimensions (n_samples, z_dim)

       Returns:
        generated images.
       """
        x = self.unsqueeze_noise(noise)
        return self.gen(x)

Setup Testing

gen = Generator()
num_test = 100

# Test the hidden block
test_hidden_noise = make_some_noise(num_test, gen.z_dim)
test_hidden_block = gen.make_gen_block(10, 20, kernel_size=4, stride=1)
test_uns_noise = gen.unsqueeze_noise(test_hidden_noise)
hidden_output = test_hidden_block(test_uns_noise)

# Check that it works with other strides
test_hidden_block_stride = gen.make_gen_block(20, 20, kernel_size=4, stride=2)

test_final_noise = make_some_noise(num_test, gen.z_dim) * 20
test_final_block = gen.make_gen_block(10, 20, final_layer=True)
test_final_uns_noise = gen.unsqueeze_noise(test_final_noise)
final_output = test_final_block(test_final_uns_noise)

# Test the whole thing:
test_gen_noise = make_some_noise(num_test, gen.z_dim)
test_uns_gen_noise = gen.unsqueeze_noise(test_gen_noise)
gen_output = gen(test_uns_gen_noise)

Unit Tests

assert tuple(hidden_output.shape) == (num_test, 20, 4, 4)
assert hidden_output.max() > 1
assert hidden_output.min() == 0
assert hidden_output.std() > 0.2
assert hidden_output.std() < 1
assert hidden_output.std() > 0.5

assert tuple(test_hidden_block_stride(hidden_output).shape) == (num_test, 20, 10, 10)

assert final_output.max().item() == 1
assert final_output.min().item() == -1

assert tuple(gen_output.shape) == (num_test, 1, 28, 28)
assert gen_output.std() > 0.5
assert gen_output.std() < 0.8
print("Success!")

The Discriminator

The second component you need to create is the discriminator.

You will use 3 layers in your discriminator's neural network. Like with the generator, you will need to create the method to create a single neural network block for the discriminator.

From the paper:

  • [u]se LeakyReLU activation in the discriminator for all layers.
  • For the LeakyReLUs, "the slope of the leak was set to 0.2" in DCGAN.

See Also:

The Discriminator Class

class Discriminator(nn.Module):
    """The DCGAN Discriminator

    Args:
     im_chan: the number of channels in the images, fitted for the dataset used
             (MNIST is black-and-white, so 1 channel is the default)
     hidden_dim: the inner dimension,
    """
    def __init__(self, im_chan: int=1, hidden_dim: int=16):
        super(Discriminator, self).__init__()
        self.disc = nn.Sequential(
            self.make_disc_block(im_chan, hidden_dim),
            self.make_disc_block(hidden_dim, hidden_dim * 2),
            self.make_disc_block(hidden_dim * 2, 1, final_layer=True),
        )
        return

    def make_disc_block(self, input_channels: int, output_channels: int,
                        kernel_size: int=4, stride: int=2,
                        final_layer: bool=False) -> nn.Sequential:
        """Make a sub-block of layers for the discriminator

        - a convolution
        - a batchnorm (except for in the last layer)
        - an activation.

       Args:
         input_channels: how many channels the input feature representation has
         output_channels: how many channels the output feature representation should have
         kernel_size: the size of each convolutional filter, equivalent to (kernel_size, kernel_size)
         stride: the stride of the convolution
         final_layer: if true it is the final layer and otherwise not
                     (affects activation and batchnorm)
       """        
        # Build the neural block
        if not final_layer:
            return nn.Sequential(
                nn.Conv2d(input_channels, output_channels, kernel_size, stride),
                nn.BatchNorm2d(output_channels),
                nn.LeakyReLU(0.2)
            )
        else: # Final Layer
            return nn.Sequential(
                nn.Conv2d(input_channels, output_channels, kernel_size, stride),
            )

    def forward(self, image: torch.Tensor) -> torch.Tensor:
        """Complete a forward pass of the discriminator

       Args:
         image: a flattened image tensor with dimension (im_dim)

       Returns:
        a 1-dimension tensor representing fake/real.
       """
        disc_pred = self.disc(image)
        return disc_pred.view(len(disc_pred), -1)

Set Up Testing

num_test = 100

gen = Generator()
disc = Discriminator()
test_images = gen(make_some_noise(num_test, gen.z_dim))

# Test the hidden block
test_hidden_block = disc.make_disc_block(1, 5, kernel_size=6, stride=3)
hidden_output = test_hidden_block(test_images)

# Test the final block
test_final_block = disc.make_disc_block(1, 10, kernel_size=2, stride=5, final_layer=True)
final_output = test_final_block(test_images)

# Test the whole thing:
disc_output = disc(test_images)

Unit Testing

  • The Hidden Block
    assert tuple(hidden_output.shape) == (num_test, 5, 8, 8)
    # Because of the LeakyReLU slope
    assert -hidden_output.min() / hidden_output.max() > 0.15
    assert -hidden_output.min() / hidden_output.max() < 0.25
    assert hidden_output.std() > 0.5
    assert hidden_output.std() < 1
    
  • The Final Block
    assert tuple(final_output.shape) == (num_test, 10, 6, 6)
    assert final_output.max() > 1.0
    assert final_output.min() < -1.0
    assert final_output.std() > 0.3
    assert final_output.std() < 0.6
    
  • The Whole Thing
    assert tuple(disc_output.shape) == (num_test, 1)
    assert disc_output.std() > 0.25
    assert disc_output.std() < 0.5
    print("Success!")
    

Training The Model

Remember that these are your parameters:

  • criterion: the loss function
  • n_epochs: the number of times you iterate through the entire dataset when training
  • z_dim: the dimension of the noise vector
  • display_step: how often to display/visualize the images
  • batch_size: the number of images per forward/backward pass
  • lr: the learning rate
  • beta_1, beta_2: the momentum term
  • device: the device type

Set Up The Data

criterion = nn.BCEWithLogitsLoss()
z_dim = 64
batch_size = 128
# A learning rate of 0.0002 works well on DCGAN
lr = 0.0002

# These parameters control the optimizer's momentum, which you can read more about here:
# https://distill.pub/2017/momentum/ but you don’t need to worry about it for this course!
beta_1 = 0.5 
beta_2 = 0.999
device = 'cuda'

# You can tranform the image values to be between -1 and 1 (the range of the tanh activation)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

path = Path("~/pytorch-data/MNIST").expanduser()
dataloader = DataLoader(
    MNIST(path, download=True, transform=transform),
    batch_size=batch_size,
    shuffle=True)

Set Up the GAN

gen = Generator(z_dim).to(device)
gen_opt = torch.optim.Adam(gen.parameters(), lr=lr, betas=(beta_1, beta_2))
disc = Discriminator().to(device) 
disc_opt = torch.optim.Adam(disc.parameters(), lr=lr, betas=(beta_1, beta_2))

A Weight Initializer

def initial_weights(m):
    """Initialize the weights to the normal distribution

     - mean 0
     - standard deviation 0.02

    Args:
     m: layer whose weights to initialize
    """
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
        torch.nn.init.normal_(m.weight, 0.0, 0.02)
    if isinstance(m, nn.BatchNorm2d):
        torch.nn.init.normal_(m.weight, 0.0, 0.02)
        torch.nn.init.constant_(m.bias, 0)
    return
gen = gen.apply(initial_weights)
disc = disc.apply(initial_weights)

Train it

For each epoch, you will process the entire dataset in batches. For every batch, you will update the discriminator and generator. Then, you can see DCGAN's results!

Here's roughly the progression you should be expecting. On GPU this takes about 30 seconds per thousand steps. On CPU, this can take about 8 hours per thousand steps. You might notice that in the image of Step 5000, the generator is disproprotionately producing things that look like ones. If the discriminator didn't learn to detect this imbalance quickly enough, then the generator could just produce more ones. As a result, it may have ended up tricking the discriminator so well that there would be no more improvement, known as mode collapse.

n_epochs = 100
cur_step = 0
display_step = 1000
mean_generator_loss = 0
mean_discriminator_loss = 0
generator_losses = []
discriminator_losses = []
steps = []

best_loss = float("inf")
best_step = 0
best_path = Path("~/models/gans/mnist-dcgan/best_model.pth").expanduser()

with TIMER:
    for epoch in range(n_epochs):
        # Dataloader returns the batches
        for real, _ in dataloader:
            cur_batch_size = len(real)
            real = real.to(device)

            ## Update discriminator ##
            disc_opt.zero_grad()
            fake_noise = make_some_noise(cur_batch_size, z_dim, device=device)
            fake = gen(fake_noise)
            disc_fake_pred = disc(fake.detach())
            disc_fake_loss = criterion(disc_fake_pred, torch.zeros_like(disc_fake_pred))
            disc_real_pred = disc(real)
            disc_real_loss = criterion(disc_real_pred, torch.ones_like(disc_real_pred))
            disc_loss = (disc_fake_loss + disc_real_loss) / 2

            # Keep track of the average discriminator loss
            mean_discriminator_loss += disc_loss.item() / display_step
            # Update gradients
            disc_loss.backward(retain_graph=True)
            # Update optimizer
            disc_opt.step()

            ## Update generator ##
            gen_opt.zero_grad()
            fake_noise_2 = make_some_noise(cur_batch_size, z_dim, device=device)
            fake_2 = gen(fake_noise_2)
            disc_fake_pred = disc(fake_2)
            gen_loss = criterion(disc_fake_pred, torch.ones_like(disc_fake_pred))
            gen_loss.backward()
            gen_opt.step()

            # Keep track of the average generator loss
            mean_generator_loss += gen_loss.item() / display_step
            if mean_generator_loss < best_loss:
                best_loss, best_step = mean_generator_loss, cur_step
                with best_path.open("wb") as writer:
                    torch.save(gen, writer)
            ## Visualization code ##
            if cur_step % display_step == 0 and cur_step > 0:
                print(f"Epoch {epoch}, step {cur_step}: Generator loss:"
                        f" {mean_generator_loss}, discriminator loss:"
                        f" {mean_discriminator_loss}")

                steps.append(cur_step)
                generator_losses.append(mean_generator_loss)
                discriminator_losses.append(mean_discriminator_loss)

                mean_generator_loss = 0
                mean_discriminator_loss = 0
            cur_step += 1
Started: 2021-04-21 12:45:12.452739
Epoch 2, step 1000: Generator loss: 1.2671969079673289, discriminator loss: 0.43014343224465823
Epoch 4, step 2000: Generator loss: 1.1353899730443968, discriminator loss: 0.5306872705817226
Epoch 6, step 3000: Generator loss: 0.8764803466945883, discriminator loss: 0.611450107574464
Epoch 8, step 4000: Generator loss: 0.7747784045338618, discriminator loss: 0.6631499938964849
Epoch 10, step 5000: Generator loss: 0.7640163034200661, discriminator loss: 0.6734729865789411
Epoch 12, step 6000: Generator loss: 0.7452541967928404, discriminator loss: 0.6805261079072958
Epoch 14, step 7000: Generator loss: 0.7337032879889016, discriminator loss: 0.6874966211915009
Epoch 17, step 8000: Generator loss: 0.7245009585618979, discriminator loss: 0.6908933531045917
Epoch 19, step 9000: Generator loss: 0.7180560626983646, discriminator loss: 0.6936621717810626
Epoch 21, step 10000: Generator loss: 0.7115822317004211, discriminator loss: 0.695760274052621
Epoch 23, step 11000: Generator loss: 0.7090291924774644, discriminator loss: 0.6962701203227039
Epoch 25, step 12000: Generator loss: 0.7059894913136957, discriminator loss: 0.6973492541313167
Epoch 27, step 13000: Generator loss: 0.7030480077862743, discriminator loss: 0.6978999735713001
Epoch 29, step 14000: Generator loss: 0.7028095332086096, discriminator loss: 0.6974007876515396
Epoch 31, step 15000: Generator loss: 0.7027116653919212, discriminator loss: 0.6965595571994787
Epoch 34, step 16000: Generator loss: 0.7005282629728309, discriminator loss: 0.6962912415862079
Epoch 36, step 17000: Generator loss: 0.7007142878770828, discriminator loss: 0.6961965024471283
Epoch 38, step 18000: Generator loss: 0.699474583208561, discriminator loss: 0.6952810400128371
Epoch 40, step 19000: Generator loss: 0.6989677719473828, discriminator loss: 0.6954642050266268
Epoch 42, step 20000: Generator loss: 0.6977452509403238, discriminator loss: 0.695180906951427
Epoch 44, step 21000: Generator loss: 0.6973587237596515, discriminator loss: 0.6950308464765543
Epoch 46, step 22000: Generator loss: 0.6960379970669743, discriminator loss: 0.6949119175076485
Epoch 49, step 23000: Generator loss: 0.6957966268062581, discriminator loss: 0.6948324624896048
Epoch 51, step 24000: Generator loss: 0.6958502059578898, discriminator loss: 0.6945331234931943
Epoch 53, step 25000: Generator loss: 0.6954856168627734, discriminator loss: 0.6943869084119801
Epoch 55, step 26000: Generator loss: 0.6957543395757682, discriminator loss: 0.694317172288894
Epoch 57, step 27000: Generator loss: 0.6947923063635825, discriminator loss: 0.694082073867321
Epoch 59, step 28000: Generator loss: 0.6945026598572728, discriminator loss: 0.6939926172494871
Epoch 61, step 29000: Generator loss: 0.6947789136767392, discriminator loss: 0.6938506522774704
Epoch 63, step 30000: Generator loss: 0.6946699734926227, discriminator loss: 0.6937169924378406
Epoch 66, step 31000: Generator loss: 0.6944284628629694, discriminator loss: 0.6936815274357805
Epoch 68, step 32000: Generator loss: 0.6940396347641948, discriminator loss: 0.6935891906023032
Epoch 70, step 33000: Generator loss: 0.6946771386265761, discriminator loss: 0.6937210547327995
Epoch 72, step 34000: Generator loss: 0.693429798424244, discriminator loss: 0.6937174627780922
Epoch 74, step 35000: Generator loss: 0.6937471128702157, discriminator loss: 0.6935204346776015
Epoch 76, step 36000: Generator loss: 0.6938841561675072, discriminator loss: 0.6934832554459566
Epoch 78, step 37000: Generator loss: 0.6934520475268362, discriminator loss: 0.6934578058719627
Epoch 81, step 38000: Generator loss: 0.6936635475754732, discriminator loss: 0.6934186050295835
Epoch 83, step 39000: Generator loss: 0.6936795052289972, discriminator loss: 0.6935187472105031
Epoch 85, step 40000: Generator loss: 0.6933113215565679, discriminator loss: 0.6933534587025645
Epoch 87, step 41000: Generator loss: 0.6934976277351385, discriminator loss: 0.6933284662365923
Epoch 89, step 42000: Generator loss: 0.6933313971757892, discriminator loss: 0.693348657488824
Epoch 91, step 43000: Generator loss: 0.6937436528205883, discriminator loss: 0.6933502901792529
Epoch 93, step 44000: Generator loss: 0.6943431540131578, discriminator loss: 0.6933887023925772
Epoch 95, step 45000: Generator loss: 0.6938722513914105, discriminator loss: 0.6932663491368296
Epoch 98, step 46000: Generator loss: 0.6933276618123067, discriminator loss: 0.6934270900487906
Ended: 2021-04-21 13:06:00.256725
Elapsed: 0:20:47.803986

Looking at the Final model.

fake_noise = make_some_noise(cur_batch_size, z_dim, device=device)

best_model = torch.load(best_path)
fake = best_model(fake_noise)
plot_image(image=fake, filename="fake_digits.png", title="Fake Digits")

fake_digits.png

plot_image(real, filename="real_digits.png", title="Real Digits")

real_digits.png

plotting = pandas.DataFrame.from_dict({
    "Step": steps,
    "Generator Loss": generator_losses,
    "Discriminator Loss": discriminator_losses
})

best = plotting.iloc[plotting["Generator Loss"].argmin()]
best_line = holoviews.VLine(best.Step)
gen_plot = plotting.hvplot(x="Step", y="Generator Loss", color=PLOT.blue)
disc_plot = plotting.hvplot(x="Step", y="Discriminator Loss", color=PLOT.red)

plot = (gen_plot * disc_plot * best_line).opts(title="Training Losses",
                                               height=PLOT.height,
                                               width=PLOT.width,
                                               ylabel="Loss",
                                               fontscale=PLOT.fontscale)
output = Embed(plot=plot, file_name="losses")()
print(output)

Figure Missing

End

Sources

  • Radford A, Metz L, Chintala S. Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434. 2015 Nov 19. (PDF)

PyTorch Linear Regression

Table of Contents

Beginning

Imports

# python
from collections import namedtuple
from functools import partial

# pypi
from torch import nn
from torch.utils.data import Dataset, DataLoader

import hvplot.pandas
import numpy
import pandas

# local stuff
from graeae import EmbedHoloviews

Set Up

random_generator = numpy.random.default_rng(seed=2021)
slug = "pytorch-linear-regression"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/pytorch/{slug}")

Plot = namedtuple("Plot", ["width", "height", "fontscale", "tan", "blue", "red"])
PLOT = Plot(
    width=900,
    height=750,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
 )
def sample(start: float, stop: float, shape: tuple, uniform: bool=True) -> numpy.ndarray:
    """Create a random sample

    Args:
     start: lowest allowed value
     stop: highest allowed value
     shape: shape for the final array (just an int for single values)
     uniform: use the uniform distribution instead of the standard normal
    """
    if uniform:
        return (stop - start) * random_generator.random(shape) + start
    return (stop - start) * random_generator.standard_normal(shape) + start

Middle

SAMPLES = 200
X_RANGE = 5
x_values = sample(-X_RANGE, X_RANGE, SAMPLES)
SLOPE = sample(-5, 5, 1)
INTERCEPT = sample(-5, 5, 1)
noise = sample(-2, 2, SAMPLES, uniform=False)
y_values = SLOPE * x_values + INTERCEPT + noise
data_frame = pandas.DataFrame.from_dict(dict(X=x_values, Y=y_values))
first, last = x_values.min(), x_values.max()
line_frame = pandas.DataFrame.from_dict(
    dict(X=[first, last],
         Y=[SLOPE * first + INTERCEPT,
            SLOPE * last + INTERCEPT]))
line_plot = line_frame.hvplot(x="X", y="Y", color=PLOT.blue)
data_plot = data_frame.hvplot.scatter(x="X", y="Y", title="Sample Data",
                                      color=PLOT.tan)
plot = (data_plot * line_plot).opts(
    height=PLOT.height,
    width=PLOT.width,
    fontscale=PLOT.fontscale
)
output = Embed(plot=plot, file_name="sample_data")()

Figure Missing

class XY(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        return

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        return {"x": self.x[index], "y": self.y[index]}
dataset = XY(x_values, y_values)
loader = DataLoader(dataset, batch_size=4)
model = nn.Linear(1, 1)

MNIST GAN

Beginning

Imports

# python
from collections import namedtuple
from functools import partial
from pathlib import Path

# from pypi
from torch import nn
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.utils import make_grid
from torch.utils.data import DataLoader

import hvplot.pandas
import matplotlib.pyplot as pyplot
import pandas
import torch

# local code
from graeae import EmbedHoloviews, Timer

Some Setup

First we'll set the manual seed to make this reproducible.

torch.manual_seed(0)

This is a convenience object to time the training.

TIMER = Timer()

This is for plotting.

slug = "mnist-gan"

Embed = partial(EmbedHoloviews, folder_path=f"files/posts/gans/{slug}")

Plot = namedtuple("Plot", ["width", "height", "fontscale",
                           "tan", "blue", "red", "sizing_mode"],
                  defaults=[
                      900,
                      556,
                      2,
                      "#ddb377",
                      "#4687b7",
                      "#ce7b6d",
                      "scale_both",
                  ])()

GANParts = namedtuple("GANParts", ["generator", "generator_optimizer",
                                   "discriminator", "discriminator_optimizer"])

PlotData = namedtuple("PlotData", ["steps", "generator_losses", "discriminator_losses"])

Middle

The MNIST Dataset

The training images we will be using are from a dataset called MNIST. The dataset contains 60,000 images of handwritten digits, from 0 to 9.

The images are 28 pixels x 28 pixels in size. The small size of its images makes MNIST ideal for simple training. Additionally, these images are also in black-and-white so only one dimension, or "color channel", is needed to represent them. Pytorch has a version of it ready-made for their system so we'll use theirs.

The Generator

The first step is to build the generator component.

We'll start by creating a function to make a single layer/block for the generator's neural network. Each block should include a linear transformation (\(y=xA^T + b\)) to the input to another shape, batch normalization for stabilization, and finally a non-linear activation function (ReLU in this case).

def generator_block(input_features: int, output_features: int) -> nn.Sequential:
    """
    Creates a block of the generator's neural network

    Args:
      input_features: the dimension of the input vector
      output_features: the dimension of the output vector

    Returns:
       a generator neural network layer, with a linear transformation 
         followed by a batch normalization and then a relu activation
    """
    return nn.Sequential(
        nn.Linear(input_features, output_features),
        nn.BatchNorm1d(output_features),
        nn.ReLU(inplace=True),
    )

Verify the generator block function

def test_gen_block(in_features: int, out_features: int,
                   test_rows: int=1000) -> None:
    """Test the generator block creator

    Args:
     in_features: number of features for the block input
     out_features: the final number of features for it to output
     test_rows: how many rows to put in the test Tensor

    Raises:
     AssertionError: something isn't right
    """
    block = generator_block(in_features, out_features)

    # Check the three parts
    assert len(block) == 3
    assert type(block[0]) == nn.Linear
    assert type(block[1]) == nn.BatchNorm1d
    assert type(block[2]) == nn.ReLU

    # Check the output shape
    test_output = block(torch.randn(test_rows, in_features))
    assert tuple(test_output.shape) == (test_rows, out_features)

    # check the normalization
    assert 0.65 > test_output.std() > 0.55
    return

test_gen_block(25, 12)
test_gen_block(15, 28)

Building the Generator Class

Now that we have the block-builder we can define our Generator network. It's going to contain a sequence of blocks output by our block-building function and a final two layers that use the linear transformation again, but don't apply normalization and use a Sigmoid Function instead of the ReLU. Each block will have an output double that of the previous one.

generator.png

class Generator(nn.Module):
    """Generator Class

    Args:
      input_dimension: the dimension of the noise vector
      image_dimension: the dimension of the images, fitted for the dataset used
        (MNIST images are 28 x 28 = 784 so that is the default)
      hidden_dimension: the initial hidden-layer dimension
    """
    def __init__(self, input_dimension: int=10, image_dimension: int=784,
                 hidden_dimension: int=128):
        super().__init__()

        self.generator = nn.Sequential(
            generator_block(input_dimension, hidden_dimension),
            generator_block(hidden_dimension, hidden_dimension * 2),
            generator_block(hidden_dimension * 2, hidden_dimension * 4),
            generator_block(hidden_dimension * 4, hidden_dimension * 8),
            nn.Linear(hidden_dimension * 8, image_dimension),
            nn.Sigmoid()
        )
        return

    def forward(self, noise: torch.Tensor) -> torch.Tensor:
        """
       Method for a forward pass of the generator

       Args:
        noise: a noise tensor with dimensions (n_samples, z_dim)

       Returns: 
        generated images.
       """
        return self.generator(noise)

Verify the Generator Class

def test_generator(z_dim: int, im_dim: int, hidden_dim: int, 
                   num_test: int=10000) -> None:
    """Test the Generator Class

    Args:
     z_dim: the size of the input
     im_dim: the size of the image
     hidden_dim: the size of the initial hidden layer

    Raises:
     AssertionError: something is wrong
    """
    gen = Generator(z_dim, im_dim, hidden_dim).generator

    # Check there are six modules in the sequential part
    assert len(gen) == 6
    test_input = torch.randn(num_test, z_dim)
    test_output = gen(test_input)

    # Check that the output shape is correct
    assert tuple(test_output.shape) == (num_test, im_dim)

    # Chechk the output
    assert 0 < test_output.max() < 1, "Make sure to use a sigmoid"
    assert test_output.min() < 0.5, "Don't use a block in your solution"
    assert 0.15 > test_output.std() > 0.05, "Don't use batchnorm here"
    return

test_generator(5, 10, 20)
test_generator(20, 8, 24)

Noise

To be able to use the generator, we will need to be able to create noise vectors. The noise vector z has the important role of making sure the images generated from the same class don't all look the same – think of it as a random seed. You will generate it randomly using PyTorch by sampling random numbers from the normal distribution. Since multiple images will be processed per pass, you will generate all the noise vectors at once.

Note that whenever you create a new tensor using torch.ones, torch.zeros, or torch.randn, you either need to create it on the target device, e.g. torch.ones(3, 3, device=device), or move it onto the target device using torch.ones(3, 3).to(device). You do not need to do this if you're creating a tensor by manipulating another tensor or by using a variation that defaults the device to the input, such as torch.ones_like. In general, use torch.ones_like and torch.zeros_like instead of torch.ones or torch.zeros where possible.

get_noise = partial(torch.randn, device="cuda")
# def get_noise(n_samples: int, z_dim: int, device='cuda') -> torch.Tensor:
#     """create noise vectors
# 
#     Args:
#         n_samples: the number of samples to generate, a scalar
#         z_dim: the dimension of the noise vector, a scalar
#         device: the device type
#     """
#     return torch.randn(n_samples, z_dim, device=device)

Verify the noise vector function

def test_get_noise(n_samples, z_dim, device='cpu'):
    noise = get_noise(n_samples, z_dim, device=device)

    # Make sure a normal distribution was used
    assert tuple(noise.shape) == (n_samples, z_dim)
    assert torch.abs(noise.std() - torch.tensor(1.0)) < 0.01
    assert str(noise.device).startswith(device)

test_get_noise(1000, 32)

The Discriminator

The second component that you need to construct is the discriminator. As with the generator component, you will start by creating a function that builds a neural network block for the discriminator.

Note: You use leaky ReLUs to prevent the "dying ReLU" problem, which refers to the phenomenon where the parameters stop changing due to consistently negative values passed to a ReLU, which result in a zero gradient.

def get_discriminator_block(input_dim: int, output_dim: int,
                            negative_slope: float=0.2) -> nn.Sequential:
    """Create the Discriminator block

    Args:
      input_dim: the dimension of the input vector, a scalar
      output_dim: the dimension of the output vector, a scalar
      negative_slope: angle for the negative slope

    Returns:
       a discriminator neural network layer, with a linear transformation 
         followed by an nn.LeakyReLU activation with negative slope of 0.2 
    """
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.LeakyReLU(negative_slope=0.2)
    )

Verify the discriminator block function

def test_disc_block(in_features, out_features, num_test=10000):
    block = get_discriminator_block(in_features, out_features)

    # Check there are two parts
    assert len(block) == 2
    test_input = torch.randn(num_test, in_features)
    test_output = block(test_input)

    # Check that the shape is right
    assert tuple(test_output.shape) == (num_test, out_features)

    # Check that the LeakyReLU slope is about 0.2
    assert -test_output.min() / test_output.max() > 0.1
    assert -test_output.min() / test_output.max() < 0.3
    assert test_output.std() > 0.3
    assert test_output.std() < 0.5

test_disc_block(25, 12)
test_disc_block(15, 28)

The Discriminator Class

The discriminator class holds 2 values:

  • The image dimension
  • The hidden dimension

The discriminator will build a neural network with 4 layers. It will start with the image tensor and transform it until it returns a single number (1-dimension tensor) output. This output classifies whether an image is fake or real. Note that you do not need a sigmoid after the output layer since it is included in the loss function. Finally, to use your discrimator's neural network you are given a forward pass function that takes in an image tensor to be classified.

class Discriminator(nn.Module):
    """The Discriminator Class

    Args:
       im_dim: the dimension of the images, fitted for the dataset used, a scalar
           (MNIST images are 28x28 = 784 so that is your default)
       hidden_dim: the inner dimension, a scalar
    """
    def __init__(self, im_dim: int=784, hidden_dim: int=128):
        super().__init__()
        self.disc = nn.Sequential(
            get_discriminator_block(im_dim, hidden_dim * 4),
            get_discriminator_block(hidden_dim * 4, hidden_dim * 2),
            get_discriminator_block(hidden_dim * 2, hidden_dim),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, image: torch.Tensor) -> torch.Tensor:
        """forward pass of the discriminator

       Args:
           image: a flattened image tensor with dimension (im_dim)

       Returns a 1-dimension tensor representing fake/real.
       """
        return self.disc(image)
  • Verify the discriminator class
    def test_discriminator(z_dim, hidden_dim, num_test=100):
    
        disc = Discriminator(z_dim, hidden_dim).disc
    
        # Check there are three parts
        assert len(disc) == 4
    
        # Check the linear layer is correct
        test_input = torch.randn(num_test, z_dim)
        test_output = disc(test_input)
        assert tuple(test_output.shape) == (num_test, 1)
    
        # Don't use a block
        assert not isinstance(disc[-1], nn.Sequential)
    
    test_discriminator(5, 10)
    test_discriminator(20, 8)
    

Training

First, you will set your parameters:

  • criterion: the loss function (BCEWithLogitsLoss
  • n_epochs: the number of times you iterate through the entire dataset when training
  • z_dim: the dimension of the noise vector
  • display_step: how often to display/visualize the images
  • batch_size: the number of images per forward/backward pass
  • lr: the learning rate
  • device: the device type, here using a GPU (which runs CUDA), not CPU

Next, you will load the MNIST dataset as tensors using a dataloader.

Set your parameters

criterion = nn.BCEWithLogitsLoss()
z_dim = 64
batch_size = 128
lr = 0.00001

Load MNIST dataset as tensors

data_path = Path("~/data/datasets/pytorch/").expanduser()
dataloader = DataLoader(
    MNIST(root=data_path, download=True, transform=transforms.ToTensor()),
    batch_size=batch_size,
    shuffle=True)

Now, you can initialize your generator, discriminator, and optimizers. Note that each optimizer only takes the parameters of one particular model, since we want each optimizer to optimize only one of the models.

def build_parts(z_dim: int=z_dim, learning_rate: float=lr) -> GANParts:
    device = "cuda"
    generator = Generator(z_dim).to(device)
    gen_optimizer = torch.optim.Adam(generator.parameters(), lr=learning_rate)
    discriminator = Discriminator().to(device) 
    disc_optimizer = torch.optim.Adam(discriminator.parameters(), lr=learning_rate)
    return GANParts(generator, gen_optimizer, discriminator, disc_optimizer)

gen, gen_opt, disc, disc_opt = build_parts()

This next bit is from https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu.

def check_gpu():
    assert torch.cuda.is_available()
    current_device = torch.cuda.current_device()
    print(current_device)
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(current_device))
    print('Memory Usage:')
    print(f"Allocated: {torch.cuda.memory_allocated(current_device)/1024**2:,}"
          " MB")
    print(f"Cached:   {torch.cuda.memory_reserved(0)/1024**2:,} MB")
    return

check_gpu()
0
1
NVIDIA GeForce GTX 1070 Ti
Memory Usage:
Allocated: 7.92138671875 MB
Cached:   22.0 MB

Before you train your GAN, you will need to create functions to calculate the discriminator's loss and the generator's loss. This is how the discriminator and generator will know how they are doing and improve themselves. Since the generator is needed when calculating the discriminator's loss, you will need to call .detach() on the generator result to ensure that only the discriminator is updated!

Remember that you have already defined a loss function earlier (criterion) and you are encouraged to use torch.ones_like and torch.zeros_like instead of torch.ones or torch.zeros. If you use torch.ones or torch.zeros, you'll need to pass device=device to them.

def get_disc_loss(gen: Generator, disc: Discriminator,
                  criterion: nn.BCEWithLogitsLoss,
                  real: torch.Tensor,
                  num_images: int, z_dim: int, 
                  device: str="cuda"):
    """
    Get the loss of the discriminator given inputs.

    Args:
       gen: the generator model, which returns an image given z-dimensional noise
       disc: the discriminator model, which returns a single-dimensional prediction of real/fake
       criterion: the loss function, which should be used to compare 
              the discriminator's predictions to the ground truth reality of the images 
              (e.g. fake = 0, real = 1)
       real: a batch of real images
       num_images: the number of images the generator should produce, 
               which is also the length of the real images
       z_dim: the dimension of the noise vector, a scalar
       device: the device type

    Returns:
       disc_loss: a torch scalar loss value for the current batch
    """
    noise = torch.randn(num_images, z_dim, device=device)
    fakes = gen(noise).detach()

    fake_prediction = disc(fakes)
    fake_loss = criterion(fake_prediction, torch.zeros_like(fake_prediction))

    real_prediction = disc(real)
    real_loss = criterion(real_prediction, torch.ones_like(real_prediction))
    disc_loss = (fake_loss + real_loss)/2
    return disc_loss
def test_disc_reasonable(num_images=10):
    # Don't use explicit casts to cuda - use the device argument
    import inspect, re
    lines = inspect.getsource(get_disc_loss)
    assert (re.search(r"to\(.cuda.\)", lines)) is None
    assert (re.search(r"\.cuda\(\)", lines)) is None

    z_dim = 64
    gen = torch.zeros_like
    disc = lambda x: x.mean(1)[:, None]
    criterion = torch.mul # Multiply
    real = torch.ones(num_images, z_dim)
    disc_loss = get_disc_loss(gen, disc, criterion, real, num_images, z_dim, 'cpu')
    assert torch.all(torch.abs(disc_loss.mean() - 0.5) < 1e-5)

    gen = torch.ones_like
    criterion = torch.mul # Multiply
    real = torch.zeros(num_images, z_dim)
    assert torch.all(torch.abs(get_disc_loss(gen, disc, criterion, real, num_images, z_dim, 'cpu')) < 1e-5)

    gen = lambda x: torch.ones(num_images, 10)
    disc = lambda x: x.mean(1)[:, None] + 10
    criterion = torch.mul # Multiply
    real = torch.zeros(num_images, 10)
    assert torch.all(torch.abs(get_disc_loss(gen, disc, criterion, real, num_images, z_dim, 'cpu').mean() - 5) < 1e-5)

    gen = torch.ones_like
    disc = nn.Linear(64, 1, bias=False)
    real = torch.ones(num_images, 64) * 0.5
    disc.weight.data = torch.ones_like(disc.weight.data) * 0.5
    disc_opt = torch.optim.Adam(disc.parameters(), lr=lr)
    criterion = lambda x, y: torch.sum(x) + torch.sum(y)
    disc_loss = get_disc_loss(gen, disc, criterion, real, num_images, z_dim, 'cpu').mean()
    disc_loss.backward()
    assert torch.isclose(torch.abs(disc.weight.grad.mean() - 11.25), torch.tensor(3.75))
    return

test_disc_reasonable()
device = "cuda"
def test_disc_loss(max_tests = 10):
    z_dim = 64
    gen = Generator(z_dim).to(device)
    gen_opt = torch.optim.Adam(gen.parameters(), lr=lr)
    disc = Discriminator().to(device) 
    disc_opt = torch.optim.Adam(disc.parameters(), lr=lr)
    num_steps = 0
    for real, _ in dataloader:
        cur_batch_size = len(real)
        real = real.view(cur_batch_size, -1).to(device)

        ### Update discriminator ###
        # Zero out the gradient before backpropagation
        disc_opt.zero_grad()

        # Calculate discriminator loss
        disc_loss = get_disc_loss(gen, disc, criterion, real, cur_batch_size, z_dim, device)
        assert (disc_loss - 0.68).abs() < 0.05, disc_loss

        # Update gradients
        disc_loss.backward(retain_graph=True)

        # Check that they detached correctly
        assert gen.generator[0][0].weight.grad is None

        # Update optimizer
        old_weight = disc.disc[0][0].weight.data.clone()
        disc_opt.step()
        new_weight = disc.disc[0][0].weight.data

        # Check that some discriminator weights changed
        assert not torch.all(torch.eq(old_weight, new_weight))
        num_steps += 1
        if num_steps >= max_tests:
            break

test_disc_loss()

Generator Loss

def get_gen_loss(gen: Generator,
                 disc: Discriminator,
                 criterion: nn.BCEWithLogitsLoss,
                 num_images: int,
                 z_dim: int, device: str="cuda") -> torch.Tensor:
    """Calculates the loss for the generator

    Args:
       gen: the generator model, which returns an image given z-dimensional noise
       disc: the discriminator model, which returns a single-dimensional prediction of real/fake
       criterion: the loss function, which should be used to compare 
              the discriminator's predictions to the ground truth reality of the images 
              (e.g. fake = 0, real = 1)
       num_images: the number of images the generator should produce, 
               which is also the length of the real images
       z_dim: the dimension of the noise vector, a scalar
       device: the device type
    Returns:
       gen_loss: a torch scalar loss value for the current batch
    """
    noise = torch.randn(num_images, z_dim, device=device)
    fakes = gen(noise)
    fake_prediction = disc(fakes)
    gen_loss = criterion(fake_prediction, torch.ones_like(fake_prediction))
    return gen_loss
def test_gen_reasonable(num_images=10):
    # Don't use explicit casts to cuda - use the device argument
    import inspect, re
    lines = inspect.getsource(get_gen_loss)
    assert (re.search(r"to\(.cuda.\)", lines)) is None
    assert (re.search(r"\.cuda\(\)", lines)) is None

    z_dim = 64
    gen = torch.zeros_like
    disc = nn.Identity()
    criterion = torch.mul # Multiply
    gen_loss_tensor = get_gen_loss(gen, disc, criterion, num_images, z_dim, 'cpu')
    assert torch.all(torch.abs(gen_loss_tensor) < 1e-5)
    #Verify shape. Related to gen_noise parametrization
    assert tuple(gen_loss_tensor.shape) == (num_images, z_dim)

    gen = torch.ones_like
    disc = nn.Identity()
    criterion = torch.mul # Multiply
    real = torch.zeros(num_images, 1)
    gen_loss_tensor = get_gen_loss(gen, disc, criterion, num_images, z_dim, 'cpu')
    assert torch.all(torch.abs(gen_loss_tensor - 1) < 1e-5)
    #Verify shape. Related to gen_noise parametrization
    assert tuple(gen_loss_tensor.shape) == (num_images, z_dim)
    return
test_gen_reasonable(10)
def test_gen_loss(num_images):
    z_dim = 64
    gen = Generator(z_dim).to(device)
    gen_opt = torch.optim.Adam(gen.parameters(), lr=lr)
    disc = Discriminator().to(device) 
    disc_opt = torch.optim.Adam(disc.parameters(), lr=lr)

    gen_loss = get_gen_loss(gen, disc, criterion, num_images, z_dim, device)

    # Check that the loss is reasonable
    assert (gen_loss - 0.7).abs() < 0.1
    gen_loss.backward()
    old_weight = gen.generator[0][0].weight.clone()
    gen_opt.step()
    new_weight = gen.generator[0][0].weight
    assert not torch.all(torch.eq(old_weight, new_weight))
test_gen_loss(18)

All Together

For each epoch, you will process the entire dataset in batches. For every batch, you will need to update the discriminator and generator using their loss. Batches are sets of images that will be predicted on before the loss functions are calculated (instead of calculating the loss function after each image). Note that you may see a loss to be greater than 1, this is okay since binary cross entropy loss can be any positive number for a sufficiently confident wrong guess.

It’s also often the case that the discriminator will outperform the generator, especially at the start, because its job is easier. It's important that neither one gets too good (that is, near-perfect accuracy), which would cause the entire model to stop learning. Balancing the two models is actually remarkably hard to do in a standard GAN and something you will see more of in later lectures and assignments.

After you've submitted a working version with the original architecture, feel free to play around with the architecture if you want to see how different architectural choices can lead to better or worse GANs. For example, consider changing the size of the hidden dimension, or making the networks shallower or deeper by changing the number of layers.

def train(epochs: int=2000, gen: Generator=gen,
          gen_opt: torch.optim.Adam=gen_opt,
          disc: Discriminator=disc,
          disc_opt: torch.optim.Adam=disc_opt,
          start_step: int=0) -> PlotData:
    cur_step = start_step
    mean_generator_loss = 0
    mean_discriminator_loss = 0
    test_generator = True # Whether the generator should be tested
    gen_loss = False
    error = False
    display_step = 4100
    generator_losses = []
    discriminator_losses = []
    steps = []

    with TIMER:
        for epoch in range(epochs):

            # Dataloader returns the batches
            for real, _ in dataloader:
                cur_batch_size = len(real)

                # Flatten the batch of real images from the dataset
                real = real.view(cur_batch_size, -1).to(device)

                ### Update discriminator ###
                # Zero out the gradients before backpropagation
                disc_opt.zero_grad()

                # Calculate discriminator loss
                disc_loss = get_disc_loss(gen, disc, criterion, real, cur_batch_size, z_dim, device)

                # Update gradients
                disc_loss.backward(retain_graph=True)

                # Update optimizer
                disc_opt.step()

                # For testing purposes, to keep track of the generator weights
                if test_generator:
                    old_generator_weights = gen.generator[0][0].weight.detach().clone()

                ### Update generator ###
                gen_opt.zero_grad()
                gen_loss = get_gen_loss(gen, disc, criterion, cur_batch_size, z_dim, device)
                gen_loss.backward(retain_graph=True)
                gen_opt.step()

                # For testing purposes, to check that your code changes the generator weights
                if test_generator:
                    try:
                        assert lr > 0.0000002 or (gen.generator[0][0].weight.grad.abs().max() < 0.0005 and epoch == 0)
                        assert torch.any(gen.generator[0][0].weight.detach().clone() != old_generator_weights)
                    except:
                        error = True
                        print("Runtime tests have failed")

                # Keep track of the average discriminator loss
                mean_discriminator_loss += disc_loss.item() / display_step

                # Keep track of the average generator loss
                mean_generator_loss += gen_loss.item() / display_step

                if cur_step % display_step == 0 and cur_step > 0:
                    print(f"Epoch {epoch}, step {cur_step}: Generator loss:"
                            f" {mean_generator_loss}, discriminator loss:"
                            f" {mean_discriminator_loss}")
                    steps.append(cur_step)
                    generator_losses.append(mean_generator_loss)
                    discriminator_losses.append(mean_discriminator_loss)

                    mean_generator_loss = 0
                    mean_discriminator_loss = 0
                cur_step += 1
    return PlotData(steps=steps, generator_losses=generator_losses,
                    discriminator_losses=discriminator_losses)
print(train(epochs=10))
Started: 2023-06-25 21:54:06.871834
Epoch 8, step 4100: Generator loss: 2.0954834696577818, discriminator loss: 0.19236493878793443
Ended: 2023-06-25 21:55:15.804335
Elapsed: 0:01:08.932501
PlotData(steps=[4100], generator_losses=[2.0954834696577818], discriminator_losses=[0.19236493878793443])
def run_batch(parts: GANParts, plot_data: PlotData,
              batch_size: int=100) -> PlotData:
    """Run a smaller batch of epochs

    Args:
     parts: the GAN parts
     plot_data: the accumulated output of the training

    Returns:
     updated plot_data
    """
    next_step = max(plot_data.steps) + 1 if plot_data.steps else 0
    output = train(gen=parts.generator,
                   gen_opt=parts.generator_optimizer,
                   disc=parts.discriminator,
                   disc_opt=parts.discriminator_optimizer, epochs=batch_size,
                   start_step=next_step)
    return PlotData(
        steps=plot_data.steps + output.steps,
        generator_losses=plot_data.generator_losses + output.generator_losses,
        discriminator_losses=(plot_data.discriminator_losses +
                              output.discriminator_losses))

At about one epoch a minute, this should take about an hour and forty minutes or so.

parts = build_parts()
plot_data = run_batch(parts, PlotData([], [], []))
Started: 2023-06-25 21:55:59.764625
Epoch 8, step 4100: Generator loss: 2.0340036781241237, discriminator loss: 0.1983713445948756
Epoch 17, step 8200: Generator loss: 3.8651721296077866, discriminator loss: 0.06043561847562473
Epoch 26, step 12300: Generator loss: 4.252211730131294, discriminator loss: 0.05903724503090109
Epoch 34, step 16400: Generator loss: 4.04407673754343, discriminator loss: 0.08460633859674378
Epoch 43, step 20500: Generator loss: 3.758469139889969, discriminator loss: 0.11290085612546387
Epoch 52, step 24600: Generator loss: 3.38570730540812, discriminator loss: 0.14871580843972707
Epoch 61, step 28700: Generator loss: 2.912376526158033, discriminator loss: 0.1914764872665816
Epoch 69, step 32800: Generator loss: 2.7313904397080573, discriminator loss: 0.20848654878030437
Epoch 78, step 36900: Generator loss: 2.433680108175046, discriminator loss: 0.2548566825288091
Epoch 87, step 41000: Generator loss: 2.2282438069436625, discriminator loss: 0.2819251123615879
Epoch 96, step 45100: Generator loss: 2.1413849820741846, discriminator loss: 0.293808027022496
Ended: 2023-06-25 22:07:28.587673
Elapsed: 0:11:28.823048

Twelve minutes… something's wrong with my math (or my code).

plot_data = run_batch(parts, plot_data, batch_size=500)
Started: 2021-12-11 01:13:48.089630
Epoch 8, step 49200: Generator loss: 2.2106766842923546, discriminator loss: 0.30767942854180547
Epoch 17, step 53300: Generator loss: 2.053519683320349, discriminator loss: 0.3469244217182084
Epoch 26, step 57400: Generator loss: 1.8833183762213095, discriminator loss: 0.3716685724476485
Epoch 34, step 61500: Generator loss: 1.7962335860147725, discriminator loss: 0.388955959706772
Epoch 43, step 65600: Generator loss: 1.7297733351079438, discriminator loss: 0.39347998913468357
Epoch 52, step 69700: Generator loss: 1.6756027169634629, discriminator loss: 0.40696758046382825
Epoch 61, step 73800: Generator loss: 1.7340464807719738, discriminator loss: 0.391147572165583
Epoch 69, step 77900: Generator loss: 1.6250884978945666, discriminator loss: 0.4112582277960894
Epoch 78, step 82000: Generator loss: 1.5282696703294425, discriminator loss: 0.4417608343828005
Epoch 87, step 86100: Generator loss: 1.551421322909793, discriminator loss: 0.4309799511621639
Epoch 96, step 90200: Generator loss: 1.502451353887236, discriminator loss: 0.43569522036284997
Epoch 104, step 94300: Generator loss: 1.5355568281034093, discriminator loss: 0.4274125394515864
Epoch 113, step 98400: Generator loss: 1.515106762124273, discriminator loss: 0.4361268874203291
Epoch 122, step 102500: Generator loss: 1.4262471299345922, discriminator loss: 0.4538826259389165
Epoch 131, step 106600: Generator loss: 1.3722852372541716, discriminator loss: 0.46847428283313336
Epoch 139, step 110700: Generator loss: 1.30627704975082, discriminator loss: 0.4916798695195014
Epoch 148, step 114800: Generator loss: 1.3066031820744994, discriminator loss: 0.48832398747525635
Epoch 157, step 118900: Generator loss: 1.2358002482972494, discriminator loss: 0.5049220843794864
Epoch 166, step 123000: Generator loss: 1.2148369922434428, discriminator loss: 0.5165507747632705
Epoch 174, step 127100: Generator loss: 1.1939138480802867, discriminator loss: 0.5140434620365866
Epoch 183, step 131200: Generator loss: 1.1655712901092137, discriminator loss: 0.5210341898333737
Epoch 192, step 135300: Generator loss: 1.0877681193119135, discriminator loss: 0.5537625885663968
Epoch 201, step 139400: Generator loss: 1.1292353331461187, discriminator loss: 0.5308915733782256
Epoch 209, step 143500: Generator loss: 1.097566315502655, discriminator loss: 0.5411388381806799
Epoch 218, step 147600: Generator loss: 1.0757532430131274, discriminator loss: 0.5517329544628548
Epoch 227, step 151700: Generator loss: 1.0799634633267818, discriminator loss: 0.5458411719668192
Epoch 236, step 155800: Generator loss: 1.0768744062068984, discriminator loss: 0.5462928909935603
Epoch 244, step 159900: Generator loss: 1.038065175501311, discriminator loss: 0.5608314768788278
Epoch 253, step 164000: Generator loss: 1.0237846635173, discriminator loss: 0.5654601171176611
Epoch 262, step 168100: Generator loss: 0.9991180751236483, discriminator loss: 0.5736101391184627
Epoch 271, step 172200: Generator loss: 1.0070789654516565, discriminator loss: 0.5714522401516029
Epoch 279, step 176300: Generator loss: 0.9614028337670539, discriminator loss: 0.5910174207425695
Epoch 288, step 180400: Generator loss: 0.9706563392499603, discriminator loss: 0.5850461383781773
Epoch 297, step 184500: Generator loss: 0.9717842458661009, discriminator loss: 0.5814466277055627
Epoch 305, step 188600: Generator loss: 0.9541865834084969, discriminator loss: 0.5840334389413279
Epoch 314, step 192700: Generator loss: 0.9712330036628544, discriminator loss: 0.5782463719932038
Epoch 323, step 196800: Generator loss: 0.9551204285098255, discriminator loss: 0.5880173592596538
Epoch 332, step 200900: Generator loss: 0.9226158920584665, discriminator loss: 0.5996578313373941
Epoch 340, step 205000: Generator loss: 0.9246665488074481, discriminator loss: 0.597404850198001
Epoch 349, step 209100: Generator loss: 0.9608930256744725, discriminator loss: 0.5801783614260392
Epoch 358, step 213200: Generator loss: 0.9612123412766118, discriminator loss: 0.5802033745152197
Epoch 367, step 217300: Generator loss: 0.9180114618307212, discriminator loss: 0.5937960715264811
Epoch 375, step 221400: Generator loss: 0.9368741252364194, discriminator loss: 0.5900343924106617
Epoch 384, step 225500: Generator loss: 0.9440083874725718, discriminator loss: 0.5846795598762786
Epoch 393, step 229600: Generator loss: 0.9536703778330882, discriminator loss: 0.5829839877384473
Epoch 402, step 233700: Generator loss: 0.9294257469729658, discriminator loss: 0.5916450903909986
Epoch 410, step 237800: Generator loss: 0.9466903525445554, discriminator loss: 0.5856489780472574
Epoch 419, step 241900: Generator loss: 0.9522996573477257, discriminator loss: 0.5799181649743058
Epoch 428, step 246000: Generator loss: 0.9839635591972143, discriminator loss: 0.5714024179156216
Epoch 437, step 250100: Generator loss: 0.9648611798373641, discriminator loss: 0.5808709646216277
Epoch 445, step 254200: Generator loss: 0.9566089589712128, discriminator loss: 0.5805564494249301
Epoch 454, step 258300: Generator loss: 0.9783696506226937, discriminator loss: 0.5709307750914155
Epoch 463, step 262400: Generator loss: 0.9719752054098166, discriminator loss: 0.5762244856866391
Epoch 472, step 266500: Generator loss: 0.999251530417581, discriminator loss: 0.5657230390717329
Epoch 480, step 270600: Generator loss: 0.9960989969387296, discriminator loss: 0.5667149156113943
Epoch 489, step 274700: Generator loss: 1.0056089845953926, discriminator loss: 0.5628991482534059
Epoch 498, step 278800: Generator loss: 0.9715120762295836, discriminator loss: 0.5786686657042022
Ended: 2021-12-11 02:15:09.581779
Elapsed: 1:01:21.492149
def hook(plot, element):
    figure = plot.state
    figure["layout"]["sizing_mode"] = Plot.sizing_mode
    return

def plot_losses(plot_data: PlotData, file_name: str="losses",
                title: str="Training Loss"):
    """Plot the losses in Holoviews

    Args:
     plot_data: namedtuple with the losses over time
     file_name: name to save the plot (without extension)
     title: title for the plot
    """
    plotting = pandas.DataFrame.from_dict({
        "Step": plot_data.steps,
        "Generator Loss": plot_data.generator_losses,
        "Discriminator Loss": plot_data.discriminator_losses
    })

    gen_plot = plotting.hvplot(x="Step", y="Generator Loss", color=Plot.blue)
    disc_plot = plotting.hvplot(x="Step", y="Discriminator Loss", color=Plot.red)

    plot = (gen_plot * disc_plot).opts(title=title,
                                       height=Plot.height,
                                       width=Plot.width,
                                       ylabel="Loss",
                                       hooks=[hook],
                                       fontscale=Plot.fontscale)
    return Embed(plot=plot, file_name=file_name)()
output = plot_losses(plot_data)
print(output)

Figure Missing

plot_data = run_batch(parts, plot_data, batch_size=500)
output = plot_losses(plot_data, file_name="losses_2", title="Training Loss 2")
print(output)

Figure Missing

Looking at the Final model.

def plot_image(image: torch.Tensor,
                filename: str,
                title: str,
                num_images: int=25,
                size: tuple=(1, 28, 28),
                folder: str="files/posts/gans/mnist-gan/") -> None:
    """Plot the image and save it

    Args:
     image: the tensor with the image to plot
     filename: name for the final image file
     title: title to put on top of the image
     num_images: how many images to put in the composite image
     size: the size for the image
     folder: sub-folder to save the file in
    """
    unflattened_image = image.detach().cpu().view(-1, *size)
    image_grid = make_grid(unflattened_image[:num_images], nrow=5)

    pyplot.title(title)
    pyplot.grid(False)
    pyplot.imshow(image_grid.permute(1, 2, 0).squeeze())

    pyplot.tick_params(bottom=False, top=False, labelbottom=False,
                       right=False, left=False, labelleft=False)
    pyplot.savefig(folder + filename)
    print(f"[[file:{filename}]]")
    return
fake_noise = get_noise(500, z_dim, device=device)
fake = parts.generator(fake_noise)
plot_image(image=fake, filename="fake_digits.png", title="Fake Digits")

fake_digits.png

I thought something was wrong with the losses, at first, since they seem to go up over time, but the loss is based on the Generator and the Discriminator being able to do their job, so as they get better, the loss goes up. The main one for us to note is the Discriminator loss, since this is how much it gets fooled by the Generator. Since it's still going up this likely means that the Generator can still improve.

Neural Machine Translation: Helper Functions

Helper Functions

We will first implement a few functions that we will use later on. These will be for:

  • the input encoder
  • the pre-attention decoder
  • preparation of the queries, keys, values, and mask.

Imports

# from pypi
from trax import layers
from trax.fastmath import numpy as fastmath_numpy

import trax

Helper functions

Input encoder

The input encoder runs on the input tokens, creates its embeddings, and feeds it to an LSTM network. This outputs the activations that will be the keys and values for attention. It is a Serial network which uses:

  • tl.Embedding: Converts each token to its vector representation. In this case, it is the the size of the vocabulary by the dimension of the model: tl.Embedding(vocab_size, d_model). vocab_size is the number of entries in the given vocabulary. d_model is the number of elements in the word embedding.
  • tl.LSTM: LSTM layer of size d_model. We want to be able to configure how many encoder layers we have so remember to create LSTM layers equal to the number of the n_encoder_layers parameter.
def input_encoder(input_vocab_size: int, d_model: int,
                     n_encoder_layers: int) -> layers.Serial:
    """ Input encoder runs on the input sentence and creates
    activations that will be the keys and values for attention.

    Args:
       input_vocab_size: vocab size of the input
       d_model:  depth of embedding (n_units in the LSTM cell)
       n_encoder_layers: number of LSTM layers in the encoder

    Returns:
       tl.Serial: The input encoder
    """
    input_encoder = layers.Serial( 
        layers.Embedding(input_vocab_size, d_model),
        [layers.LSTM(d_model) for _ in range(n_encoder_layers)]
    )
    return input_encoder
def test_input_encoder_fn(input_encoder_fn):
    target = input_encoder_fn
    success = 0
    fails = 0

    input_vocab_size = 10
    d_model = 2
    n_encoder_layers = 6

    encoder = target(input_vocab_size, d_model, n_encoder_layers)

    lstms = "\n".join([f'  LSTM_{d_model}'] * n_encoder_layers)

    expected = f"Serial[\n  Embedding_{input_vocab_size}_{d_model}\n{lstms}\n]"

    proposed = str(encoder)

    # Test all layers are in the expected sequence
    try:
        assert(proposed.replace(" ", "") == expected.replace(" ", ""))
        success += 1
    except:
        fails += 1
        print("Wrong model. \nProposed:\n%s" %proposed, "\nExpected:\n%s" %expected)

    # Test the output type
    try:
        assert(isinstance(encoder, trax.layers.combinators.Serial))
        success += 1
        # Test the number of layers
        try:
            # Test 
            assert len(encoder.sublayers) == (n_encoder_layers + 1)
            success += 1
        except:
            fails += 1
            print('The number of sublayers does not match %s <>' %len(encoder.sublayers), " %s" %(n_encoder_layers + 1))
    except:
        fails += 1
        print("The enconder is not an object of ", trax.layers.combinators.Serial)


    if fails == 0:
        print("\033[92m All tests passed")
    else:
        print('\033[92m', success," Tests passed")
        print('\033[91m', fails, " Tests failed")
test_input_encoder_fn(input_encoder)
[92m All tests passed

Pre-attention decoder

The pre-attention decoder runs on the targets and creates activations that are used as queries in attention. This is a Serial network which is composed of the following:

  • tl.ShiftRight: This pads a token to the beginning of your target tokens (e.g. [8, 34, 12] shifted right is [0, 8, 34, 12]). This will act like a start-of-sentence token that will be the first input to the decoder. During training, this shift also allows the target tokens to be passed as input to do teacher forcing.
  • tl.Embedding: Like in the previous function, this converts each token to its vector representation. In this case, it is the the size of the vocabulary by the dimension of the model: tl.Embedding(vocab_size, d_model). vocab_size is the number of entries in the given vocabulary. d_model is the number of elements in the word embedding.
  • tl.LSTM: LSTM layer of size d_model.
def pre_attention_decoder(mode: str, target_vocab_size: int, d_model: int) -> layers.Serial:
    """ Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.

    Args:
       mode: 'train' or 'eval'
       target_vocab_size: vocab size of the target
       d_model:  depth of embedding (n_units in the LSTM cell)
    Returns:
       tl.Serial: The pre-attention decoder
    """
    return layers.Serial(
        layers.ShiftRight(mode=mode),
        layers.Embedding(target_vocab_size, d_model),
        layers.LSTM(d_model)
    )
def test_pre_attention_decoder_fn(pre_attention_decoder_fn):
    target = pre_attention_decoder_fn
    success = 0
    fails = 0

    mode = 'train'
    target_vocab_size = 10
    d_model = 2

    decoder = target(mode, target_vocab_size, d_model)

    expected = f"Serial[\n  ShiftRight(1)\n  Embedding_{target_vocab_size}_{d_model}\n  LSTM_{d_model}\n]"

    proposed = str(decoder)

    # Test all layers are in the expected sequence
    try:
        assert(proposed.replace(" ", "") == expected.replace(" ", ""))
        success += 1
    except:
        fails += 1
        print("Wrong model. \nProposed:\n%s" %proposed, "\nExpected:\n%s" %expected)

    # Test the output type
    try:
        assert(isinstance(decoder, trax.layers.combinators.Serial))
        success += 1
        # Test the number of layers
        try:
            # Test 
            assert len(decoder.sublayers) == 3
            success += 1
        except:
            fails += 1
            print('The number of sublayers does not match %s <>' %len(decoder.sublayers), " %s" %3)
    except:
        fails += 1
        print("The enconder is not an object of ", trax.layers.combinators.Serial)


    if fails == 0:
        print("\033[92m All tests passed")
    else:
        print('\033[92m', success," Tests passed")
        print('\033[91m', fails, " Tests failed")

They changed the behavior of the Fn (or something in there) so that it always wraps the ShiftRight in a Serial layer, so it doesn't match the test anymore. Testing strings is kind of gimpy anyway…

It looks like they're using a decorator to check the shape which then wraps it in a Serial layer. See trax.layers.assert_shape.AssertFunction

test_pre_attention_decoder_fn(pre_attention_decoder)
Wrong model. 
Proposed:
Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_10_2
  LSTM_2
] 
Expected:
Serial[
  ShiftRight(1)
  Embedding_10_2
  LSTM_2
]
[92m 2  Tests passed
[91m 1  Tests failed

Preparing the attention input

This function will prepare the inputs to the attention layer. We want to take in the encoder and pre-attention decoder activations and assign it to the queries, keys, and values. In addition, another output here will be the mask to distinguish real tokens from padding tokens. This mask will be used internally by Trax when computing the softmax so padding tokens will not have an effect on the computated probabilities. From the data preparation steps in Section 1 of this assignment, you should know which tokens in the input correspond to padding.

def prepare_attention_input(encoder_activations: fastmath_numpy.array,
                            decoder_activations: fastmath_numpy.array,
                            inputs: fastmath_numpy.array) -> tuple:
    """Prepare queries, keys, values and mask for attention.

    Args:
       encoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the input encoder
       decoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the pre-attention decoder
       inputs fastnp.array(batch_size, padded_input_length): padded input tokens

    Returns:
       queries, keys, values and mask for attention.
    """
    keys = encoder_activations
    values = encoder_activations
    queries = decoder_activations    
    mask = inputs != 0

    mask = fastmath_numpy.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    mask += fastmath_numpy.zeros((1, 1, decoder_activations.shape[1], 1))
    return queries, keys, values, mask
def test_prepare_attention_input(prepare_attention_input):
    target = prepare_attention_input
    success = 0
    fails = 0

    #This unit test consider a batch size = 2, number_of_tokens = 3 and embedding_size = 4

    enc_act = fastmath_numpy.array([[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]],
               [[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 0, 0]]])
    dec_act = fastmath_numpy.array([[[2, 0, 0, 0], [0, 2, 0, 0], [0, 0, 2, 0]], 
               [[2, 0, 2, 0], [0, 2, 0, 2], [0, 0, 0, 0]]])
    inputs =  fastmath_numpy.array([[1, 2, 3], [1, 4, 0]])

    exp_mask = fastmath_numpy.array([[[[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]]], 
                             [[[1., 1., 0.], [1., 1., 0.], [1., 1., 0.]]]])

    exp_type = type(enc_act)

    queries, keys, values, mask = target(enc_act, dec_act, inputs)

    try:
        assert(fastmath_numpy.allclose(queries, dec_act))
        success += 1
    except:
        fails += 1
        print("Queries does not match the decoder activations")
    try:
        assert(fastmath_numpy.allclose(keys, enc_act))
        success += 1
    except:
        fails += 1
        print("Keys does not match the encoder activations")
    try:
        assert(fastmath_numpy.allclose(values, enc_act))
        success += 1
    except:
        fails += 1
        print("Values does not match the encoder activations")
    try:
        assert(fastmath_numpy.allclose(mask, exp_mask))
        success += 1
    except:
        fails += 1
        print("Mask does not match expected tensor. \nExpected:\n%s" %exp_mask, "\nOutput:\n%s" %mask)

    # Test the output type
    try:
        assert(isinstance(queries, exp_type))
        assert(isinstance(keys, exp_type))
        assert(isinstance(values, exp_type))
        assert(isinstance(mask, exp_type))
        success += 1
    except:
        fails += 1
        print("One of the output object are not of type ", jax.interpreters.xla.DeviceArray)

    if fails == 0:
        print("\033[92m All tests passed")
    else:
        print('\033[92m', success," Tests passed")
        print('\033[91m', fails, " Tests failed")
test_prepare_attention_input(prepare_attention_input)
[92m All tests passed

Logistic Regression With Neural Networks

Beginning

In this post we will build a logistic regression classifier to recognize cats.

Instructions

  • Do not use loops (for/while) in your code, unless the instructions explicitly ask you to do so.

You will learn to:

  • Build the general architecture of a learning algorithm, including:
    • Initializing parameters
    • Calculating the cost function and its gradient
    • Using an optimization algorithm (gradient descent)
  • Gather all three functions above into a main model function, in the right order.

1 - Packages

First, let's run the cell below to import all the packages that you will need during this assignment.

  • numpy is the fundamental package for scientific computing with Python.
  • h5py is a common package to interact with a dataset that is stored on an H5 file.
  • matplotlib is a famous library to plot graphs in Python.
  • PIL and scipy are used here to test your model with your own picture at the end.
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
from PIL import Image
from scipy import ndimage
from lr_utils import load_dataset
get_ipython().run_line_magic('matplotlib', 'inline')
TOLERANCE = (0.1)**5

2 - Overview of the Problem set

Problem Statement: You are given a dataset ("data.h5") containing:

  • a training set of m_train images labeled as cat (y=1) or non-cat (y=0)
  • a test set of m_test images labeled as cat or non-cat
  • each image is of shape (num_px, num_px, 3) where 3 is for the 3 channels (RGB). Thus, each image is square (height = num_px) and (width = num_px).

You will build a simple image-recognition algorithm that can correctly classify pictures as cat or non-cat.

Let's get more familiar with the dataset. Load the data by running the following code.

# Loading the data (cat/non-cat)
train_set_x_orig, train_set_y, test_set_x_orig, test_set_y, classes = load_dataset()

We added "_orig" at the end of image datasets (train and test) because we are going to preprocess them. After preprocessing, we will end up with train_set_x and test_set_x (the labels train_set_y and test_set_y don't need any preprocessing).

Each line of your train_set_x_orig and test_set_x_orig is an array representing an image. You can visualize an example by running the following code. Feel free also to change the `index` value and re-run to see other images.

# Example of a picture
index = 25
image = plt.imshow(train_set_x_orig[index])

picture_example.png

print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") +  "' picture.")

Many software bugs in deep learning come from having matrix/vector dimensions that don't fit. If you can keep your matrix/vector dimensions straight you will go a long way toward eliminating many bugs.

Exercise: Find the values for:

  • m_train (number of training examples)
  • m_test (number of test examples)
  • num_px (= height = width of a training image)

Remember that `train_set_x_orig` is a numpy-array of shape (m_train, num_px, num_px, 3). For instance, you can access `m_train` by writing `train_set_x_orig.shape[0]`.

### START CODE HERE ### (≈ 3 lines of code)
m_train, num_px, num_px, dimensions = train_set_x_orig.shape
m_test, num_px, num_px, dimensions = test_set_x_orig.shape
m_train = m_train
m_test = m_test
num_px = num_px
### END CODE HERE ###
assert m_train == 209
assert m_test == 50
assert num_px == 64
print ("Number of training examples: m_train = " + str(m_train))
print ("Number of testing examples: m_test = " + str(m_test))
print ("Height/Width of each image: num_px = " + str(num_px))
print ("Each image is of size: (" + str(num_px) + ", " + str(num_px) + ", 3)")
print ("train_set_x shape: " + str(train_set_x_orig.shape))
print ("train_set_y shape: " + str(train_set_y.shape))
print ("test_set_x shape: " + str(test_set_x_orig.shape))
print ("test_set_y shape: " + str(test_set_y.shape))

For convenience, you should now reshape images of shape (num_px, num_px, 3) in a numpy-array of shape (num_px \(*\) num_px \(*\) 3, 1). After this, our training (and test) dataset is a numpy-array where each column represents a flattened image. There should be m_train (respectively m_test) columns.

Exercise: Reshape the training and test data sets so that images of size (num_px, num_px, 3) are flattened into single vectors of shape (num\_px \(*\) num\_px \(*\) 3, 1).

A trick when you want to flatten a matrix X of shape (a,b,c,d) to a matrix X_flatten of shape (b$*$c$*$d, a) is to use:

X_flatten = X.reshape(X.shape[0], -1).T      # X.T is the transpose of X

Reshape the training and test examples

### START CODE HERE ### (≈ 2 lines of code)
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T
### END CODE HERE ###
assert train_set_x_flatten.shape == (12288, 209)
assert train_set_y.shape == (1, 209)
assert test_set_x_flatten.shape == (12288, 50)
assert test_set_y.shape == (1, 50)
assert (train_set_x_flatten[0:5, 0] == np.array([17, 31, 56, 22, 33])).all()
print ("train_set_x_flatten shape: " + str(train_set_x_flatten.shape))
print ("train_set_y shape: " + str(train_set_y.shape))
print ("test_set_x_flatten shape: " + str(test_set_x_flatten.shape))
print ("test_set_y shape: " + str(test_set_y.shape))
print ("sanity check after reshaping: " + str(train_set_x_flatten[0:5,0]))

To represent color images, the red, green and blue channels (RGB) must be specified for each pixel, and so the pixel value is actually a vector of three numbers ranging from 0 to 255.

One common preprocessing step in machine learning is to center and standardize your dataset, meaning that you substract the mean of the whole numpy array from each example, and then divide each example by the standard deviation of the whole numpy array. But for picture datasets, it is simpler and more convenient and works almost as well to just divide every row of the dataset by 255 (the maximum value of a pixel channel).

<!– During the training of your model, you're going to multiply weights and add biases to some initial inputs in order to observe neuron activations. Then you backpropogate with the gradients to train the model. But, it is extremely important for each feature to have a similar range such that our gradients don't explode. You will see that more in detail later in the lectures. !–>

Let's standardize our dataset.

train_set_x = train_set_x_flatten/255.
test_set_x = test_set_x_flatten/255.

What you need to remember:

Common steps for pre-processing a new dataset are:

  • Figure out the dimensions and shapes of the problem (m_train, m_test, num_px, …)
  • Reshape the datasets such that each example is now a vector of size (num_px \* num_px \* 3, 1)
  • "Standardize" the data

3 - General Architecture of the learning algorithm

It's time to design a simple algorithm to distinguish cat images from non-cat images.

You will build a Logistic Regression, using a Neural Network mindset. The following Figure explains why Logistic Regression is actually a very simple Neural Network!

<img src="images/LogReg_kiank.png" style="width:650px;height:400px;">

Mathematical expression of the algorithm:

Key steps: In this exercise, you will carry out the following steps:

  • Initialize the parameters of the model
  • Learn the parameters for the model by minimizing the cost
  • Use the learned parameters to make predictions (on the test set)
  • Analyse the results and conclude

4 - Building the parts of our algorithm

The main steps for building a Neural Network are:

  1. Define the model structure (such as number of input features)
  2. Initialize the model's parameters
  3. Loop:
    • Calculate current loss (forward propagation)
    • Calculate current gradient (backward propagation)
    • Update parameters (gradient descent)

You often build 1-3 separately and integrate them into one function we call `model()`.

4.1 - Helper functions

Exercise: Using your code from "Python Basics", implement `sigmoid()`. As you've seen in the figure above, you need to compute \(sigmoid( w^T x + b) = \frac{1}{1 + e^{-(w^T x + b)}}\) to make predictions. Use np.exp().

# GRADED FUNCTION: sigmoid

def sigmoid(z):
    """
    Compute the sigmoid of z

    Arguments:
    z -- A scalar or numpy array of any size.

    Return:
    s -- sigmoid(z)
    """

    ### START CODE HERE ### (≈ 1 line of code)
    s = 1/(1 + np.exp(-z))
    ### END CODE HERE ###

    return s
expected = np.array([0.5, 0.88079708])
actual = sigmoid(np.array([0,2]))
print ("sigmoid([0, 2]) = " + str(actual))
assert (expected - actual < TOLERANCE).all()

4.2 - Initializing parameters

Exercise: Implement parameter initialization in the cell below. You have to initialize w as a vector of zeros. If you don't know what numpy function to use, look up np.zeros() in the Numpy library's documentation.

# GRADED FUNCTION: initialize_with_zeros

def initialize_with_zeros(dim):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.

    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)

    Returns:
    w -- initialized vector of shape (dim, 1)
    b -- initialized scalar (corresponds to the bias)
    """

    ### START CODE HERE ### (≈ 1 line of code)
    w = np.zeros((dim, 1))
    b = 0
    ### END CODE HERE ###

    assert(w.shape == (dim, 1))
    assert(isinstance(b, float) or isinstance(b, int))

    return w, b
dim = 2
w, b = initialize_with_zeros(dim)
print ("w = " + str(w))
print ("b = " + str(b))

For image inputs, w will be of shape (num_px \(\times\) num_px \(\times\) 3, 1).

4.3 - Forward and Backward propagation

Now that your parameters are initialized, you can do the "forward" and "backward" propagation steps for learning the parameters.

Exercise: Implement a function `propagate()` that computes the cost function and its gradient.

Hints:

Forward Propagation:

  • You get X
  • You compute \(A = \sigma(w^T X + b) = (a^{(0)}, a^{(1)}, ..., a^{(m-1)}, a^{(m)})\)
  • You calculate the cost function: \(J = -\frac{1}{m}\sum_{i=1}^{m}y^{(i)}\log(a^{(i)})+(1-y^{(i)})\log(1-a^{(i)})\)

Here are the two formulas you will be using:

\[ \frac{\partial J}{\partial w} = \frac{1}{m}X(A-Y)^T\tag{7}\] \[ \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (a^{(i)}-y^{(i)})\tag{8}\]

# GRADED FUNCTION: propagate

def propagate(w, b, X, Y):
    """
    Implement the cost function and its gradient for the propagation explained above

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)

    Return:
    cost -- negative log-likelihood cost for logistic regression
    dw -- gradient of the loss with respect to w, thus same shape as w
    db -- gradient of the loss with respect to b, thus same shape as b

    Tips:
    - Write your code step by step for the propagation. np.log(), np.dot()
    """

    m = X.shape[1]

    # FORWARD PROPAGATION (FROM X TO COST)
    ### START CODE HERE ### (≈ 2 lines of code)
    A = sigmoid(np.dot(w.T, X) + b)                                    # compute activation
    cost = -(Y * np.log(A) + (1 - Y) * np.log(1 - A)).mean()
    ### END CODE HERE ###

    # BACKWARD PROPAGATION (TO FIND GRAD)
    ### START CODE HERE ### (≈ 2 lines of code)
    dz = A - Y
    dw = np.dot(X, dz.T)/m
    db = dz.mean()
    ### END CODE HERE ###

    assert(dw.shape == w.shape)
    assert(db.dtype == float)
    cost = np.squeeze(cost)
    assert(cost.shape == ())

    grads = {"dw": dw,
             "db": db}

    return grads, cost
w, b, X, Y = np.array([[1.],[2.]]), 2., np.array([[1.,2.,-1.],[3.,4.,-3.2]]), np.array([[1,0,1]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))
assert (grads["dw"] - np.abs(np.array([[0.99845601], [2.39507239]])) < TOLERANCE).all()
assert np.abs(grads["db"] - 0.00145557813678) < TOLERANCE
assert abs(cost - 5.801545319394553) < TOLERANCE

d) Optimization

  • You have initialized your parameters.
  • You are also able to compute a cost function and its gradient.
  • Now, you want to update the parameters using gradient descent.

Exercise: Write down the optimization function. The goal is to learn \(w\) and \(b\) by minimizing the cost function \(J\). For a parameter \(\theta\), the update rule is $ θ = θ - α \text{ } dθ$, where \(\alpha\) is the learning rate.

# GRADED FUNCTION: optimize

def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    """
    This function optimizes w and b by running a gradient descent algorithm

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of shape (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat), of shape (1, number of examples)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- True to print the loss every 100 steps

    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.

    Tips:
    You basically need to write down two steps and iterate through them:
        1) Calculate the cost and the gradient for the current parameters. Use propagate().
        2) Update the parameters using gradient descent rule for w and b.
    """

    costs = []

    for i in range(num_iterations):


        # Cost and gradient calculation (≈ 1-4 lines of code)
        ### START CODE HERE ### 
        grads, cost = propagate(w, b, X, Y)
        ### END CODE HERE ###

        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]

        # update rule (≈ 2 lines of code)
        ### START CODE HERE ###
        w = w - learning_rate * dw
        b = b - learning_rate * db
        ### END CODE HERE ###

        # Record the costs
        if i % 100 == 0:
            costs.append(cost)

        # Print the cost every 100 training examples
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

    params = {"w": w,
              "b": b}

    grads = {"dw": dw,
             "db": db}

    return params, grads, costs
params, grads, costs = optimize(w, b, X, Y, num_iterations= 100, learning_rate = 0.009, print_cost = False)
print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
assert (np.abs(params["w"] - np.array([[ 0.19033591], [ 0.12259159]])) < TOLERANCE).all()
assert abs(params["b"] - 1.92535983008) < TOLERANCE
assert (np.abs(grads['dw'] - np.array([[ 0.67752042], [ 1.41625495]])) < TOLERANCE).all()
assert abs(grads["db"] - 0.219194504541) < TOLERANCE

Exercise: The previous function will output the learned w and b. We are able to use w and b to predict the labels for a dataset X. Implement the `predict()` function. There is two steps to computing predictions:

  1. Calculate \(\hat{Y} = A = \sigma(w^T X + b)\)
  2. Convert the entries of a into 0 (if activation <= 0.5) or 1 (if activation > 0.5), stores the predictions in a vector `Y_prediction`. If you wish, you can use an `if`/`else` statement in a `for` loop (though there is also a way to vectorize this).
# GRADED FUNCTION: predict

def predict(w, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px * 3, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px * 3, number of examples)

    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''

    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)

    # Compute vector "A" predicting the probabilities of a cat being present in the picture
    ### START CODE HERE ### (≈ 1 line of code)
    A = sigmoid(np.dot(w.T, X) + b)
    ### END CODE HERE ###

    for i in range(A.shape[1]):

        # Convert probabilities A[0,i] to actual predictions p[0,i]
        ### START CODE HERE ### (≈ 4 lines of code)
        Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
        ### END CODE HERE ###

    assert(Y_prediction.shape == (1, m))

    return Y_prediction
w = np.array([[0.1124579],[0.23106775]])
b = -0.3
X = np.array([[1.,-1.1,-3.2],[1.2,2.,0.1]])
predictions = predict(w, b, X)
print ("predictions = " + str(predictions))
assert (np.abs(predictions - np.array([1., 1., 0.])) < TOLERANCE).all()

What to remember: You've implemented several functions that:

  • Initialize (w,b)
  • Optimize the loss iteratively to learn parameters (w,b):
    • computing the cost and its gradient
    • updating the parameters using gradient descent
  • Use the learned (w,b) to predict the labels for a given set of examples

5 - Merge all functions into a model

You will now see how the overall model is structured by putting together all the building blocks (functions implemented in the previous parts) together, in the right order.

Exercise: Implement the model function. Use the following notation:

  • Y_prediction for your predictions on the test set
  • Y_prediction_train for your predictions on the train set
  • w, costs, grads for the outputs of optimize()
# GRADED FUNCTION: model

def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
    """
    Builds the logistic regression model by calling the function you've implemented previously

    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to true to print the cost every 100 iterations

    Returns:
    d -- dictionary containing information about the model.
    """

    ### START CODE HERE ###

    # initialize parameters with zeros (≈ 1 line of code)
    w, b = initialize_with_zeros(X_train.shape[0])

    # Gradient descent (≈ 1 line of code)
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)

    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]

    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)

    ### END CODE HERE ###

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))


    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}

    return d
d = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 2000, learning_rate = 0.005, print_cost = True)

Comment: Training accuracy is close to 100%. This is a good sanity check: your model is working and has high enough capacity to fit the training data. Test error is 68%. It is actually not bad for this simple model, given the small dataset we used and that logistic regression is a linear classifier. But no worries, you'll build an even better classifier next week!

Also, you see that the model is clearly overfitting the training data. Later in this specialization you will learn how to reduce overfitting, for example by using regularization. Using the code below (and changing the `index` variable) you can look at predictions on pictures of the test set.

# Example of a picture that was wrongly classified.
index = 1
plt.imshow(test_set_x[:,index].reshape((num_px, num_px, 3)))
<matplotlib.image.AxesImage at 0x7f02ff4aaa90>

wrong_classification.png

y_actual = test_set_y[0,index]
y_prediction_test = d["Y_prediction_test"]
prediction = classes[int(y_prediction_test[0,index])].decode("utf-8")
print ("y = " + str(y_actual) + ", you predicted that it is a \"" + prediction +  "\" picture.")
# Plot learning curve (with costs)
costs = np.squeeze(d['costs'])
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(d["learning_rate"]))
plt.show()

learning_curve.png

Interpretation: You can see the cost decreasing. It shows that the parameters are being learned. However, you see that you could train the model even more on the training set. Try to increase the number of iterations in the cell above and rerun the cells. You might see that the training set accuracy goes up, but the test set accuracy goes down. This is called overfitting.

6 - Further analysis (optional/ungraded exercise)

Congratulations on building your first image classification model. Let's analyze it further, and examine possible choices for the learning rate \(\alpha\).

#### Choice of learning rate ####

Reminder: In order for Gradient Descent to work you must choose the learning rate wisely. The learning rate \(\alpha\) determines how rapidly we update the parameters. If the learning rate is too large we may "overshoot" the optimal value. Similarly, if it is too small we will need too many iterations to converge to the best values. That's why it is crucial to use a well-tuned learning rate.

Let's compare the learning curve of our model with several choices of learning rates. Run the cell below. This should take about 1 minute. Feel free also to try different values than the three we have initialized the `learning_rates` variable to contain, and see what happens.

learning_rates = [0.01, 0.001, 0.0001]
models = {}
for i in learning_rates:
    print ("learning rate is: " + str(i))
    models[str(i)] = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 1500, learning_rate = i, print_cost = False)
    print ('\n' + "-------------------------------------------------------" + '\n')
for i in learning_rates:
    plt.plot(np.squeeze(models[str(i)]["costs"]), label= str(models[str(i)]["learning_rate"]))

plt.ylabel('cost')
plt.xlabel('iterations')

legend = plt.legend(loc='upper center', shadow=True)
frame = legend.get_frame()
frame.set_facecolor('0.90')
plt.show()

tuning_alpha.png

Interpretation:

  • Different learning rates give different costs and thus different predictions results.
  • If the learning rate is too large (0.01), the cost may oscillate up and down. It may even diverge (though in this example, using 0.01 still eventually ends up at a good value for the cost).
  • A lower cost doesn't mean a better model. You have to check if there is possibly overfitting. It happens when the training accuracy is a lot higher than the test accuracy.
  • In deep learning, we usually recommend that you:
    • Choose the learning rate that better minimizes the cost function.
    • If your model overfits, use other techniques to reduce overfitting. (We'll talk about this in later videos.)

7 - Test with your own image (optional/ungraded exercise)

Congratulations on finishing this assignment. You can use your own image and see the output of your model. To do that:

  1. Click on "File" in the upper bar of this notebook, then click "Open" to go on your Coursera Hub.
  2. Add your image to this Jupyter Notebook's directory, in the "images" folder
  3. Change your image's name in the following code
  4. Run the code and check if the algorithm is right (1 = cat, 0 = non-cat)!
## START CODE HERE ## (PUT YOUR IMAGE NAME) 
my_image = "my_image.jpg"   # change this to the name of your image file 
## END CODE HERE ##
# We preprocess the image to fit your algorithm.
fname = "images/" + my_image
image = np.array(ndimage.imread(fname, flatten=False))
my_image = scipy.misc.imresize(image, size=(num_px,num_px)).reshape((1, num_px*num_px*3)).T
my_predicted_image = predict(d["w"], d["b"], my_image)
plt.imshow(image)
print("y = " + str(np.squeeze(my_predicted_image)) + ", your algorithm predicts a \"" + classes[int(np.squeeze(my_predicted_image)),].decode("utf-8") +  "\" picture.")

What to remember from this assignment:

  1. Preprocessing the dataset is important.
  2. You implemented each function separately: initialize(), propagate(), optimize(). Then you built a model().
  3. Tuning the learning rate (which is an example of a "hyperparameter") can make a big difference to the algorithm. You will see more examples of this later in this course!

Finally, if you'd like, we invite you to try different things on this Notebook. Make sure you submit before trying anything. Once you submit, things you can play with include:

  • Play with the learning rate and the number of iterations
  • Try different initialization methods and compare the results
  • Test other preprocessings (center the data, or divide each row by its standard deviation)

Bibliography:

Basic Numpy for Neural Networks

Beginning

Numpy is the main package for scientific computing in Python. It is maintained by a large community (www.numpy.org). In this exercise you will learn several key numpy functions such as np.exp, np.log, and np.reshape.

Imports

# python
from collections import namedtuple
from functools import partial

import math
import time

# pypi
from expects import be_true, equal, expect

import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews

Set Up

slug = "basic-numpy-for-neural-networks"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/first-course/{slug}")

Plot = namedtuple("Plot", ["width", "height", "fontscale", "tan", "blue", "red"])
PLOT = Plot(
    width=900,
    height=750,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
 )

Building basic functions with numpy

sigmoid function, np.exp()

Before using np.exp(), you will use math.exp() to implement the sigmoid function. You will then see why np.exp() is preferable to math.exp(). (see python's power and logarithmic functions)

Build a function that returns the sigmoid of a real number x using math.exp(x) for the exponential function.

\(sigmoid(x) = \frac{1}{1+e^{-x}}\) is sometimes also known as the logistic function. It is a non-linear function used not only in Machine Learning (Logistic Regression), but also in Deep Learning.

TOLERANCE = 0.000001
def basic_sigmoid(x: float) -> float:
    """Compute sigmoid of x.

    Args:
     x: A scalar

    Returns:
     s: sigmoid(x)
    """
    return 1/(1 + math.exp(-x))
expected = 0.9525741268224334
actual = basic_sigmoid(3)

expect(math.isclose(expected, actual, rel_tol=TOLERANCE)).to(be_true)
print(f"sigmoid of 3: {actual:0.3f}")

Actually, we rarely use the "math" library in deep learning because the inputs of the functions are real numbers. In deep learning we mostly use matrices and vectors. This is why numpy is more useful.

One reason why we use "numpy" instead of "math" in Deep Learning

x = [1, 2, 3]
try:
    basic_sigmoid(x) # you will see this give an error when you run it, because x is a vector.
except TypeError as error:
    print(str(error))

In fact, if \(x = (x_1, x_2, ..., x_n)\) is a row vector then \(np.exp(x)\) will apply the exponential function to every element of x. The output will thus be: \(np.exp(x) = (e^{x_1}, e^{x_2}, ..., e^{x_n})\). (see numpy.exp).

x = numpy.array([1, 2, 3])
print(numpy.exp(x))
[ 2.71828183  7.3890561  20.08553692]

Furthermore, if x is a vector, then a Python operation such as \(s = x + 3\) or \(s = \frac{1}{x}\) will output s as a vector of the same size as x.

x = numpy.array([1, 2, 3])
print (x + 3)
[4 5 6]

Any time you need more info on a numpy function, we encourage you to look at the official documentation.

Implement the sigmoid function using numpy.

Instructions: x could now be either a real number, a vector, or a matrix. The data structures we use in numpy to represent these shapes (vectors, matrices…) are called numpy arrays. You don't need to know more for now.

\[ \text{For } x \in \mathbb{R}^n \text{, } sigmoid(x) = sigmoid\begin{pmatrix} x_1 \\ x_2 \\ ... \\ x_n \\ \end{pmatrix} = \begin{pmatrix} \frac{1}{1+e^{-x_1}} \\ \frac{1}{1+e^{-x_2}} \\ ... \\ \frac{1}{1+e^{-x_n}} \\ \end{pmatrix}\tag{1} \]

def sigmoid(x):
    """
    Compute the sigmoid of x

    Args:
     x: A scalar or numpy array of any size

    Returns:
     s: sigmoid(x)
    """
    return 1/(1 + numpy.exp(-x))
x = numpy.array([1, 2, 3])
expected = numpy.array([ 0.73105858,  0.88079708,  0.95257413])
actual = sigmoid(x)
print(actual)
expect(numpy.allclose(expected, actual, TOLERANCE)).to(be_true)
[0.73105858 0.88079708 0.95257413]

Sigmoid gradient

You will need to compute gradients to optimize loss functions using backpropagation. Let's code your first gradient function.

The formula is:

\[ sigmoid\_derivative(x) = \sigma'(x) = \sigma(x) (1 - \sigma(x))\tag{2} \]

You often code this function in two steps:

  1. Set s to be the sigmoid of x. You might find your sigmoid(x) function useful.
  2. Compute \(\sigma'(x) = s(1-s)\)

numpy.random.randn generates a sample from the standard normal distribution.

a = numpy.random.randn(2, 3)
b = numpy.random.randn(2, 1)
c = a + b
def sigmoid_derivative(x):
    """
    Compute the gradient (also called the slope or derivative) of the sigmoid
    function with respect to its input x.

    Args:
     x: A scalar or numpy array

    Returns:
     ds: Your computed gradient.
    """
    s = sigmoid(x)
    return s * (1 - s)
x = numpy.array([1, 2, 3])
expected = numpy.array([0.19661193, 0.10499359, 0.04517666])
actual = sigmoid_derivative(x)
print (f"sigmoid_derivative(x) = {actual}")
expect(numpy.allclose(expected, actual, TOLERANCE)).to(be_true)
sigmoid_derivative(x) = [0.19661193 0.10499359 0.04517666]

Plotting The Sigmoid and Its Derivative

x = numpy.linspace(-10, 10)
siggy = sigmoid(x)
siggy_slope = sigmoid_derivative(x)
frame = pandas.DataFrame.from_dict(dict(Sigmoid=siggy, Slope=siggy_slope))
frame = frame.set_index(x)
plot = frame.hvplot(title="Sigmoid and Derivative").opts(
    height=PLOT.height,
    width=PLOT.width,
    fontscale=PLOT.fontscale,
    ylim=(0, 1),
)

output = Embed(plot=plot, file_name="sigmoid")()
print(output)

Figure Missing

Reshaping arrays

Two common numpy functions used in deep learning are np.shape and np.reshape().

  • X.shape is used to get the shape (dimension) of a matrix/vector X.
  • X.reshape(…) is used to reshape X into some other dimension.

For example, in computer science, an image is represented by a 3D array of shape \((length, height, depth = 3)\). However, when you read an image as the input of an algorithm you convert it to a vector of shape \((length \times height \times 3, 1)\). In other words, you "unroll", or reshape, the 3D array into a 1D vector.

We'll implemnt image2vector(), a function that takes an input of shape (length, height, 3) and returns a vector of shape \((length\times height\times 3, 1)\). For example, if you would like to reshape an array v of shape (a, b, c) into a vector of shape (\(a \times b, c\)) you would do:

v = v.reshape((v.shape[0] * v.shape[1], v.shape[2]))
def image2vector(image: numpy.ndarray) -> numpy.ndarray:
    """Unroll the image

    Args:
     image: array of shape (length, height, depth)

    Returns:
     v: vector of shape (length*height*depth, 1)
    """
    length, height, depth = image.shape
    return image.reshape((length * height * depth, 1))

Our image will a 3 by 3 by 2 array, typically images will be \((\textrm{number of pixels}_x, \textrm{number of pixels}_y,3)\) where 3 represents the RGB values

image = numpy.array([[[ 0.67826139,  0.29380381],
                      [ 0.90714982,  0.52835647],
                      [ 0.4215251 ,  0.45017551]],

                     [[ 0.92814219,  0.96677647],
                      [ 0.85304703,  0.52351845],
                      [ 0.19981397,  0.27417313]],

                     [[ 0.60659855,  0.00533165],
                      [ 0.10820313,  0.49978937],
                      [ 0.34144279,  0.94630077]]])

expected = numpy.array([[ 0.67826139],
                        [ 0.29380381],
                        [ 0.90714982],
                        [ 0.52835647],
                        [ 0.4215251 ],
                        [ 0.45017551],
                        [ 0.92814219],
                        [ 0.96677647],
                        [ 0.85304703],
                        [ 0.52351845],
                        [ 0.19981397],
                        [ 0.27417313],
                        [ 0.60659855],
                        [ 0.00533165],
                        [ 0.10820313],
                        [ 0.49978937],
                        [ 0.34144279],
                        [ 0.94630077]])

actual = image2vector(image)
print (f"image2vector(image) = {actual}")
length, height, depth = image.shape
expect(actual.shape == (length * height * depth, 1)).to(be_true)
expect(numpy.allclose(actual, expected, TOLERANCE)).to(be_true)
image2vector(image) = [[0.67826139]
 [0.29380381]
 [0.90714982]
 [0.52835647]
 [0.4215251 ]
 [0.45017551]
 [0.92814219]
 [0.96677647]
 [0.85304703]
 [0.52351845]
 [0.19981397]
 [0.27417313]
 [0.60659855]
 [0.00533165]
 [0.10820313]
 [0.49978937]
 [0.34144279]
 [0.94630077]]

Normalizing rows

Another common technique we use in Machine Learning and Deep Learning is to normalize our data. It often leads to a better performance because gradient descent converges faster after normalization. Here, by normalization we mean changing x to \) \frac{x}{\| x\|} \) (dividing each row vector of x by its norm).

For example, if \[ x = \begin{bmatrix} 0 & 3 & 4 \\ 2 & 6 & 4 \\ \end{bmatrix}\tag{3} \]

then

\[ \| x\| = np.linalg.norm(x, axis = 1, keepdims = True) = \begin{bmatrix} 5 \\ \sqrt{56} \\ \end{bmatrix}\tag{4} \] and

\[ x\_normalized = \frac{x}{\| x\|} = \begin{bmatrix} 0 & \frac{3}{5} & \frac{4}{5} \\ \frac{2}{\sqrt{56}} & \frac{6}{\sqrt{56}} & \frac{4}{\sqrt{56}} \\ \end{bmatrix}\tag{5} \]

Note that you can divide matrices of different sizes and it works fine: this is called broadcasting and you're going to learn about it further down.

Now we'll implement normalizeRows() to normalize the rows of a matrix. After applying this function to an input matrix x, each row of x should be a vector of unit length (meaning length 1).

See: numpy.linalg.norm

def normalizeRows(x: numpy.ndarray) -> numpy.ndarray:
    """
    Implement a function that normalizes each row of the matrix x 
    (to have unit length).

    Args:
     x: A numpy matrix of shape (n, m)

    Returns:
     x: The normalized (by row) numpy matrix.
    """
    x_norm = numpy.linalg.norm(x, ord=2, axis=1, keepdims=True)    
    x = x/x_norm
    return x
x = numpy.array([
    [0, 3, 4],
    [1, 6, 4]])

expected = numpy.array([[ 0., 0.6, 0.8],
                        [ 0.13736056,  0.82416338,  0.54944226]])
actual = normalizeRows(x)

print(f"normalizeRows(x) = {actual}")
expect(numpy.allclose(expected, actual, TOLERANCE)).to(be_true)
normalizeRows(x) = [[0.         0.6        0.8       ]
 [0.13736056 0.82416338 0.54944226]]

We can check that each row is a unit vector by calculating the Euclidean distance.

\[ Euclidean = \sqrt{\sum X^2} \]

SUM_ROWS = 1
print(numpy.sqrt(numpy.sum(actual**2, axis=SUM_ROWS)))
[1. 1.]

Note: x_norm and x have different shapes. This is normal given that x_norm takes the norm of each row of x. So x_norm has the same number of rows but only 1 column. As a consequence you can't use x /= x_norm instead of x = x/x_norm. So how did it work when you divided x by x_norm? This is called broadcasting and we'll talk about it next.

Broadcasting and the softmax function

A very important concept to understand in numpy is "broadcasting". It is very useful for performing mathematical operations between arrays of different shapes. For the full details on broadcasting, you can read the official broadcasting documentation.

We'll implement a softmax function using numpy. You can think of softmax as a normalizing function used when your algorithm needs to classify two or more classes.

The Mathy Definitions: \[ \text{for } x \in \mathbb{R}^{1\times n} \text{, } softmax(x) = softmax(\begin{bmatrix} x_1 && x_2 && \ldots && x_n \end{bmatrix}) = \begin{bmatrix} \frac{e^{x_1}}{\sum_{j}e^{x_j}} && \frac{e^{x_2}}{\sum_{j}e^{x_j}} && \ldots && \frac{e^{x_n}}{\sum_{j}e^{x_j}} \end{bmatrix} \]

\(\text{for a matrix } x \in \mathbb{R}^{m \times n} \text{, x_{ij}}\) maps to the element in the \(i^{th}\) row and \(j^{th}\) column of x, thus we have: \[ softmax(x) = softmax\begin{bmatrix} x_{11} & x_{12} & x_{13} & \dots & x_{1n} \\ x_{21} & x_{22} & x_{23} & \dots & x_{2n} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ x_{m1} & x_{m2} & x_{m3} & \dots & x_{mn} \end{bmatrix} = \begin{bmatrix} \frac{e^{x_{11}}}{\sum_{j}e^{x_{1j}}} & \frac{e^{x_{12}}}{\sum_{j}e^{x_{1j}}} & \frac{e^{x_{13}}}{\sum_{j}e^{x_{1j}}} & \dots & \frac{e^{x_{1n}}}{\sum_{j}e^{x_{1j}}} \\ \frac{e^{x_{21}}}{\sum_{j}e^{x_{2j}}} & \frac{e^{x_{22}}}{\sum_{j}e^{x_{2j}}} & \frac{e^{x_{23}}}{\sum_{j}e^{x_{2j}}} & \dots & \frac{e^{x_{2n}}}{\sum_{j}e^{x_{2j}}} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ \frac{e^{x_{m1}}}{\sum_{j}e^{x_{mj}}} & \frac{e^{x_{m2}}}{\sum_{j}e^{x_{mj}}} & \frac{e^{x_{m3}}}{\sum_{j}e^{x_{mj}}} & \dots & \frac{e^{x_{mn}}}{\sum_{j}e^{x_{mj}}} \end{bmatrix} = \begin{pmatrix} softmax\text{(first row of x)} \\ softmax\text{(second row of x)} \\ \ldots \\ softmax\text{(last row of x)} \\ \end{pmatrix} \]

See also: numpy.sum

ROW_SUMS = 1

def softmax(x: numpy.ndarray) -> numpy.ndarray:
    """Calculates the softmax for each row of the input x.

    Args:
     x: A numpy matrix of shape (n,m)

    Returns:
     s: A numpy matrix equal to the softmax of x, of shape (n,m)
    """
    x_exp = numpy.exp(x)
    x_sum = numpy.sum(x_exp, axis=ROW_SUMS, keepdims=True)

    return x_exp/x_sum
a = numpy.random.randn(2, 3)
b = numpy.random.randn(2, 1)
c = a + b

expected = numpy.array([[ 9.80897665e-01, 8.94462891e-04, 1.79657674e-02,
                          1.21052389e-04, 1.21052389e-04],
                        [ 8.78679856e-01, 1.18916387e-01, 8.01252314e-04,
                          8.01252314e-04, 8.01252314e-04]])

x = numpy.array([
    [9, 2, 5, 0, 0],
    [7, 5, 0, 0 ,0]])

actual = softmax(x)
print(f"softmax(x) = {actual}")
expect(numpy.allclose(expected, actual, TOLERANCE)).to(be_true)
softmax(x) = [[9.80897665e-01 8.94462891e-04 1.79657674e-02 1.21052389e-04
  1.21052389e-04]
 [8.78679856e-01 1.18916387e-01 8.01252314e-04 8.01252314e-04
  8.01252314e-04]]

Note:

  • If you print the shapes of x_exp, x_sum and s above and rerun the assessment cell, you will see that x_sum is of shape (2,1) while x_exp and s are of shape (2,5). x_exp/x_sum works due to python broadcasting.

What you need to remember:

  • np.exp(x) works for any np.array x and applies the exponential function to every coordinate
  • the sigmoid function and its gradient
  • Some equivalent of image2vector is commonly used in deep learning
  • np.reshape is widely used. In the future, you'll see that keeping your matrix/vector dimensions straight will go toward eliminating a lot of bugs.
  • numpy has efficient built-in functions
  • broadcasting is extremely useful

Vectorization

In deep learning, you deal with very large datasets. Hence, a non-computationally-optimal function can become a huge bottleneck in your algorithm and can result in a model that takes ages to run. To make sure that your code is computationally efficient, you will use vectorization. For example, try to tell the difference between the following implementations of the dot/outer/elementwise product.

x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0]
x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0]
CLASSIC = dict()

Classic (Non-Vectorized)

Dot Product Of Vectors Implementation

tic = time.process_time()
dot = 0
for i in range(len(x1)):
    dot+= x1[i]*x2[i]
toc = time.process_time()
CLASSIC["dot"] = 1000 * (toc - tic)
print (f"dot = {dot} \n ----- Computation time = {CLASSIC['dot']} ms")
dot = 278 
 ----- Computation time = 0.09222100000005895 ms

Outer Product Implementation

tic = time.process_time()
outer = numpy.zeros((len(x1),len(x2)))
for i in range(len(x1)):
    for j in range(len(x2)):
        outer[i,j] = x1[i]*x2[j]
toc = time.process_time()
CLASSIC["outer"] = 1000*(toc - tic)
print (f"outer = {outer}\n ----- Computation time = {CLASSIC['outer']} ms")
outer = [[81. 18. 18. 81.  0. 81. 18. 45.  0.  0. 81. 18. 45.  0.  0.]
 [18.  4.  4. 18.  0. 18.  4. 10.  0.  0. 18.  4. 10.  0.  0.]
 [45. 10. 10. 45.  0. 45. 10. 25.  0.  0. 45. 10. 25.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [63. 14. 14. 63.  0. 63. 14. 35.  0.  0. 63. 14. 35.  0.  0.]
 [45. 10. 10. 45.  0. 45. 10. 25.  0.  0. 45. 10. 25.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [81. 18. 18. 81.  0. 81. 18. 45.  0.  0. 81. 18. 45.  0.  0.]
 [18.  4.  4. 18.  0. 18.  4. 10.  0.  0. 18.  4. 10.  0.  0.]
 [45. 10. 10. 45.  0. 45. 10. 25.  0.  0. 45. 10. 25.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
 ----- Computation time = 0.2285300000002266 ms

Elementwise Implementation

tic = time.process_time()
mul = numpy.zeros(len(x1))
for i in range(len(x1)):
    mul[i] = x1[i]*x2[i]
toc = time.process_time()
CLASSIC["elementwise"] = 1000*(toc - tic)
print(f"elementwise multiplication = {mul}\n ----- Computation time = {CLASSIC['elementwise']} ms")
elementwise multiplication = [81.  4. 10.  0.  0. 63. 10.  0.  0.  0. 81.  4. 25.  0.  0.]
 ----- Computation time = 0.10630600000016699 ms

General Dot Product Implementation

W = numpy.random.rand(3,len(x1))
tic = time.process_time()
gdot = numpy.zeros(W.shape[0])
for i in range(W.shape[0]):
    for j in range(len(x1)):
        gdot[i] += W[i,j]*x1[j]
toc = time.process_time()
CLASSIC["general_dot"] = 1000*(toc - tic)
print(f"gdot = {gdot}\n ----- Computation time = {CLASSIC['general_dot']} ms")
gdot = [26.7997887  21.98533453 17.23427487]
 ----- Computation time = 0.14043400000041117 ms

Vectorized

Dot Product Of Vectors

tic = time.process_time()
dot = numpy.dot(x1,x2)
toc = time.process_time()
DOT = 1000*(toc - tic)
print(f"dot = {dot}\n ----- Computation time = {DOT} ms")
print(f"Difference: {CLASSIC['dot'] - DOT} ms")
dot = 278
 ----- Computation time = 0.11425399999964725 ms
Difference: -0.0220329999995883 ms

So for this small set, the pure python is faster.

Outer Product

tic = time.process_time()
outer = numpy.outer(x1,x2)
toc = time.process_time()
OUTER = 1000*(toc - tic)
print(f"outer = {outer}\n ----- Computation time = {OUTER} ms")
print(f"Difference: {CLASSIC['outer'] - OUTER} ms")
outer = [[81 18 18 81  0 81 18 45  0  0 81 18 45  0  0]
 [18  4  4 18  0 18  4 10  0  0 18  4 10  0  0]
 [45 10 10 45  0 45 10 25  0  0 45 10 25  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [63 14 14 63  0 63 14 35  0  0 63 14 35  0  0]
 [45 10 10 45  0 45 10 25  0  0 45 10 25  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [81 18 18 81  0 81 18 45  0  0 81 18 45  0  0]
 [18  4  4 18  0 18  4 10  0  0 18  4 10  0  0]
 [45 10 10 45  0 45 10 25  0  0 45 10 25  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
 ----- Computation time = 0.09857899999943243 ms
Difference: 0.12995100000079418 ms

Now numpy is a little faster.

Elementwise Multiplication

tic = time.process_time()
mul = numpy.multiply(x1,x2)
toc = time.process_time()
ELEMENTWISE = 1000*(toc - tic)
print(f"elementwise multiplication = {mul}\n ----- Computation time = {ELEMENTWISE} ms")
print(f"Difference: {CLASSIC['elementwise'] - ELEMENTWISE} ms")
elementwise multiplication = [81  4 10  0  0 63 10  0  0  0 81  4 25  0  0]
 ----- Computation time = 0.07506199999962604 ms
Difference: 0.03124400000054095 ms

General Dot Product

tic = time.process_time()
dot = numpy.dot(W,x1)
toc = time.process_time()
GENERAL = 1000*(toc - tic)
print(f"gdot = {dot}\n ----- Computation time = {GENERAL} ms")
print(f"Difference: {CLASSIC['general_dot'] - GENERAL} ms")
gdot = [26.7997887  21.98533453 17.23427487]
 ----- Computation time = 0.10962399999936423 ms
Difference: 0.030810000001046944 ms

As you may have noticed, the vectorized implementation is much cleaner and somewhat more efficient. For bigger vectors/matrices, the differences in running time become even bigger.

Note that np.dot() performs a matrix-matrix or matrix-vector multiplication. This is different from np.multiply() and the * operator (which is equivalent to .* in Matlab/Octave), which performs an element-wise multiplication.

The L1 and L2 loss functions

L1

Now we'll implement the numpy vectorized version of the L1 loss.

Reminder: The loss is used to evaluate the performance of your model. The bigger your loss is, the more different your predictions (\( \hat{y} \)) are from the true values (y). In deep learning, you use optimization algorithms like Gradient Descent to train your model and to minimize the cost. L1 loss is defined as:

\[ \begin{align*} & L_1(\hat{y}, y) = \sum_{i=0}^m \left|y^{(i)} - \hat{y}^{(i)}\right| \end{align*}\tag{6} \]

def L1(yhat: numpy.ndarray, y: numpy.ndarray) -> numpy.ndarray:
    """L1 Loss

    Args:
     yhat: vector of size m (predicted labels)
     y: vector of size m (true labels)

    Returns:
     loss: the value of the L1 loss function defined above
    """
    return numpy.sum(numpy.abs(y - yhat))
yhat = numpy.array([.9, 0.2, 0.1, .4, .9])
y = numpy.array([1, 0, 0, 1, 1])
expected = 1.1
actual = L1(yhat, y)
print(f"L1 = {actual}")
expect(actual).to(equal(expected))
L1 = 1.1

L2 Loss

Next we'll implement the numpy vectorized version of the L2 loss. There are several ways of implementing the L2 loss but we'll use the function np.dot(). As a reminder, if \(x = [x_1, x_2, \ldots, x_n]\), then np.dot(x,x) = \(\sum_{j=0}^n x_j^{2}\).

L2 loss is defined as \[ L_2(\hat{y},y) = \sum_{i=0}^m\left(y^{(i)} - \hat{y}^{(i)}\right)^2\tag{7} \]

def L2(yhat: numpy.ndarray, y: numpy.ndarray) -> numpy.ndarray:
    """Calculate the L2 Loss

    Args:
     yhat: vector of size m (predicted labels)
     y: vector of size m (true labels)

    Returns:
     loss: the value of the L2 loss function defined above
    """
    return numpy.sum((y - yhat)**2)
yhat = numpy.array([.9, 0.2, 0.1, .4, .9])
y = numpy.array([1, 0, 0, 1, 1])
expected = 0.43
actual = L2(yhat,y)
print(f"L2 = {actual}")
expect(actual).to(equal(expected))
L2 = 0.43

End

What to remember:

  • Vectorization is very important in deep learning. It provides computational efficiency and clarity.
  • You have reviewed the L1 and L2 loss.
  • You are familiar with many numpy functions such as np.sum, np.dot, np.multiply, np.maximum, etc…

Source

This was an exercise from DeepLearning.ai's first Coursera course. (link to come)

Neural Machine Translation: Testing the Model

Table of Contents

Testing the Model

In the previous post we trained our machine translation model so now it's time to test it and see how well it does.

End

The overview post with links to all the posts in this series is here.

Raw

# # Part 4:  Testing
# 
# We will now be using the model you just trained to translate English sentences to German. We will implement this with two functions: The first allows you to identify the next symbol (i.e. output token). The second one takes care of combining the entire translated string.
# 
# We will start by first loading in a pre-trained copy of the model you just coded. Please run the cell below to do just that.

# In[ ]:


# instantiate the model we built in eval mode
model = NMTAttn(mode='eval')

# initialize weights from a pre-trained model
model.init_from_file("model.pkl.gz", weights_only=True)
model = tl.Accelerate(model)


# <a name="4.1"></a>
# ## 4.1  Decoding
# 
# As discussed in the lectures, there are several ways to get the next token when translating a sentence. For instance, we can just get the most probable token at each step (i.e. greedy decoding) or get a sample from a distribution. We can generalize the implementation of these two approaches by using the `tl.logsoftmax_sample()` method. Let's briefly look at its implementation:
# 
# ```python
# def logsoftmax_sample(log_probs, temperature=1.0):  # pylint: disable=invalid-name
#   """Returns a sample from a log-softmax output, with temperature.
# 
#   Args:
#     log_probs: Logarithms of probabilities (often coming from LogSofmax)
#     temperature: For scaling before sampling (1.0 = default, 0.0 = pick argmax)
#   """
#   # This is equivalent to sampling from a softmax with temperature.
#   u = np.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
#   g = -np.log(-np.log(u))
#   return np.argmax(log_probs + g * temperature, axis=-1)
# ```
# 
# The key things to take away here are: 1. it gets random samples with the same shape as your input (i.e. `log_probs`), and 2. the amount of "noise" added to the input by these random samples is scaled by a `temperature` setting. You'll notice that setting it to `0` will just make the return statement equal to getting the argmax of `log_probs`. This will come in handy later. 
# 
# <a name="ex06"></a>
# ### Exercise 06
# 
# **Instructions:** Implement the `next_symbol()` function that takes in the `input_tokens` and the `cur_output_tokens`, then return the index of the next word. You can click below for hints in completing this exercise.
# 
# <details>    
# <summary>
#     <font size="3" color="darkgreen"><b>Click Here for Hints</b></font>
# </summary>
# <p>
# <ul>
#     <li>To get the next power of two, you can compute <i>2^log_2(token_length + 1)</i> . We add 1 to avoid <i>log(0).</i></li>
#     <li>You can use <i>np.ceil()</i> to get the ceiling of a float.</li>
#     <li><i>np.log2()</i> will get the logarithm base 2 of a value</li>
#     <li><i>int()</i> will cast a value into an integer type</li>
#     <li>From the model diagram in part 2, you know that it takes two inputs. You can feed these with this syntax to get the model outputs: <i>model((input1, input2))</i>. It's up to you to determine which variables below to substitute for input1 and input2. Remember also from the diagram that the output has two elements: [log probabilities, target tokens]. You won't need the target tokens so we assigned it to _ below for you. </li>
#     <li> The log probabilities output will have the shape: (batch size, decoder length, vocab size). It will contain log probabilities for each token in the <i>cur_output_tokens</i> plus 1 for the start symbol introduced by the ShiftRight in the preattention decoder. For example, if cur_output_tokens is [1, 2, 5], the model will output an array of log probabilities each for tokens 0 (start symbol), 1, 2, and 5. To generate the next symbol, you just want to get the log probabilities associated with the last token (i.e. token 5 at index 3). You can slice the model output at [0, 3, :] to get this. It will be up to you to generalize this for any length of cur_output_tokens </li>
# </ul>
# 

# In[ ]:


# UNQ_C6
# GRADED FUNCTION
def next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature):
    """Returns the index of the next token.

    Args:
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        input_tokens (np.ndarray 1 x n_tokens): tokenized representation of the input sentence
        cur_output_tokens (list): tokenized representation of previously translated words
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)

    Returns:
        int: index of the next token in the translated sentence
        float: log probability of the next symbol
    """

    ### START CODE HERE (REPLACE INSTANCES OF `None` WITH YOUR CODE) ###

    # set the length of the current output tokens
    token_length = None

    # calculate next power of 2 for padding length 
    padded_length = None

    # pad cur_output_tokens up to the padded_length
    padded = cur_output_tokens + None
    
    # model expects the output to have an axis for the batch size in front so
    # convert `padded` list to a numpy array with shape (x, <padded_length>) where the
    # x position is the batch axis. (hint: you can use np.expand_dims() with axis=0 to insert a new axis)
    padded_with_batch = None

    # get the model prediction. remember to use the `NMTAttn` argument defined above.
    # hint: the model accepts a tuple as input (e.g. `my_model((input1, input2))`)
    output, _ = None
    
    # get log probabilities from the last token output
    log_probs = output[None]

    # get the next symbol by getting a logsoftmax sample (*hint: cast to an int)
    symbol = None
    
    ### END CODE HERE ###

    return symbol, float(log_probs[symbol])


# In[ ]:


# BEGIN UNIT TEST
w1_unittest.test_next_symbol(next_symbol, model)
# END UNIT TEST


# Now you will implement the `sampling_decode()` function. This will call the `next_symbol()` function above several times until the next output is the end-of-sentence token (i.e. `EOS`). It takes in an input string and returns the translated version of that string.
# 
# <a name="ex07"></a>
# ### Exercise 07
# 
# **Instructions**: Implement the `sampling_decode()` function.

# In[ ]:


# UNQ_C7
# GRADED FUNCTION
def sampling_decode(input_sentence, NMTAttn = None, temperature=0.0, vocab_file=None, vocab_dir=None):
    """Returns the translated sentence.

    Args:
        input_sentence (str): sentence to translate.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        tuple: (list, str, float)
            list of int: tokenized version of the translated sentence
            float: log probability of the translated sentence
            str: the translated sentence
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF `None` WITH YOUR CODE) ###
    
    # encode the input sentence
    input_tokens = None
    
    # initialize the list of output tokens
    cur_output_tokens = None
    
    # initialize an integer that represents the current output index
    cur_output = None
    
    # Set the encoding of the "end of sentence" as 1
    EOS = None
    
    # check that the current output is not the end of sentence token
    while cur_output != EOS:
        
        # update the current output token by getting the index of the next word (hint: use next_symbol)
        cur_output, log_prob = None
        
        # append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output)
    
    # detokenize the output tokens
    sentence = None
    
    ### END CODE HERE ###
    
    return cur_output_tokens, log_prob, sentence


# In[ ]:


# Test the function above. Try varying the temperature setting with values from 0 to 1.
# Run it several times with each setting and see how often the output changes.
sampling_decode("I love languages.", model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)


# In[ ]:


# BEGIN UNIT TEST
w1_unittest.test_sampling_decode(sampling_decode, model)
# END UNIT TEST


# We have set a default value of `0` to the temperature setting in our implementation of `sampling_decode()` above. As you may have noticed in the `logsoftmax_sample()` method, this setting will ultimately result in greedy decoding. As mentioned in the lectures, this algorithm generates the translation by getting the most probable word at each step. It gets the argmax of the output array of your model and then returns that index. See the testing function and sample inputs below. You'll notice that the output will remain the same each time you run it.

# In[ ]:


def greedy_decode_test(sentence, NMTAttn=None, vocab_file=None, vocab_dir=None):
    """Prints the input and output of our NMTAttn model using greedy decode

    Args:
        sentence (str): a custom string.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        str: the translated sentence
    """
    
    _,_, translated_sentence = sampling_decode(sentence, NMTAttn, vocab_file=vocab_file, vocab_dir=vocab_dir)
    
    print("English: ", sentence)
    print("German: ", translated_sentence)
    
    return translated_sentence


# In[ ]:


# put a custom string here
your_sentence = 'I love languages.'

greedy_decode_test(your_sentence, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);


# In[ ]:


greedy_decode_test('You are almost done with the assignment!', model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);


# <a name="4.2"></a>
# ## 4.2  Minimum Bayes-Risk Decoding
# 
# As mentioned in the lectures, getting the most probable token at each step may not necessarily produce the best results. Another approach is to do Minimum Bayes Risk Decoding or MBR. The general steps to implement this are:
# 
# 1. take several random samples
# 2. score each sample against all other samples
# 3. select the one with the highest score
# 
# You will be building helper functions for these steps in the following sections.

# <a name='4.2.1'></a>
# ### 4.2.1 Generating samples
# 
# First, let's build a function to generate several samples. You can use the `sampling_decode()` function you developed earlier to do this easily. We want to record the token list and log probability for each sample as these will be needed in the next step.

# In[ ]:


def generate_samples(sentence, n_samples, NMTAttn=None, temperature=0.6, vocab_file=None, vocab_dir=None):
    """Generates samples using sampling_decode()

    Args:
        sentence (str): sentence to translate.
        n_samples (int): number of samples to generate
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file
        
    Returns:
        tuple: (list, list)
            list of lists: token list per sample
            list of floats: log probability per sample
    """
    # define lists to contain samples and probabilities
    samples, log_probs = [], []

    # run a for loop to generate n samples
    for _ in range(n_samples):
        
        # get a sample using the sampling_decode() function
        sample, logp, _ = sampling_decode(sentence, NMTAttn, temperature, vocab_file=vocab_file, vocab_dir=vocab_dir)
        
        # append the token list to the samples list
        samples.append(sample)
        
        # append the log probability to the log_probs list
        log_probs.append(logp)
                
    return samples, log_probs


# In[ ]:


# generate 4 samples with the default temperature (0.6)
generate_samples('I love languages.', 4, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)


# ### 4.2.2 Comparing overlaps
# 
# Let us now build our functions to compare a sample against another. There are several metrics available as shown in the lectures and you can try experimenting with any one of these. For this assignment, we will be calculating scores for unigram overlaps. One of the more simple metrics is the [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index) which gets the intersection over union of two sets. We've already implemented it below for your perusal.

# In[ ]:


def jaccard_similarity(candidate, reference):
    """Returns the Jaccard similarity between two token lists

    Args:
        candidate (list of int): tokenized version of the candidate translation
        reference (list of int): tokenized version of the reference translation

    Returns:
        float: overlap between the two token lists
    """
    
    # convert the lists to a set to get the unique tokens
    can_unigram_set, ref_unigram_set = set(candidate), set(reference)  
    
    # get the set of tokens common to both candidate and reference
    joint_elems = can_unigram_set.intersection(ref_unigram_set)
    
    # get the set of all tokens found in either candidate or reference
    all_elems = can_unigram_set.union(ref_unigram_set)
    
    # divide the number of joint elements by the number of all elements
    overlap = len(joint_elems) / len(all_elems)
    
    return overlap


# In[ ]:


# let's try using the function. remember the result here and compare with the next function below.
jaccard_similarity([1, 2, 3], [1, 2, 3, 4])


# One of the more commonly used metrics in machine translation is the ROUGE score. For unigrams, this is called ROUGE-1 and as shown in class, you can output the scores for both precision and recall when comparing two samples. To get the final score, you will want to compute the F1-score as given by:
# 
# $$score = 2* \frac{(precision * recall)}{(precision + recall)}$$
# 
# <a name="ex08"></a>
# ### Exercise 08
# 
# **Instructions**: Implement the `rouge1_similarity()` function.

# In[ ]:


# UNQ_C8
# GRADED FUNCTION

# for making a frequency table easily
from collections import Counter

def rouge1_similarity(system, reference):
    """Returns the ROUGE-1 score between two token lists

    Args:
        system (list of int): tokenized version of the system translation
        reference (list of int): tokenized version of the reference translation

    Returns:
        float: overlap between the two token lists
    """    
    
    ### START CODE HERE (REPLACE INSTANCES OF `None` WITH YOUR CODE) ###
    
    # make a frequency table of the system tokens (hint: use the Counter class)
    sys_counter = None
    
    # make a frequency table of the reference tokens (hint: use the Counter class)
    ref_counter = None
    
    # initialize overlap to 0
    overlap = None
    
    # run a for loop over the sys_counter object (can be treated as a dictionary)
    for token in sys_counter:
        
        # lookup the value of the token in the sys_counter dictionary (hint: use the get() method)
        token_count_sys = None
        
        # lookup the value of the token in the ref_counter dictionary (hint: use the get() method)
        token_count_ref = None
        
        # update the overlap by getting the smaller number between the two token counts above
        overlap += None
    
    # get the precision (i.e. number of overlapping tokens / number of system tokens)
    precision = None
    
    # get the recall (i.e. number of overlapping tokens / number of reference tokens)
    recall = None
    
    if precision + recall != 0:
        # compute the f1-score
        rouge1_score = None
    else:
        rouge1_score = 0 
    ### END CODE HERE ###
    
    return rouge1_score
    


# In[ ]:


# notice that this produces a different value from the jaccard similarity earlier
rouge1_similarity([1, 2, 3], [1, 2, 3, 4])


# In[ ]:


# BEGIN UNIT TEST
w1_unittest.test_rouge1_similarity(rouge1_similarity)
# END UNIT TEST


# ### 4.2.3 Overall score
# 
# We will now build a function to generate the overall score for a particular sample. As mentioned earlier, we need to compare each sample with all other samples. For instance, if we generated 30 sentences, we will need to compare sentence 1 to sentences 2 to 30. Then, we compare sentence 2 to sentences 1 and 3 to 30, and so forth. At each step, we get the average score of all comparisons to get the overall score for a particular sample. To illustrate, these will be the steps to generate the scores of a 4-sample list.
# 
# 1. Get similarity score between sample 1 and sample 2
# 2. Get similarity score between sample 1 and sample 3
# 3. Get similarity score between sample 1 and sample 4
# 4. Get average score of the first 3 steps. This will be the overall score of sample 1.
# 5. Iterate and repeat until samples 1 to 4 have overall scores.
# 
# We will be storing the results in a dictionary for easy lookups.
# 
# <a name="ex09"></a>
# ### Exercise 09
# 
# **Instructions**: Implement the `average_overlap()` function.

# In[ ]:


# UNQ_C9
# GRADED FUNCTION
def average_overlap(similarity_fn, samples, *ignore_params):
    """Returns the arithmetic mean of each candidate sentence in the samples

    Args:
        similarity_fn (function): similarity function used to compute the overlap
        samples (list of lists): tokenized version of the translated sentences
        *ignore_params: additional parameters will be ignored

    Returns:
        dict: scores of each sample
            key: index of the sample
            value: score of the sample
    """  
    
    # initialize dictionary
    scores = {}
    
    # run a for loop for each sample
    for index_candidate, candidate in enumerate(samples):    
        
        ### START CODE HERE (REPLACE INSTANCES OF `None` WITH YOUR CODE) ###
        
        # initialize overlap to 0.0
        overlap = None
        
        # run a for loop for each sample
        for index_sample, sample in enumerate(samples): 

            # skip if the candidate index is the same as the sample index
            if index_candidate == index_sample:
                continue
                
            # get the overlap between candidate and sample using the similarity function
            sample_overlap = None
            
            # add the sample overlap to the total overlap
            overlap += None
            
        # get the score for the candidate by computing the average
        score = None
        
        # save the score in the dictionary. use index as the key.
        scores[index_candidate] = None
        
        ### END CODE HERE ###
    return scores


# In[ ]:


average_overlap(jaccard_similarity, [[1, 2, 3], [1, 2, 4], [1, 2, 4, 5]], [0.4, 0.2, 0.5])


# In[ ]:


# BEGIN UNIT TEST
w1_unittest.test_average_overlap(average_overlap)
# END UNIT TEST


# In practice, it is also common to see the weighted mean being used to calculate the overall score instead of just the arithmetic mean. We have implemented it below and you can use it in your experiements to see which one will give better results.

# In[ ]:


def weighted_avg_overlap(similarity_fn, samples, log_probs):
    """Returns the weighted mean of each candidate sentence in the samples

    Args:
        samples (list of lists): tokenized version of the translated sentences
        log_probs (list of float): log probability of the translated sentences

    Returns:
        dict: scores of each sample
            key: index of the sample
            value: score of the sample
    """
    
    # initialize dictionary
    scores = {}
    
    # run a for loop for each sample
    for index_candidate, candidate in enumerate(samples):    
        
        # initialize overlap and weighted sum
        overlap, weight_sum = 0.0, 0.0
        
        # run a for loop for each sample
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):

            # skip if the candidate index is the same as the sample index            
            if index_candidate == index_sample:
                continue
                
            # convert log probability to linear scale
            sample_p = float(np.exp(logp))

            # update the weighted sum
            weight_sum += sample_p

            # get the unigram overlap between candidate and sample
            sample_overlap = similarity_fn(candidate, sample)
            
            # update the overlap
            overlap += sample_p * sample_overlap
            
        # get the score for the candidate
        score = overlap / weight_sum
        
        # save the score in the dictionary. use index as the key.
        scores[index_candidate] = score
    
    return scores


# In[ ]:


weighted_avg_overlap(jaccard_similarity, [[1, 2, 3], [1, 2, 4], [1, 2, 4, 5]], [0.4, 0.2, 0.5])


# ### 4.2.4 Putting it all together
# 
# We will now put everything together and develop the `mbr_decode()` function. Please use the helper functions you just developed to complete this. You will want to generate samples, get the score for each sample, get the highest score among all samples, then detokenize this sample to get the translated sentence.
# 
# <a name="ex10"></a>
# ### Exercise 10
# 
# **Instructions**: Implement the `mbr_overlap()` function.

# In[ ]:


# UNQ_C10
# GRADED FUNCTION
def mbr_decode(sentence, n_samples, score_fn, similarity_fn, NMTAttn=None, temperature=0.6, vocab_file=None, vocab_dir=None):
    """Returns the translated sentence using Minimum Bayes Risk decoding

    Args:
        sentence (str): sentence to translate.
        n_samples (int): number of samples to generate
        score_fn (function): function that generates the score for each sample
        similarity_fn (function): function used to compute the overlap between a pair of samples
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        str: the translated sentence
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF `None` WITH YOUR CODE) ###
    # generate samples
    samples, log_probs = None
    
    # use the scoring function to get a dictionary of scores
    # pass in the relevant parameters as shown in the function definition of 
    # the mean methods you developed earlier
    scores = None
    
    # find the key with the highest score
    max_index = None
    
    # detokenize the token list associated with the max_index
    translated_sentence = None
    
    ### END CODE HERE ###
    return (translated_sentence, max_index, scores)


# In[ ]:


TEMPERATURE = 1.0

# put a custom string here
your_sentence = 'She speaks English and German.'


# In[ ]:


mbr_decode(your_sentence, 4, weighted_avg_overlap, jaccard_similarity, model, TEMPERATURE, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)[0]


# In[ ]:


mbr_decode('Congratulations!', 4, average_overlap, rouge1_similarity, model, TEMPERATURE, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)[0]


# In[ ]:


mbr_decode('You have completed the assignment!', 4, average_overlap, rouge1_similarity, model, TEMPERATURE, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)[0]


# **This unit test take a while to run. Please be patient**

# In[ ]:


# BEGIN UNIT TEST
w1_unittest.test_mbr_decode(mbr_decode, model)
# END UNIT TEST


# #### Congratulations! Next week, you'll dive deeper into attention models and study the Transformer architecture. You will build another network but without the recurrent part. It will show that attention is all you need! It should be fun!


Neural Machine Translation: Training the Model

Training Our Model

In the previous post we defined our model for machine translation. In this post we'll train the model on our data.

Doing supervised training in Trax is pretty straightforward (short example here). We will be instantiating three classes for this: TrainTask, EvalTask, and Loop. Let's take a closer look at each of these in the sections below.

Imports

# python
from collections import namedtuple
from contextlib import redirect_stdout
from functools import partial
from pathlib import Path

import sys

# pypi
from holoviews import opts
from trax import layers, optimizers
from trax.supervised import lr_schedules, training

import holoviews
import hvplot.pandas
import pandas

# this project
from neurotic.nlp.machine_translation import DataGenerator, NMTAttn

# related
from graeae import EmbedHoloviews, Timer

Set Up

train_batch_stream = DataGenerator().batch_generator
eval_batch_stream = DataGenerator(training=False).batch_generator
SLUG = "neural-machine-translation-training-the-model"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/nlp/{SLUG}")

Plot = namedtuple("Plot", ["width", "height", "fontscale", "tan", "blue", "red"])
PLOT = Plot(
    width=900,
    height=750,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
 )
TIMER = Timer()

Training

TrainTask

The TrainTask class allows us to define the labeled data to use for training and the feedback mechanisms to compute the loss and update the weights.

train_task = training.TrainTask(

    # use the train batch stream as labeled data
    labeled_data = train_batch_stream,

    # use the cross entropy loss
    loss_layer = layers.WeightedCategoryCrossEntropy(),

    # use the Adam optimizer with learning rate of 0.01
    optimizer = optimizers.Adam(0.01),

    # use the `trax.lr.warmup_and_rsqrt_decay` as the learning rate schedule
    # have 1000 warmup steps with a max value of 0.01
    lr_schedule = lr_schedules.warmup_and_rsqrt_decay(1000, 0.01),

    # have a checkpoint every 10 steps
    n_steps_per_checkpoint= 10,
)
def test_train_task(train_task):
    target = train_task
    success = 0
    fails = 0

    # Test the labeled data parameter
    try:
        strlabel = str(target._labeled_data)
        assert(strlabel.find("generator") and strlabel.find('add_loss_weights'))
        success += 1
    except:
        fails += 1
        print("Wrong labeled data parameter")

    # Test the cross entropy loss data parameter
    try:
        strlabel = str(target._loss_layer)
        assert(strlabel == "CrossEntropyLoss_in3")
        success += 1
    except:
        fails += 1
        print("Wrong loss functions. CrossEntropyLoss_in3 was expected")

     # Test the optimizer parameter
    try:
        assert(isinstance(target.optimizer, trax.optimizers.adam.Adam))
        success += 1
    except:
        fails += 1
        print("Wrong optimizer")

    # Test the schedule parameter
    try:
        assert(isinstance(target._lr_schedule,trax.supervised.lr_schedules._BodyAndTail))
        success += 1
    except:
        fails += 1
        print("Wrong learning rate schedule type")

    # Test the _n_steps_per_checkpoint parameter
    try:
        assert(target._n_steps_per_checkpoint==10)
        success += 1
    except:
        fails += 1
        print("Wrong checkpoint step frequency")

    if fails == 0:
        print("\033[92m All tests passed")
    else:
        print('\033[92m', success," Tests passed")
        print('\033[91m', fails, " Tests failed")
    return
test_train_task(train_task)
Wrong loss functions. CrossEntropyLoss_in3 was expected
Wrong optimizer
Wrong learning rate schedule type
[92m 2  Tests passed
[91m 3  Tests failed

The code has changed a bit since the test was written so it won't pass without updates.

EvalTask

The EvalTask on the other hand allows us to see how the model is doing while training. For our application, we want it to report the cross entropy loss and accuracy.

eval_task = training.EvalTask(

    ## use the eval batch stream as labeled data
    labeled_data=eval_batch_stream,

    ## use the cross entropy loss and accuracy as metrics
    metrics=[layers.WeightedCategoryCrossEntropy(), layers.Accuracy()],
)

Loop

The Loop class defines the model we will train as well as the train and eval tasks to execute. Its run() method allows us to execute the training for a specified number of steps.

output_dir = Path("~/models/machine_translation/").expanduser()

Define the training loop.

training_loop = training.Loop(NMTAttn(mode='train'),
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)
train_steps = 1000

with TIMER, \
     open("/tmp/machine_translation_training.log", "w") as temp_file, \
     redirect_stdout(temp_file):
            training_loop.run(train_steps)
Started: 2021-03-09 18:31:58.844878
Ended: 2021-03-09 20:14:43.090358
Elapsed: 1:42:44.245480
frame = pandas.DataFrame(
    training_loop.history.get("eval", "metrics/WeightedCategoryCrossEntropy"),
    columns="Batch CrossEntropy".split())

minimum = frame.loc[frame.CrossEntropy.idxmin()]
vline = holoviews.VLine(minimum.Batch).opts(opts.VLine(color=PLOT.red))
hline = holoviews.HLine(minimum.CrossEntropy).opts(opts.HLine(color=PLOT.red))
line = frame.hvplot(x="Batch", y="CrossEntropy").opts(opts.Curve(color=PLOT.blue))

plot = (line * hline * vline).opts(
    width=PLOT.width, height=PLOT.height,
    title="Evaluation Batch Cross Entropy Loss",
                                   )
output = Embed(plot=plot, file_name="evaluation_cross_entropy")()
print(output)

Figure Missing

frame = pandas.DataFrame(
    training_loop.history.get("eval", "metrics/Accuracy"),
    columns="Batch Accuracy".split())

minimum = frame.loc[frame.Accuracy.idxmin()]
vline = holoviews.VLine(minimum.Batch).opts(opts.VLine(color=PLOT.red))
hline = holoviews.HLine(minimum.Accuracy).opts(opts.HLine(color=PLOT.red))
line = frame.hvplot(x="Batch", y="Accuracy").opts(opts.Curve(color=PLOT.blue))

plot = (line * hline * vline).opts(
    width=PLOT.width, height=PLOT.height,
    title="Evaluation Batch Accuracy",
                                   )
output = Embed(plot=plot, file_name="evaluation_accuracy")()
print(output)

Figure Missing

It seems to be stuck…

End

Now that we've trained the model in the next post we'll test our model to see how well it does. The overview post with links to all the posts in this series is here.

Raw

Neural Machine Translation: The Attention Model

<<imports>>

<<attention-model>>

Defining the Model

In the previous post we made some helper functions to prepare inputs for some of the layers in the model. In this post we'll define the model itself.

Attention Overview

The model we will be building uses an encoder-decoder architecture. This Recurrent Neural Network (RNN) will take in a tokenized version of a sentence in its encoder, then passes it on to the decoder for translation. Just using a a regular sequence-to-sequence model with LSTMs will work effectively for short to medium sentences but will start to degrade for longer ones. You can picture it like the figure below where all of the context of the input sentence is compressed into one vector that is passed into the decoder block. You can see how this will be an issue for very long sentences (e.g. 100 tokens or more) because the context of the first parts of the input will have very little effect on the final vector passed to the decoder.

Adding an attention layer to this model avoids this problem by giving the decoder access to all parts of the input sentence. To illustrate, let's just use a 4-word input sentence as shown below. Remember that a hidden state is produced at each timestep of the encoder (represented by the orange rectangles). These are all passed to the attention layer and each are given a score given the current activation (i.e. hidden state) of the decoder. For instance, let's consider the figure below where the first prediction "Wie" is already made. To produce the next prediction, the attention layer will first receive all the encoder hidden states (i.e. orange rectangles) as well as the decoder hidden state when producing the word "Wie" (i.e. first green rectangle). Given this information, it will score each of the encoder hidden states to know which one the decoder should focus on to produce the next word. The result of the model training might have learned that it should align to the second encoder hidden state and subsequently assigns a high probability to the word "geht". If we are using greedy decoding, we will output the said word as the next symbol, then restart the process to produce the next word until we reach an end-of-sentence prediction.

There are different ways to implement attention and the one we'll use is the Scaled Dot Product Attention which has the form:

\[ Attention(Q, K, V) = softmax \left(\frac{QK^T}{\sqrt{d_k}} \right)V \]

You can think of it as computing scores using queries (Q) and keys (K), followed by a multiplication of values (V) to get a context vector at a particular timestep of the decoder. This context vector is fed to the decoder RNN to get a set of probabilities for the next predicted word. The division by square root of the keys dimensionality (\(\sqrt{d_k}\)) is for improving model performance and you'll also learn more about it next week. For our machine translation application, the encoder activations (i.e. encoder hidden states) will be the keys and values, while the decoder activations (i.e. decoder hidden states) will be the queries.

You will see in the upcoming sections that this complex architecture and mechanism can be implemented with just a few lines of code.

Imports

# pypi
from trax import layers

import trax

# this project
from neurotic.nlp.machine_translation import (
    NMTAttn)

Implementation

Overview

We are now ready to implement our sequence-to-sequence model with attention. This will be a Serial network and is illustrated in the diagram below. It shows the layers you'll be using in Trax and you'll see that each step can be implemented quite easily with one line commands. We've placed several links to the documentation for each relevant layer in the discussion after the figure below.

  • Step 0: Prepare the input encoder and pre-attention decoder branches. We've already defined this earlier as helper functions so it's just a matter of calling those functions and assigning it to variables.
  • Step 1: Create a Serial network. This will stack the layers in the next steps one after the other. As before, we'll use tl.Serial.
  • Step 2: Make a copy of the input and target tokens. As you see in the diagram above, the input and target tokens will be fed into different layers of the model. We'll use tl.Select layer to create copies of these tokens, arranging them as [input tokens, target tokens, input tokens, target tokens].
  • Step 3: Create a parallel branch to feed the input tokens to the input_encoder and the target tokens to the pre_attention_decoder. We'll use tl.Parallel to create these sublayers in parallel, remembering to pass the variables defined in Step 0 as parameters to this layer.
  • Step 4: Next, call the `prepare_attention_input` function to convert the encoder and pre-attention decoder activations to a format that the attention layer will accept. You can use tl.Fn to call this function. Note: Pass the prepare_attention_input function as the f parameter in tl.Fn without any arguments or parenthesis.
  • Step 5: We will now feed the (queries, keys, values, and mask) to the tl.AttentionQKV layer. This computes the scaled dot product attention and outputs the attention weights and mask. Take note that although it is a one liner, this layer is actually composed of a deep network made up of several branches. We'll show the implementation show here (on github) to see the different layers used.
def AttentionQKV(d_feature, n_heads=1, dropout=0.0, mode='train'):
  """Returns a layer that maps (q, k, v, mask) to (activations, mask).

  See `Attention` above for further context/details.

  Args:
    d_feature: Depth/dimensionality of feature embedding.
    n_heads: Number of attention heads.
    dropout: Probababilistic rate for internal dropout applied to attention
        activations (based on query-key pairs) before dotting them with values.
    mode: Either 'train' or 'eval'.
  """
  return cb.Serial(
      cb.Parallel(
          core.Dense(d_feature),
          core.Dense(d_feature),
          core.Dense(d_feature),
      ),
      PureAttention(  # pylint: disable=no-value-for-parameter
          n_heads=n_heads, dropout=dropout, mode=mode),
      core.Dense(d_feature),
  )

Having deep layers poses the risk of vanishing gradients during training and we would want to mitigate that. To improve the ability of the network to learn, we can insert a tl.Residual layer to add the output of AttentionQKV with the queries input. You can do this in trax by simply nesting the AttentionQKV layer inside the Residual layer. The library will take care of branching and adding for you.

  • Step 6: We will not need the mask for the model we're building so we can safely drop it. At this point in the network, the signal stack currently has [attention activations, mask, target tokens] and you can use tl.Select to output just [attention activations, target tokens].
  • Step 7: We can now feed the attention weighted output to the LSTM decoder. We can stack multiple tl.LSTM layers to improve the output so remember to append LSTMs equal to the number defined by n_decoder_layers parameter to the model.
  • Step 8: We want to determine the probabilities of each subword in the vocabulary and you can set this up easily with a tl.Dense layer by making its size equal to the size of our vocabulary.
  • Step 9: Normalize the output to log probabilities by passing the activations in Step 8 to a tl.LogSoftmax layer.

The Implementation

# pypi
from trax import layers

# this project
from .help_me import input_encoder as input_encoder_fn
from .help_me import pre_attention_decoder as pre_attention_decoder_fn
from .help_me import prepare_attention_input as prepare_attention_input_fn
def NMTAttn(input_vocab_size: int=33300,
            target_vocab_size: int=33300,
            d_model: int=1024,
            n_encoder_layers: int=2,
            n_decoder_layers: int=2,
            n_attention_heads: int=4,
            attention_dropout: float=0.0,
            mode: str='train') -> layers.Serial:
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
    input_vocab_size: int: vocab size of the input
    target_vocab_size: int: vocab size of the target
    d_model: int:  depth of embedding (n_units in the LSTM cell)
    n_encoder_layers: int: number of LSTM layers in the encoder
    n_decoder_layers: int: number of LSTM layers in the decoder after attention
    n_attention_heads: int: number of attention heads
    attention_dropout: float, dropout for the attention layer
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
    A LSTM sequence-to-sequence model with attention.
    """
    # Step 0: call the helper function to create layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    # Step 0: call the helper function to create layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    # Step 1: create a serial network
    model = layers.Serial( 

      # Step 2: copy input tokens and target tokens as they will be needed later.
      layers.Select([0, 1, 0, 1]),

      # Step 3: run input encoder on the input and pre-attention decoder on the target.
      layers.Parallel(input_encoder, pre_attention_decoder),

      # Step 4: prepare queries, keys, values and mask for attention.
      layers.Fn('PrepareAttentionInput', prepare_attention_input_fn, n_out=4),

      # Step 5: run the AttentionQKV layer
      # nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
      layers.Residual(layers.AttentionQKV(d_model,
                                          n_heads=n_attention_heads,
                                          dropout=attention_dropout, mode=mode)),

      # Step 6: drop attention mask (i.e. index = None
      layers.Select([0, 2]),

      # Step 7: run the rest of the RNN decoder
      [layers.LSTM(d_model) for _ in range(n_decoder_layers)],

      # Step 8: prepare output by making it the right size
      layers.Dense(target_vocab_size),

      # Step 9: Log-softmax for output
      layers.LogSoftmax()
    )
    return model
def test_NMTAttn(NMTAttn):
    test_cases = [
                {
                    "name":"simple_test_check",
                    "expected":"Serial_in2_out2[\n  Select[0,1,0,1]_in2_out4\n  Parallel_in2_out2[\n    Serial[\n      Embedding_33300_1024\n      LSTM_1024\n      LSTM_1024\n    ]\n    Serial[\n      ShiftRight(1)\n      Embedding_33300_1024\n      LSTM_1024\n    ]\n  ]\n  PrepareAttentionInput_in3_out4\n  Serial_in4_out2[\n    Branch_in4_out3[\n      None\n      Serial_in4_out2[\n        Parallel_in3_out3[\n          Dense_1024\n          Dense_1024\n          Dense_1024\n        ]\n        PureAttention_in4_out2\n        Dense_1024\n      ]\n    ]\n    Add_in2\n  ]\n  Select[0,2]_in3_out2\n  LSTM_1024\n  LSTM_1024\n  Dense_33300\n  LogSoftmax\n]",
                    "error":"The NMTAttn is not defined properly."
                },
                {
                    "name":"layer_len_check",
                    "expected":9,
                    "error":"We found {} layers in your model. It should be 9.\nCheck the LSTM stack before the dense layer"
                },
                {
                    "name":"selection_layer_check",
                    "expected":["Select[0,1,0,1]_in2_out4", "Select[0,2]_in3_out2"],
                    "error":"Look at your selection layers."
                }
            ]

    success = 0
    fails = 0

    for test_case in test_cases:
        try:
            if test_case['name'] == "simple_test_check":
                assert test_case["expected"] == str(NMTAttn())
                success += 1
            if test_case['name'] == "layer_len_check":
                if test_case["expected"] == len(NMTAttn().sublayers):
                    success += 1
                else:
                    print(test_case["error"].format(len(NMTAttn().sublayers))) 
                    fails += 1
            if test_case['name'] == "selection_layer_check":
                model = NMTAttn()
                output = [str(model.sublayers[0]),str(model.sublayers[4])]
                check_count = 0
                for i in range(2):
                    if test_case["expected"][i] != output[i]:
                        print(test_case["error"])
                        fails += 1
                        break
                    else:
                        check_count += 1
                if check_count == 2:
                    success += 1
        except:
            print(test_case['error'])
            fails += 1

    if fails == 0:
        print("\033[92m All tests passed")
    else:
        print('\033[92m', success," Tests passed")
        print('\033[91m', fails, " Tests failed")
    return test_cases
test_cases = test_NMTAttn(NMTAttn)
The NMTAttn is not defined properly.
[92m 2  Tests passed
[91m 1  Tests failed
model = NMTAttn()
print(model)
Serial_in2_out2[
  Select[0,1,0,1]_in2_out4
  Parallel_in2_out2[
    Serial[
      Embedding_33300_1024
      LSTM_1024
      LSTM_1024
    ]
    Serial[
      Serial[
        ShiftRight(1)
      ]
      Embedding_33300_1024
      LSTM_1024
    ]
  ]
  PrepareAttentionInput_in3_out4
  Serial_in4_out2[
    Branch_in4_out3[
      None
      Serial_in4_out2[
        _in4_out4
        Serial_in4_out2[
          Parallel_in3_out3[
            Dense_1024
            Dense_1024
            Dense_1024
          ]
          PureAttention_in4_out2
          Dense_1024
        ]
        _in2_out2
      ]
    ]
    Add_in2
  ]
  Select[0,2]_in3_out2
  LSTM_1024
  LSTM_1024
  Dense_33300
  LogSoftmax
]

End

Now that we have the model defined, in the next post we'll train the model. The overview post with links to all the posts in this series is here.