tf.Tensor([[0.5089391]], shape=(1, 1), dtype=float32)
16 Generative Adversarial Networks (GANs)
The idea of a generative adversarial network (GAN) is that two neural networks contest against each other in a “game”. One network is creating data and is trying to “trick” the other network into deciding the generated data is real. The generator (similar to the decoder in autoencoders) creates new images from noise. The discriminator is getting a mix of true (from the data set) and artificially generated images from the generator. Thereby, the loss of the generator rises when fakes are identified as fakes by the discriminator (simple binary cross entropy loss, 0/1…). The loss of the discriminator rises when fakes are identified as real images (class 0) or real images as fakes (class 1), again with binary cross entropy.
Binary cross entropy: Entropy or Shannon entropy (named after Claude Shannon) \(\mathbf{H}\) (uppercase “eta”) in context of information theory is the expected value of information content or the mean/average information content of an “event” compared to all possible outcomes. Encountering an event with low probability holds more information than encountering an event with high probability.
Binary cross entropy is a measure to determine the similarity of two (discrete) probability distributions \(A~(\mathrm{true~distribution}), B~(\mathrm{predicted~distribution})\) according to the inherent information.
It is not (!) symmetric, in general: \(\textbf{H}_{A}(B) \neq \textbf{H}_{B}(A)\). The minimum value depends on the distribution of \(A\) and is the entropy of \(A\): \[\mathrm{min}~\textbf{H}_{A}(B) = \underset{B}{\mathrm{min}}~\textbf{H}_{A}(B) = \textbf{H}_{A}(B = A) = \textbf{H}_{A}(A) = \textbf{H}(A)\]
The setup:
- Outcomes \(y_{i} \in \{0, 1\}\) (labels).
- Predictions \(\hat{y}_{i} \in[0, 1]\) (probabilities).
The binary cross entropy or log loss of a system of outcomes/predictions is then defined as follows: \[ \textbf{H}_{A}(B) = -\frac{1}{N} \sum_{i = 1}^{N} y_{i} \cdot \mathrm{log} \left( p(y_{i}) \right) + (1 -y_{i}) \cdot \mathrm{log} \left( 1-p(y_{i}) \right) =\\ = -\frac{1}{N} \sum_{i = 1}^{N} y_{i} \cdot \mathrm{log} (\hat{y}_{i}) + (1 -y_{i}) \cdot \mathrm{log} \left( 1- \hat{y}_{i} \right) \] High predicted probabilities of having the label for originally labeled data (1) yield a low loss as well as predicting a low probability of having the label for originally unlabeled data (0). Mind the properties of probabilities and the logarithm.
A possible application of generative adversarial networks is to create pictures that look like real photographs e.g. https://thispersondoesnotexist.com/. Visit that site (several times)!. However, the application of generative adversarial networks today is much wider than just the creation of data. For example, generative adversarial networks can also be used to “augment” data, i.e. to create new data and thereby improve the fitted model.
16.1 MNIST - Generative Adversarial Networks Based on Deep Neural Networks
We will now explore this on the MNIST data set.
library(keras)
library(tensorflow)
set_random_seed(321L, disable_gpu = FALSE) # Already sets R's random seed.
= function(x){ t(apply(x, 2, rev)) }
rotate = function(img, title = ""){
imgPlot = grey.colors(255)
col image(rotate(img), col = col, xlab = "", ylab = "", axes = FALSE,
main = paste0("Label: ", as.character(title)))
}
We don’t need the test set here.
= dataset_mnist()
data = data$train
train = array((train$x-127.5)/127.5, c(dim(train$x)[1], 784L)) train_x
We need a function to sample images for the discriminator.
= 32L
batch_size = tf$data$Dataset$from_tensor_slices(tf$constant(train_x, "float32"))
dataset $batch(batch_size) dataset
Create function that returns the generator model:
= function(){
get_generator = keras_model_sequential()
generator %>%
generator layer_dense(units = 200L, input_shape = c(100L)) %>%
layer_activation_leaky_relu() %>%
layer_dense(units = 200L) %>%
layer_activation_leaky_relu() %>%
layer_dense(units = 784L, activation = "tanh")
return(generator)
}
Test the generator:
= get_generator()
generator = tf$random$normal(c(1L, 100L))
sample imgPlot(array(generator(sample)$numpy(), c(28L, 28L)))
In the discriminator, noise (random vector with 100 values) is passed through the network such that the output corresponds to the number of pixels of one MNIST image (784). We therefore define the discriminator function now.
= function(){
get_discriminator = keras_model_sequential()
discriminator %>%
discriminator layer_dense(units = 200L, input_shape = c(784L)) %>%
layer_activation_leaky_relu() %>%
layer_dense(units = 100L) %>%
layer_activation_leaky_relu() %>%
layer_dense(units = 1L, activation = "sigmoid")
return(discriminator)
}
And we also test the discriminator function.
= get_discriminator()
discriminator discriminator(generator(tf$random$normal(c(1L, 100L))))
We also have to define the loss functions for both networks.We use the already known binary cross entropy. However, we have to encode the real and predicted values for the two networks individually.
The discriminator will get two losses - one for identifying fake images as fake, and one for identifying real MNIST images as real images.
The generator will just get one loss - was it able to deceive the discriminator?
= tf$keras$losses$BinaryCrossentropy(from_logits = TRUE)
ce
= function(real, fake){
loss_discriminator = ce(tf$ones_like(real), real)
real_loss = ce(tf$zeros_like(fake), fake)
fake_loss return(real_loss + fake_loss)
}
= function(fake){
loss_generator return(ce(tf$ones_like(fake), fake))
}
Each network will get its own optimizer (in a GAN the networks are treated independently):
= tf$keras$optimizers$RMSprop(1e-4)
gen_opt = tf$keras$optimizers$RMSprop(1e-4) disc_opt
We have to write our own training loop here (we cannot use the fit function). In each iteration (for each batch) we will do the following (the GradientTape records computations to do automatic differentiation):
- Sample noise.
- Generator creates images from the noise.
- Discriminator makes predictions for fake images and real images (response is a probability between [0,1]).
- Calculate loss for generator.
- Calculate loss for discriminator.
- Calculate gradients for weights and the loss.
- Update weights of generator.
- Update weights of discriminator.
- Return losses.
= get_generator()
generator = get_discriminator()
discriminator
= function(images){
train_step = tf$random$normal(c(128L, 100L))
noise with(tf$GradientTape(persistent = TRUE) %as% tape,
{= generator(noise)
gen_images = discriminator(gen_images)
fake_output = discriminator(images)
real_output = loss_generator(fake_output)
gen_loss = loss_discriminator(real_output, fake_output)
disc_loss
}
)
= tape$gradient(gen_loss, generator$weights)
gen_grads = tape$gradient(disc_loss, discriminator$weights)
disc_grads rm(tape)
$apply_gradients(purrr::transpose(list(gen_grads, generator$weights)))
gen_opt$apply_gradients(purrr::transpose(list(disc_grads, discriminator$weights)))
disc_opt
return(c(gen_loss, disc_loss))
}
= tf$`function`(reticulate::py_func(train_step)) train_step
Now we can finally train our networks in a training loop:
- Create networks.
- Get batch of images.
- Run train_step function.
- Print losses.
- Repeat step 2-4 for number of epochs.
= 128L
batch_size = 20L
epochs = as.integer(nrow(train_x)/batch_size)
steps = 1
counter = c()
gen_loss = c()
disc_loss
= dataset$prefetch(tf$data$AUTOTUNE)
dataset2
for(e in 1:epochs){
= reticulate::as_iterator(dataset2$batch(batch_size))
dat
::loop(
corofor(images in dat){
= train_step(images)
losses = c(gen_loss, tf$reduce_sum(losses[[1]])$numpy())
gen_loss = c(disc_loss, tf$reduce_sum(losses[[2]])$numpy())
disc_loss
}
)
if(e %% 5 == 0){ #Print output every 5 steps.
cat("Gen: ", mean(gen_loss), " Disc: ", mean(disc_loss), " \n")
}= tf$random$normal(c(1L, 100L))
noise if(e %% 10 == 0){ #Plot image every 10 steps.
imgPlot(array(generator(noise)$numpy(), c(28L, 28L)), "Gen")
} }
Gen: 0.8095555 Disc: 1.10533
Gen: 0.8928918 Disc: 1.287504
Gen: 0.9071119 Disc: 1.314586
Gen: 0.9514963 Disc: 1.31548
16.2 Flower - GAN
We can now also do the same for the flower data set. We will write this completely on our own following the steps also done for the MNIST data set.
library(keras)
library(tidyverse)
library(tensorflow)
library(EcoData)
= EcoData::dataset_flower()
data = (data$train-127.5)/127.5
train = (data$test-127.5)/127.5
test = abind::abind(list(train, test), along = 1L)
train_x = tf$data$Dataset$from_tensor_slices(tf$constant(train_x, "float32")) dataset
Define the generator model and test it:
= function(){
get_generator = keras_model_sequential()
generator %>%
generator layer_dense(units = 20L*20L*128L, input_shape = c(100L),
use_bias = FALSE) %>%
layer_activation_leaky_relu() %>%
layer_reshape(c(20L, 20L, 128L)) %>%
layer_dropout(0.3) %>%
layer_conv_2d_transpose(filters = 256L, kernel_size = c(3L, 3L),
padding = "same", strides = c(1L, 1L),
use_bias = FALSE) %>%
layer_activation_leaky_relu() %>%
layer_dropout(0.3) %>%
layer_conv_2d_transpose(filters = 128L, kernel_size = c(5L, 5L),
padding = "same", strides = c(1L, 1L),
use_bias = FALSE) %>%
layer_activation_leaky_relu() %>%
layer_dropout(0.3) %>%
layer_conv_2d_transpose(filters = 64L, kernel_size = c(5L, 5L),
padding = "same", strides = c(2L, 2L),
use_bias = FALSE) %>%
layer_activation_leaky_relu() %>%
layer_dropout(0.3) %>%
layer_conv_2d_transpose(filters = 3L, kernel_size = c(5L, 5L),
padding = "same", strides = c(2L, 2L),
activation = "tanh", use_bias = FALSE)
return(generator)
}
= get_generator()
generator = generator(tf$random$normal(c(1L,100L)))$numpy()[1,,,]
image = scales::rescale(image, to = c(0, 255))
image %>%
image image_to_array() %>%
`/`(., 255) %>%
as.raster() %>%
plot()
Define the discriminator and test it:
= function(){
get_discriminator = keras_model_sequential()
discriminator %>%
discriminator layer_conv_2d(filters = 64L, kernel_size = c(5L, 5L),
strides = c(2L, 2L), padding = "same",
input_shape = c(80L, 80L, 3L)) %>%
layer_activation_leaky_relu() %>%
layer_dropout(0.3) %>%
layer_conv_2d(filters = 128L, kernel_size = c(5L, 5L),
strides = c(2L, 2L), padding = "same") %>%
layer_activation_leaky_relu() %>%
layer_dropout(0.3) %>%
layer_conv_2d(filters = 256L, kernel_size = c(3L, 3L),
strides = c(2L, 2L), padding = "same") %>%
layer_activation_leaky_relu() %>%
layer_dropout(0.3) %>%
layer_flatten() %>%
layer_dense(units = 1L, activation = "sigmoid")
return(discriminator)
}
= get_discriminator()
discriminator
discriminatordiscriminator(generator(tf$random$normal(c(1L, 100L))))
cat('
Model: "sequential_13"
__________________________________________________________________________________________
Layer (type) Output Shape Param #
==========================================================================================
conv2d_21 (Conv2D) (None, 40, 40, 64) 4864
leaky_re_lu_14 (LeakyReLU) (None, 40, 40, 64) 0
dropout_6 (Dropout) (None, 40, 40, 64) 0
conv2d_20 (Conv2D) (None, 20, 20, 128) 204928
leaky_re_lu_13 (LeakyReLU) (None, 20, 20, 128) 0
dropout_5 (Dropout) (None, 20, 20, 128) 0
conv2d_19 (Conv2D) (None, 10, 10, 256) 295168
leaky_re_lu_12 (LeakyReLU) (None, 10, 10, 256) 0
dropout_4 (Dropout) (None, 10, 10, 256) 0
flatten_3 (Flatten) (None, 25600) 0
dense_25 (Dense) (None, 1) 25601
==========================================================================================
Total params: 530,561
Trainable params: 530,561
Non-trainable params: 0
__________________________________________________________________________________________
tf.Tensor([[0.49996078]], shape=(1, 1), dtype=float32)
')
Model: "sequential_13"
__________________________________________________________________________________________
Layer (type) Output Shape Param #
==========================================================================================
conv2d_21 (Conv2D) (None, 40, 40, 64) 4864
leaky_re_lu_14 (LeakyReLU) (None, 40, 40, 64) 0
dropout_6 (Dropout) (None, 40, 40, 64) 0
conv2d_20 (Conv2D) (None, 20, 20, 128) 204928
leaky_re_lu_13 (LeakyReLU) (None, 20, 20, 128) 0
dropout_5 (Dropout) (None, 20, 20, 128) 0
conv2d_19 (Conv2D) (None, 10, 10, 256) 295168
leaky_re_lu_12 (LeakyReLU) (None, 10, 10, 256) 0
dropout_4 (Dropout) (None, 10, 10, 256) 0
flatten_3 (Flatten) (None, 25600) 0
dense_25 (Dense) (None, 1) 25601
==========================================================================================
Total params: 530,561
Trainable params: 530,561
Non-trainable params: 0
__________________________________________________________________________________________
tf.Tensor([[0.49996078]], shape=(1, 1), dtype=float32)
Loss functions:
= tf$keras$losses$BinaryCrossentropy(from_logits = FALSE,
ce label_smoothing = 0.1)
= function(real, fake){
loss_discriminator = ce(tf$ones_like(real), real)
real_loss = ce(tf$zeros_like(fake), fake)
fake_loss return(real_loss+fake_loss)
}
= function(fake){
loss_generator return(ce(tf$ones_like(fake), fake))
}
Define the optimizers and the batch function:
= tf$keras$optimizers$RMSprop(1e-4)
gen_opt = tf$keras$optimizers$RMSprop(1e-4) disc_opt
Define functions for the generator and discriminator:
= get_generator()
generator = get_discriminator()
discriminator
= function(images){
train_step = tf$random$normal(c(32L, 100L))
noise
with(tf$GradientTape(persistent = TRUE) %as% tape,
{= generator(noise)
gen_images
= discriminator(images)
real_output = discriminator(gen_images)
fake_output
= loss_generator(fake_output)
gen_loss = loss_discriminator(real_output, fake_output)
disc_loss
}
)
= tape$gradient(gen_loss, generator$weights)
gen_grads = tape$gradient(disc_loss, discriminator$weights)
disc_grads rm(tape)
$apply_gradients(purrr::transpose(list(gen_grads,
gen_opt$weights)))
generator$apply_gradients(purrr::transpose(list(disc_grads,
disc_opt$weights)))
discriminator
return(c(gen_loss, disc_loss))
}
= tf$`function`(reticulate::py_func(train_step)) train_step
Do the training:
= 32L
batch_size = 30L
epochs = as.integer(dim(train_x)[1]/batch_size)
steps = 1
counter = c()
gen_loss = c()
disc_loss
= dataset$prefetch(tf$data$AUTOTUNE)
dataset
for(e in 1:epochs){
= reticulate::as_iterator(dataset$batch(batch_size))
dat
::loop(
corofor(images in dat){
= train_step(images)
losses = c(gen_loss, tf$reduce_sum(losses[[1]])$numpy())
gen_loss = c(disc_loss, tf$reduce_sum(losses[[2]])$numpy())
disc_loss
}
)
= tf$random$normal(c(1L, 100L))
noise = generator(noise)$numpy()[1,,,]
image = scales::rescale(image, to = c(0, 255))
image if(e %% 15 == 0){
%>%
image image_to_array() %>%
`/`(., 255) %>%
as.raster() %>%
plot()
}
if(e %% 10 == 0) cat("Gen: ", mean(gen_loss), " Disc: ", mean(disc_loss), " \n")
}
Gen: 1.651127 Disc: 0.8720699
Gen: 1.303061 Disc: 1.037192
Gen: 1.168868 Disc: 1.100166
= tf$random$normal(c(1L, 100L))
noise = generator(noise)$numpy()[1,,,]
image = scales::rescale(image, to = c(0, 255))
image %>%
image image_to_array() %>%
`/`(., 255) %>%
as.raster() %>%
plot()
More images: