15  Autoencoder

Autoencoders (AE) can be used for unsupervised learning. The idea is similar to data compression: The first part of the network compresses (encodes) the data to a low dimensional space (e.g. 2-4 dimensions) and the second part of the network decompresses (decodes) and learns to reconstruct the data (think of a hourglass).

Why is this useful? The method is similar to a dimension reduction technique (e.g. PCA) but with the advantage that we don’t have to make any distributional assumptions (but see PCA). For instance, we could first train an autoencoder on genomic expression data with thousands of features, compress them into 2-4 dimensions, and then use them for clustering.

15.1 Autoencoder - Deep Neural Network MNIST

We now will write an autoencoder for the MNIST data set.

Let’s start with the (usual) MNIST example:

library(keras)
library(tensorflow)

data = keras::dataset_mnist()

We don’t need the labels here, our images will be the inputs and at the same time the outputs of our final autoencoder.

rotate = function(x){ t(apply(x, 2, rev)) }

imgPlot = function(img, title = ""){
  col = grey.colors(255)
  if(title != ""){ main = paste0("Label: ", as.character(title)) }
  else{ main = "" }
  image(rotate(img), col = col, xlab = "", ylab = "", axes = FALSE, main = main)
}

train = data[[1]]
test = data[[2]]
train_x = array(train[[1]]/255, c(dim(train[[1]])[1], 784L))
test_x = array(test[[1]]/255, c(dim(test[[1]])[1], 784L))

Our encoder: image (784 dimensions) \(\rightarrow\) 2 dimensions

down_size_model = keras_model_sequential()
down_size_model %>% 
  layer_dense(units = 100L, input_shape = c(784L), activation = "relu") %>% 
  layer_dense(units = 20L, activation = "relu") %>% 
  layer_dense(units = 2L, activation = "linear")

Our decoder: 2 dimensions \(\rightarrow\) 784 dimensions (our image)

up_size_model = keras_model_sequential()
up_size_model %>% 
  layer_dense(units = 20L, input_shape = c(2L), activation = "relu") %>% 
  layer_dense(units = 100L, activation = "relu") %>% 
  layer_dense(units = 784L, activation = "sigmoid")

We can use the non-sequential model type to connect the two models. (We did the same in the transfer learning chapter.)

autoencoder = keras_model(inputs = down_size_model$input, 
                          outputs = up_size_model(down_size_model$output))
autoencoder$compile(loss = loss_binary_crossentropy,
                    optimizer = optimizer_adamax(0.01))
summary(autoencoder)

Model:
__________________________________________________________________________________________
 Layer (type)                           Output Shape                        Param #       
==========================================================================================
 dense_2_input (InputLayer)             [(None, 784)]                       0             
 dense_2 (Dense)                        (None, 100)                         78500         
 dense_1 (Dense)                        (None, 20)                          2020          
 dense (Dense)                          (None, 2)                           42            
 sequential_1 (Sequential)              (None, 784)                         81344         
==========================================================================================
Total params: 161,906
Trainable params: 161,906
Non-trainable params: 0
__________________________________________________________________________________________    
    

We will now show an example of an image before and after the unfitted autoencoder, so we see that we have to train the autoencoder.

image = autoencoder(train_x[1,,drop = FALSE])
imgPlot(array(train_x[1,,drop = FALSE], c(28, 28)), title = "Before")
imgPlot(array(image$numpy(), c(28, 28)), title = "After")

Fit the autoencoder (inputs == outputs!):

library(tensorflow)
library(keras)
set_random_seed(123L, disable_gpu = FALSE)  # Already sets R's random seed.

autoencoder %>% 
  fit(x = train_x, y = train_x, epochs = 5L, batch_size = 128L)

Visualization of the latent variables:

pred_dim = down_size_model(test_x)
reconstr_pred = up_size_model(pred_dim)
imgPlot(array(reconstr_pred[10,]$numpy(), dim = c(28L, 28L)), title = "")

ownColors = c("limegreen", "purple", "yellow", "grey", "orange",
              "black", "red", "navy", "sienna", "springgreen")
oldpar = par(mfrow = c(1, 1))
plot(pred_dim$numpy()[,1], pred_dim$numpy()[,2], col = ownColors[test[[2]]+1L])

The picture above shows the 2-dimensional encoded values of the numbers in the MNIST data set and the number they are depicting via the respective color.

15.2 Autoencoder - MNIST Convolutional Neural Networks

We can also use convolutional neural networks instead or on the side of deep neural networks: Prepare data:

data = tf$keras$datasets$mnist$load_data()
train = data[[1]]
train_x = array(train[[1]]/255, c(dim(train[[1]]), 1L))
test_x = array(data[[2]][[1]]/255, c(dim(data[[2]][[1]]/255), 1L))

Then define the downsize model:

down_size_model = keras_model_sequential()
down_size_model %>% 
  layer_conv_2d(filters = 16L, activation = "relu", kernel_size = c(3L, 3L), input_shape = c(28L, 28L, 1L), padding = "same") %>% 
  layer_max_pooling_2d(, padding = "same") %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_max_pooling_2d(, padding = "same") %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_max_pooling_2d(, padding = "same") %>% 
  layer_flatten() %>% 
  layer_dense(units = 2L, activation = "linear")

Define the upsize model:

up_size_model = keras_model_sequential()
up_size_model %>% 
  layer_dense(units = 128L, activation = "relu", input_shape = c(2L)) %>% 
  layer_reshape(target_shape = c(4L, 4L, 8L)) %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_upsampling_2d() %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_upsampling_2d() %>% 
  layer_conv_2d(filters = 16L, activation = "relu", kernel_size = c(3L,3L)) %>% 
  layer_upsampling_2d() %>% 
  layer_conv_2d(filters = 1, activation = "sigmoid", kernel_size = c(3L,3L), padding = "same")

Combine the two models and fit it:

library(tensorflow)
library(keras)
set_random_seed(321L, disable_gpu = FALSE)  # Already sets R's random seed.

autoencoder = tf$keras$models$Model(inputs = down_size_model$input,
                                    outputs = up_size_model(down_size_model$output))

autoencoder %>% compile(loss = loss_binary_crossentropy,
                    optimizer = optimizer_rmsprop(0.001))

autoencoder %>%  fit(x = tf$constant(train_x), y = tf$constant(train_x),
                      epochs = 50L, batch_size = 64L)

Test it:

pred_dim = down_size_model(tf$constant(test_x, "float32"))
reconstr_pred = autoencoder(tf$constant(test_x, "float32"))
imgPlot(reconstr_pred[10,,,]$numpy()[,,1])


ownColors = c("limegreen", "purple", "yellow", "grey", "orange",
              "black", "red", "navy", "sienna", "springgreen")
plot(pred_dim[,1]$numpy(), pred_dim[,2]$numpy(), col = ownColors[test[[2]]+1L])


## Generate new images!
new = matrix(c(10, 10), 1, 2)
imgPlot(array(up_size_model(new)$numpy(), c(28L, 28L)))


new = matrix(c(5, 5), 1, 2)
imgPlot(array(up_size_model(new)$numpy(), c(28L, 28L)))

15.3 Variational Autoencoder (VAE)

The difference between a variational and a normal autoencoder is that a variational autoencoder assumes a distribution for the latent variables (latent variables cannot be observed and are composed of other variables) and the parameters of this distribution are learned. Thus new objects can be generated by inserting valid (!) (with regard to the assumed distribution) “seeds” to the decoder. To achieve the property that more or less randomly chosen points in the low dimensional latent space are meaningful and yield suitable results after decoding, the latent space/training process must be regularized. In this process, the input to the VAE is encoded to a distribution in the latent space rather than a single point.

For building variational autoencoders, we will use TensorFlow probability, but first, we need to split the data again.

library(tfprobability)

data = tf$keras$datasets$mnist$load_data()
train = data[[1]]
train_x = array(train[[1]]/255, c(dim(train[[1]]), 1L))

We will use TensorFlow probability to define priors for our latent variables.

library(tfprobability)

tfp = reticulate::import("tensorflow_probability")

Build the two networks:

encoded = 2L
prior = tfd_independent(tfd_normal(c(0.0, 0.0), 1.0), 1L)

up_size_model = keras_model_sequential()
up_size_model %>% 
  layer_dense(units = 128L, activation = "relu", input_shape = c(2L)) %>% 
  layer_reshape(target_shape = c(4L, 4L, 8L)) %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_upsampling_2d() %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_upsampling_2d() %>% 
  layer_conv_2d(filters = 16L, activation = "relu", kernel_size = c(3L,3L)) %>% 
  layer_upsampling_2d() %>% 
  layer_conv_2d(filters = 1, activation = "sigmoid", kernel_size = c(3L,3L), padding = "same")

down_size_model = keras_model_sequential()
down_size_model %>% 
  layer_conv_2d(filters = 16L, activation = "relu", kernel_size = c(3L, 3L), input_shape = c(28L, 28L, 1L), padding = "same") %>% 
  layer_max_pooling_2d(, padding = "same") %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_max_pooling_2d(, padding = "same") %>% 
  layer_conv_2d(filters = 8L, activation = "relu", kernel_size = c(3L,3L), padding = "same") %>% 
  layer_max_pooling_2d(, padding = "same") %>% 
  layer_flatten() %>% 
  layer_dense(units = 4L, activation = "linear") %>% 
  layer_independent_normal(2L,
                           activity_regularizer =
                             tfp$layers$KLDivergenceRegularizer(distribution_b = prior))

VAE = keras_model(inputs = down_size_model$inputs,
                  outputs = up_size_model(down_size_model$outputs))

Compile and fit model:

library(tensorflow)
library(keras)
set_random_seed(321L, disable_gpu = FALSE)  # Already sets R's random seed.

loss_binary = function(true, pred){
  return(loss_binary_crossentropy(true, pred) * 28.0 * 28.0)
}
VAE %>% compile(loss = loss_binary, optimizer = optimizer_adamax())

VAE %>% fit(train_x, train_x, epochs = 50L)

And show that it works:

dist = down_size_model(train_x[1:2000,,,,drop = FALSE])
images = up_size_model(dist$sample()[1:5,])

ownColors = c("limegreen", "purple", "yellow", "grey", "orange",
              "black", "red", "navy", "sienna", "springgreen")
oldpar = par(mfrow = c(1, 1))
imgPlot(images[1,,,1]$numpy())

plot(dist$mean()$numpy()[,1], dist$mean()$numpy()[,2], col = ownColors[train[[2]]+1L])

15.4 Exercise

Question

Read section Section 15.3 on variational autoencoders and try to transfer the examples with MNIST to our flower data set (so from black-white images to colored images).

Split the data:

library(keras)
library(tensorflow)
library(tfprobability)
set_random_seed(321L, disable_gpu = FALSE)  # Already sets R's random seed.

data = EcoData::dataset_flower()
test = data$test/255
train = data$train/255
rm(data)

Build the variational autoencoder:

encoded = 10L
prior = tfp$distributions$Independent(
  tfp$distributions$Normal(loc=tf$zeros(encoded), scale = 1.),
  reinterpreted_batch_ndims = 1L
)

down_size_model = tf$keras$models$Sequential(list(
  tf$keras$layers$InputLayer(input_shape = c(80L, 80L, 3L)),
  tf$keras$layers$Conv2D(filters = 32L, activation = tf$nn$leaky_relu,
                         kernel_size = 5L, strides = 1L),
  tf$keras$layers$Conv2D(filters = 32L, activation = tf$nn$leaky_relu,
                         kernel_size = 5L, strides = 2L),
  tf$keras$layers$Conv2D(filters = 64L, activation = tf$nn$leaky_relu,
                         kernel_size = 5L, strides = 1L),
  tf$keras$layers$Conv2D(filters = 64L, activation = tf$nn$leaky_relu,
                         kernel_size = 5L, strides = 2L),
  tf$keras$layers$Conv2D(filters = 128L, activation = tf$nn$leaky_relu,
                         kernel_size = 7L, strides = 1L),
  tf$keras$layers$Flatten(),
  tf$keras$layers$Dense(units = tfp$layers$MultivariateNormalTriL$params_size(encoded),
                        activation = NULL),
  tfp$layers$MultivariateNormalTriL(
    encoded, 
    activity_regularizer = tfp$layers$KLDivergenceRegularizer(prior, weight = 0.0002)
  )
))

up_size_model = tf$keras$models$Sequential(list(
  tf$keras$layers$InputLayer(input_shape = encoded),
  tf$keras$layers$Dense(units = 8192L, activation = "relu"),
  tf$keras$layers$Reshape(target_shape =  c(8L, 8L, 128L)),
  tf$keras$layers$Conv2DTranspose(filters = 128L, kernel_size = 7L,
                                  activation = tf$nn$leaky_relu, strides = 1L,
                                  use_bias = FALSE),
  tf$keras$layers$Conv2DTranspose(filters = 64L, kernel_size = 5L,
                                  activation = tf$nn$leaky_relu, strides = 2L,
                                  use_bias = FALSE),
  tf$keras$layers$Conv2DTranspose(filters = 64L, kernel_size = 5L,
                                  activation = tf$nn$leaky_relu, strides = 1L,
                                  use_bias = FALSE),
  tf$keras$layers$Conv2DTranspose(filters = 32L, kernel_size = 5L,
                                  activation = tf$nn$leaky_relu, strides = 2L,
                                  use_bias = FALSE),
  tf$keras$layers$Conv2DTranspose(filters = 32L, kernel_size = 5L,
                                  activation = tf$nn$leaky_relu, strides = 1L,
                                  use_bias = FALSE),
  tf$keras$layers$Conv2DTranspose(filters = 3L, kernel_size = c(4L, 4L),
                                  activation = "sigmoid", strides = c(1L, 1L),
                                  use_bias = FALSE)
))

VAE = tf$keras$models$Model(inputs = down_size_model$inputs, 
                            outputs = up_size_model(down_size_model$outputs))

summary(VAE)
Model: "model_3"
__________________________________________________________________________________________
 Layer (type)                           Output Shape                        Param #       
==========================================================================================
 input_1 (InputLayer)                   [(None, 80, 80, 3)]                 0             
 conv2d_14 (Conv2D)                     (None, 76, 76, 32)                  2432          
 conv2d_15 (Conv2D)                     (None, 36, 36, 32)                  25632         
 conv2d_16 (Conv2D)                     (None, 32, 32, 64)                  51264         
 conv2d_17 (Conv2D)                     (None, 14, 14, 64)                  102464        
 conv2d_18 (Conv2D)                     (None, 8, 8, 128)                   401536        
 flatten_2 (Flatten)                    (None, 8192)                        0             
 dense_10 (Dense)                       (None, 65)                          532545        
 multivariate_normal_tri_l (Multivariat  ((None, 10),                       0             
 eNormalTriL)                            (None, 10))                                      
 sequential_7 (Sequential)              (None, 80, 80, 3)                   1278464       
==========================================================================================
Total params: 2,394,337
Trainable params: 2,394,337
Non-trainable params: 0
__________________________________________________________________________________________    
    

Compile and train model:

be = function(true, pred){
  return(tf$losses$binary_crossentropy(true, pred) * 80.0 * 80.0)
}

VAE$compile(loss = be,
            optimizer = tf$keras$optimizers$Adamax(learning_rate = 0.0003))
VAE$fit(x = train, y = train, epochs = 50L, shuffle = TRUE, batch_size = 20L)

dist = down_size_model(train[1:10,,,])
images = up_size_model( dist$sample()[1:5,] )

oldpar = par(mfrow = c(3, 1), mar = rep(1, 4))
scales::rescale(images[1,,,]$numpy(), to = c(0, 255)) %>% 
  image_to_array() %>%
  `/`(., 255) %>%
  as.raster() %>%
  plot()

scales::rescale(images[2,,,]$numpy(), to = c(0, 255)) %>% 
  image_to_array() %>%
  `/`(., 255) %>%
  as.raster() %>%
  plot()

scales::rescale(images[3,,,]$numpy(), to = c(0, 255)) %>% 
  image_to_array() %>%
  `/`(., 255) %>%
  as.raster() %>%
  plot()