Neural Style Transfer with Tensorflow Eager Execution

Categories:

Updated:

23 minute read

I’ll follow the Neural Style Transfer with Eager Execution Tutorial, with additional parts with ITALIC fonts to understand the code better. In this tutorial, ‘VGG19’ was used to find features of images.

Look at my neural style transfer post for basic knowledge about neural style transfer.

Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (10,10)
mpl.rcParams['axes.grid'] = False
from PIL import Image

import time
import functools

import tensorflow as tf
import tensorflow.contrib.eager as tfe

from tensorflow.python.keras.preprocessing import image as kp_image
from tensorflow.python.keras import models
from tensorflow.python.keras import losses
from tensorflow.python.keras import layers
from tensorflow.python.keras import backend as K

Enable Eager Execution

tf.enable_eager_execution()
print("Eager execution: {}".format(tf.executing_eagerly()))
Eager execution: True

Set Path to Content and Style Image

content_path = 'content4.jpg'
style_path = 'style4.jpg'

Handling Images

img = Image.open(content_path)
img.size
(2560, 1600)
img = kp_image.img_to_array(img)
print(img.shape)
img
(1600, 2560, 3)





array([[[ 75.,  39.,   7.],
        [ 71.,  41.,   5.],
        [ 60.,  34.,   0.],
        ...,
        [175., 217., 255.],
        [175., 217., 255.],
        [175., 217., 255.]],

       [[ 81.,  47.,  12.],
        [ 79.,  51.,  14.],
        [ 69.,  45.,   7.],
        ...,
        [175., 217., 255.],
        [175., 217., 255.],
        [175., 217., 255.]],

       [[ 81.,  49.,  11.],
        [ 75.,  50.,  10.],
        [ 67.,  48.,   8.],
        ...,
        [175., 217., 255.],
        [175., 217., 255.],
        [174., 216., 254.]],

       ...,

       [[  3.,  39.,  55.],
        [  5.,  42.,  51.],
        [  2.,  36.,  46.],
        ...,
        [ 91., 116.,  35.],
        [ 75., 105.,  19.],
        [ 68., 100.,  17.]],

       [[  3.,  36.,  51.],
        [  1.,  34.,  49.],
        [  0.,  30.,  35.],
        ...,
        [ 84., 113.,  29.],
        [ 58.,  94.,   4.],
        [ 44.,  83.,   2.]],

       [[  5.,  32.,  49.],
        [  1.,  29.,  50.],
        [  0.,  31.,  33.],
        ...,
        [ 76., 120.,  31.],
        [ 44.,  95.,   3.],
        [ 22.,  77.,   0.]]], dtype=float32)
img = np.expand_dims(img, axis=0)
img.shape
(1, 1600, 2560, 3)

Loading Image

def load_img(path_to_img):
    max_dim = 1024
    img = Image.open(path_to_img)
    long = max(img.size)
    scale = max_dim/long
    img = img.resize((round(img.size[0]*scale), round(img.size[1]*scale)), Image.ANTIALIAS)

    img = kp_image.img_to_array(img)

    # We need to broadcast the image array such that it has a batch dimension
    img = np.expand_dims(img, axis=0)
    return img

Visualizing Image

def imshow(img, title=None):
    # Remove the batch dimension
    out = np.squeeze(img, axis=0)
    # Normalize for display
    out = out.astype('uint8')
    plt.imshow(out)
    if title is not None:
        plt.title(title)
    plt.imshow(out)

Loading and Showing Content and Style Image

plt.figure(figsize=(10,10))

content = load_img(content_path).astype('uint8')
style = load_img(style_path).astype('uint8')

plt.subplot(1, 2, 1)
imshow(content, 'Content Image')

plt.subplot(1, 2, 2)
imshow(style, 'Style Image')
plt.show()

png

Preprocessing image with vgg19’s preprocessing

VGG networks are trained on image with each channel normalized by mean = [103.939, 116.779, 123.68] and with channels BGR.

img = load_img('content4.jpg')
img = tf.keras.applications.vgg19.preprocess_input(img)
img
array([[[[ -95.939     ,  -71.779     ,  -49.68      ],
         [ -94.939     ,  -69.779     ,  -51.68      ],
         [ -92.939     ,  -66.779     ,  -49.68      ],
         ...,
         [ 151.061     ,  100.221     ,   55.32      ],
         [ 151.061     ,   99.221     ,   53.32      ],
         [ 151.061     ,  100.221     ,   51.32      ]],

        [[ -99.939     ,  -73.779     ,  -58.68      ],
         [ -96.939     ,  -75.779     ,  -62.68      ],
         [ -93.939     ,  -81.779     ,  -68.68      ],
         ...,
         [ 151.061     ,   99.221     ,   54.32      ],
         [ 151.061     ,   99.221     ,   52.32      ],
         [ 150.061     ,   99.221     ,   50.32      ]],

        [[ -99.939     ,  -84.779     ,  -70.68      ],
         [ -99.939     ,  -85.779     ,  -73.68      ],
         [ -97.939     ,  -84.779     ,  -74.68      ],
         ...,
         [ 151.061     ,   99.221     ,   54.32      ],
         [ 150.061     ,   98.221     ,   52.32      ],
         [ 150.061     ,   99.221     ,   50.32      ]],

        ...,

        [[ -57.939003  ,  -83.779     , -120.68      ],
         [ -61.939003  ,  -81.779     , -120.68      ],
         [ -68.939     ,  -77.779     , -120.68      ],
         ...,
         [ -39.939003  ,   -5.7789993 ,  -48.68      ],
         [ -43.939003  ,    0.22100067,  -38.68      ],
         [ -40.939003  ,   -2.7789993 ,  -42.68      ]],

        [[ -52.939003  ,  -83.779     , -121.68      ],
         [ -61.939003  ,  -80.779     , -121.68      ],
         [ -69.939     ,  -81.779     , -121.68      ],
         ...,
         [ -46.939003  ,  -15.778999  ,  -61.68      ],
         [ -44.939003  ,    0.22100067,  -34.68      ],
         [ -64.939     ,   -0.7789993 ,  -41.68      ]],

        [[ -56.939003  ,  -82.779     , -121.68      ],
         [ -66.939     ,  -81.779     , -122.68      ],
         [ -66.939     ,  -82.779     , -122.68      ],
         ...,
         [ -51.939003  ,  -13.778999  ,  -54.68      ],
         [ -52.939003  ,    2.2210007 ,  -30.68      ],
         [ -94.939     ,  -19.779     ,  -66.68      ]]]], dtype=float32)

Loading and Preprocessing Image

def load_and_process_img(path_to_img):
    img = load_img(path_to_img)
    img = tf.keras.applications.vgg19.preprocess_input(img)
    return img

np.squeeze, np.clip, [::-1]

array = np.array([[[911, -3],[77, 42]]])
print(array.shape)
array
(1, 2, 2)





array([[[911,  -3],
        [ 77,  42]]])
array = np.squeeze(array, 0)
print(array.shape)
array
(2, 2)





array([[911,  -3],
       [ 77,  42]])
array = np.clip(array, 0, 255)
array
array([[255,   0],
       [ 77,  42]])
array[:, ::-1]
array([[  0, 255],
       [ 42,  77]])
array[::-1, :]
array([[ 77,  42],
       [255,   0]])

Deprocessing Image

def deprocess_img(processed_img):
    x = processed_img.copy()
    if len(x.shape) == 4:
        x = np.squeeze(x, 0)
        assert len(x.shape) == 3, ("Input to deprocess image must be an image of "
                             "dimension [1, height, width, channel] or [height, width, channel]")
    if len(x.shape) != 3:
        raise ValueError("Invalid input to deprocessing image")

    # perform the inverse of the preprocessiing step
    x[:, :, 0] += 103.939
    x[:, :, 1] += 116.779
    x[:, :, 2] += 123.68
    x = x[:, :, ::-1]

    x = np.clip(x, 0, 255).astype('uint8')
    return x

What does VGG19 Architecture Look Like?

vgg = tf.keras.applications.vgg19.VGG19(include_top=False, weights='imagenet')
vgg.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
input_1 (InputLayer)         (None, None, None, 3)     0
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0
_________________________________________________________________
block3_conv1 (Conv2D)        (None, None, None, 256)   295168
_________________________________________________________________
block3_conv2 (Conv2D)        (None, None, None, 256)   590080
_________________________________________________________________
block3_conv3 (Conv2D)        (None, None, None, 256)   590080
_________________________________________________________________
block3_conv4 (Conv2D)        (None, None, None, 256)   590080
_________________________________________________________________
block3_pool (MaxPooling2D)   (None, None, None, 256)   0
_________________________________________________________________
block4_conv1 (Conv2D)        (None, None, None, 512)   1180160
_________________________________________________________________
block4_conv2 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block4_conv3 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block4_conv4 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block4_pool (MaxPooling2D)   (None, None, None, 512)   0
_________________________________________________________________
block5_conv1 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block5_conv2 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block5_conv3 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block5_conv4 (Conv2D)        (None, None, None, 512)   2359808
_________________________________________________________________
block5_pool (MaxPooling2D)   (None, None, None, 512)   0
=================================================================
Total params: 20,024,384
Trainable params: 20,024,384
Non-trainable params: 0
_________________________________________________________________

Select layers in vgg19 that we would consider, for content and for style

# Content layer where will pull our feature maps
content_layers = ['block5_conv2']

# Style layer we are interested in
style_layers = ['block1_conv1',
                'block2_conv1',
                'block3_conv1',
                'block4_conv1',
                'block5_conv1'
               ]

num_content_layers = len(content_layers)
num_style_layers = len(style_layers)

Getting input and output of a certain layer from a model

vgg.input
<DeferredTensor 'input_1' shape=(?, ?, ?, 3) dtype=float32>
vgg.get_layer('block5_conv4').output
<DeferredTensor 'None' shape=(?, ?, ?, 512) dtype=float32>

Get model, where we will input content and style image and get output of layers we’ve defined above

def get_model():
    """ Creates our model with access to intermediate layers.

    This function will load the VGG19 model and access the intermediate layers.
    These layers will then be used to create a new model that will take input image
    and return the outputs from these intermediate layers from the VGG model.

    Returns:
    returns a keras model that takes image inputs and outputs the style and
      content intermediate layers.
    """
    # Load our model. We load pretrained VGG, trained on imagenet data
    vgg = tf.keras.applications.vgg19.VGG19(include_top=False, weights='imagenet')
    vgg.trainable = False
    # Get output layers corresponding to style and content layers
    style_outputs = [vgg.get_layer(name).output for name in style_layers]
    content_outputs = [vgg.get_layer(name).output for name in content_layers]
    model_outputs = style_outputs + content_outputs
    # Build model
    return models.Model(vgg.input, model_outputs)

Content Loss for each layer

\[J_{content}^{[l]}(C,G) = \frac{1}{2}||a^{[l](C)}-a^{[l](G)}||^2\]
def get_content_loss(base_content, target):
    return tf.reduce_mean(tf.square(base_content - target))

Style Loss for each layer

\[G_{kk'}^{[l]} = \sum_{i=1}^{n_H^{[l]}} \sum_{j=1}^{n_W^{[l]}} a_{ijk}^{[l]} a_{ijk'}^{[l]}\] \[J_{style}^{[l]} (S,G)=\frac{1}{(2n_H^{[l]} n_W^{[l]} n_C^{[l]} )^2} ‖G^{[l](S)}-G^{[l](G)} ‖_F^2\]

How to calculate gram matrix?

print(img.shape)
channels = int(img.shape[-1])
a = tf.reshape(img, [-1, channels])
print(a.shape)
gram = tf.matmul(a, a, transpose_a=True) # transpose first matrix before matmul
gram
(1, 640, 1024, 3)
(655360, 3)





<tf.Tensor: id=452, shape=(3, 3), dtype=float32, numpy=
array([[5.1160253e+09, 2.8473692e+09, 2.4522580e+09],
       [2.8473692e+09, 3.6207872e+09, 2.9435433e+09],
       [2.4522580e+09, 2.9435433e+09, 3.7259351e+09]], dtype=float32)>
def gram_matrix(input_tensor):
    # We make the image channels first
    channels = int(input_tensor.shape[-1])
    a = tf.reshape(input_tensor, [-1, channels])
    gram = tf.matmul(a, a, transpose_a=True)
    return gram

def get_style_loss(base_style, gram_target):
    """Expects two images of dimension h, w, c"""
    # height, width, num filters of each layer
    # We scale the loss at a given layer by the size of the feature map and the number of filters
    height, width, channels = base_style.get_shape().as_list()
    gram_style = gram_matrix(base_style)

    return tf.reduce_mean(tf.square(gram_style - gram_target))# / (4. * (channels ** 2) * (width * height) ** 2)

Getting Feature Representations of Content and Style

def get_feature_representations(model, content_path, style_path):
    """Helper function to compute our content and style feature representations.

    This function will simply load and preprocess both the content and style
    images from their path. Then it will feed them through the network to obtain
    the outputs of the intermediate layers.

    Arguments:
    model: The model that we are using.
    content_path: The path to the content image.
    style_path: The path to the style image

    Returns:
    returns the style features and the content features.
    """
    # Load our images in
    content_image = load_and_process_img(content_path)
    style_image = load_and_process_img(style_path)

    # batch compute content and style features
    style_outputs = model(style_image)
    content_outputs = model(content_image)

    # Get the style and content feature representations from our model
    style_features = [style_layer[0] for style_layer in style_outputs[:num_style_layers]]
    content_features = [content_layer[0] for content_layer in content_outputs[num_style_layers:]]
    return style_features, content_features

Computing Total Loss

\[J_{content} (C,G)=\sum_lλ^{[l]} J_{content}^{[l]} (C,G)\] \[J_{style} (S,G)=\sum_lλ^{[l]} J_{style}^{[l]} (S,G)\] \[J(G) = \alpha J_{content}(C,G)+\beta J_{style}(S,G)\]
def compute_loss(model, loss_weights, init_image, gram_style_features, content_features):
    """This function will compute the total loss.

    Arguments:
    model: The model that will give us access to the intermediate layers
    loss_weights: The weights of each contribution of each loss function.
      (style weight, content weight, and total variation weight)
    init_image: Our initial base image. This image is what we are updating with
      our optimization process. We apply the gradients wrt the loss we are
      calculating to this image.
    gram_style_features: Precomputed gram matrices corresponding to the
      defined style layers of interest.
    content_features: Precomputed outputs from defined content layers of
      interest.

    Returns:
    returns the total loss, style loss, content loss, and total variational loss
    """
    style_weight, content_weight = loss_weights

    # Feed our init image through our model. This will give us the content and
    # style representations at our desired layers. Since we're using eager
    # our model is callable just like any other function!
    model_outputs = model(init_image)

    style_output_features = model_outputs[:num_style_layers]
    content_output_features = model_outputs[num_style_layers:]

    style_score = 0
    content_score = 0

    # Accumulate style losses from all layers
    # Here, we equally weight each contribution of each loss layer
    weight_per_style_layer = 1.0 / float(num_style_layers)
    for target_style, comb_style in zip(gram_style_features, style_output_features):
        style_score += weight_per_style_layer * get_style_loss(comb_style[0], target_style)

    # Accumulate content losses from all layers
    weight_per_content_layer = 1.0 / float(num_content_layers)
    for target_content, comb_content in zip(content_features, content_output_features):
        content_score += weight_per_content_layer* get_content_loss(comb_content[0], target_content)

    style_score *= style_weight
    content_score *= content_weight

    # Get total loss
    loss = style_score + content_score
    return loss, style_score, content_score

Using tf.GradientTape() to compute gradients

x_1 = tf.Variable(3.0)
x_2 = tf.Variable(7.0)
with tf.GradientTape() as tape:
    y = x_1**3 + (3 * (x_2**2))
dy_dx = tape.gradient(y, (x_1,x_2))
print(dy_dx)
(<tf.Tensor: id=512, shape=(), dtype=float32, numpy=27.0>, <tf.Tensor: id=492, shape=(), dtype=float32, numpy=42.000004>)

What is *args and **kwargs?

The *args will give you all function parameters as a tuple. The **kwargs will give you all keyword arguments except for those corresponding to a formal parameter as a dictionary.

def foo(*args):
    for a in args:
        print(a)

print('*args')
foo(1,2,3)

def bar(**kwargs):
    for a in kwargs:
        print(a, kwargs[a])

print('**kwargs')
bar(name='one', age=27)
*args
1
2
3
**kwargs
name one
age 27

It’s also worth noting that you can use * and ** when calling functions as well. This is a shortcut that allows you to pass multiple arguments to a function directly using either a list/tuple or a dictionary.

def foo(x,y,z):
    print("x=" + str(x))
    print("y=" + str(y))
    print("z=" + str(z))
mylist = [1,2,3]
mydict = {'x':1,'y':2,'z':3}
print('**args')
foo(*mylist)
print('**kwargs')
foo(**mydict)
**args
x=1
y=2
z=3
**kwargs
x=1
y=2
z=3

Computing Gradients with tf.GradientTape()

def compute_grads(cfg):
    with tf.GradientTape() as tape:
        all_loss = compute_loss(**cfg)
    # Compute gradients wrt input image
    total_loss = all_loss[0]
    return tape.gradient(total_loss, cfg['init_image']), all_loss

Running Style Transfer Algorithm

import IPython.display

def run_style_transfer(content_path,
                       style_path,
                       num_iterations=1000,
                       content_weight=0.9999999999999):
    # We don't need to (or want to) train any layers of our model, so we set their
    # trainable to false.
    model = get_model()
    for layer in model.layers:
        layer.trainable = False

    # Get the style and content feature representations (from our specified intermediate layers)
    style_features, content_features = get_feature_representations(model, content_path, style_path)
    gram_style_features = [gram_matrix(style_feature) for style_feature in style_features]

    # Set initial image
    init_image = load_and_process_img(content_path)
    init_image = tfe.Variable(init_image, dtype=tf.float32)
    # Create our optimizer
    opt = tf.train.AdamOptimizer(learning_rate=5, beta1=0.99, epsilon=1e-1)

    # For displaying intermediate images
    iter_count = 1

    # Store our best result
    best_loss, best_img = float('inf'), None

    # Create a nice config
    style_weight = 1 - content_weight
    loss_weights = (style_weight, content_weight)
    cfg = {
      'model': model,
      'loss_weights': loss_weights,
      'init_image': init_image,
      'gram_style_features': gram_style_features,
      'content_features': content_features
    }

    # For displaying
    num_rows = 5
    num_cols = 2
    display_interval = num_iterations/(num_rows*num_cols)
    global_start = time.time()

    # We should clip output pixel values from 0 to 255.
    norm_means = np.array([103.939, 116.779, 123.68])
    min_vals = -norm_means
    max_vals = 255 - norm_means

    imgs = []
    for i in range(num_iterations):
        start_time = time.time()
        grads, all_loss = compute_grads(cfg)
        loss, style_score, content_score = all_loss
        opt.apply_gradients([(grads, init_image)])
        clipped = tf.clip_by_value(init_image, min_vals, max_vals)
        init_image.assign(clipped)

        if loss < best_loss:
            # Update best loss and best image from total loss.
            best_loss = loss
            best_img = deprocess_img(init_image.numpy())

        if i % display_interval== 0:

            # Use the .numpy() method to get the concrete numpy array
            plot_img = init_image.numpy()
            plot_img = deprocess_img(plot_img)
            imgs.append(plot_img)
            IPython.display.clear_output(wait=True)
            IPython.display.display_png(Image.fromarray(plot_img))
            print('Iteration: {}'.format(i))
            print('Total loss: {:.4e}, '
                'style loss: {:.4e}, '
                'content loss: {:.4e}, '
                'time: {:.4f}s'.format(loss, style_score, content_score, time.time() - start_time))
    print('Total time: {:.4f}s'.format(time.time() - global_start))
    IPython.display.clear_output(wait=True)
    plt.figure(figsize=(14,20))
    for i,img in enumerate(imgs):
        plt.subplot(num_rows,num_cols,i+1)
        plt.imshow(img)
        plt.xticks([])
        plt.yticks([])

    return best_img, best_loss
best, best_loss = run_style_transfer(content_path,
                                     style_path, num_iterations=1000)

png

Show image with lowest loss

Image.fromarray(best)

png

Show produced image with original content and style image

def show_results(best_img, content_path, style_path, show_large_final=True):
    plt.figure(figsize=(10, 5))
    content = load_img(content_path)
    style = load_img(style_path)

    plt.subplot(1, 2, 1)
    imshow(content, 'Content Image')

    plt.subplot(1, 2, 2)
    imshow(style, 'Style Image')

    if show_large_final:
        plt.figure(figsize=(10, 10))

        plt.imshow(best_img)
        plt.title('Output Image')
        plt.show()
show_results(best, content_path, style_path)

png

png

Leave a Comment