= "https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa%2C_by_Leonardo_da_Vinci%2C_from_C2RMF_retouched.jpg/640px-Mona_Lisa%2C_by_Leonardo_da_Vinci%2C_from_C2RMF_retouched.jpg"
url = download_image(url)
img = 100, 100
xo, yo = img[:, xo : 256 + xo, yo : 256 + yo]
img 1, 2, 0)); show_image(img.permute(
Neural Style Transfer
Adapted from:
min(), img.max() img.
(tensor(0.), tensor(1.))
We can optimize the raw pixels, like any other parameter.
class DummyDataset:
"""Dataset that yields n iterations of dummy data"""
def __init__(self, n):
self.n = n
def __len__(self):
return self.n
def __getitem__(self, idx):
= 0, 0
x, y return x, y
def get_dls(n):
return DataLoaders(
={"train": DummyDataset(n), "test": DummyDataset(1)}, nworkers=0
splits )
class TensorModel(nn.Module):
def __init__(self, tensor_):
super().__init__()
self.tensor_ = nn.Parameter(tensor_.clone())
def forward(self, *args, **kwargs):
return self.tensor_
= TensorModel(tensor_=torch.rand_like(img))
m m().shape
torch.Size([3, 256, 256])
class ImgOptCb(TrainCB):
def __init__(self, target):
self.target = target
self.intermediates = []
def predict(self, learn):
= learn.model()
learn.preds self.intermediates.append(learn.preds.clone())
def get_loss(self, learn):
= learn.loss_func(learn.preds, self.target) learn.loss
def optimize_noise_to_target(noise_model, n, target, lr=0.001):
= get_dls(n)
dls = ImgOptCb(target)
img_opt_cb = [img_opt_cb, ProgressCB(plot=True)]
cbs
Learner(
noise_model,
dls,
F.mse_loss,=lr,
lr=cbs,
cbs=torch.optim.Adam,
opt_func1)
).fit(return noise_model.tensor_.clip(0, 1), img_opt_cb.intermediates
= optimize_noise_to_target(
denoise, intermediates =torch.rand_like(img)),
TensorModel(tensor_250,
img,=1e-1,
lr
) show_images([denoise, img])
0, 1) for i in intermediates], figsize=(6, 6)) show_images([i.clip(
This isn’t interesting on its own, but starts to become interesting when we incorporate the pretrained feature extractors such as VGG16. This gives us a richer representation than the raw pixels that we can manipulate.
Here is a classic article on the discriminative features that these models learn.
VGG16 is similar to the pre-resnet model we implemented for FashionMNIST.
= timm.create_model("vgg16", pretrained=True).to(def_device) vgg16
vgg16
VGG(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
(2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): ReLU(inplace=True)
(4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(6): ReLU(inplace=True)
(7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(8): ReLU(inplace=True)
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(13): ReLU(inplace=True)
(14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(15): ReLU(inplace=True)
(16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(18): ReLU(inplace=True)
(19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(20): ReLU(inplace=True)
(21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(22): ReLU(inplace=True)
(23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(25): ReLU(inplace=True)
(26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(27): ReLU(inplace=True)
(28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(29): ReLU(inplace=True)
(30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(pre_logits): ConvMlp(
(fc1): Conv2d(512, 4096, kernel_size=(7, 7), stride=(1, 1))
(act1): ReLU(inplace=True)
(drop): Dropout(p=0.0, inplace=False)
(fc2): Conv2d(4096, 4096, kernel_size=(1, 1), stride=(1, 1))
(act2): ReLU(inplace=True)
)
(head): ClassifierHead(
(global_pool): SelectAdaptivePool2d(pool_type=avg, flatten=Flatten(start_dim=1, end_dim=-1))
(drop): Dropout(p=0.0, inplace=False)
(fc): Linear(in_features=4096, out_features=1000, bias=True)
(flatten): Identity()
)
)
To use VGG16, we need to normalize for each color channel with the same constants that the model was trained with.
def normalize_imagenet(img):
= img.clone()
i -= imagenet_mean[:, None, None]
i /= imagenet_std[:, None, None]
i return i
img.shape
torch.Size([3, 256, 256])
= normalize_imagenet(img)
nimg 0, 1)) show_image(nimg.clip(
We can also use the PyTorch version
0, 1)); show_image(pt_normalize_imagenet(img).clip(
So, how do we use this normalized image to extract their features?
calculate_features_vgg16
calculate_features_vgg16 (vgg16, imgs, target_layers=(18, 25))
= calculate_features_vgg16(vgg16, nimg)
embeddings 0].shape embeddings[
torch.Size([512, 32, 32])
# Homework: Implement a hook to capture intermediate representations
We can implement a loss function that computes the differences in feature space.
class ContentLoss:
def __init__(
self,
target_img,=(18, 25), # out of 30
target_layers=timm.create_model("vgg16", pretrained=True).to(def_device),
vgg
):self.featurize = partial(
=target_layers
calculate_features_vgg16, vgg, target_layers
)with torch.no_grad():
self.tgt = self.featurize(target_img)
def __call__(self, img, _):
= self.featurize(img)
inp return sum(F.mse_loss(f1, f2) for f1, f2 in zip(inp, self.tgt))
def optimize_features_to_target(
noise_model,
n,
target,=0.001,
lr=5,
epochs=(18, 25),
target_layers
):= []
intermediates for _ in range(epochs):
TrainLearner(
noise_model,// epochs),
get_dls(n =target_layers),
ContentLoss(target, target_layers=lr,
lr=torch.optim.Adam,
opt_func1)
).fit(= noise_model.tensor_.clone().detach().clip(0, 1)
i
intermediates.append(i)return intermediates
= TensorModel(tensor_=torch.rand_like(img))
nm = [torch.rand_like(img), *optimize_features_to_target(nm, 1000, nimg, 1e-2, 8)]
out =(6, 6)) show_images(out, figsize
= TensorModel(tensor_=torch.rand_like(img))
nm = optimize_features_to_target(nm, 1000, nimg, 1e-2, 8, target_layers=(1,))
out *out], figsize=(6, 6)) show_images([torch.rand_like(img),
Gram Loss
We don’t want to just transfer the spatial feature maps. We want something more abstract: style.
This is where the Gram matrix comes in. The gram matrix uses an intermediate represention, \(X\), where the magnitude of a feature basis is the row and the pixel ID in the column. By taking \(X \cdot X^T\), we get the covariance matrix of the feature activations, which we call the Gram matrix.
calculate_grams_vgg16
calculate_grams_vgg16 (vgg16, imgs, target_layers=(18, 25))
GramLoss
GramLoss (target_img, target_layers=(18, 25), vgg=VGG( (features): Sequential( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU(inplace=True) (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU(inplace=True) (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (6): ReLU(inplace=True) (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (8): ReLU(inplace=True) (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (11): ReLU(inplace=True) (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (13): ReLU(inplace=True) (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (15): ReLU(inplace=True) (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (18): ReLU(inplace=True) (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (20): ReLU(inplace=True) (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (22): ReLU(inplace=True) (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (25): ReLU(inplace=True) (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (27): ReLU(inplace=True) (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (29): ReLU(inplace=True) (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (pre_logits): ConvMlp( (fc1): Conv2d(512, 4096, kernel_size=(7, 7), stride=(1, 1)) (act1): ReLU(inplace=True) (drop): Dropout(p=0.0, inplace=False) (fc2): Conv2d(4096, 4096, kernel_size=(1, 1), stride=(1, 1)) (act2): ReLU(inplace=True) ) (head): ClassifierHead( (global_pool): SelectAdaptivePool2d(pool_type=avg, flatten=Flatten(start_dim=1, end_dim=-1)) (drop): Dropout(p=0.0, inplace=False) (fc): Linear(in_features=4096, out_features=1000, bias=True) (flatten): Identity() ) ))
Initialize self. See help(type(self)) for accurate signature.
Type | Default | Details | |
---|---|---|---|
target_img | |||
target_layers | tuple | (18, 25) | out of 30 |
vgg | VGG | VGG( (features): Sequential( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU(inplace=True) (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU(inplace=True) (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (6): ReLU(inplace=True) (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (8): ReLU(inplace=True) (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (11): ReLU(inplace=True) (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (13): ReLU(inplace=True) (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (15): ReLU(inplace=True) (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (18): ReLU(inplace=True) (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (20): ReLU(inplace=True) (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (22): ReLU(inplace=True) (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (25): ReLU(inplace=True) (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (27): ReLU(inplace=True) (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (29): ReLU(inplace=True) (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (pre_logits): ConvMlp( (fc1): Conv2d(512, 4096, kernel_size=(7, 7), stride=(1, 1)) (act1): ReLU(inplace=True) (drop): Dropout(p=0.0, inplace=False) (fc2): Conv2d(4096, 4096, kernel_size=(1, 1), stride=(1, 1)) (act2): ReLU(inplace=True) ) (head): ClassifierHead( (global_pool): SelectAdaptivePool2d(pool_type=avg, flatten=Flatten(start_dim=1, end_dim=-1)) (drop): Dropout(p=0.0, inplace=False) (fc): Linear(in_features=4096, out_features=1000, bias=True) (flatten): Identity() ) ) |
def optimize_gram_to_target(
noise_model,
n,
target,=0.001,
lr=5,
epochs=(18, 25),
target_layers
):= []
intermediates for _ in range(epochs):
TrainLearner(
noise_model,// epochs),
get_dls(n =target_layers),
GramLoss(target, target_layers=lr,
lr=torch.optim.Adam,
opt_func1)
).fit(= noise_model.tensor_.clone().detach().clip(0, 1)
i
intermediates.append(i)return intermediates
= "https://insideecology.com/wp-content/uploads/2018/06/spider-web-with-water-beads-921039_1280-810x540.jpg"
spider_web show_image(download_image(spider_web))
def style_transfer(from_, to_):
if isinstance(from_, str):
= pt_normalize_imagenet(download_image(from_))
from_ if isinstance(to_, str):
= pt_normalize_imagenet(download_image(to_))
to_ = TensorModel(tensor_=from_)
nm return optimize_gram_to_target(nm, 2000, to_, 0.1, 9, target_layers=(2, 18, 25))
=(6, 6)) show_images(style_transfer(nimg, spider_web), figsize
= "https://sanctuarymentalhealth.org/wp-content/uploads/2021/03/The-Starry-Night-1200x630-1-979x514.jpg"
starry_night
show_image(download_image(starry_night))=(6, 6)) show_images(style_transfer(nimg, starry_night), figsize
Using the features from pretrained networks can be powerful.