Skip to content

Commit

Permalink
unet skip connections being proven out in a number of works. add as a…
Browse files Browse the repository at this point in the history
…n option for AttentionLayers as `unet_skips`
  • Loading branch information
lucidrains committed Sep 11, 2024
1 parent 04075d7 commit 6db4d22
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 2 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2198,4 +2198,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
}
```

```bibtex
@article{Bao2022AllAW,
title = {All are Worth Words: A ViT Backbone for Diffusion Models},
author = {Fan Bao and Shen Nie and Kaiwen Xue and Yue Cao and Chongxuan Li and Hang Su and Jun Zhu},
journal = {2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2022},
pages = {22669-22679},
url = {https://api.semanticscholar.org/CorpusID:253581703}
}
```

*solve intelligence... then use that to solve everything else.* - Demis Hassabis
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name = 'x-transformers',
packages = find_packages(exclude=['examples']),
version = '1.34.1',
version = '1.35.0',
license='MIT',
description = 'X-Transformers - Pytorch',
author = 'Phil Wang',
Expand Down
20 changes: 20 additions & 0 deletions tests/test_x_transformers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import torch

from x_transformers.x_transformers import (
Expand Down Expand Up @@ -218,3 +219,22 @@ def test_squeeze_logit_dim_one():
logits = model(x, mask = mask)

assert logits.shape == (2,)

@pytest.mark.parametrize('depth', (4, 5))
def test_unet_skip(depth):

model = TransformerWrapper(
num_tokens = 20000,
max_seq_len = 1024,
attn_layers = Encoder(
dim = 128,
depth = depth,
heads = 8,
unet_skips = True
)
)

x = torch.randint(0, 20000, (2, 1024))
mask = torch.randint(0, 2, (2, 1024)).bool()

model(x, mask = mask)
63 changes: 62 additions & 1 deletion x_transformers/x_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,19 @@ def forward(self, x, *, condition, **kwargs):
out, *rest = out
return out * gamma, *rest

# skip connection combining

class ConcatCombine(Module):
def __init__(self, dim, prev_layer_ind):
super().__init__()
self.prev_layer_ind = prev_layer_ind
self.combine = nn.Linear(dim * 2, dim, bias = False)

def forward(self, x, prev_layers: list[Tensor]):
skip = prev_layers[self.prev_layer_ind]
concatted_skip = torch.cat((skip, x), dim = -1)
return self.combine(concatted_skip)

# feedforward

class GLU(Module):
Expand Down Expand Up @@ -1307,6 +1320,7 @@ def __init__(
disable_abs_pos_emb = None,
use_layerscale = False,
layerscale_init_value = 0.,
unet_skips = False,
**kwargs
):
super().__init__()
Expand Down Expand Up @@ -1468,6 +1482,8 @@ def __init__(

# calculate layer block order

len_default_block = 1

if exists(custom_layers):
layer_types = custom_layers
elif exists(par_ratio):
Expand All @@ -1487,6 +1503,7 @@ def __init__(
else:
assert exists(depth), '`depth` must be passed in for `Decoder` or `Encoder`'
layer_types = default_block * depth
len_default_block = len(default_block)

self.layer_types = layer_types
self.layers_execute_order = default(layers_execute_order, tuple(range(len(layer_types))))
Expand Down Expand Up @@ -1522,11 +1539,31 @@ def __init__(

self.final_norm = norm_fn() if pre_norm or resi_dual else nn.Identity()

# whether unet or not

self.unet_skips = unet_skips
num_skips = self.depth // len_default_block

assert not (unet_skips and num_skips == 0), 'must have depth of at least 2 for unet skip connections'

skip_indices = [i * len_default_block for i in range(num_skips)]

self.skip_combines = ModuleList([])

# iterate and construct layers

for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)):

# `ind` is the index of each module - attention, feedforward, cross attention
# but `block_ind` refers to the typical enumeration of a transformer block (attn + ff + [optional] cross attn)

block_begin = divisible_by(ind, len_default_block)
block_ind = ind // len_default_block

is_last_layer = ind == (len(self.layer_types) - 1)

# attention, cross attention, feedforward

if layer_type == 'a':
layer = Attention(dim, heads = heads, causal = causal, **attn_kwargs)
elif layer_type == 'c':
Expand All @@ -1548,6 +1585,14 @@ def __init__(
residual_fn = GRUGating if gate_residual else Residual
residual = residual_fn(dim, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant)

# handle unet skip connection

skip_combine = None
is_latter_half = block_begin and block_ind >= (self.depth / 2)

if self.unet_skips and is_latter_half:
skip_combine = ConcatCombine(dim, skip_indices.pop())

# all normalizations of the layer

pre_branch_norm = norm_fn() if pre_norm else None
Expand All @@ -1560,6 +1605,8 @@ def __init__(
post_main_norm
])

self.skip_combines.append(skip_combine)

self.layers.append(ModuleList([
norms,
layer,
Expand Down Expand Up @@ -1670,6 +1717,7 @@ def forward(

layer_variables = (
self.layer_types,
self.skip_combines,
self.layers,
self.layer_dropouts
)
Expand All @@ -1680,11 +1728,24 @@ def forward(

layer_variables = tuple(tuple(layer_variable[i] for i in layers_execute_order) for layer_variable in layer_variables)

# store all hiddens for skips

skip_hiddens = []

# go through the attention and feedforward layers

for ind, (layer_type, (norm, block, residual_fn), layer_dropout) in enumerate(zip(*layer_variables)):
for ind, (layer_type, skip_combine, (norm, block, residual_fn), layer_dropout) in enumerate(zip(*layer_variables)):
is_last = ind == (len(self.layers) - 1)

# handle skip connections

skip_hiddens.append(x)

if exists(skip_combine):
x = skip_combine(x, skip_hiddens)

# layer dropout

if self.training and layer_dropout > 0. and random() < layer_dropout:
continue

Expand Down

0 comments on commit 6db4d22

Please sign in to comment.