Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Review fixes 2 #8

Merged
merged 6 commits into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 163 additions & 74 deletions trax/models/research/funnel_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,55 +73,63 @@ def _Upsampler():
def _FunnelBlock(d_model, d_ff, n_heads,
dropout, dropout_shared_axes, mode, ff_activation,
pool_layer, pool_size, strides, separate_cls):
"""Internal funnel block. On input it takes (activations, masks).
"""Internal funnel block. Returns a list of layers implementing it.

The input is an activation tensor.

Args:
d_model: Final dimension of tensors at most points in the model, including
the initial embedding output.
d_ff: Size of special dense layer in the feed-forward part of each block.
n_heads: Number of attention heads.
dropout: Stochastic rate (probability) for dropping an activation value
when applying dropout within a block.
dropout_shared_axes: Tensor axes on which to share a dropout mask.
Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
a useful way to save memory and apply consistent masks to activation
vectors at different sequence positions.
mode: If `'train'`, each block will include dropout; else, it will
pass all values through unaltered.
ff_activation: Type of activation function at the end of each block; must
be an activation-type subclass of `Layer`.
pool_size: Shape of window that gets reduced to a single vector value.
If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
must be a tuple of length :math:`n-2`.
strides: Offsets from the location of one window to the locations of
neighboring windows along each axis. If specified, must be a tuple of
the same length as `pool_size`. If None, then offsets of 1 along each
window axis, :math:`(1, ..., 1)`, will be used.
d_model: Final dimension of tensors at most points in the model, including
the initial embedding output.
d_ff: Size of special dense layer in the feed-forward part of each block.
n_heads: Number of attention heads.
dropout: Stochastic rate (probability) for dropping an activation value
when applying dropout within a block.
dropout_shared_axes: Tensor axes on which to share a dropout mask.
Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
a useful way to save memory and apply consistent masks to activation
vectors at different sequence positions.
mode: If `'train'`, each block will include dropout; else, it will
pass all values through unaltered.
ff_activation: Type of activation function at the end of each block; must
be an activation-type subclass of `Layer`.
pool_layer: Type of pooling layer used for downsampling;
should be `tl.AvgPool` or `tl.MaxPool`.
pool_size: Shape of window that gets reduced to a single vector value.
If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
must be a tuple of length :math:`n-2`.
strides: Offsets from the location of one window to the locations of
neighboring windows along each axis. If specified, must be a tuple of
the same length as `pool_size`. If None, then offsets of 1 along each
window axis, :math:`(1, ..., 1)`, will be used.
separate_cls: If `True`, pooling in funnel blocks is not applied to
embeddings of the first token (`cls` from BERT paper).
Returns:
A list of layers that maps (activations, mask) to (activations', mask).
"""
attention = tl.AttentionQKV(
d_feature=d_model, n_heads=n_heads, dropout=dropout, mode=mode)
feed_forward = _FeedForwardBlock(
d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)
pooling = PoolLayer(pool_layer, pool_size, strides, separate_cls)
mask_pooling = MaskPool(pool_size, strides, separate_cls)

return tl.Serial( # h, mask
tl.Branch(pooling, None, None), # h', h, h, mask
tl.Dup(), # h', h', h, h, mask
tl.Parallel(
None,
attention
), # h', attention(...), mask
tl.Add(), # h'+attention(...), mask
tl.LayerNorm(), # funnel_activations, mask
tl.Parallel(
None,
mask_pooling
), # funnel_activations, mask'
feed_forward
)
attention = tl.AttentionQKV(d_model, n_heads=n_heads, dropout=dropout,
mode=mode)
hidden_dropout = tl.Dropout(
rate=dropout, shared_axes=dropout_shared_axes, mode=mode)

feed_forward = _FeedForwardBlock(
d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)

return [ # h, mask
tl.LayerNorm(), # h, mask
tl.Branch(pooling, None), # h', h, mask
tl.Residual(
tl.Select([0, 1, 1, 2]), # h', h, h, mask
attention, # attn, mask
tl.Parallel(None, mask_pooling), # attn, mask'
hidden_dropout # attn, mask'
), # funnel_activations, mask'
tl.Residual(
feed_forward
)
]


def FunnelTransformerEncoder(vocab_size,
Expand All @@ -140,6 +148,62 @@ def FunnelTransformerEncoder(vocab_size,
strides=(2,),
separate_cls=True):
"""Returns a Funnel Encoder.

This model performs text categorization:

- input: rank 2 tensor representing a batch of text strings via token IDs
plus padding markers; shape is (batch_size, sequence_length). The tensor
elements are integers in `range(vocab_size)`, and `0` values mark padding
positions.

- output: rank 2 tensor representing a batch of log-probability
distributions over N categories; shape is (batch_size, `n_classes`).

Args:
vocab_size: Input vocabulary size -- each element of the input tensor
should be an integer in `range(vocab_size)`. These integers typically
represent token IDs from a vocabulary-based tokenizer.
n_classes: Final dimension of the output tensors, representing N-way
classification.
d_model: Final dimension of tensors at most points in the model, including
the initial embedding output.
d_ff: Size of special dense layer in the feed-forward part of each encoder
block.
encoder_segment_lengths: Tuple, where each element denotes the number of
transformer encoder blocks preceding a funnel transformer block.
There is no funnel block after the last sequence of encoder blocks,
therefore the total number of blocks in the model is equal to
`sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`.
n_heads: Number of attention heads.
max_len: Maximum symbol length for positional encoding.
dropout: Stochastic rate (probability) for dropping an activation value
when applying dropout within an encoder block.
dropout_shared_axes: Tensor axes on which to share a dropout mask.
Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
a useful way to save memory and apply consistent masks to activation
vectors at different sequence positions.
mode: If `'train'`, each encoder block will include dropout; else, it will
pass all values through unaltered.
ff_activation: Type of activation function at the end of each encoder
block; must be an activation-type subclass of `Layer`.
pool_layer: Type of pooling layer used for downsampling in each of the
funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`.
pool_size: Shape of window that gets reduced to a single vector value.
If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
must be a tuple of length :math:`n-2`.
strides: Offsets from the location of one window to the locations of
neighboring windows along each axis. If specified, must be a tuple of
the same length as `pool_size`. If None, then offsets of 1 along each
window axis, :math:`(1, ..., 1)`, will be used.
separate_cls: If `True`, pooling in funnel blocks is not applied to
embeddings of the first token (`cls` from BERT paper) and only final
embedding of this token is used for categorization - the rest are
discarded. If `False`, each token from the beginning is pooled and
all embeddings are averaged and mapped to output categories like in
original `TransformerEncoder` model.
Returns:
A Transformer model that maps strings (conveyed via token IDs) to
probability-like activations over a range of output classes.
"""
assert encoder_segment_lengths

Expand All @@ -161,10 +225,11 @@ def FunnelTransformerEncoder(vocab_size,

# If not last segment, add funnel block
if i != n_encoder_segments - 1:
encoder_blocks.append(_FunnelBlock(d_model, d_ff, n_heads, dropout,
dropout_shared_axes, mode,
ff_activation, pool_layer, pool_size,
strides, separate_cls))
encoder_blocks.append(
_FunnelBlock(d_model, d_ff, n_heads, dropout,
dropout_shared_axes, mode,
ff_activation, pool_layer, pool_size,
strides, separate_cls))

cls_pooling = SelectFirst() if separate_cls else tl.Mean(axis=1)

Expand All @@ -184,34 +249,6 @@ def FunnelTransformerEncoder(vocab_size,
)


def _FunnelResidualBlock(d_model, d_ff, n_heads,
dropout, dropout_shared_axes, mode, ff_activation,
pool_layer, pool_size, strides):
feed_forward = _FeedForwardBlock(
d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)

dropout_ = tl.Dropout(
rate=dropout, shared_axes=dropout_shared_axes, mode=mode)

attn_ = tl.AttentionQKV(d_model, n_heads=n_heads, dropout=dropout, mode=mode)

pooling_ = PoolLayer(pool_layer, pool_size, strides)

return [
tl.Parallel(tl.Branch(pooling_, None), None),
tl.Residual(
tl.Parallel(tl.LayerNorm(), tl.LayerNorm()),
tl.Select([0, 1, 1, 2]),
attn_,
tl.Parallel(None, MaskPool()),
dropout_
),
tl.Residual(
feed_forward
)
]


def FunnelTransformer(vocab_size,
d_model=512,
d_ff=2048,
Expand All @@ -226,7 +263,57 @@ def FunnelTransformer(vocab_size,
pool_layer=tl.AvgPool,
pool_size=(2,),
separate_cls=True):
"""Returns a Full Funnel Transformer.
"""Returns a Full Funnel Transformer, that can be used for example for BERT.

This model outputs token-level categorical distributions over all vocab:

- input: rank 2 tensor representing a batch of text strings via token IDs
plus padding markers; shape is (batch_size, sequence_length). The tensor
elements are integers in `range(vocab_size)`, and `0` values mark padding
positions.

- output: rank 3 tensor representing a batch of log-probability
distributions over `vocab_size` categories for each token; shape is
(batch_size, sequence_length, vocab_size).


Args:
vocab_size: Input vocabulary size -- each element of the input tensor
should be an integer in `range(vocab_size)`. These integers typically
represent token IDs from a vocabulary-based tokenizer.
d_model: Final dimension of tensors at most points in the model, including
the initial embedding output.
d_ff: Size of special dense layer in the feed-forward part of each encoder
block.
encoder_segment_lengths: Tuple, where each element denotes the number of
transformer encoder blocks preceding a funnel transformer block.
There is no funnel block after the last sequence of encoder blocks,
therefore the total number of blocks in the model is equal to
`sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`.
n_decoder_blocks: Number of transformer blocks in the upsampling decoder.
n_heads: Number of attention heads.
max_len: Maximum symbol length for positional encoding.
dropout: Stochastic rate (probability) for dropping an activation value
when applying dropout within an encoder block.
dropout_shared_axes: Tensor axes on which to share a dropout mask.
Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
a useful way to save memory and apply consistent masks to activation
vectors at different sequence positions.
mode: If `'train'`, each encoder block will include dropout; else, it will
pass all values through unaltered.
ff_activation: Type of activation function at the end of each encoder
block; must be an activation-type subclass of `Layer`.
pool_layer: Type of pooling layer used for downsampling in each of the
funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`.
pool_size: Shape of window that gets reduced to a single vector value.
If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
must be a tuple of length :math:`n-2`.
separate_cls: If `True`, pooling in funnel blocks is not applied to
embeddings of the first token (`cls` from BERT paper) and only final
embedding of this token is used for categorization - the rest are
discarded. If `False`, each token from the beginning is pooled and
all embeddings are averaged and mapped to output categories like in
original `TransformerEncoder` model.
"""
assert encoder_segment_lengths

Expand Down Expand Up @@ -281,4 +368,6 @@ def FunnelTransformer(vocab_size,
decoder_blocks,
tl.Select([0], n_in=2), # vecs
tl.LayerNorm(),
tl.Dense(vocab_size),
tl.LogSoftmax()
)
11 changes: 6 additions & 5 deletions trax/models/research/funnel_transformer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from trax import layers as tl, shapes
from trax.models.research.funnel_transformer import PoolLayer, \
_FunnelResidualBlock, FunnelTransformerEncoder, FunnelTransformer, MaskPool
_FunnelBlock, FunnelTransformerEncoder, FunnelTransformer, MaskPool


class FunnelTransformerTest(parameterized.TestCase):
Expand Down Expand Up @@ -62,8 +62,8 @@ def test_funnel_block_forward_shape(self):
mask = masker(mask)

block = tl.Serial(
_FunnelResidualBlock(d_model, 8, 2, 0.1, None, 'train', tl.Relu,
tl.AvgPool, (2,), (2,)))
_FunnelBlock(d_model, 8, 2, 0.1, None, 'train', tl.Relu,
tl.AvgPool, (2,), (2,), separate_cls=True))

xs = [x, mask]
_, _ = block.init(shapes.signature(xs))
Expand All @@ -88,7 +88,8 @@ def test_funnel_transformer_encoder_forward_shape(self):

def test_funnel_transformer_forward_shape(self):
d_model = 8
model = FunnelTransformer(2, d_model=d_model, d_ff=8,
vocab_size = 7
model = FunnelTransformer(7, d_model=d_model, d_ff=8,
encoder_segment_lengths=(1, 1),
n_decoder_blocks=1, n_heads=2, max_len=8)

Expand All @@ -98,7 +99,7 @@ def test_funnel_transformer_forward_shape(self):
_ = model.init(shapes.signature(x))
y = model(x)

self.assertEqual(y.shape, (batch_size, n_tokens, d_model))
self.assertEqual(y.shape, (batch_size, n_tokens, vocab_size))


if __name__ == '__main__':
Expand Down