Vatican-X-Formers · syzymon · Nov 11, 2020 · Nov 11, 2020 · Nov 11, 2020 · Nov 11, 2020
diff --git a/trax/models/research/funnel_transformer.py b/trax/models/research/funnel_transformer.py
@@ -73,55 +73,63 @@ def _Upsampler():
 def _FunnelBlock(d_model, d_ff, n_heads,
                  dropout, dropout_shared_axes, mode, ff_activation,
                  pool_layer, pool_size, strides, separate_cls):
-  """Internal funnel block. On input it takes (activations, masks).
+  """Internal funnel block. Returns a list of layers implementing it.
+
+  The input is an activation tensor.
 
   Args:
-      d_model: Final dimension of tensors at most points in the model, including
-          the initial embedding output.
-      d_ff: Size of special dense layer in the feed-forward part of each block.
-      n_heads: Number of attention heads.
-      dropout: Stochastic rate (probability) for dropping an activation value
-          when applying dropout within a block.
-      dropout_shared_axes: Tensor axes on which to share a dropout mask.
-          Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
-          a useful way to save memory and apply consistent masks to activation
-          vectors at different sequence positions.
-      mode: If `'train'`, each block will include dropout; else, it will
-          pass all values through unaltered.
-      ff_activation: Type of activation function at the end of each block; must
-          be an activation-type subclass of `Layer`.
-      pool_size: Shape of window that gets reduced to a single vector value.
-          If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
-          must be a tuple of length :math:`n-2`.
-      strides: Offsets from the location of one window to the locations of
-          neighboring windows along each axis. If specified, must be a tuple of
-          the same length as `pool_size`. If None, then offsets of 1 along each
-          window axis, :math:`(1, ..., 1)`, will be used.
+    d_model: Final dimension of tensors at most points in the model, including
+        the initial embedding output.
+    d_ff: Size of special dense layer in the feed-forward part of each block.
+    n_heads: Number of attention heads.
+    dropout: Stochastic rate (probability) for dropping an activation value
+        when applying dropout within a block.
+    dropout_shared_axes: Tensor axes on which to share a dropout mask.
+        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
+        a useful way to save memory and apply consistent masks to activation
+        vectors at different sequence positions.
+    mode: If `'train'`, each block will include dropout; else, it will
+        pass all values through unaltered.
+    ff_activation: Type of activation function at the end of each block; must
+        be an activation-type subclass of `Layer`.
+    pool_layer: Type of pooling layer used for downsampling;
+        should be `tl.AvgPool` or `tl.MaxPool`.
+    pool_size: Shape of window that gets reduced to a single vector value.
+        If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
+        must be a tuple of length :math:`n-2`.
+    strides: Offsets from the location of one window to the locations of
+        neighboring windows along each axis. If specified, must be a tuple of
+        the same length as `pool_size`. If None, then offsets of 1 along each
+        window axis, :math:`(1, ..., 1)`, will be used.
+    separate_cls: If `True`, pooling in funnel blocks is not applied to
+          embeddings of the first token (`cls` from BERT paper).
   Returns:
       A list of layers that maps (activations, mask) to (activations', mask).
   """
-  attention = tl.AttentionQKV(
-      d_feature=d_model, n_heads=n_heads, dropout=dropout, mode=mode)
-  feed_forward = _FeedForwardBlock(
-      d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)
   pooling = PoolLayer(pool_layer, pool_size, strides, separate_cls)
   mask_pooling = MaskPool(pool_size, strides, separate_cls)
 
-  return tl.Serial(                     # h, mask
-      tl.Branch(pooling, None, None),   # h', h, h, mask
-      tl.Dup(),                         # h', h', h, h, mask
-      tl.Parallel(
-          None,
-          attention
-      ),                                # h', attention(...), mask
-      tl.Add(),                         # h'+attention(...), mask
-      tl.LayerNorm(),                   # funnel_activations, mask
-      tl.Parallel(
-          None,
-          mask_pooling
-      ),                                # funnel_activations, mask'
-      feed_forward
-  )
+  attention = tl.AttentionQKV(d_model, n_heads=n_heads, dropout=dropout,
+                              mode=mode)
+  hidden_dropout = tl.Dropout(
+      rate=dropout, shared_axes=dropout_shared_axes, mode=mode)
+
+  feed_forward = _FeedForwardBlock(
+      d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)
+
+  return [                                          # h, mask
+      tl.LayerNorm(),                               # h, mask
+      tl.Branch(pooling, None),                     # h', h, mask
+      tl.Residual(
+          tl.Select([0, 1, 1, 2]),                  # h', h, h, mask
+          attention,                                # attn, mask
+          tl.Parallel(None, mask_pooling),          # attn, mask'
+          hidden_dropout                            # attn, mask'
+      ),                                            # funnel_activations, mask'
+      tl.Residual(
+          feed_forward
+      )
+  ]
 
 
 def FunnelTransformerEncoder(vocab_size,
@@ -140,6 +148,62 @@ def FunnelTransformerEncoder(vocab_size,
                              strides=(2,),
                              separate_cls=True):
   """Returns a Funnel Encoder.
+
+  This model performs text categorization:
+
+    - input: rank 2 tensor representing a batch of text strings via token IDs
+      plus padding markers; shape is (batch_size, sequence_length). The tensor
+      elements are integers in `range(vocab_size)`, and `0` values mark padding
+      positions.
+
+    - output: rank 2 tensor representing a batch of log-probability
+      distributions over N categories; shape is (batch_size, `n_classes`).
+
+  Args:
+    vocab_size: Input vocabulary size -- each element of the input tensor
+        should be an integer in `range(vocab_size)`. These integers typically
+        represent token IDs from a vocabulary-based tokenizer.
+    n_classes: Final dimension of the output tensors, representing N-way
+        classification.
+    d_model: Final dimension of tensors at most points in the model, including
+        the initial embedding output.
+    d_ff: Size of special dense layer in the feed-forward part of each encoder
+        block.
+    encoder_segment_lengths: Tuple, where each element denotes the number of
+        transformer encoder blocks preceding a funnel transformer block.
+        There is no funnel block after the last sequence of encoder blocks,
+        therefore the total number of blocks in the model is equal to
+        `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`.
+    n_heads: Number of attention heads.
+    max_len: Maximum symbol length for positional encoding.
+    dropout: Stochastic rate (probability) for dropping an activation value
+        when applying dropout within an encoder block.
+    dropout_shared_axes: Tensor axes on which to share a dropout mask.
+        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
+        a useful way to save memory and apply consistent masks to activation
+        vectors at different sequence positions.
+    mode: If `'train'`, each encoder block will include dropout; else, it will
+        pass all values through unaltered.
+    ff_activation: Type of activation function at the end of each encoder
+        block; must be an activation-type subclass of `Layer`.
+    pool_layer: Type of pooling layer used for downsampling in each of the
+        funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`.
+    pool_size: Shape of window that gets reduced to a single vector value.
+        If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
+        must be a tuple of length :math:`n-2`.
+    strides: Offsets from the location of one window to the locations of
+        neighboring windows along each axis. If specified, must be a tuple of
+        the same length as `pool_size`. If None, then offsets of 1 along each
+        window axis, :math:`(1, ..., 1)`, will be used.
+    separate_cls: If `True`, pooling in funnel blocks is not applied to
+        embeddings of the first token (`cls` from BERT paper) and only final
+        embedding of this token is used for categorization - the rest are
+        discarded. If `False`, each token from the beginning is pooled and
+        all embeddings are averaged and mapped to output categories like in
+        original `TransformerEncoder` model.
+  Returns:
+    A Transformer model that maps strings (conveyed via token IDs) to
+    probability-like activations over a range of output classes.
   """
   assert encoder_segment_lengths
 
@@ -161,10 +225,11 @@ def FunnelTransformerEncoder(vocab_size,
 
     # If not last segment, add funnel block
     if i != n_encoder_segments - 1:
-      encoder_blocks.append(_FunnelBlock(d_model, d_ff, n_heads, dropout,
-                                         dropout_shared_axes, mode,
-                                         ff_activation, pool_layer, pool_size,
-                                         strides, separate_cls))
+      encoder_blocks.append(
+        _FunnelBlock(d_model, d_ff, n_heads, dropout,
+                     dropout_shared_axes, mode,
+                     ff_activation, pool_layer, pool_size,
+                     strides, separate_cls))
 
   cls_pooling = SelectFirst() if separate_cls else tl.Mean(axis=1)
 
@@ -184,34 +249,6 @@ def FunnelTransformerEncoder(vocab_size,
   )
 
 
-def _FunnelResidualBlock(d_model, d_ff, n_heads,
-                         dropout, dropout_shared_axes, mode, ff_activation,
-                         pool_layer, pool_size, strides):
-  feed_forward = _FeedForwardBlock(
-      d_model, d_ff, dropout, dropout_shared_axes, mode, ff_activation)
-
-  dropout_ = tl.Dropout(
-      rate=dropout, shared_axes=dropout_shared_axes, mode=mode)
-
-  attn_ = tl.AttentionQKV(d_model, n_heads=n_heads, dropout=dropout, mode=mode)
-
-  pooling_ = PoolLayer(pool_layer, pool_size, strides)
-
-  return [
-      tl.Parallel(tl.Branch(pooling_, None), None),
-      tl.Residual(
-          tl.Parallel(tl.LayerNorm(), tl.LayerNorm()),
-          tl.Select([0, 1, 1, 2]),
-          attn_,
-          tl.Parallel(None, MaskPool()),
-          dropout_
-      ),
-      tl.Residual(
-          feed_forward
-      )
-  ]
-
-
 def FunnelTransformer(vocab_size,
                       d_model=512,
                       d_ff=2048,
@@ -226,7 +263,57 @@ def FunnelTransformer(vocab_size,
                       pool_layer=tl.AvgPool,
                       pool_size=(2,),
                       separate_cls=True):
-  """Returns a Full Funnel Transformer.
+  """Returns a Full Funnel Transformer, that can be used for example for BERT.
+
+  This model outputs token-level categorical distributions over all vocab:
+
+    - input: rank 2 tensor representing a batch of text strings via token IDs
+      plus padding markers; shape is (batch_size, sequence_length). The tensor
+      elements are integers in `range(vocab_size)`, and `0` values mark padding
+      positions.
+
+    - output: rank 3 tensor representing a batch of log-probability
+      distributions over `vocab_size` categories for each token; shape is
+      (batch_size, sequence_length, vocab_size).
+
+
+  Args:
+    vocab_size: Input vocabulary size -- each element of the input tensor
+        should be an integer in `range(vocab_size)`. These integers typically
+        represent token IDs from a vocabulary-based tokenizer.
+    d_model: Final dimension of tensors at most points in the model, including
+        the initial embedding output.
+    d_ff: Size of special dense layer in the feed-forward part of each encoder
+        block.
+    encoder_segment_lengths: Tuple, where each element denotes the number of
+        transformer encoder blocks preceding a funnel transformer block.
+        There is no funnel block after the last sequence of encoder blocks,
+        therefore the total number of blocks in the model is equal to
+        `sum(encoder_segment_lengths) + len(encoder_segment_lengths) - 1`.
+    n_decoder_blocks: Number of transformer blocks in the upsampling decoder.
+    n_heads: Number of attention heads.
+    max_len: Maximum symbol length for positional encoding.
+    dropout: Stochastic rate (probability) for dropping an activation value
+        when applying dropout within an encoder block.
+    dropout_shared_axes: Tensor axes on which to share a dropout mask.
+        Sharing along batch and sequence axes (`dropout_shared_axes=(0,1)`) is
+        a useful way to save memory and apply consistent masks to activation
+        vectors at different sequence positions.
+    mode: If `'train'`, each encoder block will include dropout; else, it will
+        pass all values through unaltered.
+    ff_activation: Type of activation function at the end of each encoder
+        block; must be an activation-type subclass of `Layer`.
+    pool_layer: Type of pooling layer used for downsampling in each of the
+        funnel blocks; should be `tl.AvgPool` or `tl.MaxPool`.
+    pool_size: Shape of window that gets reduced to a single vector value.
+        If the layer inputs are :math:`n`-dimensional arrays, then `pool_size`
+        must be a tuple of length :math:`n-2`.
+    separate_cls: If `True`, pooling in funnel blocks is not applied to
+        embeddings of the first token (`cls` from BERT paper) and only final
+        embedding of this token is used for categorization - the rest are
+        discarded. If `False`, each token from the beginning is pooled and
+        all embeddings are averaged and mapped to output categories like in
+        original `TransformerEncoder` model.
   """
   assert encoder_segment_lengths
 
@@ -281,4 +368,6 @@ def FunnelTransformer(vocab_size,
       decoder_blocks,
       tl.Select([0], n_in=2),                     # vecs
       tl.LayerNorm(),
+      tl.Dense(vocab_size),
+      tl.LogSoftmax()
   )
diff --git a/trax/models/research/funnel_transformer_test.py b/trax/models/research/funnel_transformer_test.py
@@ -22,7 +22,7 @@
 
 from trax import layers as tl, shapes
 from trax.models.research.funnel_transformer import PoolLayer, \
-  _FunnelResidualBlock, FunnelTransformerEncoder, FunnelTransformer, MaskPool
+  _FunnelBlock, FunnelTransformerEncoder, FunnelTransformer, MaskPool
 
 
 class FunnelTransformerTest(parameterized.TestCase):
@@ -62,8 +62,8 @@ def test_funnel_block_forward_shape(self):
     mask = masker(mask)
 
     block = tl.Serial(
-        _FunnelResidualBlock(d_model, 8, 2, 0.1, None, 'train', tl.Relu,
-                             tl.AvgPool, (2,), (2,)))
+        _FunnelBlock(d_model, 8, 2, 0.1, None, 'train', tl.Relu,
+                     tl.AvgPool, (2,), (2,), separate_cls=True))
 
     xs = [x, mask]
     _, _ = block.init(shapes.signature(xs))
@@ -88,7 +88,8 @@ def test_funnel_transformer_encoder_forward_shape(self):
 
   def test_funnel_transformer_forward_shape(self):
     d_model = 8
-    model = FunnelTransformer(2, d_model=d_model, d_ff=8,
+    vocab_size = 7
+    model = FunnelTransformer(7, d_model=d_model, d_ff=8,
                               encoder_segment_lengths=(1, 1),
                               n_decoder_blocks=1, n_heads=2, max_len=8)
 
@@ -98,7 +99,7 @@ def test_funnel_transformer_forward_shape(self):
     _ = model.init(shapes.signature(x))
     y = model(x)
 
-    self.assertEqual(y.shape, (batch_size, n_tokens, d_model))
+    self.assertEqual(y.shape, (batch_size, n_tokens, vocab_size))
 
 
 if __name__ == '__main__':