diff --git a/data/coco/README.md b/data/coco/README.md
new file mode 100644
index 00000000000..9d077035512
--- /dev/null
+++ b/data/coco/README.md
@@ -0,0 +1,24 @@
+For details about the Microsoft COCO ("Common Objects in Context") dataset [1],
+visit mscoco.org.  This README provides instructions for downloading and
+installing the tools and dataset.
+
+1) Download and extract the COCO Python tools by running:
+
+    ./download_tools.sh
+
+2) Install the tools, and optionally download the data by running:
+
+    cd coco/PythonAPI
+    python setup.py install  # follow prompts to download or skip data
+
+3) Download train/val/test splits using:
+
+    ./get_coco2014_aux.sh
+
+(or see the COCO README (tools/README) for more information).
+
+
+[1] Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona,
+    Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick.
+    "Microsoft COCO: Common Objects in Context."
+    arXiv preprint arXiv:1405.0312 (2014).
diff --git a/data/coco/download_eval_tools.sh b/data/coco/download_eval_tools.sh
new file mode 100755
index 00000000000..3ceadacd7f6
--- /dev/null
+++ b/data/coco/download_eval_tools.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# change to directory $DIR where this script is stored
+pushd .
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+cd $DIR
+
+OUTFILE=coco_caption_eval.zip
+wget --no-check-certificate https://github.com/jeffdonahue/coco-caption/archive/master.zip -O $OUTFILE
+unzip $OUTFILE
+mv coco-caption-master coco-caption-eval
+
+# change back to original working directory
+popd
+
+echo "Downloaded COCO evaluation tools to: $DIR/coco-caption-eval"
diff --git a/data/coco/download_tools.sh b/data/coco/download_tools.sh
new file mode 100755
index 00000000000..4a1c4ccbe54
--- /dev/null
+++ b/data/coco/download_tools.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# change to directory $DIR where this script is stored
+pushd .
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+cd $DIR
+
+git clone https://github.com/pdollar/coco.git
+
+# change back to original working directory
+popd
+
+echo "Cloned COCO tools to: $DIR/coco"
+echo "To setup COCO tools (and optionally download data), run:"
+echo "    cd $DIR/coco"
+echo "    python setup.py install"
+echo "and follow the prompts."
diff --git a/data/coco/get_coco2014_aux.sh b/data/coco/get_coco2014_aux.sh
new file mode 100755
index 00000000000..6ab23612139
--- /dev/null
+++ b/data/coco/get_coco2014_aux.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#
+# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files.
+
+# change to directory $DIR where this script is stored
+pushd .
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+cd $DIR
+
+FILENAME=coco2014_aux.tar.gz
+
+echo "Downloading..."
+
+wget http://dl.caffe.berkeleyvision.org/$FILENAME
+
+echo "Unzipping to $DIR"
+
+tar -xf $FILENAME && rm -f $FILENAME
+
+echo "Done."
+
+# change back to original working directory
+popd
diff --git a/data/coco/make_test.py b/data/coco/make_test.py
new file mode 100755
index 00000000000..7e546b575d4
--- /dev/null
+++ b/data/coco/make_test.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# This file is only meant to be run as a script with 0 arguments,
+# and depends on steps 1-3 of README.md.
+#
+# It creates a test set from the image filenames of the test set.
+
+import json
+import os
+import re
+
+# get path to directory where this script is
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+set_name = 'test2014'
+image_root = '%s/coco/images/%s' % (script_dir, set_name)
+out_filename = '%s/coco/annotations/captions_%s.json' % (script_dir, set_name)
+image_ext = 'jpg'
+imname_re = re.compile('COCO_%s_(?P<image_id>\d+)\.%s' % (set_name, image_ext))
+full_image_ext = '.%s' % image_ext
+image_filenames = filter(lambda f: f.endswith(full_image_ext), os.listdir(image_root))
+print 'Creating dummy annotation file for %d images at: %s' % \
+    (len(image_filenames), out_filename)
+
+out_data = {'type': 'captions', 'images': [], 'annotations': [],
+            'licenses': [], 'info': {}}
+for index, filename in enumerate(image_filenames):
+    match = imname_re.match(filename)
+    if match is None: raise Exception('Unsupported filename: %s' % filename)
+    image_id = int(match.group('image_id'))
+    out_data['images'].append({'file_name': filename, 'id': image_id})
+    for dummy_index in range(2):
+        annotation = {'caption': 'dummy caption %d' % dummy_index,
+                      'id': index, 'image_id': image_id}
+        out_data['annotations'].append(annotation)
+with open(out_filename, 'w') as out_file:
+    json.dump(out_data, out_file)
diff --git a/data/coco/make_trainval.py b/data/coco/make_trainval.py
new file mode 100755
index 00000000000..0dea42622e8
--- /dev/null
+++ b/data/coco/make_trainval.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+# This file is only meant to be run as a script with 0 arguments,
+# and depends on steps 1-3 of README.md.
+#
+# It creates a "trainval" set by combining the COCO 2014 train and val sets.
+# The trainval set is intended for use only when training a single final model
+# for submission of results on the test set to the COCO evaluation server.
+
+import os
+import json
+
+# get path to directory where this script is
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+anno_dir_path = '%s/coco/annotations' % script_dir
+image_root = '%s/coco/images' % script_dir
+abs_image_root = os.path.abspath(image_root)
+out_coco_id_filename = '%s/coco2014_cocoid.trainval.txt' % script_dir
+filename_pattern = 'captions_%s2014.json'
+in_sets = ['train', 'val']
+out_set = 'trainval'
+path_pattern = '%s/%s' % (anno_dir_path, filename_pattern)
+
+out_data = {}
+for in_set in in_sets:
+    filename = path_pattern % in_set
+    print 'Loading input dataset from: %s' % filename
+    data = json.load(open(filename, 'r'))
+    for key, val in data.iteritems():
+        if type(val) == list:
+            if key not in out_data:
+                out_data[key] = []
+            out_data[key] += val
+        else:
+            if key not in out_data:
+                out_data[key] = val
+            assert out_data[key] == val
+filename = path_pattern % out_set
+print 'Dumping output dataset to: %s' % filename
+json.dump(out_data, open(filename, 'w'))
+
+out_ids = [str(im['id']) for im in out_data['images']]
+print 'Writing COCO IDs to: %s' % out_coco_id_filename
+with open(out_coco_id_filename, 'w') as coco_id_file:
+    coco_id_file.write('\n'.join(out_ids) + '\n')
+
+# make a trainval dir with symlinks to all train+val images
+out_dir = '%s/%s2014' % (image_root, out_set)
+os.makedirs(out_dir)
+print 'Writing image symlinks to: %s' % out_dir
+for im in out_data['images']:
+    filename = im['file_name']
+    set_name = None
+    for in_set in in_sets:
+        if in_set in filename:
+            set_name = in_set
+            break
+    assert set_name is not None
+    real_path = '%s/%s2014/%s' % (abs_image_root, set_name, filename)
+    link_path = '%s/%s' % (out_dir, filename)
+    os.symlink(real_path, link_path)
diff --git a/examples/coco_caption/.gitignore b/examples/coco_caption/.gitignore
new file mode 100644
index 00000000000..e040331b7f2
--- /dev/null
+++ b/examples/coco_caption/.gitignore
@@ -0,0 +1 @@
+h5_data/
diff --git a/examples/coco_caption/Caffe language model.ipynb b/examples/coco_caption/Caffe language model.ipynb
new file mode 100644
index 00000000000..30d2c494a39
--- /dev/null
+++ b/examples/coco_caption/Caffe language model.ipynb	
@@ -0,0 +1,617 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import random\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('./python')\n",
+    "import caffe\n",
+    "\n",
+    "sys.path.append('./examples/coco_caption')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<unk>\r\n",
+      "a\r\n",
+      "on\r\n",
+      "of\r\n",
+      "the\r\n",
+      "in\r\n",
+      "with\r\n",
+      "and\r\n",
+      "is\r\n",
+      "man\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!head examples/coco_caption/h5_data/buffer_100/vocabulary.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8801\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocabulary = ['<EOS>'] + [line.strip() for line in\n",
+    "                          open('examples/coco_caption/h5_data/buffer_100/vocabulary.txt').readlines()]\n",
+    "print len(vocabulary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 1, 8801)\n"
+     ]
+    }
+   ],
+   "source": [
+    "iter_num = 110000\n",
+    "net = caffe.Net('./examples/coco_caption/lstm_lm.deploy.prototxt',\n",
+    "                './examples/coco_caption/lstm_lm_iter_%d.caffemodel' % iter_num, caffe.TEST)\n",
+    "print net.blobs['probs'].data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def predict_single_word(net, previous_word, output='probs'):\n",
+    "    cont = 0 if previous_word == 0 else 1\n",
+    "    cont_input = np.array([cont])\n",
+    "    word_input = np.array([previous_word])\n",
+    "    net.forward(cont_sentence=cont_input, input_sentence=word_input)\n",
+    "    output_preds = net.blobs[output].data[0, 0, :]\n",
+    "    return output_preds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "first_word_dist = predict_single_word(net, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "top_preds = np.argsort(-1 * first_word_dist)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[  2  14   5  13  64  77  30  18  93 142]\n",
+      "['a', 'two', 'the', 'an', 'there', 'three', 'some', 'people', 'several', 'this']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print top_preds[:10]\n",
+    "print [vocabulary[index] for index in top_preds[:10]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['people', 'men', 'women', 'giraffes', 'zebras', 'young', 'cats', 'elephants', 'horses', 'children']\n"
+     ]
+    }
+   ],
+   "source": [
+    "second_word_dist = predict_single_word(net, vocabulary.index('two'))\n",
+    "print [vocabulary[index] for index in np.argsort(-1 * second_word_dist)[:10]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['standing', 'are', 'in', 'stand', 'walking', 'and', 'eating', 'that', 'walk', 'with']\n"
+     ]
+    }
+   ],
+   "source": [
+    "third_word_dist = predict_single_word(net, vocabulary.index('giraffes'))\n",
+    "print [vocabulary[index] for index in np.argsort(-1 * second_word_dist)[:10]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['leaves', 'from', 'grass', 'hay', 'out', 'some', 'in', 'food', 'off', 'a']\n"
+     ]
+    }
+   ],
+   "source": [
+    "third_word_dist = predict_single_word(net, vocabulary.index('eating'))\n",
+    "print [vocabulary[index] for index in np.argsort(-1 * second_word_dist)[:10]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def softmax(softmax_inputs, temp):\n",
+    "    shifted_inputs = softmax_inputs - softmax_inputs.max()\n",
+    "    exp_outputs = np.exp(temp * shifted_inputs)\n",
+    "    exp_outputs_sum = exp_outputs.sum()\n",
+    "    if np.isnan(exp_outputs_sum):\n",
+    "        return exp_outputs * float('nan')\n",
+    "    assert exp_outputs_sum > 0\n",
+    "    if np.isinf(exp_outputs_sum):\n",
+    "        return np.zeros_like(exp_outputs)\n",
+    "    eps_sum = 1e-20\n",
+    "    return exp_outputs / max(exp_outputs_sum, eps_sum)\n",
+    "\n",
+    "def random_choice_from_probs(softmax_inputs, temp=1):\n",
+    "    # temperature of infinity == take the max\n",
+    "    if temp == float('inf'):\n",
+    "        return np.argmax(softmax_inputs)\n",
+    "    probs = softmax(softmax_inputs, temp)\n",
+    "    r = random.random()\n",
+    "    cum_sum = 0.\n",
+    "    for i, p in enumerate(probs):\n",
+    "        cum_sum += p\n",
+    "        if cum_sum >= r: return i\n",
+    "    return 1  # return UNK?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def generate_sentence(net, temp=float('inf'), output='predict', max_words=50):\n",
+    "    cont_input = np.array([0])\n",
+    "    word_input = np.array([0])\n",
+    "    sentence = []\n",
+    "    while len(sentence) < max_words and (not sentence or sentence[-1] != 0):\n",
+    "        net.forward(cont_sentence=cont_input, input_sentence=word_input)\n",
+    "        output_preds = net.blobs[output].data[0, 0, :]\n",
+    "        sentence.append(random_choice_from_probs(output_preds, temp=temp))\n",
+    "        cont_input[0] = 1\n",
+    "        word_input[0] = sentence[-1]\n",
+    "    return sentence"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 10, 9, 16, 6, 2, 35, 7, 2, 118, 0]\n",
+      "['a', 'man', 'is', 'standing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 10, 9, 16, 6, 2, 35, 7, 2, 118, 0]\n",
+      "['a', 'man', 'is', 'standing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 22, 9, 294, 7, 2, 178, 113, 11, 87, 905, 0]\n",
+      "['a', 'woman', 'is', 'posing', 'with', 'a', 'cell', 'phone', 'to', 'her', 'ear', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=1.0)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 28, 26, 2, 38, 209, 3, 2, 38, 152, 0]\n",
+      "['a', 'person', 'holding', 'a', 'tennis', 'racket', 'on', 'a', 'tennis', 'court', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=1.0)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 10, 26, 2, 38, 363, 3, 2, 38, 152, 0]\n",
+      "['a', 'man', 'holding', 'a', 'tennis', 'racquet', 'on', 'a', 'tennis', 'court', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=1.5)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 33, 4, 18, 12, 106, 2, 23, 7, 60, 0]\n",
+      "['a', 'group', 'of', 'people', 'sitting', 'around', 'a', 'table', 'with', 'food', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=1.5)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 10, 6, 2, 261, 8, 217, 16, 6, 2, 43, 0]\n",
+      "['a', 'man', 'in', 'a', 'suit', 'and', 'tie', 'standing', 'in', 'a', 'room', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=3.0)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 10, 26, 2, 38, 363, 3, 2, 38, 152, 0]\n",
+      "['a', 'man', 'holding', 'a', 'tennis', 'racquet', 'on', 'a', 'tennis', 'court', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=3.0)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 10, 9, 16, 6, 2, 35, 7, 2, 118, 0]\n",
+      "['a', 'man', 'is', 'standing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=10.0)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1993, 1074, 86, 6, 40, 4, 2, 126, 0]\n",
+      "['staircase', 'laid', 'out', 'in', 'front', 'of', 'a', 'window', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=1.0)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 28, 3, 2, 113, 46, 2, 129, 0]\n",
+      "['a', 'person', 'on', 'a', 'phone', 'riding', 'a', 'car', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=0.8)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 16, 60, 6, 136, 192, 7, 641, 16, 20, 11, 27, 0]\n",
+      "['a', 'standing', 'food', 'in', 'each', 'hand', 'with', 'cattle', 'standing', 'next', 'to', 'it', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=0.8)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 148,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[28, 236, 1042, 7, 69, 1257, 487, 1769, 0]\n",
+      "['person', 'taking', 'noodles', 'with', 'other', 'homemade', 'birthday', 'cereal', '<EOS>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=0.6)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 145,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[5623, 1087, 15, 6888, 472, 361, 8634, 8, 7241, 3, 77, 299, 935, 1296, 15, 12, 5165, 2867, 3979, 743, 4991, 4470, 640, 9, 259, 2308, 4386, 2552, 3797, 2448, 15, 3617, 5364, 4267, 4549, 8086, 176, 2529, 6434, 5445, 370, 7959, 5672, 1742, 4041, 4258, 1153, 8, 610, 2044]\n",
+      "['chilli', 'frosting', ',', 'medley', 'salad', 'items', 'sideboard', 'and', 'garnishes', 'on', 'three', 'colorful', 'gold', 'desserts', ',', 'sitting', 'knifes', 'need', 'workspace', 'where', 'exchanging', 'hoses', 'left', 'is', 'pink', 'clearing', 'obstacles', 'vandalized', 'idly', 'afternoon', ',', 'halloween', 'rich', 'fixed', 'aid', 'advertise', 'light', 'times', 'delicate', 'dealership', 'like', 'snowsuits', 'florida', 'than', 'ornamental', 'dr', 'curtains', 'and', 'multiple', 'electrical']\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = generate_sentence(net, temp=0.5)\n",
+    "print sentence\n",
+    "print [vocabulary[index] for index in sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/coco_caption/captioner.py b/examples/coco_caption/captioner.py
new file mode 100644
index 00000000000..cefa44da24c
--- /dev/null
+++ b/examples/coco_caption/captioner.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python
+
+from collections import OrderedDict
+import h5py
+import math
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import random
+import sys
+
+sys.path.append('./python/')
+import caffe
+
+class Captioner():
+  def __init__(self, weights_path, image_net_proto, lstm_net_proto,
+               vocab_path, device_id=-1):
+    if device_id >= 0:
+      caffe.set_mode_gpu()
+      caffe.set_device(device_id)
+    else:
+      caffe.set_mode_cpu()
+    # Setup image processing net.
+    phase = caffe.TEST
+    self.image_net = caffe.Net(image_net_proto, weights_path, phase)
+    image_data_shape = self.image_net.blobs['data'].data.shape
+    self.transformer = caffe.io.Transformer({'data': image_data_shape})
+    channel_mean = np.zeros(image_data_shape[1:])
+    channel_mean_values = [104, 117, 123]
+    assert channel_mean.shape[0] == len(channel_mean_values)
+    for channel_index, mean_val in enumerate(channel_mean_values):
+      channel_mean[channel_index, ...] = mean_val
+    self.transformer.set_mean('data', channel_mean)
+    self.transformer.set_channel_swap('data', (2, 1, 0))
+    self.transformer.set_transpose('data', (2, 0, 1))
+    # Setup sentence prediction net.
+    self.lstm_net = caffe.Net(lstm_net_proto, weights_path, phase)
+    self.vocab = ['<EOS>']
+    with open(vocab_path, 'r') as vocab_file:
+      self.vocab += [word.strip() for word in vocab_file.readlines()]
+    net_vocab_size = self.lstm_net.blobs['predict'].data.shape[2]
+    if len(self.vocab) != net_vocab_size:
+      raise Exception('Invalid vocab file: contains %d words; '
+          'net expects vocab with %d words' % (len(self.vocab), net_vocab_size))
+
+  def set_image_batch_size(self, batch_size):
+    self.image_net.blobs['data'].reshape(batch_size,
+        *self.image_net.blobs['data'].data.shape[1:])
+
+  def caption_batch_size(self):
+    return self.lstm_net.blobs['cont_sentence'].data.shape[1]
+
+  def set_caption_batch_size(self, batch_size):
+    self.lstm_net.blobs['cont_sentence'].reshape(1, batch_size)
+    self.lstm_net.blobs['input_sentence'].reshape(1, batch_size)
+    self.lstm_net.blobs['image_features'].reshape(batch_size,
+        *self.lstm_net.blobs['image_features'].data.shape[1:])
+    self.lstm_net.reshape()
+
+  def preprocess_image(self, image, verbose=False):
+    if type(image) in (str, unicode):
+      image = plt.imread(image)
+    crop_edge_ratio = (256. - 227.) / 256. / 2
+    ch = int(image.shape[0] * crop_edge_ratio + 0.5)
+    cw = int(image.shape[1] * crop_edge_ratio + 0.5)
+    cropped_image = image[ch:-ch, cw:-cw]
+    if len(cropped_image.shape) == 2:
+      cropped_image = np.tile(cropped_image[:, :, np.newaxis], (1, 1, 3))
+    preprocessed_image = self.transformer.preprocess('data', cropped_image)
+    if verbose:
+      print 'Preprocessed image has shape %s, range (%f, %f)' % \
+          (preprocessed_image.shape,
+           preprocessed_image.min(),
+           preprocessed_image.max())
+    return preprocessed_image
+
+  def preprocessed_image_to_descriptor(self, image, output_name='fc8'):
+    net = self.image_net
+    if net.blobs['data'].data.shape[0] > 1:
+      batch = np.zeros_like(net.blobs['data'].data)
+      batch[0] = image[0]
+    else:
+      batch = image
+    net.forward(data=batch)
+    descriptor = net.blobs[output_name].data[0].copy()
+    return descriptor
+
+  def image_to_descriptor(self, image, output_name='fc8'):
+    return self.preprocessed_image_to_descriptor(self.preprocess_image(image))
+
+  def predict_single_word(self, descriptor, previous_word, output='probs'):
+    net = self.lstm_net
+    cont = 0 if previous_word == 0 else 1
+    cont_input = np.array([cont])
+    word_input = np.array([previous_word])
+    image_features = np.zeros_like(net.blobs['image_features'].data)
+    image_features[:] = descriptor
+    net.forward(image_features=image_features, cont_sentence=cont_input,
+                input_sentence=word_input)
+    output_preds = net.blobs[output].data[0, 0, :]
+    return output_preds
+
+  def predict_single_word_from_all_previous(self, descriptor, previous_words):
+    for word in [0] + previous_words:
+      probs = self.predict_single_word(descriptor, word)
+    return probs
+
+  # Strategy must be either 'beam' or 'sample'.
+  # If 'beam', do a max likelihood beam search with beam size num_samples.
+  # Otherwise, sample with temperature temp.
+  def predict_caption(self, descriptor, strategy={'type': 'beam'}):
+    assert 'type' in strategy
+    assert strategy['type'] in ('beam', 'sample')
+    if strategy['type'] == 'beam':
+      return self.predict_caption_beam_search(descriptor, strategy)
+    num_samples = strategy['num'] if 'num' in strategy else 1
+    samples = []
+    sample_probs = []
+    for _ in range(num_samples):
+      sample, sample_prob = self.sample_caption(descriptor, strategy)
+      samples.append(sample)
+      sample_probs.append(sample_prob)
+    return samples, sample_probs
+
+  def sample_caption(self, descriptor, strategy,
+                     net_output='predict', max_length=50):
+    sentence = []
+    probs = []
+    eps_prob = 1e-8
+    temp = strategy['temp'] if 'temp' in strategy else 1.0
+    if max_length < 0: max_length = float('inf')
+    while len(sentence) < max_length and (not sentence or sentence[-1] != 0):
+      previous_word = sentence[-1] if sentence else 0
+      softmax_inputs = self.predict_single_word(descriptor, previous_word,
+                                                output=net_output)
+      word = random_choice_from_probs(softmax_inputs, temp)
+      sentence.append(word)
+      probs.append(softmax(softmax_inputs, 1.0)[word])
+    return sentence, probs
+
+  def predict_caption_beam_search(self, descriptor, strategy, max_length=50):
+    orig_batch_size = self.caption_batch_size()
+    if orig_batch_size != 1: self.set_caption_batch_size(1)
+    beam_size = strategy['beam_size'] if 'beam_size' in strategy else 1
+    assert beam_size >= 1
+    beams = [[]]
+    beams_complete = 0
+    beam_probs = [[]]
+    beam_log_probs = [0.]
+    while beams_complete < len(beams):
+      expansions = []
+      for beam_index, beam_log_prob, beam in \
+          zip(range(len(beams)), beam_log_probs, beams):
+        if beam:
+          previous_word = beam[-1]
+          if len(beam) >= max_length or previous_word == 0:
+            exp = {'prefix_beam_index': beam_index, 'extension': [],
+                   'prob_extension': [], 'log_prob': beam_log_prob}
+            expansions.append(exp)
+            # Don't expand this beam; it was already ended with an EOS,
+            # or is the max length.
+            continue
+        else:
+          previous_word = 0  # EOS is first word
+        if beam_size == 1:
+          probs = self.predict_single_word(descriptor, previous_word)
+        else:
+          probs = self.predict_single_word_from_all_previous(descriptor, beam)
+        assert len(probs.shape) == 1
+        assert probs.shape[0] == len(self.vocab)
+        expansion_inds = probs.argsort()[-beam_size:]
+        for ind in expansion_inds:
+          prob = probs[ind]
+          extended_beam_log_prob = beam_log_prob + math.log(prob)
+          exp = {'prefix_beam_index': beam_index, 'extension': [ind],
+                 'prob_extension': [prob], 'log_prob': extended_beam_log_prob}
+          expansions.append(exp)
+      # Sort expansions in decreasing order of probability.
+      expansions.sort(key=lambda expansion: -1 * expansion['log_prob'])
+      expansions = expansions[:beam_size]
+      new_beams = \
+          [beams[e['prefix_beam_index']] + e['extension'] for e in expansions]
+      new_beam_probs = \
+          [beam_probs[e['prefix_beam_index']] + e['prob_extension'] for e in expansions]
+      beam_log_probs = [e['log_prob'] for e in expansions]
+      beams_complete = 0
+      for beam in new_beams:
+        if beam[-1] == 0 or len(beam) >= max_length: beams_complete += 1
+      beams, beam_probs = new_beams, new_beam_probs
+    if orig_batch_size != 1: self.set_caption_batch_size(orig_batch_size)
+    return beams, beam_probs
+
+  def score_caption(self, descriptor, caption, is_gt=True, caption_source='gt'):
+    output = {}
+    output['caption'] = caption
+    output['gt'] = is_gt
+    output['source'] = caption_source
+    output['prob'] = []
+    probs = self.predict_single_word(descriptor, 0)
+    for word in caption:
+      output['prob'].append(probs[word])
+      probs = self.predict_single_word(descriptor, word)
+    return output
+
+  def compute_descriptors(self, image_list, output_name='fc8'):
+    batch = np.zeros_like(self.image_net.blobs['data'].data)
+    batch_shape = batch.shape
+    batch_size = batch_shape[0]
+    descriptors_shape = (len(image_list), ) + \
+        self.image_net.blobs[output_name].data.shape[1:]
+    descriptors = np.zeros(descriptors_shape)
+    for batch_start_index in range(0, len(image_list), batch_size):
+      batch_list = image_list[batch_start_index:(batch_start_index + batch_size)]
+      for batch_index, image_path in enumerate(batch_list):
+        batch[batch_index:(batch_index + 1)] = self.preprocess_image(image_path)
+      current_batch_size = min(batch_size, len(image_list) - batch_start_index)
+      print 'Computing descriptors for images %d-%d of %d' % \
+          (batch_start_index, batch_start_index + current_batch_size - 1,
+           len(image_list))
+      self.image_net.forward(data=batch)
+      descriptors[batch_start_index:(batch_start_index + current_batch_size)] = \
+          self.image_net.blobs[output_name].data[:current_batch_size]
+    return descriptors
+
+  def score_captions(self, descriptor, captions,
+                     output_name='probs', caption_source='gt', verbose=True):
+    net = self.lstm_net
+    cont_input = np.zeros_like(net.blobs['cont_sentence'].data)
+    word_input = np.zeros_like(net.blobs['input_sentence'].data)
+    image_features = np.zeros_like(net.blobs['image_features'].data)
+    batch_size = image_features.shape[0]
+    assert descriptor.shape == image_features.shape[1:]
+    for index in range(batch_size):
+      image_features[index] = descriptor
+    outputs = []
+    input_data_initialized = False
+    for batch_start_index in range(0, len(captions), batch_size):
+      caption_batch = captions[batch_start_index:(batch_start_index + batch_size)]
+      current_batch_size = len(caption_batch)
+      caption_index = 0
+      probs_batch = [[] for b in range(current_batch_size)]
+      num_done = 0
+      while num_done < current_batch_size:
+        if caption_index == 0:
+          cont_input[:] = 0
+        elif caption_index == 1:
+          cont_input[:] = 1
+        for index, caption in enumerate(caption_batch):
+          word_input[0, index] = \
+              caption['caption'][caption_index - 1] if \
+              0 < caption_index < len(caption['caption']) else 0
+        if input_data_initialized:
+          net.forward(start="embedding", input_sentence=word_input,
+                      cont_sentence=cont_input, image_features=image_features)
+        else:
+          net.forward(input_sentence=word_input, cont_sentence=cont_input,
+                      image_features=image_features)
+          input_data_initialized = True
+        output_probs = net.blobs[output_name].data
+        for index, probs, caption in \
+            zip(range(current_batch_size), probs_batch, caption_batch):
+          if caption_index == len(caption['caption']) - 1:
+            num_done += 1
+          if caption_index < len(caption['caption']):
+            word = caption['caption'][caption_index]
+            probs.append(output_probs[0, index, word].reshape(-1)[0])
+        if verbose:
+          print 'Computed probs for word %d of captions %d-%d (%d done)' % \
+              (caption_index, batch_start_index,
+               batch_start_index + current_batch_size - 1, num_done)
+        caption_index += 1
+      for prob, caption in zip(probs_batch, caption_batch):
+        output = {}
+        output['caption'] = caption['caption']
+        output['prob'] = prob
+        output['gt'] = True
+        output['source'] = caption_source
+        outputs.append(output)
+    return outputs
+
+  def sample_captions(self, descriptor, prob_output_name='probs',
+                      pred_output_name='predict', temp=1, max_length=50):
+    descriptor = np.array(descriptor)
+    batch_size = descriptor.shape[0]
+    self.set_caption_batch_size(batch_size)
+    net = self.lstm_net
+    cont_input = np.zeros_like(net.blobs['cont_sentence'].data)
+    word_input = np.zeros_like(net.blobs['input_sentence'].data)
+    image_features = np.zeros_like(net.blobs['image_features'].data)
+    image_features[:] = descriptor
+    outputs = []
+    output_captions = [[] for b in range(batch_size)]
+    output_probs = [[] for b in range(batch_size)]
+    caption_index = 0
+    num_done = 0
+    while num_done < batch_size and caption_index < max_length:
+      if caption_index == 0:
+        cont_input[:] = 0
+      elif caption_index == 1:
+        cont_input[:] = 1
+      if caption_index == 0:
+        word_input[:] = 0
+      else:
+        for index in range(batch_size):
+          word_input[0, index] = \
+              output_captions[index][caption_index - 1] if \
+              caption_index <= len(output_captions[index]) else 0
+      net.forward(image_features=image_features, cont_sentence=cont_input,
+                  input_sentence=word_input)
+      if temp == 1.0 or temp == float('inf'):
+        net_output_probs = net.blobs[prob_output_name].data[0]
+        samples = [
+            random_choice_from_probs(dist, temp=temp, already_softmaxed=True)
+            for dist in net_output_probs
+        ]
+      else:
+        net_output_preds = net.blobs[pred_output_name].data[0]
+        samples = [
+            random_choice_from_probs(preds, temp=temp, already_softmaxed=False)
+            for preds in net_output_preds
+        ]
+      for index, next_word_sample in enumerate(samples):
+        # If the caption is empty, or non-empty but the last word isn't EOS,
+        # predict another word.
+        if not output_captions[index] or output_captions[index][-1] != 0:
+          output_captions[index].append(next_word_sample)
+          output_probs[index].append(net_output_probs[index, next_word_sample])
+          if next_word_sample == 0: num_done += 1
+      sys.stdout.write('\r%d/%d done after word %d' %
+          (num_done, batch_size, caption_index))
+      sys.stdout.flush()
+      caption_index += 1
+    sys.stdout.write('\n')
+    return output_captions, output_probs
+
+  def sentence(self, vocab_indices):
+    sentence = ' '.join([self.vocab[i] for i in vocab_indices])
+    if not sentence: return sentence
+    sentence = sentence[0].upper() + sentence[1:]
+    # If sentence ends with ' <EOS>', remove and replace with '.'
+    # Otherwise (doesn't end with '<EOS>' -- maybe was the max length?):
+    # append '...'
+    suffix = ' ' + self.vocab[0]
+    if sentence.endswith(suffix):
+      sentence = sentence[:-len(suffix)] + '.'
+    else:
+      sentence += '...'
+    return sentence
+
+def softmax(softmax_inputs, temp):
+  shifted_inputs = softmax_inputs - softmax_inputs.max()
+  exp_outputs = np.exp(temp * shifted_inputs)
+  exp_outputs_sum = exp_outputs.sum()
+  if math.isnan(exp_outputs_sum):
+    return exp_outputs * float('nan')
+  assert exp_outputs_sum > 0
+  if math.isinf(exp_outputs_sum):
+    return np.zeros_like(exp_outputs)
+  eps_sum = 1e-20
+  return exp_outputs / max(exp_outputs_sum, eps_sum)
+
+def random_choice_from_probs(softmax_inputs, temp=1, already_softmaxed=False):
+  # temperature of infinity == take the max
+  if temp == float('inf'):
+    return np.argmax(softmax_inputs)
+  if already_softmaxed:
+    probs = softmax_inputs
+    assert temp == 1
+  else:
+    probs = softmax(softmax_inputs, temp)
+  r = random.random()
+  cum_sum = 0.
+  for i, p in enumerate(probs):
+    cum_sum += p
+    if cum_sum >= r: return i
+  return 1  # return UNK?
+
+def gen_stats(prob, normalizer=None):
+  stats = {}
+  stats['length'] = len(prob)
+  stats['log_p'] = 0.0
+  eps = 1e-12
+  for p in prob:
+    assert 0.0 <= p <= 1.0
+    stats['log_p'] += math.log(max(eps, p))
+  stats['log_p_word'] = stats['log_p'] / stats['length']
+  stats['p'] = math.exp(stats['log_p'])
+  stats['p_word'] = math.exp(stats['log_p'])
+  try:
+    stats['perplex'] = math.exp(-stats['log_p'])
+  except OverflowError:
+    stats['perplex'] = float('inf')
+  try:
+    stats['perplex_word'] = math.exp(-stats['log_p_word'])
+  except OverflowError:
+    stats['perplex_word'] = float('inf')
+  if normalizer is not None:
+    norm_stats = gen_stats(normalizer)
+    stats['normed_perplex'] = stats['perplex'] / norm_stats['perplex']
+    stats['normed_perplex_word'] = \
+        stats['perplex_word'] / norm_stats['perplex_word']
+  return stats
diff --git a/examples/coco_caption/coco_to_hdf5_data.py b/examples/coco_caption/coco_to_hdf5_data.py
new file mode 100755
index 00000000000..300e4748061
--- /dev/null
+++ b/examples/coco_caption/coco_to_hdf5_data.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+from hashlib import sha1
+import os
+import random
+random.seed(3)
+import re
+import sys
+
+sys.path.append('./examples/coco_caption/')
+
+COCO_PATH = './data/coco/coco'
+COCO_TOOL_PATH = '%s/PythonAPI/build/lib/pycocotools' % COCO_PATH
+COCO_IMAGE_ROOT = '%s/images' % COCO_PATH
+
+MAX_HASH = 100000
+
+sys.path.append(COCO_TOOL_PATH)
+from coco import COCO
+
+from hdf5_sequence_generator import SequenceGenerator, HDF5SequenceWriter
+
+# UNK_IDENTIFIER is the word used to identify unknown words
+UNK_IDENTIFIER = '<unk>'
+
+SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
+def split_sentence(sentence):
+  # break sentence into a list of words and punctuation
+  sentence = [s.lower() for s in SENTENCE_SPLIT_REGEX.split(sentence.strip()) if len(s.strip()) > 0]
+  # remove the '.' from the end of the sentence
+  if sentence[-1] != '.':
+    # print "Warning: sentence doesn't end with '.'; ends with: %s" % sentence[-1]
+    return sentence
+  return sentence[:-1]
+
+MAX_WORDS = 20
+
+class CocoSequenceGenerator(SequenceGenerator):
+  def __init__(self, coco, batch_num_streams, image_root, vocab=None,
+               max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True,
+               pad=True, truncate=True, split_ids=None):
+    self.max_words = max_words
+    num_empty_lines = 0
+    self.images = []
+    num_total = 0
+    num_missing = 0
+    num_captions = 0
+    known_images = {}
+    self.coco = coco
+    if split_ids is None:
+      split_ids = coco.imgs.keys()
+    self.image_path_to_id = {}
+    for image_id in split_ids:
+      image_info = coco.imgs[image_id]
+      image_path = '%s/%s' % (image_root, image_info['file_name'])
+      self.image_path_to_id[image_path] = image_id
+      if os.path.isfile(image_path):
+        assert image_id not in known_images  # no duplicates allowed
+        known_images[image_id] = {}
+        known_images[image_id]['path'] = image_path
+        if gt_captions:
+          known_images[image_id]['sentences'] = [split_sentence(anno['caption'])
+              for anno in coco.imgToAnns[image_id]]
+          num_captions += len(known_images[image_id]['sentences'])
+        else:
+          known_images[image_id]['sentences'] = []
+      else:
+        num_missing += 1
+        print 'Warning (#%d): image not found: %s' % (num_missing, image_path)
+      num_total += 1
+    print '%d/%d images missing' % (num_missing, num_total)
+    if vocab is None:
+      self.init_vocabulary(known_images)
+    else:
+      self.vocabulary_inverted = vocab
+      self.vocabulary = {}
+      for index, word in enumerate(self.vocabulary_inverted):
+        self.vocabulary[word] = index
+    self.image_sentence_pairs = []
+    num_no_sentences = 0
+    for image_filename, metadata in known_images.iteritems():
+      if not metadata['sentences']:
+        num_no_sentences += 1
+        print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename)
+      for sentence in metadata['sentences']:
+        self.image_sentence_pairs.append((metadata['path'], sentence))
+    self.index = 0
+    self.num_resets = 0
+    self.num_truncates = 0
+    self.num_pads = 0
+    self.num_outs = 0
+    self.image_list = []
+    SequenceGenerator.__init__(self)
+    self.batch_num_streams = batch_num_streams
+    # make the number of image/sentence pairs a multiple of the buffer size
+    # so each timestep of each batch is useful and we can align the images
+    if align:
+      num_pairs = len(self.image_sentence_pairs)
+      remainder = num_pairs % batch_num_streams
+      if remainder > 0:
+        num_needed = batch_num_streams - remainder
+        for i in range(num_needed):
+          choice = random.randint(0, num_pairs - 1)
+          self.image_sentence_pairs.append(self.image_sentence_pairs[choice])
+      assert len(self.image_sentence_pairs) % batch_num_streams == 0
+    if shuffle:
+      random.shuffle(self.image_sentence_pairs)
+    self.pad = pad
+    self.truncate = truncate
+    self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))
+
+  def streams_exhausted(self):
+    return self.num_resets > 0
+
+  def init_vocabulary(self, image_annotations, min_count=5):
+    words_to_count = {}
+    for image_id, annotations in image_annotations.iteritems():
+      for annotation in annotations['sentences']:
+        for word in annotation:
+          word = word.strip()
+          if word not in words_to_count:
+            words_to_count[word] = 0
+          words_to_count[word] += 1
+    # Sort words by count, then alphabetically
+    words_by_count = sorted(words_to_count.keys(), key=lambda w: (-words_to_count[w], w))
+    print 'Initialized vocabulary with %d words; top 10 words:' % len(words_by_count)
+    for word in words_by_count[:10]:
+      print '\t%s (%d)' % (word, words_to_count[word])
+    # Add words to vocabulary
+    self.vocabulary = {UNK_IDENTIFIER: 0}
+    self.vocabulary_inverted = [UNK_IDENTIFIER]
+    for index, word in enumerate(words_by_count):
+      word = word.strip()
+      if words_to_count[word] < min_count:
+        break
+      self.vocabulary_inverted.append(word)
+      self.vocabulary[word] = index + 1
+    print 'Final vocabulary (restricted to words with counts of %d+) has %d words' % \
+        (min_count, len(self.vocabulary))
+
+  def dump_vocabulary(self, vocab_filename):
+    print 'Dumping vocabulary to file: %s' % vocab_filename
+    with open(vocab_filename, 'wb') as vocab_file:
+      for word in self.vocabulary_inverted:
+        vocab_file.write('%s\n' % word)
+    print 'Done.'
+
+  def dump_image_file(self, image_filename, dummy_image_filename=None):
+    print 'Dumping image list to file: %s' % image_filename
+    with open(image_filename, 'wb') as image_file:
+      for image_path, _ in self.image_list:
+        image_file.write('%s\n' % image_path)
+    if dummy_image_filename is not None:
+      print 'Dumping image list with dummy labels to file: %s' % dummy_image_filename
+      with open(dummy_image_filename, 'wb') as image_file:
+        for path_and_hash in self.image_list:
+          image_file.write('%s %d\n' % path_and_hash)
+    print 'Done.'
+
+  def next_line(self):
+    num_lines = float(len(self.image_sentence_pairs))
+    self.index += 1
+    if self.index == 1 or self.index == num_lines or self.index % 10000 == 0:
+      print 'Processed %d/%d (%f%%) lines' % (self.index, num_lines,
+                                              100 * self.index / num_lines)
+    if self.index == num_lines:
+      self.index = 0
+      self.num_resets += 1
+
+  def line_to_stream(self, sentence):
+    stream = []
+    for word in sentence:
+      word = word.strip()
+      if word in self.vocabulary:
+        stream.append(self.vocabulary[word])
+      else:  # unknown word; append UNK
+        stream.append(self.vocabulary[UNK_IDENTIFIER])
+    # increment the stream -- 0 will be the EOS character
+    stream = [s + 1 for s in stream]
+    return stream
+
+  def get_pad_value(self, stream_name):
+    return -1 if stream_name in self.negative_one_padded_streams else 0
+
+  def get_streams(self):
+    image_filename, line = self.image_sentence_pairs[self.index]
+    stream = self.line_to_stream(line)
+    pad = self.max_words - (len(stream) + 1) if self.pad else 0
+    if pad > 0: self.num_pads += 1
+    self.num_outs += 1
+    out = {}
+    out['stage_indicators'] = [1] * (len(stream) + 1) + [0] * pad
+    out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad
+    out['input_sentence'] = [0] + stream + [-1] * pad
+    out['target_sentence'] = stream + [0] + [-1] * pad
+    truncated = False
+    if self.truncate:
+      for key, val in out.iteritems():
+        if len(val) > self.max_words:
+          out[key] = val[:self.max_words]
+          truncated = True
+      self.num_truncates += truncated
+    image_hash = self.image_hash(image_filename)
+    out['hashed_image_path'] = [image_hash] * len(out['input_sentence'])
+    self.image_list.append((image_filename, image_hash))
+    self.next_line()
+    return out
+
+  def image_hash(self, filename):
+    image_hash = int(sha1(filename).hexdigest(), 16) % MAX_HASH
+    assert image_hash == float(image_hash)
+    return image_hash
+
+COCO_ANNO_PATH = '%s/annotations/captions_%%s2014.json' % COCO_PATH
+COCO_IMAGE_PATTERN = '%s/images/%%s2014' % COCO_PATH
+COCO_IMAGE_ID_PATTERN = 'COCO_%s2014_%%012d.jpg'
+
+BUFFER_SIZE = 100
+OUTPUT_DIR = './examples/coco_caption/h5_data/buffer_%d' % BUFFER_SIZE
+SPLITS_PATTERN = './data/coco/coco2014_cocoid.%s.txt'
+OUTPUT_DIR_PATTERN = '%s/%%s_batches' % OUTPUT_DIR
+
+def process_dataset(split_name, coco_split_name, batch_stream_length,
+                    vocab=None, aligned=True):
+  with open(SPLITS_PATTERN % split_name, 'r') as split_file:
+    split_image_ids = [int(line) for line in split_file.readlines()]
+  output_dataset_name = split_name
+  if aligned:
+    output_dataset_name += '_aligned_%d' % MAX_WORDS
+  else:
+    output_dataset_name += '_unaligned'
+  output_path = OUTPUT_DIR_PATTERN % output_dataset_name
+  coco = COCO(COCO_ANNO_PATH % coco_split_name)
+  image_root = COCO_IMAGE_PATTERN % coco_split_name
+  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root,
+      split_ids=split_image_ids, vocab=vocab, align=aligned, pad=aligned,
+      truncate=aligned)
+  sg.batch_stream_length = batch_stream_length
+  writer = HDF5SequenceWriter(sg, output_dir=output_path)
+  writer.write_to_exhaustion()
+  writer.write_filelists()
+  if vocab is None:
+    vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR
+    sg.dump_vocabulary(vocab_out_path)
+  image_out_path = '%s/image_list.txt' % output_path
+  image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path
+  sg.dump_image_file(image_out_path, image_dummy_labels_out_path)
+  num_outs = sg.num_outs
+  num_pads = sg.num_pads
+  num_truncates = sg.num_truncates
+  print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
+      (num_pads, num_outs, num_truncates, num_outs)
+  return sg.vocabulary_inverted
+
+def process_coco(include_trainval=False):
+  vocab = None
+  datasets = [
+      ('train', 'train', 100000, True),
+      ('val', 'val', 100000, True),
+      ('test', 'val', 100000, True),
+      # Write unaligned datasets as well:
+      ('train', 'train', 100000, False),
+      ('val', 'val', 100000, False),
+      ('test', 'val', 100000, False),
+  ]
+  # Also create a 'trainval' set if include_trainval is set.
+  # ./data/coco/make_trainval.py must have been run for this to work.
+  if include_trainval:
+    datasets += [
+      ('trainval', 'trainval', 100000, True),
+      ('trainval', 'trainval', 100000, False),
+    ]
+  for split_name, coco_split_name, batch_stream_length, aligned in datasets:
+    vocab = process_dataset(split_name, coco_split_name, batch_stream_length,
+                            vocab=vocab, aligned=aligned)
+
+if __name__ == "__main__":
+  process_coco(include_trainval=False)
diff --git a/examples/coco_caption/finetune_lrcn.sh b/examples/coco_caption/finetune_lrcn.sh
new file mode 100755
index 00000000000..0e948bc6726
--- /dev/null
+++ b/examples/coco_caption/finetune_lrcn.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./examples/coco_caption/lrcn_iter_110000.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_finetune_solver.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/finetune_lrcn.trainval.sh b/examples/coco_caption/finetune_lrcn.trainval.sh
new file mode 100755
index 00000000000..4fd19b4763b
--- /dev/null
+++ b/examples/coco_caption/finetune_lrcn.trainval.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./examples/coco_caption/lrcn_finetune_iter_50000.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_finetune_solver.trainval.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/finetune_lrcn.vgg.sh b/examples/coco_caption/finetune_lrcn.vgg.sh
new file mode 100755
index 00000000000..85c7b5ebfde
--- /dev/null
+++ b/examples/coco_caption/finetune_lrcn.vgg.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./examples/coco_caption/lrcn_vgg_iter_90000.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_finetune_solver.vgg.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/finetune_lrcn.vgg.trainval.sh b/examples/coco_caption/finetune_lrcn.vgg.trainval.sh
new file mode 100755
index 00000000000..8b230c908fe
--- /dev/null
+++ b/examples/coco_caption/finetune_lrcn.vgg.trainval.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./examples/coco_caption/lrcn_finetune_vgg_iter_50000.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/hdf5_sequence_generator.py b/examples/coco_caption/hdf5_sequence_generator.py
new file mode 100644
index 00000000000..98d4657b6bf
--- /dev/null
+++ b/examples/coco_caption/hdf5_sequence_generator.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+import h5py
+import numpy as np
+import os
+import random
+import sys
+
+class SequenceGenerator():
+  def __init__(self):
+    self.dimension = 10
+    self.batch_stream_length = 2000
+    self.batch_num_streams = 8
+    self.min_stream_length = 13
+    self.max_stream_length = 17
+    self.substream_names = None
+    self.streams_initialized = False
+
+  def streams_exhausted(self):
+    return False
+
+  def init_streams(self):
+    self.streams = [None] * self.batch_num_streams
+    self.stream_indices = [0] * self.batch_num_streams
+    self.reset_stream(0)
+    self.streams_initialized = True
+
+  def reset_stream(self, stream_index):
+    streams = self.get_streams()
+    stream_names = sorted(streams.keys())
+    if self.substream_names is None:
+      assert len(stream_names) > 0
+      self.substream_names = stream_names
+    assert self.substream_names == stream_names
+    if self.streams[stream_index] is None:
+      self.streams[stream_index] = {}
+    stream_length = len(streams[stream_names[0]])
+    for k, v in streams.iteritems():
+      assert stream_length == len(v)
+      self.streams[stream_index][k] = v
+    self.stream_indices[stream_index] = 0
+
+  # Pad with zeroes by default -- override this to pad with soemthing else
+  # for a particular stream
+  def get_pad_value(self, stream_name):
+    return 0
+
+  def get_next_batch(self, truncate_at_exhaustion=True):
+    if not self.streams_initialized:
+      self.init_streams()
+    batch_size = self.batch_num_streams * self.batch_stream_length
+    batch = {}
+    batch_indicators = np.zeros((self.batch_stream_length, self.batch_num_streams))
+    for name in self.substream_names:
+      batch[name] = self.get_pad_value(name) * np.ones_like(batch_indicators)
+    exhausted = [False] * self.batch_num_streams
+    all_exhausted = False
+    reached_exhaustion = False
+    num_completed_streams = 0
+    for t in range(self.batch_stream_length):
+      all_exhausted = True
+      for i in range(self.batch_num_streams):
+        if not exhausted[i]:
+          if self.streams[i] is None or \
+              self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]):
+            self.stream_indices[i] = 0
+            reached_exhaustion = reached_exhaustion or self.streams_exhausted()
+            if reached_exhaustion: exhausted[i] = True
+            if not reached_exhaustion or not truncate_at_exhaustion:
+              self.reset_stream(i)
+            else:
+              continue
+          for name in self.substream_names:
+            batch[name][t, i] = self.streams[i][name][self.stream_indices[i]]
+          batch_indicators[t, i] = 0 if self.stream_indices[i] == 0 else 1
+          self.stream_indices[i] += 1
+          if self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]):
+            num_completed_streams += 1
+        if not exhausted[i]: all_exhausted = False
+      if all_exhausted and truncate_at_exhaustion:
+        print ('Exhausted all data; cutting off batch at timestep %d ' +
+               'with %d streams completed') % (t, num_completed_streams)
+        for name in self.substream_names:
+          batch[name] = batch[name][:t, :]
+        batch_indicators = batch_indicators[:t, :]
+        break
+    return batch, batch_indicators
+
+  def get_streams(self):
+    raise Exception('get_streams should be overridden to return a dict ' +
+                    'of equal-length iterables.')
+
+class HDF5SequenceWriter():
+  def __init__(self, sequence_generator, output_dir=None, verbose=False):
+    self.generator = sequence_generator
+    assert output_dir is not None  # required
+    self.output_dir = output_dir
+    if os.path.exists(output_dir):
+      raise Exception('Output directory already exists: ' + output_dir)
+    os.makedirs(output_dir)
+    self.verbose = verbose
+    self.filenames = []
+
+  def write_batch(self, stop_at_exhaustion=False):
+    batch_comps, cont_indicators = self.generator.get_next_batch()
+    batch_index = len(self.filenames)
+    filename = '%s/batch_%d.h5' % (self.output_dir, batch_index)
+    self.filenames.append(filename)
+    h5file = h5py.File(filename, 'w')
+    dataset = h5file.create_dataset('cont', shape=cont_indicators.shape, dtype=cont_indicators.dtype)
+    dataset[:] = cont_indicators
+    dataset = h5file.create_dataset('buffer_size', shape=(1,), dtype=np.int)
+    dataset[:] = self.generator.batch_num_streams
+    for key, batch in batch_comps.iteritems():
+      if self.verbose:
+        for s in range(self.generator.batch_num_streams):
+          stream = np.array(self.generator.streams[s][key])
+          print 'batch %d, stream %s, index %d: ' % (batch_index, key, s), stream
+      h5dataset = h5file.create_dataset(key, shape=batch.shape, dtype=batch.dtype)
+      h5dataset[:] = batch
+    h5file.close()
+
+  def write_to_exhaustion(self):
+    while not self.generator.streams_exhausted():
+      self.write_batch(stop_at_exhaustion=True)
+
+  def write_filelists(self):
+    assert self.filenames is not None
+    filelist_filename = '%s/hdf5_chunk_list.txt' % self.output_dir
+    with open(filelist_filename, 'w') as listfile:
+      for filename in self.filenames:
+        listfile.write('%s\n' % filename)
diff --git a/examples/coco_caption/lrcn.prototxt b/examples/coco_caption/lrcn.prototxt
new file mode 100644
index 00000000000..5a8c55443e3
--- /dev/null
+++ b/examples/coco_caption/lrcn.prototxt
@@ -0,0 +1,808 @@
+# The network is used for the image captioning experiments of LRCN [1].
+# Please consider citing LRCN [1] if you use this example in your work.
+#
+# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan,
+#     K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for
+#     Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014).
+
+name: "lrcn_caffenet_to_lstm"
+
+# train data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN  not_stage: 'trainval' }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN  not_stage: 'trainval' }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# trainval data layers (for finetuning final model)
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN  stage: 'trainval' }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN  stage: 'trainval' }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# test on train data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# test on val data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+layer {
+  name: "silence"
+  type: "Silence"
+  bottom: "label"
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "pool1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "norm1"
+  top: "conv2"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "norm1"
+  top: "conv2"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "pool2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "norm2"
+  top: "conv3"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "norm2"
+  top: "conv3"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm1"
+  include { stage: "unfactored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  top: "lstm2"
+  include {
+    stage: "unfactored"
+    stage: "2-layer"
+  }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm2"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  exclude { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm2"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  include { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  accuracy_param {
+    axis: 2
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lrcn.vgg.buffer_50.prototxt b/examples/coco_caption/lrcn.vgg.buffer_50.prototxt
new file mode 100644
index 00000000000..4091a6f7785
--- /dev/null
+++ b/examples/coco_caption/lrcn.vgg.buffer_50.prototxt
@@ -0,0 +1,940 @@
+# The network is used for the image captioning experiments of LRCN [1].
+# Please consider citing LRCN [1] if you use this example in your work.
+#
+# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan,
+#     K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for
+#     Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014).
+
+name: "lrcn_caffenet_to_lstm"
+
+# train data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN  not_stage: 'trainval' }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 50
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN  not_stage: 'trainval' }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# trainval data layers (for finetuning final model)
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN  stage: 'trainval' }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/trainval_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 50
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN  stage: 'trainval' }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/trainval_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# test on train data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 50
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# test on val data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/val_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 50
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_50/val_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+layer {
+  name: "silence"
+  type: "Silence"
+  bottom: "label"
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm1"
+  include { stage: "unfactored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  top: "lstm2"
+  include {
+    stage: "unfactored"
+    stage: "2-layer"
+  }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm2"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  exclude { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm2"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  include { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  accuracy_param {
+    axis: 2
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lrcn.vgg.prototxt b/examples/coco_caption/lrcn.vgg.prototxt
new file mode 100644
index 00000000000..db914ff0888
--- /dev/null
+++ b/examples/coco_caption/lrcn.vgg.prototxt
@@ -0,0 +1,940 @@
+# The network is used for the image captioning experiments of LRCN [1].
+# Please consider citing LRCN [1] if you use this example in your work.
+#
+# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan,
+#     K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for
+#     Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014).
+
+name: "lrcn_caffenet_to_lstm"
+
+# train data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN  not_stage: 'trainval' }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN  not_stage: 'trainval' }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# trainval data layers (for finetuning final model)
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN  stage: 'trainval' }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN  stage: 'trainval' }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# test on train data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+# test on val data layers
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+
+layer {
+  name: "silence"
+  type: "Silence"
+  bottom: "label"
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0.1 }
+  param { lr_mult: 0.2 decay_mult: 0}
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm1"
+  include { stage: "unfactored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  top: "lstm2"
+  include {
+    stage: "unfactored"
+    stage: "2-layer"
+  }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm2"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  exclude { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm2"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  include { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  accuracy_param {
+    axis: 2
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lrcn_finetune_solver.prototxt b/examples/coco_caption/lrcn_finetune_solver.prototxt
new file mode 100644
index 00000000000..52dae5f6cf4
--- /dev/null
+++ b/examples/coco_caption/lrcn_finetune_solver.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'factored' stage: '2-layer' }
+test_iter: 25
+test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 50000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_finetune"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 25
diff --git a/examples/coco_caption/lrcn_finetune_solver.trainval.prototxt b/examples/coco_caption/lrcn_finetune_solver.trainval.prototxt
new file mode 100644
index 00000000000..bc2345ecdc5
--- /dev/null
+++ b/examples/coco_caption/lrcn_finetune_solver.trainval.prototxt
@@ -0,0 +1,28 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'factored' stage: '2-layer' stage: 'trainval' }
+test_iter: 25
+test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.5
+stepsize: 40000
+display: 1
+max_iter: 100000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_finetune_trainval"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 25
diff --git a/examples/coco_caption/lrcn_finetune_solver.vgg.prototxt b/examples/coco_caption/lrcn_finetune_solver.vgg.prototxt
new file mode 100644
index 00000000000..6d1db08bfcb
--- /dev/null
+++ b/examples/coco_caption/lrcn_finetune_solver.vgg.prototxt
@@ -0,0 +1,31 @@
+net: "./examples/coco_caption/lrcn.vgg.buffer_50.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'factored' stage: '2-layer' }
+# test_iter: 25
+# test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+# test_iter: 25
+# test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 50000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 2500
+snapshot_prefix: "./examples/coco_caption/lrcn_finetune_vgg"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 25
+iter_size: 2
diff --git a/examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt b/examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt
new file mode 100644
index 00000000000..f9c4ee0d17a
--- /dev/null
+++ b/examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt
@@ -0,0 +1,29 @@
+net: "./examples/coco_caption/lrcn.vgg.buffer_50.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'factored' stage: '2-layer' stage: 'trainval' }
+# test_iter: 25
+# test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+# test_interval: 1000
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.5
+stepsize: 40000
+display: 1
+max_iter: 100000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_finetune_vgg_trainval"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 25
+iter_size: 2
diff --git a/examples/coco_caption/lrcn_solver.from_lm.prototxt b/examples/coco_caption/lrcn_solver.from_lm.prototxt
new file mode 100644
index 00000000000..3fb37879c55
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.from_lm.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_from_lm"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/lrcn_solver.prototxt b/examples/coco_caption/lrcn_solver.prototxt
new file mode 100644
index 00000000000..65ca272b30c
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/lrcn_solver.single_layer.prototxt b/examples/coco_caption/lrcn_solver.single_layer.prototxt
new file mode 100644
index 00000000000..5a9073bed71
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.single_layer.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (2).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '1-layer' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '1-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '1-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_single_layer"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/lrcn_solver.unfactored.prototxt b/examples/coco_caption/lrcn_solver.unfactored.prototxt
new file mode 100644
index 00000000000..2c83f37ca27
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.unfactored.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (3).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '2-layer' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '2-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_unfactored"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/lrcn_solver.vgg.prototxt b/examples/coco_caption/lrcn_solver.vgg.prototxt
new file mode 100644
index 00000000000..dada837a512
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.vgg.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.vgg.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' }
+# test_iter: 25
+# test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+# test_iter: 25
+# test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+# test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn_vgg"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt b/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt
new file mode 100644
index 00000000000..bfd6166add9
--- /dev/null
+++ b/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt
@@ -0,0 +1,56 @@
+name: "caffenet_to_lstm"
+
+input: "cont_sentence"
+input_shape { dim: 1 dim: 1000 }
+
+input: "input_sentence"
+input_shape { dim: 1 dim: 1000 }
+
+input: "image_features"
+input_shape { dim: 1000 dim: 1000 }
+
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  embed_param {
+    input_dim: 8801
+    num_output: 1000
+    bias_term: false
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  recurrent_param { num_output: 1000 }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  bottom: "image_features"
+  top: "lstm2"
+  recurrent_param { num_output: 1000 }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm2"
+  top: "predict"
+  inner_product_param {
+    axis: 2
+    num_output: 8801
+  }
+}
+layer {
+  name: "probs"
+  type: "Softmax"
+  bottom: "predict"
+  top: "probs"
+  softmax_param { axis: 2 }
+}
diff --git a/examples/coco_caption/lstm_language_model.prototxt b/examples/coco_caption/lstm_language_model.prototxt
new file mode 100644
index 00000000000..68fda5464fe
--- /dev/null
+++ b/examples/coco_caption/lstm_language_model.prototxt
@@ -0,0 +1,150 @@
+name: "lstm_language_model"
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801  # = vocab_size + 1 (for EOS)
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "embed-drop"
+  type: "Dropout"
+  bottom: "embedded_input_sentence"
+  top: "embedded_input_sentence"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "embed-drop" }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm-drop"
+  type: "Dropout"
+  bottom: "lstm1"
+  top: "lstm1"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "lstm-drop" }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 8801  # = vocab_size + 1 (+1 for EOS)
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  accuracy_param {
+    axis: 2
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lstm_lm.deploy.prototxt b/examples/coco_caption/lstm_lm.deploy.prototxt
new file mode 100644
index 00000000000..26b5f1b01eb
--- /dev/null
+++ b/examples/coco_caption/lstm_lm.deploy.prototxt
@@ -0,0 +1,122 @@
+name: "lstm_language_model"
+
+input: "cont_sentence"
+input_shape { dim: 1 dim: 1 }
+
+input: "input_sentence"
+input_shape { dim: 1 dim: 1 }
+
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801  # = vocab_size + 1 (for EOS)
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "embed-drop"
+  type: "Dropout"
+  bottom: "embedded_input_sentence"
+  top: "embedded_input_sentence"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "embed-drop" }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm-drop"
+  type: "Dropout"
+  bottom: "lstm1"
+  top: "lstm1"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "lstm-drop" }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 8801  # = vocab_size + 1 (+1 for EOS)
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "probs"
+  type: "Softmax"
+  bottom: "predict"
+  top: "probs"
+  softmax_param { axis: 2 }
+}
+# layer {
+#   name: "cross_entropy_loss"
+#   type: "SoftmaxWithLoss"
+#   bottom: "predict"
+#   bottom: "target_sentence"
+#   top: "cross_entropy_loss"
+#   loss_weight: 20
+#   loss_param {
+#     ignore_label: -1
+#   }
+#   softmax_param {
+#     axis: 2
+#   }
+# }
+# layer {
+#   name: "accuracy"
+#   type: "Accuracy"
+#   bottom: "predict"
+#   bottom: "target_sentence"
+#   top: "accuracy"
+#   include { phase: TEST }
+#   accuracy_param {
+#     axis: 2
+#     ignore_label: -1
+#   }
+# }
diff --git a/examples/coco_caption/lstm_lm_solver.prototxt b/examples/coco_caption/lstm_lm_solver.prototxt
new file mode 100644
index 00000000000..fb36ad15a5b
--- /dev/null
+++ b/examples/coco_caption/lstm_lm_solver.prototxt
@@ -0,0 +1,21 @@
+net: "./examples/coco_caption/lstm_language_model.prototxt"
+train_state: { stage: 'embed-drop' stage: 'lstm-drop' }
+test_iter: 25
+test_state: { stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.1
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lstm_lm"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/retrieval_experiment.py b/examples/coco_caption/retrieval_experiment.py
new file mode 100755
index 00000000000..178937bdb85
--- /dev/null
+++ b/examples/coco_caption/retrieval_experiment.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+
+from collections import OrderedDict
+import json
+import numpy as np
+import pprint
+import cPickle as pickle
+import string
+import sys
+
+# seed the RNG so we evaluate on the same subset each time
+np.random.seed(seed=0)
+
+from coco_to_hdf5_data import *
+from captioner import Captioner
+
+COCO_EVAL_PATH = './data/coco/coco-caption-eval'
+sys.path.append(COCO_EVAL_PATH)
+from pycocoevalcap.eval import COCOEvalCap
+
+class CaptionExperiment():
+  # captioner is an initialized Captioner (captioner.py)
+  # dataset is a dict: image path -> [caption1, caption2, ...]
+  def __init__(self, captioner, dataset, dataset_cache_dir, cache_dir, sg):
+    self.captioner = captioner
+    self.sg = sg
+    self.dataset_cache_dir = dataset_cache_dir
+    self.cache_dir = cache_dir
+    for d in [dataset_cache_dir, cache_dir]:
+      if not os.path.exists(d): os.makedirs(d)
+    self.dataset = dataset
+    self.images = dataset.keys()
+    self.init_caption_list(dataset)
+    self.caption_scores = [None] * len(self.images)
+    print 'Initialized caption experiment: %d images, %d captions' % \
+        (len(self.images), len(self.captions))
+
+  def init_caption_list(self, dataset):
+    self.captions = []
+    for image, captions in dataset.iteritems():
+      for caption, _ in captions:
+        self.captions.append({'source_image': image, 'caption': caption})
+    # Sort by length for performance.
+    self.captions.sort(key=lambda c: len(c['caption']))
+
+  def compute_descriptors(self):
+    descriptor_filename = '%s/descriptors.npz' % self.dataset_cache_dir
+    if os.path.exists(descriptor_filename):
+      self.descriptors = np.load(descriptor_filename)['descriptors']
+    else:
+      self.descriptors = self.captioner.compute_descriptors(self.images)
+      np.savez_compressed(descriptor_filename, descriptors=self.descriptors)
+
+  def score_captions(self, image_index, output_name='probs'):
+    assert image_index < len(self.images)
+    caption_scores_dir = '%s/caption_scores' % self.cache_dir
+    if not os.path.exists(caption_scores_dir):
+      os.makedirs(caption_scores_dir)
+    caption_scores_filename = '%s/scores_image_%06d.pkl' % \
+        (caption_scores_dir, image_index)
+    if os.path.exists(caption_scores_filename):
+      with open(caption_scores_filename, 'rb') as caption_scores_file:
+        outputs = pickle.load(caption_scores_file)
+    else:
+      outputs = self.captioner.score_captions(self.descriptors[image_index],
+          self.captions, output_name=output_name, caption_source='gt',
+          verbose=False)
+      self.caption_stats(image_index, outputs)
+      with open(caption_scores_filename, 'wb') as caption_scores_file:
+        pickle.dump(outputs, caption_scores_file)
+    self.caption_scores[image_index] = outputs
+
+  def caption_stats(self, image_index, caption_scores):
+    image_path = self.images[image_index]
+    for caption, score in zip(self.captions, caption_scores):
+      assert caption['caption'] == score['caption']
+      score['stats'] = gen_stats(score['prob'])
+      score['correct'] = (image_path == caption['source_image'])
+
+  def eval_image_to_caption(self, image_index, methods=None):
+    scores = self.caption_scores[image_index]
+    return self.eval_recall(scores, methods=methods)
+
+  def eval_caption_to_image(self, caption_index, methods=None):
+    scores = [s[caption_index] for s in self.caption_scores]
+    return self.eval_recall(scores, methods=methods)
+
+  def normalize_caption_scores(self, caption_index, stats=['log_p', 'log_p_word']):
+    scores = [s[caption_index] for s in self.caption_scores]
+    for stat in stats:
+      log_stat_scores = np.array([score['stats'][stat] for score in scores])
+      stat_scores = np.exp(log_stat_scores)
+      mean_stat_score = np.mean(stat_scores)
+      log_mean_stat_score = np.log(mean_stat_score)
+      for log_stat_score, score in zip(log_stat_scores, scores):
+        score['stats']['normalized_' + stat] = log_stat_score - log_mean_stat_score
+
+  def eval_recall(self, scores, methods=None, neg_prefix='negative_'):
+    if methods is None:
+      # rank on all stats, and all their inverses
+      methods = scores[0]['stats'].keys()
+      methods += [neg_prefix + method for method in methods]
+    correct_ranks = {}
+    for method in methods:
+      if method.startswith(neg_prefix):
+        multiplier = -1
+        method_key = method[len(neg_prefix):]
+      else:
+        multiplier = 1
+        method_key = method
+      sort_key = lambda s: multiplier * s['stats'][method_key]
+      ranked_scores = sorted(scores, key=sort_key)
+      for index, score in enumerate(ranked_scores):
+        if score['correct']:
+          correct_ranks[method] = index
+          break
+    return correct_ranks
+
+  def recall_results(self, correct_ranks, recall_ranks=[]):
+    num_instances = float(len(correct_ranks))
+    assert num_instances > 0
+    methods = correct_ranks[0].keys()
+    results = {}
+    for method in methods:
+       method_correct_ranks = \
+           np.array([correct_rank[method] for correct_rank in correct_ranks])
+       r = OrderedDict()
+       r['mean'] = np.mean(method_correct_ranks)
+       r['median'] = np.median(method_correct_ranks)
+       r['mean (1-indexed)'] = r['mean'] + 1
+       r['median (1-indexed)'] = r['median'] + 1
+       for recall_rank in recall_ranks:
+         r['R@%d' % recall_rank] = \
+             np.where(method_correct_ranks < recall_rank)[0].shape[0] / num_instances
+       results[method] = r
+    return results
+
+  def print_recall_results(self, results):
+    for method, result in results.iteritems():
+      print 'Ranking method:', method
+      for metric_name_and_value in result.iteritems():
+        print '    %s: %f' % metric_name_and_value
+
+  def retrieval_experiment(self):
+    # Compute image descriptors.
+    print 'Computing image descriptors'
+    self.compute_descriptors()
+
+    num_images, num_captions = len(self.images), len(self.captions)
+
+    # For each image, score all captions.
+    for image_index in xrange(num_images):
+      sys.stdout.write("\rScoring captions for image %d/%d" %
+                       (image_index, num_images))
+      sys.stdout.flush()
+      self.score_captions(image_index)
+    sys.stdout.write('\n')
+
+    # Compute global caption statistics for normalization.
+    for caption_index in xrange(num_captions):
+      self.normalize_caption_scores(caption_index)
+
+    recall_ranks = [1, 5, 10, 50]
+
+    eval_methods = ['negative_normalized_log_p']
+    # Evaluate caption-to-image retrieval task.
+    self.caption_to_image_ranks = [None] * num_captions
+    for caption_index in xrange(num_captions):
+      sys.stdout.write("\rCaption-to-image evaluation: "
+                       "computing recall for caption %d/%d" %
+                       (caption_index, num_captions))
+      sys.stdout.flush()
+      self.caption_to_image_ranks[caption_index] = \
+          self.eval_caption_to_image(caption_index, methods=eval_methods)
+    sys.stdout.write('\n')
+    self.caption_to_image_recall = \
+         self.recall_results(self.caption_to_image_ranks, recall_ranks)
+    print 'Caption-to-image retrieval results:'
+    self.print_recall_results(self.caption_to_image_recall)
+
+    # Evaluate image-to-caption retrieval task.
+    self.image_to_caption_ranks = [None] * num_images
+    for image_index in xrange(num_images):
+      sys.stdout.write("\rImage-to-caption evaluation: "
+                       "computing recall for image %d/%d" %
+                       (image_index, num_images))
+      sys.stdout.flush()
+      self.image_to_caption_ranks[image_index] = \
+          self.eval_image_to_caption(image_index, methods=eval_methods)
+    sys.stdout.write('\n')
+    self.image_to_caption_recall = \
+        self.recall_results(self.image_to_caption_ranks, recall_ranks)
+    print 'Image-to-caption retrieval results:'
+    self.print_recall_results(self.image_to_caption_recall)
+
+  def generation_experiment(self, strategy, max_batch_size=1000):
+    # Compute image descriptors.
+    print 'Computing image descriptors'
+    self.compute_descriptors()
+
+    do_batches = (strategy['type'] == 'beam' and strategy['beam_size'] == 1) or \
+        (strategy['type'] == 'sample' and
+         ('temp' not in strategy or strategy['temp'] in (1, float('inf'))) and
+         ('num' not in strategy or strategy['num'] == 1))
+
+    num_images = len(self.images)
+    batch_size = min(max_batch_size, num_images) if do_batches else 1
+
+    # Generate captions for all images.
+    all_captions = [None] * num_images
+    for image_index in xrange(0, num_images, batch_size):
+      batch_end_index = min(image_index + batch_size, num_images)
+      sys.stdout.write("\rGenerating captions for image %d/%d" %
+                       (image_index, num_images))
+      sys.stdout.flush()
+      if do_batches:
+        if strategy['type'] == 'beam' or \
+            ('temp' in strategy and strategy['temp'] == float('inf')):
+          temp = float('inf')
+        else:
+          temp = strategy['temp'] if 'temp' in strategy else 1
+        output_captions, output_probs = self.captioner.sample_captions(
+            self.descriptors[image_index:batch_end_index], temp=temp)
+        for batch_index, output in zip(range(image_index, batch_end_index),
+                                       output_captions):
+          all_captions[batch_index] = output
+      else:
+        for batch_image_index in xrange(image_index, batch_end_index):
+          captions, caption_probs = self.captioner.predict_caption(
+              self.descriptors[batch_image_index], strategy=strategy)
+          best_caption, max_log_prob = None, None
+          for caption, probs in zip(captions, caption_probs):
+            log_prob = gen_stats(probs)['log_p']
+            if best_caption is None or \
+                (best_caption is not None and log_prob > max_log_prob):
+              best_caption, max_log_prob = caption, log_prob
+          all_captions[batch_image_index] = best_caption
+    sys.stdout.write('\n')
+
+    # Compute the number of reference files as the maximum number of ground
+    # truth captions of any image in the dataset.
+    num_reference_files = 0
+    for captions in self.dataset.values():
+      if len(captions) > num_reference_files:
+        num_reference_files = len(captions)
+    if num_reference_files <= 0:
+      raise Exception('No reference captions.')
+
+    # Collect model/reference captions, formatting the model's captions and
+    # each set of reference captions as a list of len(self.images) strings.
+    exp_dir = '%s/generation' % self.cache_dir
+    if not os.path.exists(exp_dir):
+      os.makedirs(exp_dir)
+    # For each image, write out the highest probability caption.
+    model_captions = [''] * len(self.images)
+    reference_captions = [([''] * len(self.images)) for _ in xrange(num_reference_files)]
+    for image_index, image in enumerate(self.images):
+      caption = self.captioner.sentence(all_captions[image_index])
+      model_captions[image_index] = caption
+      for reference_index, (_, caption) in enumerate(self.dataset[image]):
+        caption = ' '.join(caption)
+        reference_captions[reference_index][image_index] = caption
+
+    coco_image_ids = [self.sg.image_path_to_id[image_path]
+                      for image_path in self.images]
+    generation_result = [{
+      'image_id': self.sg.image_path_to_id[image_path],
+      'caption': model_captions[image_index]
+    } for (image_index, image_path) in enumerate(self.images)]
+    json_filename = '%s/generation_result.json' % self.cache_dir
+    print 'Dumping result to file: %s' % json_filename
+    with open(json_filename, 'w') as json_file:
+      json.dump(generation_result, json_file)
+    generation_result = self.sg.coco.loadRes(json_filename)
+    coco_evaluator = COCOEvalCap(self.sg.coco, generation_result)
+    coco_evaluator.params['image_id'] = coco_image_ids
+    coco_evaluator.evaluate()
+
+def gen_stats(prob):
+  stats = {}
+  stats['length'] = len(prob)
+  stats['log_p'] = 0.0
+  eps = 1e-12
+  for p in prob:
+    assert 0.0 <= p <= 1.0
+    stats['log_p'] += np.log(max(eps, p))
+  stats['log_p_word'] = stats['log_p'] / stats['length']
+  try:
+    stats['perplex'] = np.exp(-stats['log_p'])
+  except OverflowError:
+    stats['perplex'] = float('inf')
+  try:
+    stats['perplex_word'] = np.exp(-stats['log_p_word'])
+  except OverflowError:
+    stats['perplex_word'] = float('inf')
+  return stats
+
+def main():
+  MAX_IMAGES = -1  # -1 to use all images
+  TAG = 'coco_2layer_factored'
+  if MAX_IMAGES >= 0:
+    TAG += '_%dimages' % MAX_IMAGES
+  eval_on_test = False
+  if eval_on_test:
+    ITER = 100000
+    MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
+    DATASET_NAME = 'test'
+  else:  # eval on val
+    ITER = 50000
+    MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER
+    DATASET_NAME = 'val'
+  TAG += '_%s' % DATASET_NAME
+  MODEL_DIR = './examples/coco_caption'
+  MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME)
+  IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt'
+  LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
+  NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME)
+  DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME,
+      str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all')
+  DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
+  VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt'
+  DEVICE_ID = 0
+  with open(VOCAB_FILE, 'r') as vocab_file:
+    vocab = [line.strip() for line in vocab_file.readlines()]
+  coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
+  image_root = COCO_IMAGE_PATTERN % DATASET_NAME
+  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab,
+                             align=False, shuffle=False)
+  dataset = {}
+  for image_path, sentence in sg.image_sentence_pairs:
+    if image_path not in dataset:
+      dataset[image_path] = []
+    dataset[image_path].append((sg.line_to_stream(sentence), sentence))
+  print 'Original dataset contains %d images' % len(dataset.keys())
+  if 0 <= MAX_IMAGES < len(dataset.keys()):
+    all_keys = dataset.keys()
+    perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
+    chosen_keys = set([all_keys[p] for p in perm])
+    for key in all_keys:
+      if key not in chosen_keys:
+        del dataset[key]
+    print 'Reduced dataset to %d images' % len(dataset.keys())
+  if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
+  captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE,
+                        device_id=DEVICE_ID)
+  beam_size = 1
+  generation_strategy = {'type': 'beam', 'beam_size': beam_size}
+  if generation_strategy['type'] == 'beam':
+    strategy_name = 'beam%d' % generation_strategy['beam_size']
+  elif generation_strategy['type'] == 'sample':
+    strategy_name = 'sample%f' % generation_strategy['temp']
+  else:
+    raise Exception('Unknown generation strategy type: %s' % generation_strategy['type'])
+  CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
+  experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg)
+  captioner.set_image_batch_size(min(100, MAX_IMAGES))
+  experimenter.generation_experiment(generation_strategy)
+  captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000))
+  experimenter.retrieval_experiment()
+
+if __name__ == "__main__":
+  main()
diff --git a/examples/coco_caption/train_language_model.sh b/examples/coco_caption/train_language_model.sh
new file mode 100755
index 00000000000..6e8a8c47b37
--- /dev/null
+++ b/examples/coco_caption/train_language_model.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lstm_lm_solver.prototxt \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/train_lrcn.from_lm.sh b/examples/coco_caption/train_lrcn.from_lm.sh
new file mode 100755
index 00000000000..332f50c9179
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.from_lm.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel\
+,\
+./examples/coco_caption/lstm_lm_iter_110000.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.from_lm.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/train_lrcn.sh b/examples/coco_caption/train_lrcn.sh
new file mode 100755
index 00000000000..5099e762ccd
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/train_lrcn.single_layer.sh b/examples/coco_caption/train_lrcn.single_layer.sh
new file mode 100755
index 00000000000..f99c09865b7
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.single_layer.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.single_layer.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/train_lrcn.unfactored.sh b/examples/coco_caption/train_lrcn.unfactored.sh
new file mode 100644
index 00000000000..a579783c5fb
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.unfactored.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.unfactored.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/train_lrcn.vgg.sh b/examples/coco_caption/train_lrcn.vgg.sh
new file mode 100755
index 00000000000..c0b873a4b41
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.vgg.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/vgg_16layers/VGG_ILSVRC_16_layers.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.vgg.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 89bab8d6f3a..fdeab11bbd1 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -511,6 +511,59 @@ class SilenceLayer : public Layer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
+/**
+ * @brief Computes a product of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        product.
+ */
+template <typename Dtype>
+class ScalarLayer: public Layer<Dtype> {
+ public:
+  explicit ScalarLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Scalar"; }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * In the below shape specifications, @f$ i @f$ denotes the value of the
+   * `axis` field given by `this->layer_param_.scalar_param().axis()`, after
+   * canonicalization (i.e., conversion from negative to positive index,
+   * if applicable).
+   *
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the first factor @f$ x @f$
+   *   -# @f$ (d_i \times ... \times d_j) @f$
+   *      the second factor @f$ y @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the product @f$ z = x y @f$ computed after "broadcasting" y.
+   *      Equivalent to tiling @f$ y @f$ to have the same shape as @f$ x @f$,
+   *      then computing the elementwise product.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> sum_multiplier_;
+  Blob<Dtype> sum_result_;
+  int axis_;
+  int outer_dim_, scalar_dim_, inner_dim_;
+};
+
 /**
  * @brief Computes the softmax function.
  *
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 1bf07d28d13..bed241d2a6c 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -179,6 +179,9 @@ class Net {
     return param_names_index_;
   }
   inline const vector<int>& param_owners() const { return param_owners_; }
+  inline const vector<string>& param_display_names() const {
+    return param_display_names_;
+  }
   /// @brief Input and output blob numbers
   inline int num_inputs() const { return net_input_blobs_.size(); }
   inline int num_outputs() const { return net_output_blobs_.size(); }
diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp
new file mode 100644
index 00000000000..03225082496
--- /dev/null
+++ b/include/caffe/sequence_layers.hpp
@@ -0,0 +1,321 @@
+#ifndef CAFFE_SEQUENCE_LAYERS_HPP_
+#define CAFFE_SEQUENCE_LAYERS_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instaed,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network arcthiecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills shapes with the shapes of the recurrent input Blob&s.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief A helper function, useful for stringifying timestep indices.
+  virtual string int_to_str(const int t) const;
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net<Dtype> > unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  vector<Blob<Dtype>* > recur_input_blobs_;
+  vector<Blob<Dtype>* > recur_output_blobs_;
+  vector<Blob<Dtype>* > output_blobs_;
+  Blob<Dtype>* x_input_blob_;
+  Blob<Dtype>* x_static_input_blob_;
+  Blob<Dtype>* cont_input_blob_;
+};
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented as a network
+ *        unrolled the LSTM computation in time.
+ *
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ *        (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ *     o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template <typename Dtype>
+class RNNLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit RNNLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SEQUENCE_LAYERS_HPP_
diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp
new file mode 100644
index 00000000000..d5e0923a3d4
--- /dev/null
+++ b/src/caffe/layers/lstm_layer.cpp
@@ -0,0 +1,237 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_0";
+  (*names)[1] = "c_0";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_" + this->int_to_str(this->T_);
+  (*names)[1] = "c_T";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  const int num_blobs = 2;
+  shapes->resize(num_blobs);
+  for (int i = 0; i < num_blobs; ++i) {
+    (*shapes)[i].Clear();
+    (*shapes)[i].add_dim(1);  // a single timestep
+    (*shapes)[i].add_dim(this->N_);
+    (*shapes)[i].add_dim(num_output);
+  }
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter scalar_param;
+  scalar_param.set_type("Scalar");
+  scalar_param.mutable_scalar_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  LayerParameter split_param;
+  split_param.set_type("Split");
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(2, input_shapes.size());
+
+  net_param->add_input("c_0");
+  net_param->add_input_shape()->CopyFrom(input_shapes[0]);
+
+  net_param->add_input("h_0");
+  net_param->add_input_shape()->CopyFrom(input_shapes[1]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xc_x = W_xc * x + b_c
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xc");
+    x_transform_param->add_param()->set_name("b_c");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xc_x");
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the gate dimension.
+    //     W_xc_x_static = W_xc_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xc_x_static");
+    x_static_transform_param->add_param()->set_name("W_xc_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xc_x_static_preshape");
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->add_bottom("W_xc_x_static_preshape");
+    reshape_param->add_top("W_xc_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->add_bottom("W_xc_x");
+  x_slice_param->set_name("W_xc_x_slice");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("h_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("h");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = this->int_to_str(t - 1);
+    string ts = this->int_to_str(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xc_x_" + ts);
+
+    // Add layers to flush the hidden state when beginning a new
+    // sequence, as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scalar_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("transform_" + ts);
+      w_param->add_param()->set_name("W_hc");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hc_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add the outputs of the linear transformations to compute the gate input.
+    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
+    {
+      LayerParameter* input_sum_layer = net_param->add_layer();
+      input_sum_layer->CopyFrom(sum_param);
+      input_sum_layer->set_name("gate_input_" + ts);
+      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
+      input_sum_layer->add_bottom("W_xc_x_" + ts);
+      if (this->static_input_) {
+        input_sum_layer->add_bottom("W_xc_x_static");
+      }
+      input_sum_layer->add_top("gate_input_" + ts);
+    }
+
+    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
+    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
+    // Outputs: c_t, h_t
+    //     [ i_t' ]
+    //     [ f_t' ] := gate_input_t
+    //     [ o_t' ]
+    //     [ g_t' ]
+    //         i_t := \sigmoid[i_t']
+    //         f_t := \sigmoid[f_t']
+    //         o_t := \sigmoid[o_t']
+    //         g_t := \tanh[g_t']
+    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+    //         h_t := o_t .* \tanh[c_t]
+    {
+      LayerParameter* lstm_unit_param = net_param->add_layer();
+      lstm_unit_param->set_type("LSTMUnit");
+      lstm_unit_param->add_bottom("c_" + tm1s);
+      lstm_unit_param->add_bottom("gate_input_" + ts);
+      lstm_unit_param->add_bottom("cont_" + ts);
+      lstm_unit_param->add_top("c_" + ts);
+      lstm_unit_param->add_top("h_" + ts);
+      lstm_unit_param->set_name("unit_" + ts);
+    }
+    output_concat_layer.add_bottom("h_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  {
+    LayerParameter* c_T_copy_param = net_param->add_layer();
+    c_T_copy_param->CopyFrom(split_param);
+    c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_));
+    c_T_copy_param->add_top("c_T");
+  }
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(LSTMLayer);
+REGISTER_LAYER_CLASS(LSTM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
new file mode 100644
index 00000000000..fd777f8adc3
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -0,0 +1,131 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+inline Dtype tanh(Dtype x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num_instances = bottom[0]->shape(1);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (i == 2) {
+      CHECK_EQ(2, bottom[i]->num_axes());
+    } else {
+      CHECK_EQ(3, bottom[i]->num_axes());
+    }
+    CHECK_EQ(1, bottom[i]->shape(0));
+    CHECK_EQ(num_instances, bottom[i]->shape(1));
+  }
+  hidden_dim_ = bottom[0]->shape(2);
+  CHECK_EQ(num_instances, bottom[1]->shape(1));
+  CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
+  top[0]->ReshapeLike(*bottom[0]);
+  top[1]->ReshapeLike(*bottom[0]);
+  X_acts_.ReshapeLike(*bottom[1]);
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* flush = bottom[2]->cpu_data();
+  Dtype* C = top[0]->mutable_cpu_data();
+  Dtype* H = top[1]->mutable_cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*flush == 0) ? 0 :
+          (*flush * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = f * c_prev + i * g;
+      C[d] = c;
+      const Dtype tanh_c = tanh(c);
+      H[d] = o * tanh_c;
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    ++flush;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* flush = bottom[2]->cpu_data();
+  const Dtype* C = top[0]->cpu_data();
+  const Dtype* H = top[1]->cpu_data();
+  const Dtype* C_diff = top[0]->cpu_diff();
+  const Dtype* H_diff = top[1]->cpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* X_diff = bottom[1]->mutable_cpu_diff();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*flush == 0) ? 0 :
+          (*flush * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = C[d];
+      const Dtype tanh_c = tanh(c);
+      Dtype* c_prev_diff = C_prev_diff + d;
+      Dtype* i_diff = X_diff + d;
+      Dtype* f_diff = X_diff + 1 * hidden_dim_ + d;
+      Dtype* o_diff = X_diff + 2 * hidden_dim_ + d;
+      Dtype* g_diff = X_diff + 3 * hidden_dim_ + d;
+      const Dtype c_term_diff =
+          C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = c_term_diff * f;
+      *i_diff = c_term_diff * g * i * (1 - i);
+      *f_diff = c_term_diff * c_prev * f * (1 - f);
+      *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+      *g_diff = c_term_diff * i * (1 - g * g);
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    C_diff += hidden_dim_;
+    H_diff += hidden_dim_;
+    X_diff += x_dim;
+    C_prev_diff += hidden_dim_;
+    ++flush;
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LSTMUnitLayer);
+#endif
+
+INSTANTIATE_CLASS(LSTMUnitLayer);
+REGISTER_LAYER_CLASS(LSTMUnit);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu
new file mode 100644
index 00000000000..d6bf85071f5
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cu
@@ -0,0 +1,154 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype tanh(const Dtype x) {
+  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
+}
+
+template <typename Dtype>
+__global__ void LSTMActsForward(const int nthreads, const int dim,
+                                const Dtype* X, Dtype* X_acts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    if (d < 3 * dim) {
+      X_acts[index] = sigmoid(X[index]);
+    } else {
+      X_acts[index] = tanh(X[index]);
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitForward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* flush,
+    Dtype* C, Dtype* H) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = flush[n] * f * c_prev + i * g;
+    C[index] = c;
+    const Dtype tanh_c = tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X = bottom[1]->gpu_data();
+  const Dtype* flush = bottom[2]->gpu_data();
+  Dtype* X_acts = X_acts_.mutable_gpu_data();
+  Dtype* C = top[0]->mutable_gpu_data();
+  Dtype* H = top[1]->mutable_gpu_data();
+  const int X_count = bottom[1]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMActsForward<Dtype><<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X, X_acts);
+  CUDA_POST_KERNEL_CHECK;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMUnitForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, hidden_dim_, C_prev, X_acts, flush, C, H);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitBackward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
+    const Dtype* flush, const Dtype* C_diff, const Dtype* H_diff,
+    Dtype* C_prev_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = C[index];
+    const Dtype tanh_c = tanh(c);
+    Dtype* c_prev_diff = C_prev_diff + index;
+    Dtype* X_diff_offset = X_diff + 4 * dim * n;
+    Dtype* i_diff = X_diff_offset + d;
+    Dtype* f_diff = X_diff_offset + 1 * dim + d;
+    Dtype* o_diff = X_diff_offset + 2 * dim + d;
+    Dtype* g_diff = X_diff_offset + 3 * dim + d;
+    const Dtype c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    const Dtype flush_n = flush[n];
+    *c_prev_diff = flush_n * c_term_diff * f;
+    *i_diff = c_term_diff * g;
+    *f_diff = flush_n * c_term_diff * c_prev;
+    *o_diff = H_diff[index] * tanh_c;
+    *g_diff = c_term_diff * i;
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMActsBackward(const int nthreads, const int dim,
+    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    const Dtype X_act = X_acts[index];
+    if (d < 3 * dim) {
+      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
+    } else {
+      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
+    }
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X_acts = X_acts_.gpu_data();
+  const Dtype* flush = bottom[2]->gpu_data();
+  const Dtype* C = top[0]->gpu_data();
+  const Dtype* H = top[1]->gpu_data();
+  const Dtype* C_diff = top[0]->gpu_diff();
+  const Dtype* H_diff = top[1]->gpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* X_acts_diff = X_acts_.mutable_gpu_diff();
+  LSTMUnitBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, hidden_dim_,
+      C_prev, X_acts, C, H, flush, C_diff, H_diff, C_prev_diff, X_acts_diff);
+  CUDA_POST_KERNEL_CHECK;
+  const int X_count = bottom[1]->count();
+  Dtype* X_diff = bottom[1]->mutable_gpu_diff();
+  LSTMActsBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
new file mode 100644
index 00000000000..89256229b54
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -0,0 +1,240 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+string RecurrentLayer<Dtype>::int_to_str(const int t) const {
+  ostringstream num;
+  num << t;
+  return num.str();
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  T_ = bottom[0]->shape(0);
+  N_ = bottom[0]->shape(1);
+  LOG(INFO) << "Initializing recurrent layer: assuming input batch contains "
+            << T_ << " timesteps of " << N_ << " independent streams.";
+
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+
+  // If provided, bottom[2] is a static input to the recurrent net.
+  static_input_ = (bottom.size() > 2);
+  if (static_input_) {
+    CHECK_GE(bottom[2]->num_axes(), 1);
+    CHECK_EQ(N_, bottom[2]->shape(0));
+  }
+
+  // Create a NetParameter; setup the inputs that aren't unique to particular
+  // recurrent architectures.
+  NetParameter net_param;
+  net_param.set_force_backward(true);
+
+  net_param.add_input("x");
+  BlobShape input_shape;
+  for (int i = 0; i < bottom[0]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[0]->shape(i));
+  }
+  net_param.add_input_shape()->CopyFrom(input_shape);
+
+  input_shape.Clear();
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[1]->shape(i));
+  }
+  net_param.add_input("cont");
+  net_param.add_input_shape()->CopyFrom(input_shape);
+
+  if (static_input_) {
+    input_shape.Clear();
+    for (int i = 0; i < bottom[2]->num_axes(); ++i) {
+      input_shape.add_dim(bottom[2]->shape(i));
+    }
+    net_param.add_input("x_static");
+    net_param.add_input_shape()->CopyFrom(input_shape);
+  }
+
+  // Call the child's FillUnrolledNet implementation to specify the unrolled
+  // recurrent architecture.
+  this->FillUnrolledNet(&net_param);
+
+  // Prepend this layer's name to the names of each layer in the unrolled net.
+  const string& layer_name = this->layer_param_.name();
+  if (layer_name.size() > 0) {
+    for (int i = 0; i < net_param.layer_size(); ++i) {
+      LayerParameter* layer = net_param.mutable_layer(i);
+      layer->set_name(layer_name + "_" + layer->name());
+    }
+  }
+
+  // Create the unrolled net.
+  unrolled_net_.reset(new Net<Dtype>(net_param));
+  unrolled_net_->set_debug_info(
+      this->layer_param_.recurrent_param().debug_info());
+
+  // Setup pointers to the inputs.
+  x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get());
+  cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get());
+  if (static_input_) {
+    x_static_input_blob_ =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get());
+  }
+
+  // Setup pointers to paired recurrent inputs/outputs.
+  vector<string> recur_input_names;
+  RecurrentInputBlobNames(&recur_input_names);
+  vector<string> recur_output_names;
+  RecurrentOutputBlobNames(&recur_output_names);
+  const int num_recur_blobs = recur_input_names.size();
+  CHECK_EQ(num_recur_blobs, recur_output_names.size());
+  recur_input_blobs_.resize(num_recur_blobs);
+  recur_output_blobs_.resize(num_recur_blobs);
+  for (int i = 0; i < recur_input_names.size(); ++i) {
+    recur_input_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get());
+    recur_output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get());
+  }
+
+  // Setup pointers to outputs.
+  vector<string> output_names;
+  OutputBlobNames(&output_names);
+  CHECK_EQ(top.size(), output_names.size())
+      << "OutputBlobNames must provide an output blob name for each top.";
+  output_blobs_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get());
+  }
+
+  // We should have 2 inputs (x and cont), plus a number of recurrent inputs,
+  // plus maybe a static input.
+  CHECK_EQ(2 + num_recur_blobs + static_input_,
+           unrolled_net_->input_blobs().size());
+
+  // This layer's parameters are any parameters in the layers of the unrolled
+  // net. We only want one copy of each parameter, so check that the parameter
+  // is "owned" by the layer, rather than shared with another.
+  this->blobs_.clear();
+  for (int i = 0; i < unrolled_net_->params().size(); ++i) {
+    if (unrolled_net_->param_owners()[i] == -1) {
+      LOG(INFO) << "Adding parameter " << i << ": "
+                << unrolled_net_->param_display_names()[i];
+      this->blobs_.push_back(unrolled_net_->params()[i]);
+    }
+  }
+  // Check that param_propagate_down is set for all of the parameters in the
+  // unrolled net; set param_propagate_down to true in this layer.
+  for (int i = 0; i < unrolled_net_->layers().size(); ++i) {
+    for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) {
+      CHECK(unrolled_net_->layers()[i]->param_propagate_down(j))
+          << "param_propagate_down not set for layer " << i << ", param " << j;
+    }
+  }
+  this->param_propagate_down_.clear();
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+
+  // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across
+  // batches.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_diff());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  CHECK_EQ(T_, bottom[0]->shape(0)) << "input number of timesteps changed";
+  N_ = bottom[0]->shape(1);
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+  CHECK_EQ(top.size(), output_blobs_.size());
+  x_input_blob_->ReshapeLike(*bottom[0]);
+  vector<int> cont_shape = bottom[1]->shape();
+  cont_input_blob_->Reshape(cont_shape);
+  if (static_input_) {
+    x_static_input_blob_->ReshapeLike(*bottom[2]);
+  }
+  vector<BlobShape> recur_input_shapes;
+  RecurrentInputShapes(&recur_input_shapes);
+  CHECK_EQ(recur_input_shapes.size(), recur_input_blobs_.size());
+  for (int i = 0; i < recur_input_shapes.size(); ++i) {
+    recur_input_blobs_[i]->Reshape(recur_input_shapes[i]);
+  }
+  unrolled_net_->Reshape();
+  x_input_blob_->ShareData(*bottom[0]);
+  x_input_blob_->ShareDiff(*bottom[0]);
+  cont_input_blob_->ShareData(*bottom[1]);
+  if (static_input_) {
+    x_static_input_blob_->ShareData(*bottom[2]);
+    x_static_input_blob_->ShareDiff(*bottom[2]);
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ReshapeLike(*output_blobs_[i]);
+    top[i]->ShareData(*output_blobs_[i]);
+    top[i]->ShareDiff(*output_blobs_[i]);
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reset() {
+  // "Reset" the hidden state of the net by zeroing out all recurrent outputs.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+    const int count = recur_input_blobs_[i]->count();
+    DCHECK_EQ(count, recur_output_blobs_[i]->count());
+    const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data();
+    Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data();
+    caffe_copy(count, timestep_T_data, timestep_0_data);
+  }
+
+  unrolled_net_->ForwardPrefilled();
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators.";
+
+  // TODO: skip backpropagation to inputs and parameters inside the unrolled
+  // net according to propagate_down[0] and propagate_down[2]. For now just
+  // backprop to inputs and parameters unconditionally, as either the inputs or
+  // the parameters do need backward (or Net would have set
+  // layer_needs_backward_[i] == false for this layer).
+  unrolled_net_->Backward();
+}
+
+#ifdef CPU_ONLY
+STUB_GPU_FORWARD(RecurrentLayer, Forward);
+#endif
+
+INSTANTIATE_CLASS(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu
new file mode 100644
index 00000000000..fa06b8add5e
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cu
@@ -0,0 +1,35 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+    const int count = recur_input_blobs_[i]->count();
+    DCHECK_EQ(count, recur_output_blobs_[i]->count());
+    const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data();
+    Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data();
+    caffe_copy(count, timestep_T_data, timestep_0_data);
+  }
+
+  unrolled_net_->ForwardPrefilled();
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp
new file mode 100644
index 00000000000..88ec92179cc
--- /dev/null
+++ b/src/caffe/layers/rnn_layer.cpp
@@ -0,0 +1,229 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_0";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_" + this->int_to_str(this->T_);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  shapes->resize(1);
+  (*shapes)[0].Clear();
+  (*shapes)[0].add_dim(1);  // a single timestep
+  (*shapes)[0].add_dim(this->N_);
+  (*shapes)[0].add_dim(num_output);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "o";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter tanh_param;
+  tanh_param.set_type("TanH");
+
+  LayerParameter scalar_param;
+  scalar_param.set_type("Scalar");
+  scalar_param.mutable_scalar_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(1, input_shapes.size());
+  net_param->add_input("h_0");
+  net_param->add_input_shape()->CopyFrom(input_shapes[0]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xh_x = W_xh * x + b_h
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xh");
+    x_transform_param->add_param()->set_name("b_h");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xh_x");
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the hidden state dimension.
+    //     W_xh_x_static = W_xh_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xh_x_static");
+    x_static_transform_param->add_param()->set_name("W_xh_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xh_x_static");
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    new_shape->add_dim(this->N_);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xh_x_static_reshape");
+    reshape_param->add_bottom("W_xh_x_static");
+    reshape_param->add_top("W_xh_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->set_name("W_xh_x_slice");
+  x_slice_param->add_bottom("W_xh_x");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("o_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("o");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = this->int_to_str(t - 1);
+    string ts = this->int_to_str(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xh_x_" + ts);
+
+    // Add layer to flush the hidden state when beginning a new sequence,
+    // as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scalar_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hh_h_{t-1} := W_hh * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("W_hh_h_" + tm1s);
+      w_param->add_param()->set_name("W_hh");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hh_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h )
+    //          = \tanh( W_hh_h_{t-1} + W_xh_t )
+    {
+      LayerParameter* h_input_sum_param = net_param->add_layer();
+      h_input_sum_param->CopyFrom(sum_param);
+      h_input_sum_param->set_name("h_input_sum_" + ts);
+      h_input_sum_param->add_bottom("W_hh_h_" + tm1s);
+      h_input_sum_param->add_bottom("W_xh_x_" + ts);
+      if (this->static_input_) {
+        h_input_sum_param->add_bottom("W_xh_x_static");
+      }
+      h_input_sum_param->add_top("h_neuron_input_" + ts);
+    }
+    {
+      LayerParameter* h_neuron_param = net_param->add_layer();
+      h_neuron_param->CopyFrom(tanh_param);
+      h_neuron_param->set_name("h_neuron_" + ts);
+      h_neuron_param->add_bottom("h_neuron_input_" + ts);
+      h_neuron_param->add_top("h_" + ts);
+    }
+
+    // Add layer to compute
+    //     W_ho_h_t := W_ho * h_t + b_o
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(biased_hidden_param);
+      w_param->set_name("W_ho_h_" + ts);
+      w_param->add_param()->set_name("W_ho");
+      w_param->add_param()->set_name("b_o");
+      w_param->add_bottom("h_" + ts);
+      w_param->add_top("W_ho_h_" + ts);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     o_t := \tanh( W_ho h_t + b_o)
+    //          = \tanh( W_ho_h_t )
+    {
+      LayerParameter* o_neuron_param = net_param->add_layer();
+      o_neuron_param->CopyFrom(tanh_param);
+      o_neuron_param->set_name("o_neuron_" + ts);
+      o_neuron_param->add_bottom("W_ho_h_" + ts);
+      o_neuron_param->add_top("o_" + ts);
+    }
+    output_concat_layer.add_bottom("o_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(RNNLayer);
+REGISTER_LAYER_CLASS(RNN);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/scalar_layer.cpp b/src/caffe/layers/scalar_layer.cpp
new file mode 100644
index 00000000000..5a4fac1aaee
--- /dev/null
+++ b/src/caffe/layers/scalar_layer.cpp
@@ -0,0 +1,119 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ScalarLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  // TODO: make ScalarLayer usable in-place.
+  // Currently, in-place computation is broken during Backward with
+  // propagate_down[0] && propagate_down[1], as bottom[0]'s diff is used for
+  // temporary storage of an intermediate result, overwriting top[0]'s diff
+  // if using in-place computation.
+  CHECK_NE(bottom[0], top[0]) << "ScalarLayer cannot be used in-place";
+  axis_ =
+      bottom[0]->CanonicalAxisIndex(this->layer_param_.scalar_param().axis());
+  CHECK_GE(bottom[0]->num_axes(), axis_ + bottom[1]->num_axes())
+      << "bottom[1]'s shape extends past bottom[0]'s shape when applied "
+      << "starting with bottom[0] axis = " << axis_;
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    CHECK_EQ(bottom[0]->shape(axis_ + i), bottom[1]->shape(i))
+        << "dimension mismatch between bottom[0]->shape(" << axis_ + i
+        << ") and bottom[1]->shape(" << i << ")";
+  }
+  outer_dim_ = bottom[0]->count(0, axis_);
+  scalar_dim_ = bottom[1]->count();
+  inner_dim_ = bottom[0]->count(axis_ + bottom[1]->num_axes());
+  top[0]->ReshapeLike(*bottom[0]);
+  sum_result_.Reshape(vector<int>(1, outer_dim_ * scalar_dim_));
+  const int sum_mult_size = std::max(outer_dim_, inner_dim_);
+  sum_multiplier_.Reshape(vector<int>(1, sum_mult_size));
+  if (sum_multiplier_.cpu_data()[sum_mult_size - 1] != Dtype(1)) {
+    caffe_set(sum_mult_size, Dtype(1), sum_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void ScalarLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* scalar_data = bottom[1]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int n = 0; n < outer_dim_; ++n) {
+    for (int d = 0; d < scalar_dim_; ++d) {
+      const Dtype factor = scalar_data[d];
+      caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);
+      bottom_data += inner_dim_;
+      top_data += inner_dim_;
+    }
+  }
+}
+
+template <typename Dtype>
+void ScalarLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    // Hack: store big eltwise product in bottom[0] diff, except in the special
+    // case where this layer itself does the eltwise product, in which case we
+    // can store it directly in the scalar diff, and we're done.
+    const bool is_eltwise = (inner_dim_ == 1 && outer_dim_ == 1);
+    Dtype* product = is_eltwise ?
+        bottom[1]->mutable_cpu_diff() : bottom[0]->mutable_cpu_diff();
+    caffe_mul(top[0]->count(), top_diff, bottom_data, product);
+    if (!is_eltwise) {
+      Dtype* sum_result = NULL;
+      if (inner_dim_ == 1) {
+        sum_result = product;
+      } else if (sum_result_.count() == 1) {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        Dtype* scalar_diff = bottom[1]->mutable_cpu_diff();
+        *scalar_diff = caffe_cpu_dot(inner_dim_, product, sum_mult);
+      } else {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        sum_result = (outer_dim_ == 1) ?
+            bottom[1]->mutable_cpu_diff() : sum_result_.mutable_cpu_data();
+        caffe_cpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
+                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
+      }
+      if (outer_dim_ != 1) {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        Dtype* scalar_diff = bottom[1]->mutable_cpu_diff();
+        if (scalar_dim_ == 1) {
+          *scalar_diff = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
+        } else {
+          caffe_cpu_gemv(CblasTrans, outer_dim_, scalar_dim_,
+                         Dtype(1), sum_result, sum_mult, Dtype(0), scalar_diff);
+        }
+      }
+    }
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* scalar_data = bottom[1]->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    for (int n = 0; n < outer_dim_; ++n) {
+      for (int d = 0; d < scalar_dim_; ++d) {
+        const Dtype factor = scalar_data[d];
+        caffe_cpu_scale(inner_dim_, factor, top_diff, bottom_diff);
+        bottom_diff += inner_dim_;
+        top_diff += inner_dim_;
+      }
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(ScalarLayer);
+#endif
+
+INSTANTIATE_CLASS(ScalarLayer);
+REGISTER_LAYER_CLASS(Scalar);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/scalar_layer.cu b/src/caffe/layers/scalar_layer.cu
new file mode 100644
index 00000000000..2711540048a
--- /dev/null
+++ b/src/caffe/layers/scalar_layer.cu
@@ -0,0 +1,86 @@
+#include <cfloat>
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void ScalarForward(const int n, const Dtype* in,
+    const Dtype* scalars, const int scalar_dim, const int inner_dim,
+    Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int scalar_index = (index / inner_dim) % scalar_dim;
+    out[index] = in[index] * scalars[scalar_index];
+  }
+}
+
+template <typename Dtype>
+void ScalarLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* scalar_data = bottom[1]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  ScalarForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, scalar_data, scalar_dim_, inner_dim_, top_data);
+}
+
+template <typename Dtype>
+void ScalarLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    // Hack: store big eltwise product in bottom[0] diff, except in the special
+    // case where this layer itself does the eltwise product, in which case we
+    // can store it directly in the scalar diff, and we're done.
+    const bool is_eltwise = (inner_dim_ == 1 && outer_dim_ == 1);
+    Dtype* product = is_eltwise ?
+        bottom[1]->mutable_gpu_diff() : bottom[0]->mutable_gpu_diff();
+    caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product);
+    if (!is_eltwise) {
+      Dtype* sum_result = NULL;
+      if (inner_dim_ == 1) {
+        sum_result = product;
+      } else if (sum_result_.count() == 1) {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        Dtype* scalar_diff = bottom[1]->mutable_cpu_diff();
+        caffe_gpu_dot(inner_dim_, product, sum_mult, scalar_diff);
+      } else {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        sum_result = (outer_dim_ == 1) ?
+            bottom[1]->mutable_gpu_diff() : sum_result_.mutable_gpu_data();
+        caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
+                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
+      }
+      if (outer_dim_ != 1) {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        if (scalar_dim_ == 1) {
+          Dtype* scalar_diff = bottom[1]->mutable_cpu_diff();
+          caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scalar_diff);
+        } else {
+          Dtype* scalar_diff = bottom[1]->mutable_gpu_diff();
+          caffe_gpu_gemv(CblasTrans, outer_dim_, scalar_dim_,
+                         Dtype(1), sum_result, sum_mult, Dtype(0), scalar_diff);
+        }
+      }
+    }
+  }
+  if (propagate_down[0]) {
+    const int count = top[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* scalar_data = bottom[1]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    ScalarForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, scalar_data, scalar_dim_, inner_dim_, bottom_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(ScalarLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index aa299f8660b..7c7d6074184 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -301,7 +301,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 139 (last added: tile_param)
+// LayerParameter next available layer-specific ID: 141 (last added: recurrent_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -374,9 +374,11 @@ message LayerParameter {
   optional PowerParameter power_param = 122;
   optional PReLUParameter prelu_param = 131;
   optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 140;
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
+  optional ScalarParameter scalar_param = 139;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
@@ -767,6 +769,19 @@ message PythonParameter {
   optional bool share_in_parallel = 4 [default = false];
 }
 
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+}
+
 // Message that stores parameters used by ReductionLayer
 message ReductionParameter {
   enum ReductionOp {
@@ -876,6 +891,23 @@ message ReshapeParameter {
   optional int32 num_axes = 3 [default = -1];
 }
 
+message ScalarParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x224x224, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x224; 100x3x224x224
+  //    (axis == 1 == -3)          3;     3x224;     3x224x224
+  //    (axis == 2 == -2)                   224;       224x224
+  //    (axis == 3 == -1)                                  224
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a literal scalar.
+  optional int32 axis = 1 [default = 0];
+}
+
 message SigmoidParameter {
   enum Engine {
     DEFAULT = 0;
diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp
new file mode 100644
index 00000000000..1fdc2fd2041
--- /dev/null
+++ b/src/caffe/test/test_lstm_layer.cpp
@@ -0,0 +1,266 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/sequence_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  LSTMLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_flush_);
+    blob_top_vec_.push_back(&blob_top_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_flush_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_flush_.Reshape(shape);
+    shape.push_back(num_output_);
+
+    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
+    unit_blob_bottom_x_.Reshape(shape);
+    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
+    unit_blob_bottom_c_prev_.Reshape(shape);
+    shape.resize(2);
+    shape[0] = 1; shape[1] = num_instances;
+    unit_blob_bottom_flush_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+    filler.Fill(&unit_blob_bottom_c_prev_);
+    filler.Fill(&unit_blob_bottom_x_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_flush_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype> unit_blob_bottom_flush_;
+  Blob<Dtype> unit_blob_bottom_c_prev_;
+  Blob<Dtype> unit_blob_bottom_x_;
+  Blob<Dtype> unit_blob_top_c_;
+  Blob<Dtype> unit_blob_top_h_;
+  vector<Blob<Dtype>*> unit_blob_bottom_vec_;
+  vector<Blob<Dtype>*> unit_blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LSTMLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(LSTMLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the flush blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  Caffe::set_random_seed(1);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<LSTMLayer<Dtype> > layer(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence LSTM";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all flush blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
+  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
+  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
+  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
+  for (int i = 0; i < num_axes; ++i) {
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_c_.shape(i));
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_h_.shape(i));
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data();
+  flush_data[0] = 0;
+  flush_data[1] = 0;
+  flush_data[2] = 0;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data();
+  flush_data[0] = 1;
+  flush_data[1] = 0;
+  flush_data[2] = 1;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlushBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp
new file mode 100644
index 00000000000..eab9269ce77
--- /dev/null
+++ b/src/caffe/test/test_rnn_layer.cpp
@@ -0,0 +1,196 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/sequence_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class RNNLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  RNNLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_flush_);
+    blob_top_vec_.push_back(&blob_top_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_flush_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_flush_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(RNNLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(RNNLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the flush blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<RNNLayer<Dtype> > layer(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence RNN";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all flush blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RNNLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlushBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_scalar_layer.cpp b/src/caffe/test/test_scalar_layer.cpp
new file mode 100644
index 00000000000..d823f1ef3ce
--- /dev/null
+++ b/src/caffe/test/test_scalar_layer.cpp
@@ -0,0 +1,258 @@
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class ScalarLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  ScalarLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_broadcast_0_(new Blob<Dtype>()),
+        blob_bottom_broadcast_1_(new Blob<Dtype>()),
+        blob_bottom_broadcast_2_(new Blob<Dtype>()),
+        blob_bottom_scalar_(new Blob<Dtype>(vector<int>())),
+        blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
+    vector<int> broadcast_shape(2);
+    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
+    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
+    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
+    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
+    FillerParameter filler_param;
+    filler_param.set_min(1);
+    filler_param.set_max(10);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_eltwise_);
+    filler.Fill(this->blob_bottom_broadcast_0_);
+    filler.Fill(this->blob_bottom_broadcast_1_);
+    filler.Fill(this->blob_bottom_broadcast_2_);
+    filler.Fill(this->blob_bottom_scalar_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~ScalarLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_eltwise_;
+    delete blob_bottom_broadcast_0_;
+    delete blob_bottom_broadcast_1_;
+    delete blob_bottom_broadcast_2_;
+    delete blob_bottom_scalar_;
+    delete blob_top_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_eltwise_;
+  Blob<Dtype>* const blob_bottom_broadcast_0_;
+  Blob<Dtype>* const blob_bottom_broadcast_1_;
+  Blob<Dtype>* const blob_bottom_broadcast_2_;
+  Blob<Dtype>* const blob_bottom_scalar_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(ScalarLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ScalarLayerTest, TestForwardEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  shared_ptr<ScalarLayer<Dtype> > layer(
+      new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(data[i], in_data_a[i] * in_data_b[i]);
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestForwardBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  shared_ptr<ScalarLayer<Dtype> > layer(
+      new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
+                    this->blob_bottom_->data_at(n, c, h, w) *
+                    this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0));
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestForwardBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(1);
+  shared_ptr<ScalarLayer<Dtype> > layer(
+      new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
+                    this->blob_bottom_->data_at(n, c, h, w) *
+                    this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0));
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestForwardBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(2);
+  shared_ptr<ScalarLayer<Dtype> > layer(
+      new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
+                    this->blob_bottom_->data_at(n, c, h, w) *
+                    this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0));
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestForwardScalar) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_);
+  LayerParameter layer_param;
+  shared_ptr<ScalarLayer<Dtype> > layer(
+      new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype scalar = *this->blob_bottom_scalar_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(data[i], in_data[i] * scalar);
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestForwardScalarAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(2);
+  shared_ptr<ScalarLayer<Dtype> > layer(
+      new ScalarLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype scalar = *this->blob_bottom_scalar_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(data[i], in_data[i] * scalar);
+  }
+}
+
+TYPED_TEST(ScalarLayerTest, TestGradientEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  ScalarLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScalarLayerTest, TestGradientBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  ScalarLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScalarLayerTest, TestGradientBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(1);
+  ScalarLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScalarLayerTest, TestGradientBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(2);
+  ScalarLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScalarLayerTest, TestGradientScalar) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_);
+  LayerParameter layer_param;
+  ScalarLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScalarLayerTest, TestGradientScalarAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_);
+  LayerParameter layer_param;
+  layer_param.mutable_scalar_param()->set_axis(2);
+  ScalarLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe