diff --git a/data/coco/README.md b/data/coco/README.md new file mode 100644 index 00000000000..9d077035512 --- /dev/null +++ b/data/coco/README.md @@ -0,0 +1,24 @@ +For details about the Microsoft COCO ("Common Objects in Context") dataset [1], +visit mscoco.org. This README provides instructions for downloading and +installing the tools and dataset. + +1) Download and extract the COCO Python tools by running: + + ./download_tools.sh + +2) Install the tools, and optionally download the data by running: + + cd coco/PythonAPI + python setup.py install # follow prompts to download or skip data + +3) Download train/val/test splits using: + + ./get_coco2014_aux.sh + +(or see the COCO README (tools/README) for more information). + + +[1] Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, + Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. + "Microsoft COCO: Common Objects in Context." + arXiv preprint arXiv:1405.0312 (2014). diff --git a/data/coco/download_eval_tools.sh b/data/coco/download_eval_tools.sh new file mode 100755 index 00000000000..3ceadacd7f6 --- /dev/null +++ b/data/coco/download_eval_tools.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# change to directory $DIR where this script is stored +pushd . +DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +cd $DIR + +OUTFILE=coco_caption_eval.zip +wget --no-check-certificate https://github.com/jeffdonahue/coco-caption/archive/master.zip -O $OUTFILE +unzip $OUTFILE +mv coco-caption-master coco-caption-eval + +# change back to original working directory +popd + +echo "Downloaded COCO evaluation tools to: $DIR/coco-caption-eval" diff --git a/data/coco/download_tools.sh b/data/coco/download_tools.sh new file mode 100755 index 00000000000..4a1c4ccbe54 --- /dev/null +++ b/data/coco/download_tools.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# change to directory $DIR where this script is stored +pushd . +DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +cd $DIR + +git clone https://github.com/pdollar/coco.git + +# change back to original working directory +popd + +echo "Cloned COCO tools to: $DIR/coco" +echo "To setup COCO tools (and optionally download data), run:" +echo " cd $DIR/coco" +echo " python setup.py install" +echo "and follow the prompts." diff --git a/data/coco/get_coco2014_aux.sh b/data/coco/get_coco2014_aux.sh new file mode 100755 index 00000000000..6ab23612139 --- /dev/null +++ b/data/coco/get_coco2014_aux.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# +# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files. + +# change to directory $DIR where this script is stored +pushd . +DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +cd $DIR + +FILENAME=coco2014_aux.tar.gz + +echo "Downloading..." + +wget http://dl.caffe.berkeleyvision.org/$FILENAME + +echo "Unzipping to $DIR" + +tar -xf $FILENAME && rm -f $FILENAME + +echo "Done." + +# change back to original working directory +popd diff --git a/data/coco/make_test.py b/data/coco/make_test.py new file mode 100755 index 00000000000..7e546b575d4 --- /dev/null +++ b/data/coco/make_test.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +# This file is only meant to be run as a script with 0 arguments, +# and depends on steps 1-3 of README.md. +# +# It creates a test set from the image filenames of the test set. + +import json +import os +import re + +# get path to directory where this script is +script_dir = os.path.dirname(os.path.realpath(__file__)) + +set_name = 'test2014' +image_root = '%s/coco/images/%s' % (script_dir, set_name) +out_filename = '%s/coco/annotations/captions_%s.json' % (script_dir, set_name) +image_ext = 'jpg' +imname_re = re.compile('COCO_%s_(?P\d+)\.%s' % (set_name, image_ext)) +full_image_ext = '.%s' % image_ext +image_filenames = filter(lambda f: f.endswith(full_image_ext), os.listdir(image_root)) +print 'Creating dummy annotation file for %d images at: %s' % \ + (len(image_filenames), out_filename) + +out_data = {'type': 'captions', 'images': [], 'annotations': [], + 'licenses': [], 'info': {}} +for index, filename in enumerate(image_filenames): + match = imname_re.match(filename) + if match is None: raise Exception('Unsupported filename: %s' % filename) + image_id = int(match.group('image_id')) + out_data['images'].append({'file_name': filename, 'id': image_id}) + for dummy_index in range(2): + annotation = {'caption': 'dummy caption %d' % dummy_index, + 'id': index, 'image_id': image_id} + out_data['annotations'].append(annotation) +with open(out_filename, 'w') as out_file: + json.dump(out_data, out_file) diff --git a/data/coco/make_trainval.py b/data/coco/make_trainval.py new file mode 100755 index 00000000000..0dea42622e8 --- /dev/null +++ b/data/coco/make_trainval.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +# This file is only meant to be run as a script with 0 arguments, +# and depends on steps 1-3 of README.md. +# +# It creates a "trainval" set by combining the COCO 2014 train and val sets. +# The trainval set is intended for use only when training a single final model +# for submission of results on the test set to the COCO evaluation server. + +import os +import json + +# get path to directory where this script is +script_dir = os.path.dirname(os.path.realpath(__file__)) + +anno_dir_path = '%s/coco/annotations' % script_dir +image_root = '%s/coco/images' % script_dir +abs_image_root = os.path.abspath(image_root) +out_coco_id_filename = '%s/coco2014_cocoid.trainval.txt' % script_dir +filename_pattern = 'captions_%s2014.json' +in_sets = ['train', 'val'] +out_set = 'trainval' +path_pattern = '%s/%s' % (anno_dir_path, filename_pattern) + +out_data = {} +for in_set in in_sets: + filename = path_pattern % in_set + print 'Loading input dataset from: %s' % filename + data = json.load(open(filename, 'r')) + for key, val in data.iteritems(): + if type(val) == list: + if key not in out_data: + out_data[key] = [] + out_data[key] += val + else: + if key not in out_data: + out_data[key] = val + assert out_data[key] == val +filename = path_pattern % out_set +print 'Dumping output dataset to: %s' % filename +json.dump(out_data, open(filename, 'w')) + +out_ids = [str(im['id']) for im in out_data['images']] +print 'Writing COCO IDs to: %s' % out_coco_id_filename +with open(out_coco_id_filename, 'w') as coco_id_file: + coco_id_file.write('\n'.join(out_ids) + '\n') + +# make a trainval dir with symlinks to all train+val images +out_dir = '%s/%s2014' % (image_root, out_set) +os.makedirs(out_dir) +print 'Writing image symlinks to: %s' % out_dir +for im in out_data['images']: + filename = im['file_name'] + set_name = None + for in_set in in_sets: + if in_set in filename: + set_name = in_set + break + assert set_name is not None + real_path = '%s/%s2014/%s' % (abs_image_root, set_name, filename) + link_path = '%s/%s' % (out_dir, filename) + os.symlink(real_path, link_path) diff --git a/examples/coco_caption/.gitignore b/examples/coco_caption/.gitignore new file mode 100644 index 00000000000..e040331b7f2 --- /dev/null +++ b/examples/coco_caption/.gitignore @@ -0,0 +1 @@ +h5_data/ diff --git a/examples/coco_caption/Caffe language model.ipynb b/examples/coco_caption/Caffe language model.ipynb new file mode 100644 index 00000000000..30d2c494a39 --- /dev/null +++ b/examples/coco_caption/Caffe language model.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import random\n", + "\n", + "import sys\n", + "sys.path.append('./python')\n", + "import caffe\n", + "\n", + "sys.path.append('./examples/coco_caption')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + "a\r\n", + "on\r\n", + "of\r\n", + "the\r\n", + "in\r\n", + "with\r\n", + "and\r\n", + "is\r\n", + "man\r\n" + ] + } + ], + "source": [ + "!head examples/coco_caption/h5_data/buffer_100/vocabulary.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8801\n" + ] + } + ], + "source": [ + "vocabulary = [''] + [line.strip() for line in\n", + " open('examples/coco_caption/h5_data/buffer_100/vocabulary.txt').readlines()]\n", + "print len(vocabulary)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, 1, 8801)\n" + ] + } + ], + "source": [ + "iter_num = 110000\n", + "net = caffe.Net('./examples/coco_caption/lstm_lm.deploy.prototxt',\n", + " './examples/coco_caption/lstm_lm_iter_%d.caffemodel' % iter_num, caffe.TEST)\n", + "print net.blobs['probs'].data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def predict_single_word(net, previous_word, output='probs'):\n", + " cont = 0 if previous_word == 0 else 1\n", + " cont_input = np.array([cont])\n", + " word_input = np.array([previous_word])\n", + " net.forward(cont_sentence=cont_input, input_sentence=word_input)\n", + " output_preds = net.blobs[output].data[0, 0, :]\n", + " return output_preds" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "first_word_dist = predict_single_word(net, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top_preds = np.argsort(-1 * first_word_dist)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 2 14 5 13 64 77 30 18 93 142]\n", + "['a', 'two', 'the', 'an', 'there', 'three', 'some', 'people', 'several', 'this']\n" + ] + } + ], + "source": [ + "print top_preds[:10]\n", + "print [vocabulary[index] for index in top_preds[:10]]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['people', 'men', 'women', 'giraffes', 'zebras', 'young', 'cats', 'elephants', 'horses', 'children']\n" + ] + } + ], + "source": [ + "second_word_dist = predict_single_word(net, vocabulary.index('two'))\n", + "print [vocabulary[index] for index in np.argsort(-1 * second_word_dist)[:10]]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['standing', 'are', 'in', 'stand', 'walking', 'and', 'eating', 'that', 'walk', 'with']\n" + ] + } + ], + "source": [ + "third_word_dist = predict_single_word(net, vocabulary.index('giraffes'))\n", + "print [vocabulary[index] for index in np.argsort(-1 * second_word_dist)[:10]]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['leaves', 'from', 'grass', 'hay', 'out', 'some', 'in', 'food', 'off', 'a']\n" + ] + } + ], + "source": [ + "third_word_dist = predict_single_word(net, vocabulary.index('eating'))\n", + "print [vocabulary[index] for index in np.argsort(-1 * second_word_dist)[:10]]" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def softmax(softmax_inputs, temp):\n", + " shifted_inputs = softmax_inputs - softmax_inputs.max()\n", + " exp_outputs = np.exp(temp * shifted_inputs)\n", + " exp_outputs_sum = exp_outputs.sum()\n", + " if np.isnan(exp_outputs_sum):\n", + " return exp_outputs * float('nan')\n", + " assert exp_outputs_sum > 0\n", + " if np.isinf(exp_outputs_sum):\n", + " return np.zeros_like(exp_outputs)\n", + " eps_sum = 1e-20\n", + " return exp_outputs / max(exp_outputs_sum, eps_sum)\n", + "\n", + "def random_choice_from_probs(softmax_inputs, temp=1):\n", + " # temperature of infinity == take the max\n", + " if temp == float('inf'):\n", + " return np.argmax(softmax_inputs)\n", + " probs = softmax(softmax_inputs, temp)\n", + " r = random.random()\n", + " cum_sum = 0.\n", + " for i, p in enumerate(probs):\n", + " cum_sum += p\n", + " if cum_sum >= r: return i\n", + " return 1 # return UNK?" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def generate_sentence(net, temp=float('inf'), output='predict', max_words=50):\n", + " cont_input = np.array([0])\n", + " word_input = np.array([0])\n", + " sentence = []\n", + " while len(sentence) < max_words and (not sentence or sentence[-1] != 0):\n", + " net.forward(cont_sentence=cont_input, input_sentence=word_input)\n", + " output_preds = net.blobs[output].data[0, 0, :]\n", + " sentence.append(random_choice_from_probs(output_preds, temp=temp))\n", + " cont_input[0] = 1\n", + " word_input[0] = sentence[-1]\n", + " return sentence" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 10, 9, 16, 6, 2, 35, 7, 2, 118, 0]\n", + "['a', 'man', 'is', 'standing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 10, 9, 16, 6, 2, 35, 7, 2, 118, 0]\n", + "['a', 'man', 'is', 'standing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 22, 9, 294, 7, 2, 178, 113, 11, 87, 905, 0]\n", + "['a', 'woman', 'is', 'posing', 'with', 'a', 'cell', 'phone', 'to', 'her', 'ear', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=1.0)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 28, 26, 2, 38, 209, 3, 2, 38, 152, 0]\n", + "['a', 'person', 'holding', 'a', 'tennis', 'racket', 'on', 'a', 'tennis', 'court', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=1.0)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 10, 26, 2, 38, 363, 3, 2, 38, 152, 0]\n", + "['a', 'man', 'holding', 'a', 'tennis', 'racquet', 'on', 'a', 'tennis', 'court', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=1.5)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 33, 4, 18, 12, 106, 2, 23, 7, 60, 0]\n", + "['a', 'group', 'of', 'people', 'sitting', 'around', 'a', 'table', 'with', 'food', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=1.5)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 10, 6, 2, 261, 8, 217, 16, 6, 2, 43, 0]\n", + "['a', 'man', 'in', 'a', 'suit', 'and', 'tie', 'standing', 'in', 'a', 'room', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=3.0)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 10, 26, 2, 38, 363, 3, 2, 38, 152, 0]\n", + "['a', 'man', 'holding', 'a', 'tennis', 'racquet', 'on', 'a', 'tennis', 'court', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=3.0)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 10, 9, 16, 6, 2, 35, 7, 2, 118, 0]\n", + "['a', 'man', 'is', 'standing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=10.0)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1993, 1074, 86, 6, 40, 4, 2, 126, 0]\n", + "['staircase', 'laid', 'out', 'in', 'front', 'of', 'a', 'window', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=1.0)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 28, 3, 2, 113, 46, 2, 129, 0]\n", + "['a', 'person', 'on', 'a', 'phone', 'riding', 'a', 'car', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=0.8)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 16, 60, 6, 136, 192, 7, 641, 16, 20, 11, 27, 0]\n", + "['a', 'standing', 'food', 'in', 'each', 'hand', 'with', 'cattle', 'standing', 'next', 'to', 'it', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=0.8)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[28, 236, 1042, 7, 69, 1257, 487, 1769, 0]\n", + "['person', 'taking', 'noodles', 'with', 'other', 'homemade', 'birthday', 'cereal', '']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=0.6)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[5623, 1087, 15, 6888, 472, 361, 8634, 8, 7241, 3, 77, 299, 935, 1296, 15, 12, 5165, 2867, 3979, 743, 4991, 4470, 640, 9, 259, 2308, 4386, 2552, 3797, 2448, 15, 3617, 5364, 4267, 4549, 8086, 176, 2529, 6434, 5445, 370, 7959, 5672, 1742, 4041, 4258, 1153, 8, 610, 2044]\n", + "['chilli', 'frosting', ',', 'medley', 'salad', 'items', 'sideboard', 'and', 'garnishes', 'on', 'three', 'colorful', 'gold', 'desserts', ',', 'sitting', 'knifes', 'need', 'workspace', 'where', 'exchanging', 'hoses', 'left', 'is', 'pink', 'clearing', 'obstacles', 'vandalized', 'idly', 'afternoon', ',', 'halloween', 'rich', 'fixed', 'aid', 'advertise', 'light', 'times', 'delicate', 'dealership', 'like', 'snowsuits', 'florida', 'than', 'ornamental', 'dr', 'curtains', 'and', 'multiple', 'electrical']\n" + ] + } + ], + "source": [ + "sentence = generate_sentence(net, temp=0.5)\n", + "print sentence\n", + "print [vocabulary[index] for index in sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/coco_caption/captioner.py b/examples/coco_caption/captioner.py new file mode 100644 index 00000000000..cefa44da24c --- /dev/null +++ b/examples/coco_caption/captioner.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python + +from collections import OrderedDict +import h5py +import math +import matplotlib.pyplot as plt +import numpy as np +import os +import random +import sys + +sys.path.append('./python/') +import caffe + +class Captioner(): + def __init__(self, weights_path, image_net_proto, lstm_net_proto, + vocab_path, device_id=-1): + if device_id >= 0: + caffe.set_mode_gpu() + caffe.set_device(device_id) + else: + caffe.set_mode_cpu() + # Setup image processing net. + phase = caffe.TEST + self.image_net = caffe.Net(image_net_proto, weights_path, phase) + image_data_shape = self.image_net.blobs['data'].data.shape + self.transformer = caffe.io.Transformer({'data': image_data_shape}) + channel_mean = np.zeros(image_data_shape[1:]) + channel_mean_values = [104, 117, 123] + assert channel_mean.shape[0] == len(channel_mean_values) + for channel_index, mean_val in enumerate(channel_mean_values): + channel_mean[channel_index, ...] = mean_val + self.transformer.set_mean('data', channel_mean) + self.transformer.set_channel_swap('data', (2, 1, 0)) + self.transformer.set_transpose('data', (2, 0, 1)) + # Setup sentence prediction net. + self.lstm_net = caffe.Net(lstm_net_proto, weights_path, phase) + self.vocab = [''] + with open(vocab_path, 'r') as vocab_file: + self.vocab += [word.strip() for word in vocab_file.readlines()] + net_vocab_size = self.lstm_net.blobs['predict'].data.shape[2] + if len(self.vocab) != net_vocab_size: + raise Exception('Invalid vocab file: contains %d words; ' + 'net expects vocab with %d words' % (len(self.vocab), net_vocab_size)) + + def set_image_batch_size(self, batch_size): + self.image_net.blobs['data'].reshape(batch_size, + *self.image_net.blobs['data'].data.shape[1:]) + + def caption_batch_size(self): + return self.lstm_net.blobs['cont_sentence'].data.shape[1] + + def set_caption_batch_size(self, batch_size): + self.lstm_net.blobs['cont_sentence'].reshape(1, batch_size) + self.lstm_net.blobs['input_sentence'].reshape(1, batch_size) + self.lstm_net.blobs['image_features'].reshape(batch_size, + *self.lstm_net.blobs['image_features'].data.shape[1:]) + self.lstm_net.reshape() + + def preprocess_image(self, image, verbose=False): + if type(image) in (str, unicode): + image = plt.imread(image) + crop_edge_ratio = (256. - 227.) / 256. / 2 + ch = int(image.shape[0] * crop_edge_ratio + 0.5) + cw = int(image.shape[1] * crop_edge_ratio + 0.5) + cropped_image = image[ch:-ch, cw:-cw] + if len(cropped_image.shape) == 2: + cropped_image = np.tile(cropped_image[:, :, np.newaxis], (1, 1, 3)) + preprocessed_image = self.transformer.preprocess('data', cropped_image) + if verbose: + print 'Preprocessed image has shape %s, range (%f, %f)' % \ + (preprocessed_image.shape, + preprocessed_image.min(), + preprocessed_image.max()) + return preprocessed_image + + def preprocessed_image_to_descriptor(self, image, output_name='fc8'): + net = self.image_net + if net.blobs['data'].data.shape[0] > 1: + batch = np.zeros_like(net.blobs['data'].data) + batch[0] = image[0] + else: + batch = image + net.forward(data=batch) + descriptor = net.blobs[output_name].data[0].copy() + return descriptor + + def image_to_descriptor(self, image, output_name='fc8'): + return self.preprocessed_image_to_descriptor(self.preprocess_image(image)) + + def predict_single_word(self, descriptor, previous_word, output='probs'): + net = self.lstm_net + cont = 0 if previous_word == 0 else 1 + cont_input = np.array([cont]) + word_input = np.array([previous_word]) + image_features = np.zeros_like(net.blobs['image_features'].data) + image_features[:] = descriptor + net.forward(image_features=image_features, cont_sentence=cont_input, + input_sentence=word_input) + output_preds = net.blobs[output].data[0, 0, :] + return output_preds + + def predict_single_word_from_all_previous(self, descriptor, previous_words): + for word in [0] + previous_words: + probs = self.predict_single_word(descriptor, word) + return probs + + # Strategy must be either 'beam' or 'sample'. + # If 'beam', do a max likelihood beam search with beam size num_samples. + # Otherwise, sample with temperature temp. + def predict_caption(self, descriptor, strategy={'type': 'beam'}): + assert 'type' in strategy + assert strategy['type'] in ('beam', 'sample') + if strategy['type'] == 'beam': + return self.predict_caption_beam_search(descriptor, strategy) + num_samples = strategy['num'] if 'num' in strategy else 1 + samples = [] + sample_probs = [] + for _ in range(num_samples): + sample, sample_prob = self.sample_caption(descriptor, strategy) + samples.append(sample) + sample_probs.append(sample_prob) + return samples, sample_probs + + def sample_caption(self, descriptor, strategy, + net_output='predict', max_length=50): + sentence = [] + probs = [] + eps_prob = 1e-8 + temp = strategy['temp'] if 'temp' in strategy else 1.0 + if max_length < 0: max_length = float('inf') + while len(sentence) < max_length and (not sentence or sentence[-1] != 0): + previous_word = sentence[-1] if sentence else 0 + softmax_inputs = self.predict_single_word(descriptor, previous_word, + output=net_output) + word = random_choice_from_probs(softmax_inputs, temp) + sentence.append(word) + probs.append(softmax(softmax_inputs, 1.0)[word]) + return sentence, probs + + def predict_caption_beam_search(self, descriptor, strategy, max_length=50): + orig_batch_size = self.caption_batch_size() + if orig_batch_size != 1: self.set_caption_batch_size(1) + beam_size = strategy['beam_size'] if 'beam_size' in strategy else 1 + assert beam_size >= 1 + beams = [[]] + beams_complete = 0 + beam_probs = [[]] + beam_log_probs = [0.] + while beams_complete < len(beams): + expansions = [] + for beam_index, beam_log_prob, beam in \ + zip(range(len(beams)), beam_log_probs, beams): + if beam: + previous_word = beam[-1] + if len(beam) >= max_length or previous_word == 0: + exp = {'prefix_beam_index': beam_index, 'extension': [], + 'prob_extension': [], 'log_prob': beam_log_prob} + expansions.append(exp) + # Don't expand this beam; it was already ended with an EOS, + # or is the max length. + continue + else: + previous_word = 0 # EOS is first word + if beam_size == 1: + probs = self.predict_single_word(descriptor, previous_word) + else: + probs = self.predict_single_word_from_all_previous(descriptor, beam) + assert len(probs.shape) == 1 + assert probs.shape[0] == len(self.vocab) + expansion_inds = probs.argsort()[-beam_size:] + for ind in expansion_inds: + prob = probs[ind] + extended_beam_log_prob = beam_log_prob + math.log(prob) + exp = {'prefix_beam_index': beam_index, 'extension': [ind], + 'prob_extension': [prob], 'log_prob': extended_beam_log_prob} + expansions.append(exp) + # Sort expansions in decreasing order of probability. + expansions.sort(key=lambda expansion: -1 * expansion['log_prob']) + expansions = expansions[:beam_size] + new_beams = \ + [beams[e['prefix_beam_index']] + e['extension'] for e in expansions] + new_beam_probs = \ + [beam_probs[e['prefix_beam_index']] + e['prob_extension'] for e in expansions] + beam_log_probs = [e['log_prob'] for e in expansions] + beams_complete = 0 + for beam in new_beams: + if beam[-1] == 0 or len(beam) >= max_length: beams_complete += 1 + beams, beam_probs = new_beams, new_beam_probs + if orig_batch_size != 1: self.set_caption_batch_size(orig_batch_size) + return beams, beam_probs + + def score_caption(self, descriptor, caption, is_gt=True, caption_source='gt'): + output = {} + output['caption'] = caption + output['gt'] = is_gt + output['source'] = caption_source + output['prob'] = [] + probs = self.predict_single_word(descriptor, 0) + for word in caption: + output['prob'].append(probs[word]) + probs = self.predict_single_word(descriptor, word) + return output + + def compute_descriptors(self, image_list, output_name='fc8'): + batch = np.zeros_like(self.image_net.blobs['data'].data) + batch_shape = batch.shape + batch_size = batch_shape[0] + descriptors_shape = (len(image_list), ) + \ + self.image_net.blobs[output_name].data.shape[1:] + descriptors = np.zeros(descriptors_shape) + for batch_start_index in range(0, len(image_list), batch_size): + batch_list = image_list[batch_start_index:(batch_start_index + batch_size)] + for batch_index, image_path in enumerate(batch_list): + batch[batch_index:(batch_index + 1)] = self.preprocess_image(image_path) + current_batch_size = min(batch_size, len(image_list) - batch_start_index) + print 'Computing descriptors for images %d-%d of %d' % \ + (batch_start_index, batch_start_index + current_batch_size - 1, + len(image_list)) + self.image_net.forward(data=batch) + descriptors[batch_start_index:(batch_start_index + current_batch_size)] = \ + self.image_net.blobs[output_name].data[:current_batch_size] + return descriptors + + def score_captions(self, descriptor, captions, + output_name='probs', caption_source='gt', verbose=True): + net = self.lstm_net + cont_input = np.zeros_like(net.blobs['cont_sentence'].data) + word_input = np.zeros_like(net.blobs['input_sentence'].data) + image_features = np.zeros_like(net.blobs['image_features'].data) + batch_size = image_features.shape[0] + assert descriptor.shape == image_features.shape[1:] + for index in range(batch_size): + image_features[index] = descriptor + outputs = [] + input_data_initialized = False + for batch_start_index in range(0, len(captions), batch_size): + caption_batch = captions[batch_start_index:(batch_start_index + batch_size)] + current_batch_size = len(caption_batch) + caption_index = 0 + probs_batch = [[] for b in range(current_batch_size)] + num_done = 0 + while num_done < current_batch_size: + if caption_index == 0: + cont_input[:] = 0 + elif caption_index == 1: + cont_input[:] = 1 + for index, caption in enumerate(caption_batch): + word_input[0, index] = \ + caption['caption'][caption_index - 1] if \ + 0 < caption_index < len(caption['caption']) else 0 + if input_data_initialized: + net.forward(start="embedding", input_sentence=word_input, + cont_sentence=cont_input, image_features=image_features) + else: + net.forward(input_sentence=word_input, cont_sentence=cont_input, + image_features=image_features) + input_data_initialized = True + output_probs = net.blobs[output_name].data + for index, probs, caption in \ + zip(range(current_batch_size), probs_batch, caption_batch): + if caption_index == len(caption['caption']) - 1: + num_done += 1 + if caption_index < len(caption['caption']): + word = caption['caption'][caption_index] + probs.append(output_probs[0, index, word].reshape(-1)[0]) + if verbose: + print 'Computed probs for word %d of captions %d-%d (%d done)' % \ + (caption_index, batch_start_index, + batch_start_index + current_batch_size - 1, num_done) + caption_index += 1 + for prob, caption in zip(probs_batch, caption_batch): + output = {} + output['caption'] = caption['caption'] + output['prob'] = prob + output['gt'] = True + output['source'] = caption_source + outputs.append(output) + return outputs + + def sample_captions(self, descriptor, prob_output_name='probs', + pred_output_name='predict', temp=1, max_length=50): + descriptor = np.array(descriptor) + batch_size = descriptor.shape[0] + self.set_caption_batch_size(batch_size) + net = self.lstm_net + cont_input = np.zeros_like(net.blobs['cont_sentence'].data) + word_input = np.zeros_like(net.blobs['input_sentence'].data) + image_features = np.zeros_like(net.blobs['image_features'].data) + image_features[:] = descriptor + outputs = [] + output_captions = [[] for b in range(batch_size)] + output_probs = [[] for b in range(batch_size)] + caption_index = 0 + num_done = 0 + while num_done < batch_size and caption_index < max_length: + if caption_index == 0: + cont_input[:] = 0 + elif caption_index == 1: + cont_input[:] = 1 + if caption_index == 0: + word_input[:] = 0 + else: + for index in range(batch_size): + word_input[0, index] = \ + output_captions[index][caption_index - 1] if \ + caption_index <= len(output_captions[index]) else 0 + net.forward(image_features=image_features, cont_sentence=cont_input, + input_sentence=word_input) + if temp == 1.0 or temp == float('inf'): + net_output_probs = net.blobs[prob_output_name].data[0] + samples = [ + random_choice_from_probs(dist, temp=temp, already_softmaxed=True) + for dist in net_output_probs + ] + else: + net_output_preds = net.blobs[pred_output_name].data[0] + samples = [ + random_choice_from_probs(preds, temp=temp, already_softmaxed=False) + for preds in net_output_preds + ] + for index, next_word_sample in enumerate(samples): + # If the caption is empty, or non-empty but the last word isn't EOS, + # predict another word. + if not output_captions[index] or output_captions[index][-1] != 0: + output_captions[index].append(next_word_sample) + output_probs[index].append(net_output_probs[index, next_word_sample]) + if next_word_sample == 0: num_done += 1 + sys.stdout.write('\r%d/%d done after word %d' % + (num_done, batch_size, caption_index)) + sys.stdout.flush() + caption_index += 1 + sys.stdout.write('\n') + return output_captions, output_probs + + def sentence(self, vocab_indices): + sentence = ' '.join([self.vocab[i] for i in vocab_indices]) + if not sentence: return sentence + sentence = sentence[0].upper() + sentence[1:] + # If sentence ends with ' ', remove and replace with '.' + # Otherwise (doesn't end with '' -- maybe was the max length?): + # append '...' + suffix = ' ' + self.vocab[0] + if sentence.endswith(suffix): + sentence = sentence[:-len(suffix)] + '.' + else: + sentence += '...' + return sentence + +def softmax(softmax_inputs, temp): + shifted_inputs = softmax_inputs - softmax_inputs.max() + exp_outputs = np.exp(temp * shifted_inputs) + exp_outputs_sum = exp_outputs.sum() + if math.isnan(exp_outputs_sum): + return exp_outputs * float('nan') + assert exp_outputs_sum > 0 + if math.isinf(exp_outputs_sum): + return np.zeros_like(exp_outputs) + eps_sum = 1e-20 + return exp_outputs / max(exp_outputs_sum, eps_sum) + +def random_choice_from_probs(softmax_inputs, temp=1, already_softmaxed=False): + # temperature of infinity == take the max + if temp == float('inf'): + return np.argmax(softmax_inputs) + if already_softmaxed: + probs = softmax_inputs + assert temp == 1 + else: + probs = softmax(softmax_inputs, temp) + r = random.random() + cum_sum = 0. + for i, p in enumerate(probs): + cum_sum += p + if cum_sum >= r: return i + return 1 # return UNK? + +def gen_stats(prob, normalizer=None): + stats = {} + stats['length'] = len(prob) + stats['log_p'] = 0.0 + eps = 1e-12 + for p in prob: + assert 0.0 <= p <= 1.0 + stats['log_p'] += math.log(max(eps, p)) + stats['log_p_word'] = stats['log_p'] / stats['length'] + stats['p'] = math.exp(stats['log_p']) + stats['p_word'] = math.exp(stats['log_p']) + try: + stats['perplex'] = math.exp(-stats['log_p']) + except OverflowError: + stats['perplex'] = float('inf') + try: + stats['perplex_word'] = math.exp(-stats['log_p_word']) + except OverflowError: + stats['perplex_word'] = float('inf') + if normalizer is not None: + norm_stats = gen_stats(normalizer) + stats['normed_perplex'] = stats['perplex'] / norm_stats['perplex'] + stats['normed_perplex_word'] = \ + stats['perplex_word'] / norm_stats['perplex_word'] + return stats diff --git a/examples/coco_caption/coco_to_hdf5_data.py b/examples/coco_caption/coco_to_hdf5_data.py new file mode 100755 index 00000000000..300e4748061 --- /dev/null +++ b/examples/coco_caption/coco_to_hdf5_data.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python + +from hashlib import sha1 +import os +import random +random.seed(3) +import re +import sys + +sys.path.append('./examples/coco_caption/') + +COCO_PATH = './data/coco/coco' +COCO_TOOL_PATH = '%s/PythonAPI/build/lib/pycocotools' % COCO_PATH +COCO_IMAGE_ROOT = '%s/images' % COCO_PATH + +MAX_HASH = 100000 + +sys.path.append(COCO_TOOL_PATH) +from coco import COCO + +from hdf5_sequence_generator import SequenceGenerator, HDF5SequenceWriter + +# UNK_IDENTIFIER is the word used to identify unknown words +UNK_IDENTIFIER = '' + +SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') +def split_sentence(sentence): + # break sentence into a list of words and punctuation + sentence = [s.lower() for s in SENTENCE_SPLIT_REGEX.split(sentence.strip()) if len(s.strip()) > 0] + # remove the '.' from the end of the sentence + if sentence[-1] != '.': + # print "Warning: sentence doesn't end with '.'; ends with: %s" % sentence[-1] + return sentence + return sentence[:-1] + +MAX_WORDS = 20 + +class CocoSequenceGenerator(SequenceGenerator): + def __init__(self, coco, batch_num_streams, image_root, vocab=None, + max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True, + pad=True, truncate=True, split_ids=None): + self.max_words = max_words + num_empty_lines = 0 + self.images = [] + num_total = 0 + num_missing = 0 + num_captions = 0 + known_images = {} + self.coco = coco + if split_ids is None: + split_ids = coco.imgs.keys() + self.image_path_to_id = {} + for image_id in split_ids: + image_info = coco.imgs[image_id] + image_path = '%s/%s' % (image_root, image_info['file_name']) + self.image_path_to_id[image_path] = image_id + if os.path.isfile(image_path): + assert image_id not in known_images # no duplicates allowed + known_images[image_id] = {} + known_images[image_id]['path'] = image_path + if gt_captions: + known_images[image_id]['sentences'] = [split_sentence(anno['caption']) + for anno in coco.imgToAnns[image_id]] + num_captions += len(known_images[image_id]['sentences']) + else: + known_images[image_id]['sentences'] = [] + else: + num_missing += 1 + print 'Warning (#%d): image not found: %s' % (num_missing, image_path) + num_total += 1 + print '%d/%d images missing' % (num_missing, num_total) + if vocab is None: + self.init_vocabulary(known_images) + else: + self.vocabulary_inverted = vocab + self.vocabulary = {} + for index, word in enumerate(self.vocabulary_inverted): + self.vocabulary[word] = index + self.image_sentence_pairs = [] + num_no_sentences = 0 + for image_filename, metadata in known_images.iteritems(): + if not metadata['sentences']: + num_no_sentences += 1 + print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename) + for sentence in metadata['sentences']: + self.image_sentence_pairs.append((metadata['path'], sentence)) + self.index = 0 + self.num_resets = 0 + self.num_truncates = 0 + self.num_pads = 0 + self.num_outs = 0 + self.image_list = [] + SequenceGenerator.__init__(self) + self.batch_num_streams = batch_num_streams + # make the number of image/sentence pairs a multiple of the buffer size + # so each timestep of each batch is useful and we can align the images + if align: + num_pairs = len(self.image_sentence_pairs) + remainder = num_pairs % batch_num_streams + if remainder > 0: + num_needed = batch_num_streams - remainder + for i in range(num_needed): + choice = random.randint(0, num_pairs - 1) + self.image_sentence_pairs.append(self.image_sentence_pairs[choice]) + assert len(self.image_sentence_pairs) % batch_num_streams == 0 + if shuffle: + random.shuffle(self.image_sentence_pairs) + self.pad = pad + self.truncate = truncate + self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence')) + + def streams_exhausted(self): + return self.num_resets > 0 + + def init_vocabulary(self, image_annotations, min_count=5): + words_to_count = {} + for image_id, annotations in image_annotations.iteritems(): + for annotation in annotations['sentences']: + for word in annotation: + word = word.strip() + if word not in words_to_count: + words_to_count[word] = 0 + words_to_count[word] += 1 + # Sort words by count, then alphabetically + words_by_count = sorted(words_to_count.keys(), key=lambda w: (-words_to_count[w], w)) + print 'Initialized vocabulary with %d words; top 10 words:' % len(words_by_count) + for word in words_by_count[:10]: + print '\t%s (%d)' % (word, words_to_count[word]) + # Add words to vocabulary + self.vocabulary = {UNK_IDENTIFIER: 0} + self.vocabulary_inverted = [UNK_IDENTIFIER] + for index, word in enumerate(words_by_count): + word = word.strip() + if words_to_count[word] < min_count: + break + self.vocabulary_inverted.append(word) + self.vocabulary[word] = index + 1 + print 'Final vocabulary (restricted to words with counts of %d+) has %d words' % \ + (min_count, len(self.vocabulary)) + + def dump_vocabulary(self, vocab_filename): + print 'Dumping vocabulary to file: %s' % vocab_filename + with open(vocab_filename, 'wb') as vocab_file: + for word in self.vocabulary_inverted: + vocab_file.write('%s\n' % word) + print 'Done.' + + def dump_image_file(self, image_filename, dummy_image_filename=None): + print 'Dumping image list to file: %s' % image_filename + with open(image_filename, 'wb') as image_file: + for image_path, _ in self.image_list: + image_file.write('%s\n' % image_path) + if dummy_image_filename is not None: + print 'Dumping image list with dummy labels to file: %s' % dummy_image_filename + with open(dummy_image_filename, 'wb') as image_file: + for path_and_hash in self.image_list: + image_file.write('%s %d\n' % path_and_hash) + print 'Done.' + + def next_line(self): + num_lines = float(len(self.image_sentence_pairs)) + self.index += 1 + if self.index == 1 or self.index == num_lines or self.index % 10000 == 0: + print 'Processed %d/%d (%f%%) lines' % (self.index, num_lines, + 100 * self.index / num_lines) + if self.index == num_lines: + self.index = 0 + self.num_resets += 1 + + def line_to_stream(self, sentence): + stream = [] + for word in sentence: + word = word.strip() + if word in self.vocabulary: + stream.append(self.vocabulary[word]) + else: # unknown word; append UNK + stream.append(self.vocabulary[UNK_IDENTIFIER]) + # increment the stream -- 0 will be the EOS character + stream = [s + 1 for s in stream] + return stream + + def get_pad_value(self, stream_name): + return -1 if stream_name in self.negative_one_padded_streams else 0 + + def get_streams(self): + image_filename, line = self.image_sentence_pairs[self.index] + stream = self.line_to_stream(line) + pad = self.max_words - (len(stream) + 1) if self.pad else 0 + if pad > 0: self.num_pads += 1 + self.num_outs += 1 + out = {} + out['stage_indicators'] = [1] * (len(stream) + 1) + [0] * pad + out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad + out['input_sentence'] = [0] + stream + [-1] * pad + out['target_sentence'] = stream + [0] + [-1] * pad + truncated = False + if self.truncate: + for key, val in out.iteritems(): + if len(val) > self.max_words: + out[key] = val[:self.max_words] + truncated = True + self.num_truncates += truncated + image_hash = self.image_hash(image_filename) + out['hashed_image_path'] = [image_hash] * len(out['input_sentence']) + self.image_list.append((image_filename, image_hash)) + self.next_line() + return out + + def image_hash(self, filename): + image_hash = int(sha1(filename).hexdigest(), 16) % MAX_HASH + assert image_hash == float(image_hash) + return image_hash + +COCO_ANNO_PATH = '%s/annotations/captions_%%s2014.json' % COCO_PATH +COCO_IMAGE_PATTERN = '%s/images/%%s2014' % COCO_PATH +COCO_IMAGE_ID_PATTERN = 'COCO_%s2014_%%012d.jpg' + +BUFFER_SIZE = 100 +OUTPUT_DIR = './examples/coco_caption/h5_data/buffer_%d' % BUFFER_SIZE +SPLITS_PATTERN = './data/coco/coco2014_cocoid.%s.txt' +OUTPUT_DIR_PATTERN = '%s/%%s_batches' % OUTPUT_DIR + +def process_dataset(split_name, coco_split_name, batch_stream_length, + vocab=None, aligned=True): + with open(SPLITS_PATTERN % split_name, 'r') as split_file: + split_image_ids = [int(line) for line in split_file.readlines()] + output_dataset_name = split_name + if aligned: + output_dataset_name += '_aligned_%d' % MAX_WORDS + else: + output_dataset_name += '_unaligned' + output_path = OUTPUT_DIR_PATTERN % output_dataset_name + coco = COCO(COCO_ANNO_PATH % coco_split_name) + image_root = COCO_IMAGE_PATTERN % coco_split_name + sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, + split_ids=split_image_ids, vocab=vocab, align=aligned, pad=aligned, + truncate=aligned) + sg.batch_stream_length = batch_stream_length + writer = HDF5SequenceWriter(sg, output_dir=output_path) + writer.write_to_exhaustion() + writer.write_filelists() + if vocab is None: + vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR + sg.dump_vocabulary(vocab_out_path) + image_out_path = '%s/image_list.txt' % output_path + image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path + sg.dump_image_file(image_out_path, image_dummy_labels_out_path) + num_outs = sg.num_outs + num_pads = sg.num_pads + num_truncates = sg.num_truncates + print 'Padded %d/%d sequences; truncated %d/%d sequences' % \ + (num_pads, num_outs, num_truncates, num_outs) + return sg.vocabulary_inverted + +def process_coco(include_trainval=False): + vocab = None + datasets = [ + ('train', 'train', 100000, True), + ('val', 'val', 100000, True), + ('test', 'val', 100000, True), + # Write unaligned datasets as well: + ('train', 'train', 100000, False), + ('val', 'val', 100000, False), + ('test', 'val', 100000, False), + ] + # Also create a 'trainval' set if include_trainval is set. + # ./data/coco/make_trainval.py must have been run for this to work. + if include_trainval: + datasets += [ + ('trainval', 'trainval', 100000, True), + ('trainval', 'trainval', 100000, False), + ] + for split_name, coco_split_name, batch_stream_length, aligned in datasets: + vocab = process_dataset(split_name, coco_split_name, batch_stream_length, + vocab=vocab, aligned=aligned) + +if __name__ == "__main__": + process_coco(include_trainval=False) diff --git a/examples/coco_caption/finetune_lrcn.sh b/examples/coco_caption/finetune_lrcn.sh new file mode 100755 index 00000000000..0e948bc6726 --- /dev/null +++ b/examples/coco_caption/finetune_lrcn.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./examples/coco_caption/lrcn_iter_110000.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_finetune_solver.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/finetune_lrcn.trainval.sh b/examples/coco_caption/finetune_lrcn.trainval.sh new file mode 100755 index 00000000000..4fd19b4763b --- /dev/null +++ b/examples/coco_caption/finetune_lrcn.trainval.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./examples/coco_caption/lrcn_finetune_iter_50000.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_finetune_solver.trainval.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/finetune_lrcn.vgg.sh b/examples/coco_caption/finetune_lrcn.vgg.sh new file mode 100755 index 00000000000..85c7b5ebfde --- /dev/null +++ b/examples/coco_caption/finetune_lrcn.vgg.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./examples/coco_caption/lrcn_vgg_iter_90000.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_finetune_solver.vgg.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/finetune_lrcn.vgg.trainval.sh b/examples/coco_caption/finetune_lrcn.vgg.trainval.sh new file mode 100755 index 00000000000..8b230c908fe --- /dev/null +++ b/examples/coco_caption/finetune_lrcn.vgg.trainval.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./examples/coco_caption/lrcn_finetune_vgg_iter_50000.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/hdf5_sequence_generator.py b/examples/coco_caption/hdf5_sequence_generator.py new file mode 100644 index 00000000000..98d4657b6bf --- /dev/null +++ b/examples/coco_caption/hdf5_sequence_generator.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +import h5py +import numpy as np +import os +import random +import sys + +class SequenceGenerator(): + def __init__(self): + self.dimension = 10 + self.batch_stream_length = 2000 + self.batch_num_streams = 8 + self.min_stream_length = 13 + self.max_stream_length = 17 + self.substream_names = None + self.streams_initialized = False + + def streams_exhausted(self): + return False + + def init_streams(self): + self.streams = [None] * self.batch_num_streams + self.stream_indices = [0] * self.batch_num_streams + self.reset_stream(0) + self.streams_initialized = True + + def reset_stream(self, stream_index): + streams = self.get_streams() + stream_names = sorted(streams.keys()) + if self.substream_names is None: + assert len(stream_names) > 0 + self.substream_names = stream_names + assert self.substream_names == stream_names + if self.streams[stream_index] is None: + self.streams[stream_index] = {} + stream_length = len(streams[stream_names[0]]) + for k, v in streams.iteritems(): + assert stream_length == len(v) + self.streams[stream_index][k] = v + self.stream_indices[stream_index] = 0 + + # Pad with zeroes by default -- override this to pad with soemthing else + # for a particular stream + def get_pad_value(self, stream_name): + return 0 + + def get_next_batch(self, truncate_at_exhaustion=True): + if not self.streams_initialized: + self.init_streams() + batch_size = self.batch_num_streams * self.batch_stream_length + batch = {} + batch_indicators = np.zeros((self.batch_stream_length, self.batch_num_streams)) + for name in self.substream_names: + batch[name] = self.get_pad_value(name) * np.ones_like(batch_indicators) + exhausted = [False] * self.batch_num_streams + all_exhausted = False + reached_exhaustion = False + num_completed_streams = 0 + for t in range(self.batch_stream_length): + all_exhausted = True + for i in range(self.batch_num_streams): + if not exhausted[i]: + if self.streams[i] is None or \ + self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]): + self.stream_indices[i] = 0 + reached_exhaustion = reached_exhaustion or self.streams_exhausted() + if reached_exhaustion: exhausted[i] = True + if not reached_exhaustion or not truncate_at_exhaustion: + self.reset_stream(i) + else: + continue + for name in self.substream_names: + batch[name][t, i] = self.streams[i][name][self.stream_indices[i]] + batch_indicators[t, i] = 0 if self.stream_indices[i] == 0 else 1 + self.stream_indices[i] += 1 + if self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]): + num_completed_streams += 1 + if not exhausted[i]: all_exhausted = False + if all_exhausted and truncate_at_exhaustion: + print ('Exhausted all data; cutting off batch at timestep %d ' + + 'with %d streams completed') % (t, num_completed_streams) + for name in self.substream_names: + batch[name] = batch[name][:t, :] + batch_indicators = batch_indicators[:t, :] + break + return batch, batch_indicators + + def get_streams(self): + raise Exception('get_streams should be overridden to return a dict ' + + 'of equal-length iterables.') + +class HDF5SequenceWriter(): + def __init__(self, sequence_generator, output_dir=None, verbose=False): + self.generator = sequence_generator + assert output_dir is not None # required + self.output_dir = output_dir + if os.path.exists(output_dir): + raise Exception('Output directory already exists: ' + output_dir) + os.makedirs(output_dir) + self.verbose = verbose + self.filenames = [] + + def write_batch(self, stop_at_exhaustion=False): + batch_comps, cont_indicators = self.generator.get_next_batch() + batch_index = len(self.filenames) + filename = '%s/batch_%d.h5' % (self.output_dir, batch_index) + self.filenames.append(filename) + h5file = h5py.File(filename, 'w') + dataset = h5file.create_dataset('cont', shape=cont_indicators.shape, dtype=cont_indicators.dtype) + dataset[:] = cont_indicators + dataset = h5file.create_dataset('buffer_size', shape=(1,), dtype=np.int) + dataset[:] = self.generator.batch_num_streams + for key, batch in batch_comps.iteritems(): + if self.verbose: + for s in range(self.generator.batch_num_streams): + stream = np.array(self.generator.streams[s][key]) + print 'batch %d, stream %s, index %d: ' % (batch_index, key, s), stream + h5dataset = h5file.create_dataset(key, shape=batch.shape, dtype=batch.dtype) + h5dataset[:] = batch + h5file.close() + + def write_to_exhaustion(self): + while not self.generator.streams_exhausted(): + self.write_batch(stop_at_exhaustion=True) + + def write_filelists(self): + assert self.filenames is not None + filelist_filename = '%s/hdf5_chunk_list.txt' % self.output_dir + with open(filelist_filename, 'w') as listfile: + for filename in self.filenames: + listfile.write('%s\n' % filename) diff --git a/examples/coco_caption/lrcn.prototxt b/examples/coco_caption/lrcn.prototxt new file mode 100644 index 00000000000..5a8c55443e3 --- /dev/null +++ b/examples/coco_caption/lrcn.prototxt @@ -0,0 +1,808 @@ +# The network is used for the image captioning experiments of LRCN [1]. +# Please consider citing LRCN [1] if you use this example in your work. +# +# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, +# K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for +# Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014). + +name: "lrcn_caffenet_to_lstm" + +# train data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN not_stage: 'trainval' } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN not_stage: 'trainval' } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# trainval data layers (for finetuning final model) +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN stage: 'trainval' } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN stage: 'trainval' } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# test on train data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-train" + } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# test on val data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-val" + } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +layer { + name: "silence" + type: "Silence" + bottom: "label" +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm1" + type: "LRN" + bottom: "pool1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm2" + type: "LRN" + bottom: "pool2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + } +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + } +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm1" + include { stage: "unfactored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + top: "lstm2" + include { + stage: "unfactored" + stage: "2-layer" + } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm2" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + exclude { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm2" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + include { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + accuracy_param { + axis: 2 + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lrcn.vgg.buffer_50.prototxt b/examples/coco_caption/lrcn.vgg.buffer_50.prototxt new file mode 100644 index 00000000000..4091a6f7785 --- /dev/null +++ b/examples/coco_caption/lrcn.vgg.buffer_50.prototxt @@ -0,0 +1,940 @@ +# The network is used for the image captioning experiments of LRCN [1]. +# Please consider citing LRCN [1] if you use this example in your work. +# +# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, +# K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for +# Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014). + +name: "lrcn_caffenet_to_lstm" + +# train data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN not_stage: 'trainval' } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 50 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN not_stage: 'trainval' } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# trainval data layers (for finetuning final model) +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN stage: 'trainval' } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/trainval_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 50 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN stage: 'trainval' } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/trainval_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# test on train data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-train" + } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 50 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# test on val data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-val" + } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/val_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 50 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_50/val_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +layer { + name: "silence" + type: "Silence" + bottom: "label" +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1" + top: "conv2_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1" + top: "conv2_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2" + top: "conv3_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2" + top: "conv3_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_3" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3" + top: "conv4_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3" + top: "conv4_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv4_3" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu4_3" + type: "ReLU" + bottom: "conv4_3" + top: "conv4_3" +} +layer { + name: "pool4" + type: "Pooling" + bottom: "conv4_3" + top: "pool4" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv5_1" + type: "Convolution" + bottom: "pool4" + top: "conv5_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv5_1" + type: "Convolution" + bottom: "pool4" + top: "conv5_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu5_1" + type: "ReLU" + bottom: "conv5_1" + top: "conv5_1" +} +layer { + name: "conv5_2" + type: "Convolution" + bottom: "conv5_1" + top: "conv5_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv5_2" + type: "Convolution" + bottom: "conv5_1" + top: "conv5_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu5_2" + type: "ReLU" + bottom: "conv5_2" + top: "conv5_2" +} +layer { + name: "conv5_3" + type: "Convolution" + bottom: "conv5_2" + top: "conv5_3" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv5_3" + type: "Convolution" + bottom: "conv5_2" + top: "conv5_3" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu5_3" + type: "ReLU" + bottom: "conv5_3" + top: "conv5_3" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5_3" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm1" + include { stage: "unfactored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + top: "lstm2" + include { + stage: "unfactored" + stage: "2-layer" + } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm2" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + exclude { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm2" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + include { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + accuracy_param { + axis: 2 + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lrcn.vgg.prototxt b/examples/coco_caption/lrcn.vgg.prototxt new file mode 100644 index 00000000000..db914ff0888 --- /dev/null +++ b/examples/coco_caption/lrcn.vgg.prototxt @@ -0,0 +1,940 @@ +# The network is used for the image captioning experiments of LRCN [1]. +# Please consider citing LRCN [1] if you use this example in your work. +# +# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, +# K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for +# Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014). + +name: "lrcn_caffenet_to_lstm" + +# train data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN not_stage: 'trainval' } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN not_stage: 'trainval' } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# trainval data layers (for finetuning final model) +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN stage: 'trainval' } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN stage: 'trainval' } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/trainval_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# test on train data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-train" + } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +# test on val data layers +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-val" + } + transform_param { + mirror: true + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} + +layer { + name: "silence" + type: "Silence" + bottom: "label" +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1" + top: "conv2_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1" + top: "conv2_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2" + top: "conv3_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2" + top: "conv3_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_3" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3" + top: "conv4_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3" + top: "conv4_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv4_3" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu4_3" + type: "ReLU" + bottom: "conv4_3" + top: "conv4_3" +} +layer { + name: "pool4" + type: "Pooling" + bottom: "conv4_3" + top: "pool4" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv5_1" + type: "Convolution" + bottom: "pool4" + top: "conv5_1" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv5_1" + type: "Convolution" + bottom: "pool4" + top: "conv5_1" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu5_1" + type: "ReLU" + bottom: "conv5_1" + top: "conv5_1" +} +layer { + name: "conv5_2" + type: "Convolution" + bottom: "conv5_1" + top: "conv5_2" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv5_2" + type: "Convolution" + bottom: "conv5_1" + top: "conv5_2" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu5_2" + type: "ReLU" + bottom: "conv5_2" + top: "conv5_2" +} +layer { + name: "conv5_3" + type: "Convolution" + bottom: "conv5_2" + top: "conv5_3" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv5_3" + type: "Convolution" + bottom: "conv5_2" + top: "conv5_3" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "relu5_3" + type: "ReLU" + bottom: "conv5_3" + top: "conv5_3" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5_3" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { lr_mult: 0 } + param { lr_mult: 0 decay_mult: 0 } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { lr_mult: 0.1 } + param { lr_mult: 0.2 decay_mult: 0} + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm1" + include { stage: "unfactored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + top: "lstm2" + include { + stage: "unfactored" + stage: "2-layer" + } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm2" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + exclude { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm2" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + include { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + accuracy_param { + axis: 2 + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lrcn_finetune_solver.prototxt b/examples/coco_caption/lrcn_finetune_solver.prototxt new file mode 100644 index 00000000000..52dae5f6cf4 --- /dev/null +++ b/examples/coco_caption/lrcn_finetune_solver.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'factored' stage: '2-layer' } +test_iter: 25 +test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.001 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 50000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_finetune" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 25 diff --git a/examples/coco_caption/lrcn_finetune_solver.trainval.prototxt b/examples/coco_caption/lrcn_finetune_solver.trainval.prototxt new file mode 100644 index 00000000000..bc2345ecdc5 --- /dev/null +++ b/examples/coco_caption/lrcn_finetune_solver.trainval.prototxt @@ -0,0 +1,28 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'factored' stage: '2-layer' stage: 'trainval' } +test_iter: 25 +test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.001 +lr_policy: "step" +gamma: 0.5 +stepsize: 40000 +display: 1 +max_iter: 100000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_finetune_trainval" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 25 diff --git a/examples/coco_caption/lrcn_finetune_solver.vgg.prototxt b/examples/coco_caption/lrcn_finetune_solver.vgg.prototxt new file mode 100644 index 00000000000..6d1db08bfcb --- /dev/null +++ b/examples/coco_caption/lrcn_finetune_solver.vgg.prototxt @@ -0,0 +1,31 @@ +net: "./examples/coco_caption/lrcn.vgg.buffer_50.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'factored' stage: '2-layer' } +# test_iter: 25 +# test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +# test_iter: 25 +# test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.001 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 50000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 2500 +snapshot_prefix: "./examples/coco_caption/lrcn_finetune_vgg" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 25 +iter_size: 2 diff --git a/examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt b/examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt new file mode 100644 index 00000000000..f9c4ee0d17a --- /dev/null +++ b/examples/coco_caption/lrcn_finetune_solver.vgg.trainval.prototxt @@ -0,0 +1,29 @@ +net: "./examples/coco_caption/lrcn.vgg.buffer_50.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'factored' stage: '2-layer' stage: 'trainval' } +# test_iter: 25 +# test_state: { stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +# test_interval: 1000 +base_lr: 0.001 +lr_policy: "step" +gamma: 0.5 +stepsize: 40000 +display: 1 +max_iter: 100000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_finetune_vgg_trainval" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 25 +iter_size: 2 diff --git a/examples/coco_caption/lrcn_solver.from_lm.prototxt b/examples/coco_caption/lrcn_solver.from_lm.prototxt new file mode 100644 index 00000000000..3fb37879c55 --- /dev/null +++ b/examples/coco_caption/lrcn_solver.from_lm.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_from_lm" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/lrcn_solver.prototxt b/examples/coco_caption/lrcn_solver.prototxt new file mode 100644 index 00000000000..65ca272b30c --- /dev/null +++ b/examples/coco_caption/lrcn_solver.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/lrcn_solver.single_layer.prototxt b/examples/coco_caption/lrcn_solver.single_layer.prototxt new file mode 100644 index 00000000000..5a9073bed71 --- /dev/null +++ b/examples/coco_caption/lrcn_solver.single_layer.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (2). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '1-layer' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '1-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '1-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_single_layer" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/lrcn_solver.unfactored.prototxt b/examples/coco_caption/lrcn_solver.unfactored.prototxt new file mode 100644 index 00000000000..2c83f37ca27 --- /dev/null +++ b/examples/coco_caption/lrcn_solver.unfactored.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (3). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '2-layer' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '2-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'unfactored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_unfactored" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/lrcn_solver.vgg.prototxt b/examples/coco_caption/lrcn_solver.vgg.prototxt new file mode 100644 index 00000000000..dada837a512 --- /dev/null +++ b/examples/coco_caption/lrcn_solver.vgg.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.vgg.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' } +# test_iter: 25 +# test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +# test_iter: 25 +# test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +# test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn_vgg" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt b/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt new file mode 100644 index 00000000000..bfd6166add9 --- /dev/null +++ b/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt @@ -0,0 +1,56 @@ +name: "caffenet_to_lstm" + +input: "cont_sentence" +input_shape { dim: 1 dim: 1000 } + +input: "input_sentence" +input_shape { dim: 1 dim: 1000 } + +input: "image_features" +input_shape { dim: 1000 dim: 1000 } + +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + embed_param { + input_dim: 8801 + num_output: 1000 + bias_term: false + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + recurrent_param { num_output: 1000 } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + bottom: "image_features" + top: "lstm2" + recurrent_param { num_output: 1000 } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm2" + top: "predict" + inner_product_param { + axis: 2 + num_output: 8801 + } +} +layer { + name: "probs" + type: "Softmax" + bottom: "predict" + top: "probs" + softmax_param { axis: 2 } +} diff --git a/examples/coco_caption/lstm_language_model.prototxt b/examples/coco_caption/lstm_language_model.prototxt new file mode 100644 index 00000000000..68fda5464fe --- /dev/null +++ b/examples/coco_caption/lstm_language_model.prototxt @@ -0,0 +1,150 @@ +name: "lstm_language_model" +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_unaligned_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 # = vocab_size + 1 (for EOS) + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "embed-drop" + type: "Dropout" + bottom: "embedded_input_sentence" + top: "embedded_input_sentence" + dropout_param { dropout_ratio: 0.5 } + include { stage: "embed-drop" } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm-drop" + type: "Dropout" + bottom: "lstm1" + top: "lstm1" + dropout_param { dropout_ratio: 0.5 } + include { stage: "lstm-drop" } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 8801 # = vocab_size + 1 (+1 for EOS) + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + accuracy_param { + axis: 2 + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lstm_lm.deploy.prototxt b/examples/coco_caption/lstm_lm.deploy.prototxt new file mode 100644 index 00000000000..26b5f1b01eb --- /dev/null +++ b/examples/coco_caption/lstm_lm.deploy.prototxt @@ -0,0 +1,122 @@ +name: "lstm_language_model" + +input: "cont_sentence" +input_shape { dim: 1 dim: 1 } + +input: "input_sentence" +input_shape { dim: 1 dim: 1 } + +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 # = vocab_size + 1 (for EOS) + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "embed-drop" + type: "Dropout" + bottom: "embedded_input_sentence" + top: "embedded_input_sentence" + dropout_param { dropout_ratio: 0.5 } + include { stage: "embed-drop" } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm-drop" + type: "Dropout" + bottom: "lstm1" + top: "lstm1" + dropout_param { dropout_ratio: 0.5 } + include { stage: "lstm-drop" } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 8801 # = vocab_size + 1 (+1 for EOS) + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "probs" + type: "Softmax" + bottom: "predict" + top: "probs" + softmax_param { axis: 2 } +} +# layer { +# name: "cross_entropy_loss" +# type: "SoftmaxWithLoss" +# bottom: "predict" +# bottom: "target_sentence" +# top: "cross_entropy_loss" +# loss_weight: 20 +# loss_param { +# ignore_label: -1 +# } +# softmax_param { +# axis: 2 +# } +# } +# layer { +# name: "accuracy" +# type: "Accuracy" +# bottom: "predict" +# bottom: "target_sentence" +# top: "accuracy" +# include { phase: TEST } +# accuracy_param { +# axis: 2 +# ignore_label: -1 +# } +# } diff --git a/examples/coco_caption/lstm_lm_solver.prototxt b/examples/coco_caption/lstm_lm_solver.prototxt new file mode 100644 index 00000000000..fb36ad15a5b --- /dev/null +++ b/examples/coco_caption/lstm_lm_solver.prototxt @@ -0,0 +1,21 @@ +net: "./examples/coco_caption/lstm_language_model.prototxt" +train_state: { stage: 'embed-drop' stage: 'lstm-drop' } +test_iter: 25 +test_state: { stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.1 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lstm_lm" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/retrieval_experiment.py b/examples/coco_caption/retrieval_experiment.py new file mode 100755 index 00000000000..178937bdb85 --- /dev/null +++ b/examples/coco_caption/retrieval_experiment.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python + +from collections import OrderedDict +import json +import numpy as np +import pprint +import cPickle as pickle +import string +import sys + +# seed the RNG so we evaluate on the same subset each time +np.random.seed(seed=0) + +from coco_to_hdf5_data import * +from captioner import Captioner + +COCO_EVAL_PATH = './data/coco/coco-caption-eval' +sys.path.append(COCO_EVAL_PATH) +from pycocoevalcap.eval import COCOEvalCap + +class CaptionExperiment(): + # captioner is an initialized Captioner (captioner.py) + # dataset is a dict: image path -> [caption1, caption2, ...] + def __init__(self, captioner, dataset, dataset_cache_dir, cache_dir, sg): + self.captioner = captioner + self.sg = sg + self.dataset_cache_dir = dataset_cache_dir + self.cache_dir = cache_dir + for d in [dataset_cache_dir, cache_dir]: + if not os.path.exists(d): os.makedirs(d) + self.dataset = dataset + self.images = dataset.keys() + self.init_caption_list(dataset) + self.caption_scores = [None] * len(self.images) + print 'Initialized caption experiment: %d images, %d captions' % \ + (len(self.images), len(self.captions)) + + def init_caption_list(self, dataset): + self.captions = [] + for image, captions in dataset.iteritems(): + for caption, _ in captions: + self.captions.append({'source_image': image, 'caption': caption}) + # Sort by length for performance. + self.captions.sort(key=lambda c: len(c['caption'])) + + def compute_descriptors(self): + descriptor_filename = '%s/descriptors.npz' % self.dataset_cache_dir + if os.path.exists(descriptor_filename): + self.descriptors = np.load(descriptor_filename)['descriptors'] + else: + self.descriptors = self.captioner.compute_descriptors(self.images) + np.savez_compressed(descriptor_filename, descriptors=self.descriptors) + + def score_captions(self, image_index, output_name='probs'): + assert image_index < len(self.images) + caption_scores_dir = '%s/caption_scores' % self.cache_dir + if not os.path.exists(caption_scores_dir): + os.makedirs(caption_scores_dir) + caption_scores_filename = '%s/scores_image_%06d.pkl' % \ + (caption_scores_dir, image_index) + if os.path.exists(caption_scores_filename): + with open(caption_scores_filename, 'rb') as caption_scores_file: + outputs = pickle.load(caption_scores_file) + else: + outputs = self.captioner.score_captions(self.descriptors[image_index], + self.captions, output_name=output_name, caption_source='gt', + verbose=False) + self.caption_stats(image_index, outputs) + with open(caption_scores_filename, 'wb') as caption_scores_file: + pickle.dump(outputs, caption_scores_file) + self.caption_scores[image_index] = outputs + + def caption_stats(self, image_index, caption_scores): + image_path = self.images[image_index] + for caption, score in zip(self.captions, caption_scores): + assert caption['caption'] == score['caption'] + score['stats'] = gen_stats(score['prob']) + score['correct'] = (image_path == caption['source_image']) + + def eval_image_to_caption(self, image_index, methods=None): + scores = self.caption_scores[image_index] + return self.eval_recall(scores, methods=methods) + + def eval_caption_to_image(self, caption_index, methods=None): + scores = [s[caption_index] for s in self.caption_scores] + return self.eval_recall(scores, methods=methods) + + def normalize_caption_scores(self, caption_index, stats=['log_p', 'log_p_word']): + scores = [s[caption_index] for s in self.caption_scores] + for stat in stats: + log_stat_scores = np.array([score['stats'][stat] for score in scores]) + stat_scores = np.exp(log_stat_scores) + mean_stat_score = np.mean(stat_scores) + log_mean_stat_score = np.log(mean_stat_score) + for log_stat_score, score in zip(log_stat_scores, scores): + score['stats']['normalized_' + stat] = log_stat_score - log_mean_stat_score + + def eval_recall(self, scores, methods=None, neg_prefix='negative_'): + if methods is None: + # rank on all stats, and all their inverses + methods = scores[0]['stats'].keys() + methods += [neg_prefix + method for method in methods] + correct_ranks = {} + for method in methods: + if method.startswith(neg_prefix): + multiplier = -1 + method_key = method[len(neg_prefix):] + else: + multiplier = 1 + method_key = method + sort_key = lambda s: multiplier * s['stats'][method_key] + ranked_scores = sorted(scores, key=sort_key) + for index, score in enumerate(ranked_scores): + if score['correct']: + correct_ranks[method] = index + break + return correct_ranks + + def recall_results(self, correct_ranks, recall_ranks=[]): + num_instances = float(len(correct_ranks)) + assert num_instances > 0 + methods = correct_ranks[0].keys() + results = {} + for method in methods: + method_correct_ranks = \ + np.array([correct_rank[method] for correct_rank in correct_ranks]) + r = OrderedDict() + r['mean'] = np.mean(method_correct_ranks) + r['median'] = np.median(method_correct_ranks) + r['mean (1-indexed)'] = r['mean'] + 1 + r['median (1-indexed)'] = r['median'] + 1 + for recall_rank in recall_ranks: + r['R@%d' % recall_rank] = \ + np.where(method_correct_ranks < recall_rank)[0].shape[0] / num_instances + results[method] = r + return results + + def print_recall_results(self, results): + for method, result in results.iteritems(): + print 'Ranking method:', method + for metric_name_and_value in result.iteritems(): + print ' %s: %f' % metric_name_and_value + + def retrieval_experiment(self): + # Compute image descriptors. + print 'Computing image descriptors' + self.compute_descriptors() + + num_images, num_captions = len(self.images), len(self.captions) + + # For each image, score all captions. + for image_index in xrange(num_images): + sys.stdout.write("\rScoring captions for image %d/%d" % + (image_index, num_images)) + sys.stdout.flush() + self.score_captions(image_index) + sys.stdout.write('\n') + + # Compute global caption statistics for normalization. + for caption_index in xrange(num_captions): + self.normalize_caption_scores(caption_index) + + recall_ranks = [1, 5, 10, 50] + + eval_methods = ['negative_normalized_log_p'] + # Evaluate caption-to-image retrieval task. + self.caption_to_image_ranks = [None] * num_captions + for caption_index in xrange(num_captions): + sys.stdout.write("\rCaption-to-image evaluation: " + "computing recall for caption %d/%d" % + (caption_index, num_captions)) + sys.stdout.flush() + self.caption_to_image_ranks[caption_index] = \ + self.eval_caption_to_image(caption_index, methods=eval_methods) + sys.stdout.write('\n') + self.caption_to_image_recall = \ + self.recall_results(self.caption_to_image_ranks, recall_ranks) + print 'Caption-to-image retrieval results:' + self.print_recall_results(self.caption_to_image_recall) + + # Evaluate image-to-caption retrieval task. + self.image_to_caption_ranks = [None] * num_images + for image_index in xrange(num_images): + sys.stdout.write("\rImage-to-caption evaluation: " + "computing recall for image %d/%d" % + (image_index, num_images)) + sys.stdout.flush() + self.image_to_caption_ranks[image_index] = \ + self.eval_image_to_caption(image_index, methods=eval_methods) + sys.stdout.write('\n') + self.image_to_caption_recall = \ + self.recall_results(self.image_to_caption_ranks, recall_ranks) + print 'Image-to-caption retrieval results:' + self.print_recall_results(self.image_to_caption_recall) + + def generation_experiment(self, strategy, max_batch_size=1000): + # Compute image descriptors. + print 'Computing image descriptors' + self.compute_descriptors() + + do_batches = (strategy['type'] == 'beam' and strategy['beam_size'] == 1) or \ + (strategy['type'] == 'sample' and + ('temp' not in strategy or strategy['temp'] in (1, float('inf'))) and + ('num' not in strategy or strategy['num'] == 1)) + + num_images = len(self.images) + batch_size = min(max_batch_size, num_images) if do_batches else 1 + + # Generate captions for all images. + all_captions = [None] * num_images + for image_index in xrange(0, num_images, batch_size): + batch_end_index = min(image_index + batch_size, num_images) + sys.stdout.write("\rGenerating captions for image %d/%d" % + (image_index, num_images)) + sys.stdout.flush() + if do_batches: + if strategy['type'] == 'beam' or \ + ('temp' in strategy and strategy['temp'] == float('inf')): + temp = float('inf') + else: + temp = strategy['temp'] if 'temp' in strategy else 1 + output_captions, output_probs = self.captioner.sample_captions( + self.descriptors[image_index:batch_end_index], temp=temp) + for batch_index, output in zip(range(image_index, batch_end_index), + output_captions): + all_captions[batch_index] = output + else: + for batch_image_index in xrange(image_index, batch_end_index): + captions, caption_probs = self.captioner.predict_caption( + self.descriptors[batch_image_index], strategy=strategy) + best_caption, max_log_prob = None, None + for caption, probs in zip(captions, caption_probs): + log_prob = gen_stats(probs)['log_p'] + if best_caption is None or \ + (best_caption is not None and log_prob > max_log_prob): + best_caption, max_log_prob = caption, log_prob + all_captions[batch_image_index] = best_caption + sys.stdout.write('\n') + + # Compute the number of reference files as the maximum number of ground + # truth captions of any image in the dataset. + num_reference_files = 0 + for captions in self.dataset.values(): + if len(captions) > num_reference_files: + num_reference_files = len(captions) + if num_reference_files <= 0: + raise Exception('No reference captions.') + + # Collect model/reference captions, formatting the model's captions and + # each set of reference captions as a list of len(self.images) strings. + exp_dir = '%s/generation' % self.cache_dir + if not os.path.exists(exp_dir): + os.makedirs(exp_dir) + # For each image, write out the highest probability caption. + model_captions = [''] * len(self.images) + reference_captions = [([''] * len(self.images)) for _ in xrange(num_reference_files)] + for image_index, image in enumerate(self.images): + caption = self.captioner.sentence(all_captions[image_index]) + model_captions[image_index] = caption + for reference_index, (_, caption) in enumerate(self.dataset[image]): + caption = ' '.join(caption) + reference_captions[reference_index][image_index] = caption + + coco_image_ids = [self.sg.image_path_to_id[image_path] + for image_path in self.images] + generation_result = [{ + 'image_id': self.sg.image_path_to_id[image_path], + 'caption': model_captions[image_index] + } for (image_index, image_path) in enumerate(self.images)] + json_filename = '%s/generation_result.json' % self.cache_dir + print 'Dumping result to file: %s' % json_filename + with open(json_filename, 'w') as json_file: + json.dump(generation_result, json_file) + generation_result = self.sg.coco.loadRes(json_filename) + coco_evaluator = COCOEvalCap(self.sg.coco, generation_result) + coco_evaluator.params['image_id'] = coco_image_ids + coco_evaluator.evaluate() + +def gen_stats(prob): + stats = {} + stats['length'] = len(prob) + stats['log_p'] = 0.0 + eps = 1e-12 + for p in prob: + assert 0.0 <= p <= 1.0 + stats['log_p'] += np.log(max(eps, p)) + stats['log_p_word'] = stats['log_p'] / stats['length'] + try: + stats['perplex'] = np.exp(-stats['log_p']) + except OverflowError: + stats['perplex'] = float('inf') + try: + stats['perplex_word'] = np.exp(-stats['log_p_word']) + except OverflowError: + stats['perplex_word'] = float('inf') + return stats + +def main(): + MAX_IMAGES = -1 # -1 to use all images + TAG = 'coco_2layer_factored' + if MAX_IMAGES >= 0: + TAG += '_%dimages' % MAX_IMAGES + eval_on_test = False + if eval_on_test: + ITER = 100000 + MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER + DATASET_NAME = 'test' + else: # eval on val + ITER = 50000 + MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER + DATASET_NAME = 'val' + TAG += '_%s' % DATASET_NAME + MODEL_DIR = './examples/coco_caption' + MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME) + IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt' + LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' + NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME) + DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, + str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') + DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) + VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt' + DEVICE_ID = 0 + with open(VOCAB_FILE, 'r') as vocab_file: + vocab = [line.strip() for line in vocab_file.readlines()] + coco = COCO(COCO_ANNO_PATH % DATASET_NAME) + image_root = COCO_IMAGE_PATTERN % DATASET_NAME + sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, + align=False, shuffle=False) + dataset = {} + for image_path, sentence in sg.image_sentence_pairs: + if image_path not in dataset: + dataset[image_path] = [] + dataset[image_path].append((sg.line_to_stream(sentence), sentence)) + print 'Original dataset contains %d images' % len(dataset.keys()) + if 0 <= MAX_IMAGES < len(dataset.keys()): + all_keys = dataset.keys() + perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] + chosen_keys = set([all_keys[p] for p in perm]) + for key in all_keys: + if key not in chosen_keys: + del dataset[key] + print 'Reduced dataset to %d images' % len(dataset.keys()) + if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) + captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, + device_id=DEVICE_ID) + beam_size = 1 + generation_strategy = {'type': 'beam', 'beam_size': beam_size} + if generation_strategy['type'] == 'beam': + strategy_name = 'beam%d' % generation_strategy['beam_size'] + elif generation_strategy['type'] == 'sample': + strategy_name = 'sample%f' % generation_strategy['temp'] + else: + raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) + CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) + experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg) + captioner.set_image_batch_size(min(100, MAX_IMAGES)) + experimenter.generation_experiment(generation_strategy) + captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000)) + experimenter.retrieval_experiment() + +if __name__ == "__main__": + main() diff --git a/examples/coco_caption/train_language_model.sh b/examples/coco_caption/train_language_model.sh new file mode 100755 index 00000000000..6e8a8c47b37 --- /dev/null +++ b/examples/coco_caption/train_language_model.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +GPU_ID=0 +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lstm_lm_solver.prototxt \ + -gpu $GPU_ID diff --git a/examples/coco_caption/train_lrcn.from_lm.sh b/examples/coco_caption/train_lrcn.from_lm.sh new file mode 100755 index 00000000000..332f50c9179 --- /dev/null +++ b/examples/coco_caption/train_lrcn.from_lm.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel\ +,\ +./examples/coco_caption/lstm_lm_iter_110000.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.from_lm.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/train_lrcn.sh b/examples/coco_caption/train_lrcn.sh new file mode 100755 index 00000000000..5099e762ccd --- /dev/null +++ b/examples/coco_caption/train_lrcn.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/train_lrcn.single_layer.sh b/examples/coco_caption/train_lrcn.single_layer.sh new file mode 100755 index 00000000000..f99c09865b7 --- /dev/null +++ b/examples/coco_caption/train_lrcn.single_layer.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.single_layer.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/train_lrcn.unfactored.sh b/examples/coco_caption/train_lrcn.unfactored.sh new file mode 100644 index 00000000000..a579783c5fb --- /dev/null +++ b/examples/coco_caption/train_lrcn.unfactored.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.unfactored.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/examples/coco_caption/train_lrcn.vgg.sh b/examples/coco_caption/train_lrcn.vgg.sh new file mode 100755 index 00000000000..c0b873a4b41 --- /dev/null +++ b/examples/coco_caption/train_lrcn.vgg.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/vgg_16layers/VGG_ILSVRC_16_layers.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.vgg.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index 89bab8d6f3a..fdeab11bbd1 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -511,6 +511,59 @@ class SilenceLayer : public Layer { const vector& propagate_down, const vector*>& bottom); }; +/** + * @brief Computes a product of two input Blobs, with the shape of the + * latter Blob "broadcast" to match the shape of the former. + * Equivalent to tiling the latter Blob, then computing the elementwise + * product. + */ +template +class ScalarLayer: public Layer { + public: + explicit ScalarLayer(const LayerParameter& param) + : Layer(param) {} + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "Scalar"; } + virtual inline int ExactNumBottomBlobs() const { return 2; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + protected: + /** + * In the below shape specifications, @f$ i @f$ denotes the value of the + * `axis` field given by `this->layer_param_.scalar_param().axis()`, after + * canonicalization (i.e., conversion from negative to positive index, + * if applicable). + * + * @param bottom input Blob vector (length 2) + * -# @f$ (d_0 \times ... \times + * d_i \times ... \times d_j \times ... \times d_n) @f$ + * the first factor @f$ x @f$ + * -# @f$ (d_i \times ... \times d_j) @f$ + * the second factor @f$ y @f$ + * @param top output Blob vector (length 1) + * -# @f$ (d_0 \times ... \times + * d_i \times ... \times d_j \times ... \times d_n) @f$ + * the product @f$ z = x y @f$ computed after "broadcasting" y. + * Equivalent to tiling @f$ y @f$ to have the same shape as @f$ x @f$, + * then computing the elementwise product. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob sum_multiplier_; + Blob sum_result_; + int axis_; + int outer_dim_, scalar_dim_, inner_dim_; +}; + /** * @brief Computes the softmax function. * diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 1bf07d28d13..bed241d2a6c 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -179,6 +179,9 @@ class Net { return param_names_index_; } inline const vector& param_owners() const { return param_owners_; } + inline const vector& param_display_names() const { + return param_display_names_; + } /// @brief Input and output blob numbers inline int num_inputs() const { return net_input_blobs_.size(); } inline int num_outputs() const { return net_output_blobs_.size(); } diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp new file mode 100644 index 00000000000..03225082496 --- /dev/null +++ b/include/caffe/sequence_layers.hpp @@ -0,0 +1,321 @@ +#ifndef CAFFE_SEQUENCE_LAYERS_HPP_ +#define CAFFE_SEQUENCE_LAYERS_HPP_ + +#include +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/layer.hpp" +#include "caffe/net.hpp" +#include "caffe/proto/caffe.pb.h" + +namespace caffe { + +template class RecurrentLayer; + +/** + * @brief An abstract class for implementing recurrent behavior inside of an + * unrolled network. This Layer type cannot be instantiated -- instaed, + * you should use one of its implementations which defines the recurrent + * architecture, such as RNNLayer or LSTMLayer. + */ +template +class RecurrentLayer : public Layer { + public: + explicit RecurrentLayer(const LayerParameter& param) + : Layer(param) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual void Reset(); + + virtual inline const char* type() const { return "Recurrent"; } + virtual inline int MinBottomBlobs() const { return 2; } + virtual inline int MaxBottomBlobs() const { return 3; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + virtual inline bool AllowForceBackward(const int bottom_index) const { + // Can't propagate to sequence continuation indicators. + return bottom_index != 1; + } + + protected: + /** + * @brief Fills net_param with the recurrent network arcthiecture. Subclasses + * should define this -- see RNNLayer and LSTMLayer for examples. + */ + virtual void FillUnrolledNet(NetParameter* net_param) const = 0; + + /** + * @brief Fills names with the names of the 0th timestep recurrent input + * Blob&s. Subclasses should define this -- see RNNLayer and LSTMLayer + * for examples. + */ + virtual void RecurrentInputBlobNames(vector* names) const = 0; + + /** + * @brief Fills shapes with the shapes of the recurrent input Blob&s. + * Subclasses should define this -- see RNNLayer and LSTMLayer + * for examples. + */ + virtual void RecurrentInputShapes(vector* shapes) const = 0; + + /** + * @brief Fills names with the names of the Tth timestep recurrent output + * Blob&s. Subclasses should define this -- see RNNLayer and LSTMLayer + * for examples. + */ + virtual void RecurrentOutputBlobNames(vector* names) const = 0; + + /** + * @brief Fills names with the names of the output blobs, concatenated across + * all timesteps. Should return a name for each top Blob. + * Subclasses should define this -- see RNNLayer and LSTMLayer for + * examples. + */ + virtual void OutputBlobNames(vector* names) const = 0; + + /** + * @param bottom input Blob vector (length 2-3) + * + * -# @f$ (T \times N \times ...) @f$ + * the time-varying input @f$ x @f$. After the first two axes, whose + * dimensions must correspond to the number of timesteps @f$ T @f$ and + * the number of independent streams @f$ N @f$, respectively, its + * dimensions may be arbitrary. Note that the ordering of dimensions -- + * @f$ (T \times N \times ...) @f$, rather than + * @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$ + * independent input streams must be "interleaved". + * + * -# @f$ (T \times N) @f$ + * the sequence continuation indicators @f$ \delta @f$. + * These inputs should be binary (0 or 1) indicators, where + * @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream + * @f$ n @f$ is the beginning of a new sequence, and hence the previous + * hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$ + * and has no effect on the cell's output at timestep @f$ t @f$, and + * a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of + * stream @f$ n @f$ is a continuation from the previous timestep + * @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the + * updated hidden state and output. + * + * -# @f$ (N \times ...) @f$ (optional) + * the static (non-time-varying) input @f$ x_{static} @f$. + * After the first axis, whose dimension must be the number of + * independent streams, its dimensions may be arbitrary. + * This is mathematically equivalent to using a time-varying input of + * @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input + * across the @f$ T @f$ timesteps and concatenating with the time-varying + * input. Note that if this input is used, all timesteps in a single + * batch within a particular one of the @f$ N @f$ streams must share the + * same static input, even if the sequence continuation indicators + * suggest that difference sequences are ending and beginning within a + * single batch. This may require padding and/or truncation for uniform + * length. + * + * @param top output Blob vector (length 1) + * -# @f$ (T \times N \times D) @f$ + * the time-varying output @f$ y @f$, where @f$ D @f$ is + * recurrent_param.num_output(). + * Refer to documentation for particular RecurrentLayer implementations + * (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief A helper function, useful for stringifying timestep indices. + virtual string int_to_str(const int t) const; + + /// @brief A Net to implement the Recurrent functionality. + shared_ptr > unrolled_net_; + + /// @brief The number of independent streams to process simultaneously. + int N_; + + /** + * @brief The number of timesteps in the layer's input, and the number of + * timesteps over which to backpropagate through time. + */ + int T_; + + /// @brief Whether the layer has a "static" input copied across all timesteps. + bool static_input_; + + vector* > recur_input_blobs_; + vector* > recur_output_blobs_; + vector* > output_blobs_; + Blob* x_input_blob_; + Blob* x_static_input_blob_; + Blob* cont_input_blob_; +}; + +/** + * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM) + * [1] style recurrent neural network (RNN). Implemented as a network + * unrolled the LSTM computation in time. + * + * + * The specific architecture used in this implementation is as described in + * "Learning to Execute" [2], reproduced below: + * i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ] + * f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ] + * o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ] + * g_t := \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ] + * c_t := (f_t .* c_{t-1}) + (i_t .* g_t) + * h_t := o_t .* \tanh[c_t] + * In the implementation, the i, f, o, and g computations are performed as a + * single inner product. + * + * Notably, this implementation lacks the "diagonal" gates, as used in the + * LSTM architectures described by Alex Graves [3] and others. + * + * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory." + * Neural Computation 9, no. 8 (1997): 1735-1780. + * + * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute." + * arXiv preprint arXiv:1410.4615 (2014). + * + * [3] Graves, Alex. "Generating sequences with recurrent neural networks." + * arXiv preprint arXiv:1308.0850 (2013). + */ +template +class LSTMLayer : public RecurrentLayer { + public: + explicit LSTMLayer(const LayerParameter& param) + : RecurrentLayer(param) {} + + virtual inline const char* type() const { return "LSTM"; } + + protected: + virtual void FillUnrolledNet(NetParameter* net_param) const; + virtual void RecurrentInputBlobNames(vector* names) const; + virtual void RecurrentOutputBlobNames(vector* names) const; + virtual void RecurrentInputShapes(vector* shapes) const; + virtual void OutputBlobNames(vector* names) const; +}; + +/** + * @brief A helper for LSTMLayer: computes a single timestep of the + * non-linearity of the LSTM, producing the updated cell and hidden + * states. + */ +template +class LSTMUnitLayer : public Layer { + public: + explicit LSTMUnitLayer(const LayerParameter& param) + : Layer(param) {} + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "LSTMUnit"; } + virtual inline int ExactNumBottomBlobs() const { return 3; } + virtual inline int ExactNumTopBlobs() const { return 2; } + + virtual inline bool AllowForceBackward(const int bottom_index) const { + // Can't propagate to sequence continuation indicators. + return bottom_index != 2; + } + + protected: + /** + * @param bottom input Blob vector (length 3) + * -# @f$ (1 \times N \times D) @f$ + * the previous timestep cell state @f$ c_{t-1} @f$ + * -# @f$ (1 \times N \times 4D) @f$ + * the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$ + * -# @f$ (1 \times N) @f$ + * the sequence continuation indicators @f$ \delta_t @f$ + * @param top output Blob vector (length 2) + * -# @f$ (1 \times N \times D) @f$ + * the updated cell state @f$ c_t @f$, computed as: + * i_t := \sigmoid[i_t'] + * f_t := \sigmoid[f_t'] + * o_t := \sigmoid[o_t'] + * g_t := \tanh[g_t'] + * c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) + * -# @f$ (1 \times N \times D) @f$ + * the updated hidden state @f$ h_t @f$, computed as: + * h_t := o_t .* \tanh[c_t] + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the LSTMUnit inputs. + * + * @param top output Blob vector (length 2), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times N \times D) @f$: + * containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$ + * with respect to the updated cell state @f$ c_t @f$ + * -# @f$ (1 \times N \times D) @f$: + * containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$ + * with respect to the updated cell state @f$ h_t @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 3), into which the error gradients + * with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate + * inputs are computed. Computatation of the error gradients w.r.t. + * the sequence indicators is not implemented. + * -# @f$ (1 \times N \times D) @f$ + * the error gradient w.r.t. the previous timestep cell state + * @f$ c_{t-1} @f$ + * -# @f$ (1 \times N \times 4D) @f$ + * the error gradient w.r.t. the "gate inputs" + * @f$ [ + * \frac{\partial E}{\partial i_t} + * \frac{\partial E}{\partial f_t} + * \frac{\partial E}{\partial o_t} + * \frac{\partial E}{\partial g_t} + * ] @f$ + * -# @f$ (1 \times 1 \times N) @f$ + * the gradient w.r.t. the sequence continuation indicators + * @f$ \delta_t @f$ is currently not computed. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief The hidden and output dimension. + int hidden_dim_; + Blob X_acts_; +}; + +/** + * @brief Processes time-varying inputs using a simple recurrent neural network + * (RNN). Implemented as a network unrolling the RNN computation in time. + * + * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$ + * h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ] + * @f$, and outputs @f$ + * o_t := \tanh[ W_{ho} h_t + b_o ] + * @f$. + */ +template +class RNNLayer : public RecurrentLayer { + public: + explicit RNNLayer(const LayerParameter& param) + : RecurrentLayer(param) {} + + virtual inline const char* type() const { return "RNN"; } + + protected: + virtual void FillUnrolledNet(NetParameter* net_param) const; + virtual void RecurrentInputBlobNames(vector* names) const; + virtual void RecurrentOutputBlobNames(vector* names) const; + virtual void RecurrentInputShapes(vector* shapes) const; + virtual void OutputBlobNames(vector* names) const; +}; + +} // namespace caffe + +#endif // CAFFE_SEQUENCE_LAYERS_HPP_ diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp new file mode 100644 index 00000000000..d5e0923a3d4 --- /dev/null +++ b/src/caffe/layers/lstm_layer.cpp @@ -0,0 +1,237 @@ +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void LSTMLayer::RecurrentInputBlobNames(vector* names) const { + names->resize(2); + (*names)[0] = "h_0"; + (*names)[1] = "c_0"; +} + +template +void LSTMLayer::RecurrentOutputBlobNames(vector* names) const { + names->resize(2); + (*names)[0] = "h_" + this->int_to_str(this->T_); + (*names)[1] = "c_T"; +} + +template +void LSTMLayer::RecurrentInputShapes(vector* shapes) const { + const int num_output = this->layer_param_.recurrent_param().num_output(); + const int num_blobs = 2; + shapes->resize(num_blobs); + for (int i = 0; i < num_blobs; ++i) { + (*shapes)[i].Clear(); + (*shapes)[i].add_dim(1); // a single timestep + (*shapes)[i].add_dim(this->N_); + (*shapes)[i].add_dim(num_output); + } +} + +template +void LSTMLayer::OutputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "h"; +} + +template +void LSTMLayer::FillUnrolledNet(NetParameter* net_param) const { + const int num_output = this->layer_param_.recurrent_param().num_output(); + CHECK_GT(num_output, 0) << "num_output must be positive"; + const FillerParameter& weight_filler = + this->layer_param_.recurrent_param().weight_filler(); + const FillerParameter& bias_filler = + this->layer_param_.recurrent_param().bias_filler(); + + // Add generic LayerParameter's (without bottoms/tops) of layer types we'll + // use to save redundant code. + LayerParameter hidden_param; + hidden_param.set_type("InnerProduct"); + hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4); + hidden_param.mutable_inner_product_param()->set_bias_term(false); + hidden_param.mutable_inner_product_param()->set_axis(2); + hidden_param.mutable_inner_product_param()-> + mutable_weight_filler()->CopyFrom(weight_filler); + + LayerParameter biased_hidden_param(hidden_param); + biased_hidden_param.mutable_inner_product_param()->set_bias_term(true); + biased_hidden_param.mutable_inner_product_param()-> + mutable_bias_filler()->CopyFrom(bias_filler); + + LayerParameter sum_param; + sum_param.set_type("Eltwise"); + sum_param.mutable_eltwise_param()->set_operation( + EltwiseParameter_EltwiseOp_SUM); + + LayerParameter scalar_param; + scalar_param.set_type("Scalar"); + scalar_param.mutable_scalar_param()->set_axis(0); + + LayerParameter slice_param; + slice_param.set_type("Slice"); + slice_param.mutable_slice_param()->set_axis(0); + + LayerParameter split_param; + split_param.set_type("Split"); + + vector input_shapes; + RecurrentInputShapes(&input_shapes); + CHECK_EQ(2, input_shapes.size()); + + net_param->add_input("c_0"); + net_param->add_input_shape()->CopyFrom(input_shapes[0]); + + net_param->add_input("h_0"); + net_param->add_input_shape()->CopyFrom(input_shapes[1]); + + LayerParameter* cont_slice_param = net_param->add_layer(); + cont_slice_param->CopyFrom(slice_param); + cont_slice_param->set_name("cont_slice"); + cont_slice_param->add_bottom("cont"); + cont_slice_param->mutable_slice_param()->set_axis(0); + + // Add layer to transform all timesteps of x to the hidden state dimension. + // W_xc_x = W_xc * x + b_c + { + LayerParameter* x_transform_param = net_param->add_layer(); + x_transform_param->CopyFrom(biased_hidden_param); + x_transform_param->set_name("x_transform"); + x_transform_param->add_param()->set_name("W_xc"); + x_transform_param->add_param()->set_name("b_c"); + x_transform_param->add_bottom("x"); + x_transform_param->add_top("W_xc_x"); + } + + if (this->static_input_) { + // Add layer to transform x_static to the gate dimension. + // W_xc_x_static = W_xc_static * x_static + LayerParameter* x_static_transform_param = net_param->add_layer(); + x_static_transform_param->CopyFrom(hidden_param); + x_static_transform_param->mutable_inner_product_param()->set_axis(1); + x_static_transform_param->set_name("W_xc_x_static"); + x_static_transform_param->add_param()->set_name("W_xc_static"); + x_static_transform_param->add_bottom("x_static"); + x_static_transform_param->add_top("W_xc_x_static_preshape"); + + LayerParameter* reshape_param = net_param->add_layer(); + reshape_param->set_type("Reshape"); + BlobShape* new_shape = + reshape_param->mutable_reshape_param()->mutable_shape(); + new_shape->add_dim(1); // One timestep. + // Should infer this->N as the dimension so we can reshape on batch size. + new_shape->add_dim(-1); + new_shape->add_dim( + x_static_transform_param->inner_product_param().num_output()); + reshape_param->add_bottom("W_xc_x_static_preshape"); + reshape_param->add_top("W_xc_x_static"); + } + + LayerParameter* x_slice_param = net_param->add_layer(); + x_slice_param->CopyFrom(slice_param); + x_slice_param->add_bottom("W_xc_x"); + x_slice_param->set_name("W_xc_x_slice"); + + LayerParameter output_concat_layer; + output_concat_layer.set_name("h_concat"); + output_concat_layer.set_type("Concat"); + output_concat_layer.add_top("h"); + output_concat_layer.mutable_concat_param()->set_axis(0); + + for (int t = 1; t <= this->T_; ++t) { + string tm1s = this->int_to_str(t - 1); + string ts = this->int_to_str(t); + + cont_slice_param->add_top("cont_" + ts); + x_slice_param->add_top("W_xc_x_" + ts); + + // Add layers to flush the hidden state when beginning a new + // sequence, as indicated by cont_t. + // h_conted_{t-1} := cont_t * h_{t-1} + // + // Normally, cont_t is binary (i.e., 0 or 1), so: + // h_conted_{t-1} := h_{t-1} if cont_t == 1 + // 0 otherwise + { + LayerParameter* cont_h_param = net_param->add_layer(); + cont_h_param->CopyFrom(scalar_param); + cont_h_param->set_name("h_conted_" + tm1s); + cont_h_param->add_bottom("h_" + tm1s); + cont_h_param->add_bottom("cont_" + ts); + cont_h_param->add_top("h_conted_" + tm1s); + } + + // Add layer to compute + // W_hc_h_{t-1} := W_hc * h_conted_{t-1} + { + LayerParameter* w_param = net_param->add_layer(); + w_param->CopyFrom(hidden_param); + w_param->set_name("transform_" + ts); + w_param->add_param()->set_name("W_hc"); + w_param->add_bottom("h_conted_" + tm1s); + w_param->add_top("W_hc_h_" + tm1s); + w_param->mutable_inner_product_param()->set_axis(2); + } + + // Add the outputs of the linear transformations to compute the gate input. + // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c + // = W_hc_h_{t-1} + W_xc_x_t + b_c + { + LayerParameter* input_sum_layer = net_param->add_layer(); + input_sum_layer->CopyFrom(sum_param); + input_sum_layer->set_name("gate_input_" + ts); + input_sum_layer->add_bottom("W_hc_h_" + tm1s); + input_sum_layer->add_bottom("W_xc_x_" + ts); + if (this->static_input_) { + input_sum_layer->add_bottom("W_xc_x_static"); + } + input_sum_layer->add_top("gate_input_" + ts); + } + + // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t. + // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t + // Outputs: c_t, h_t + // [ i_t' ] + // [ f_t' ] := gate_input_t + // [ o_t' ] + // [ g_t' ] + // i_t := \sigmoid[i_t'] + // f_t := \sigmoid[f_t'] + // o_t := \sigmoid[o_t'] + // g_t := \tanh[g_t'] + // c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) + // h_t := o_t .* \tanh[c_t] + { + LayerParameter* lstm_unit_param = net_param->add_layer(); + lstm_unit_param->set_type("LSTMUnit"); + lstm_unit_param->add_bottom("c_" + tm1s); + lstm_unit_param->add_bottom("gate_input_" + ts); + lstm_unit_param->add_bottom("cont_" + ts); + lstm_unit_param->add_top("c_" + ts); + lstm_unit_param->add_top("h_" + ts); + lstm_unit_param->set_name("unit_" + ts); + } + output_concat_layer.add_bottom("h_" + ts); + } // for (int t = 1; t <= this->T_; ++t) + + { + LayerParameter* c_T_copy_param = net_param->add_layer(); + c_T_copy_param->CopyFrom(split_param); + c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_)); + c_T_copy_param->add_top("c_T"); + } + net_param->add_layer()->CopyFrom(output_concat_layer); +} + +INSTANTIATE_CLASS(LSTMLayer); +REGISTER_LAYER_CLASS(LSTM); + +} // namespace caffe diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp new file mode 100644 index 00000000000..fd777f8adc3 --- /dev/null +++ b/src/caffe/layers/lstm_unit_layer.cpp @@ -0,0 +1,131 @@ +#include +#include +#include + +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" + +namespace caffe { + +template +inline Dtype sigmoid(Dtype x) { + return 1. / (1. + exp(-x)); +} + +template +inline Dtype tanh(Dtype x) { + return 2. * sigmoid(2. * x) - 1.; +} + +template +void LSTMUnitLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + const int num_instances = bottom[0]->shape(1); + for (int i = 0; i < bottom.size(); ++i) { + if (i == 2) { + CHECK_EQ(2, bottom[i]->num_axes()); + } else { + CHECK_EQ(3, bottom[i]->num_axes()); + } + CHECK_EQ(1, bottom[i]->shape(0)); + CHECK_EQ(num_instances, bottom[i]->shape(1)); + } + hidden_dim_ = bottom[0]->shape(2); + CHECK_EQ(num_instances, bottom[1]->shape(1)); + CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2)); + top[0]->ReshapeLike(*bottom[0]); + top[1]->ReshapeLike(*bottom[0]); + X_acts_.ReshapeLike(*bottom[1]); +} + +template +void LSTMUnitLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const int num = bottom[0]->shape(1); + const int x_dim = hidden_dim_ * 4; + const Dtype* C_prev = bottom[0]->cpu_data(); + const Dtype* X = bottom[1]->cpu_data(); + const Dtype* flush = bottom[2]->cpu_data(); + Dtype* C = top[0]->mutable_cpu_data(); + Dtype* H = top[1]->mutable_cpu_data(); + for (int n = 0; n < num; ++n) { + for (int d = 0; d < hidden_dim_; ++d) { + const Dtype i = sigmoid(X[d]); + const Dtype f = (*flush == 0) ? 0 : + (*flush * sigmoid(X[1 * hidden_dim_ + d])); + const Dtype o = sigmoid(X[2 * hidden_dim_ + d]); + const Dtype g = tanh(X[3 * hidden_dim_ + d]); + const Dtype c_prev = C_prev[d]; + const Dtype c = f * c_prev + i * g; + C[d] = c; + const Dtype tanh_c = tanh(c); + H[d] = o * tanh_c; + } + C_prev += hidden_dim_; + X += x_dim; + C += hidden_dim_; + H += hidden_dim_; + ++flush; + } +} + +template +void LSTMUnitLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators."; + if (!propagate_down[0] && !propagate_down[1]) { return; } + + const int num = bottom[0]->shape(1); + const int x_dim = hidden_dim_ * 4; + const Dtype* C_prev = bottom[0]->cpu_data(); + const Dtype* X = bottom[1]->cpu_data(); + const Dtype* flush = bottom[2]->cpu_data(); + const Dtype* C = top[0]->cpu_data(); + const Dtype* H = top[1]->cpu_data(); + const Dtype* C_diff = top[0]->cpu_diff(); + const Dtype* H_diff = top[1]->cpu_diff(); + Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff(); + Dtype* X_diff = bottom[1]->mutable_cpu_diff(); + for (int n = 0; n < num; ++n) { + for (int d = 0; d < hidden_dim_; ++d) { + const Dtype i = sigmoid(X[d]); + const Dtype f = (*flush == 0) ? 0 : + (*flush * sigmoid(X[1 * hidden_dim_ + d])); + const Dtype o = sigmoid(X[2 * hidden_dim_ + d]); + const Dtype g = tanh(X[3 * hidden_dim_ + d]); + const Dtype c_prev = C_prev[d]; + const Dtype c = C[d]; + const Dtype tanh_c = tanh(c); + Dtype* c_prev_diff = C_prev_diff + d; + Dtype* i_diff = X_diff + d; + Dtype* f_diff = X_diff + 1 * hidden_dim_ + d; + Dtype* o_diff = X_diff + 2 * hidden_dim_ + d; + Dtype* g_diff = X_diff + 3 * hidden_dim_ + d; + const Dtype c_term_diff = + C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c); + *c_prev_diff = c_term_diff * f; + *i_diff = c_term_diff * g * i * (1 - i); + *f_diff = c_term_diff * c_prev * f * (1 - f); + *o_diff = H_diff[d] * tanh_c * o * (1 - o); + *g_diff = c_term_diff * i * (1 - g * g); + } + C_prev += hidden_dim_; + X += x_dim; + C += hidden_dim_; + H += hidden_dim_; + C_diff += hidden_dim_; + H_diff += hidden_dim_; + X_diff += x_dim; + C_prev_diff += hidden_dim_; + ++flush; + } +} + +#ifdef CPU_ONLY +STUB_GPU(LSTMUnitLayer); +#endif + +INSTANTIATE_CLASS(LSTMUnitLayer); +REGISTER_LAYER_CLASS(LSTMUnit); + +} // namespace caffe diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu new file mode 100644 index 00000000000..d6bf85071f5 --- /dev/null +++ b/src/caffe/layers/lstm_unit_layer.cu @@ -0,0 +1,154 @@ +#include +#include +#include + +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" + +namespace caffe { + +template +__device__ Dtype sigmoid(const Dtype x) { + return Dtype(1) / (Dtype(1) + exp(-x)); +} + +template +__device__ Dtype tanh(const Dtype x) { + return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1); +} + +template +__global__ void LSTMActsForward(const int nthreads, const int dim, + const Dtype* X, Dtype* X_acts) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int x_dim = 4 * dim; + const int d = index % x_dim; + if (d < 3 * dim) { + X_acts[index] = sigmoid(X[index]); + } else { + X_acts[index] = tanh(X[index]); + } + } +} + +template +__global__ void LSTMUnitForward(const int nthreads, const int dim, + const Dtype* C_prev, const Dtype* X, const Dtype* flush, + Dtype* C, Dtype* H) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + const Dtype* X_offset = X + 4 * dim * n; + const Dtype i = X_offset[d]; + const Dtype f = X_offset[1 * dim + d]; + const Dtype o = X_offset[2 * dim + d]; + const Dtype g = X_offset[3 * dim + d]; + const Dtype c_prev = C_prev[index]; + const Dtype c = flush[n] * f * c_prev + i * g; + C[index] = c; + const Dtype tanh_c = tanh(c); + H[index] = o * tanh_c; + } +} + +template +void LSTMUnitLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const int count = top[1]->count(); + const Dtype* C_prev = bottom[0]->gpu_data(); + const Dtype* X = bottom[1]->gpu_data(); + const Dtype* flush = bottom[2]->gpu_data(); + Dtype* X_acts = X_acts_.mutable_gpu_data(); + Dtype* C = top[0]->mutable_gpu_data(); + Dtype* H = top[1]->mutable_gpu_data(); + const int X_count = bottom[1]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LSTMActsForward<<>>( + X_count, hidden_dim_, X, X_acts); + CUDA_POST_KERNEL_CHECK; + // NOLINT_NEXT_LINE(whitespace/operators) + LSTMUnitForward<<>>( + count, hidden_dim_, C_prev, X_acts, flush, C, H); + CUDA_POST_KERNEL_CHECK; +} + +template +__global__ void LSTMUnitBackward(const int nthreads, const int dim, + const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H, + const Dtype* flush, const Dtype* C_diff, const Dtype* H_diff, + Dtype* C_prev_diff, Dtype* X_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + const Dtype* X_offset = X + 4 * dim * n; + const Dtype i = X_offset[d]; + const Dtype f = X_offset[1 * dim + d]; + const Dtype o = X_offset[2 * dim + d]; + const Dtype g = X_offset[3 * dim + d]; + const Dtype c_prev = C_prev[index]; + const Dtype c = C[index]; + const Dtype tanh_c = tanh(c); + Dtype* c_prev_diff = C_prev_diff + index; + Dtype* X_diff_offset = X_diff + 4 * dim * n; + Dtype* i_diff = X_diff_offset + d; + Dtype* f_diff = X_diff_offset + 1 * dim + d; + Dtype* o_diff = X_diff_offset + 2 * dim + d; + Dtype* g_diff = X_diff_offset + 3 * dim + d; + const Dtype c_term_diff = + C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); + const Dtype flush_n = flush[n]; + *c_prev_diff = flush_n * c_term_diff * f; + *i_diff = c_term_diff * g; + *f_diff = flush_n * c_term_diff * c_prev; + *o_diff = H_diff[index] * tanh_c; + *g_diff = c_term_diff * i; + } +} + +template +__global__ void LSTMActsBackward(const int nthreads, const int dim, + const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int x_dim = 4 * dim; + const int d = index % x_dim; + const Dtype X_act = X_acts[index]; + if (d < 3 * dim) { + X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act); + } else { + X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act); + } + } +} + +template +void LSTMUnitLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators."; + if (!propagate_down[0] && !propagate_down[1]) { return; } + + const int count = top[1]->count(); + const Dtype* C_prev = bottom[0]->gpu_data(); + const Dtype* X_acts = X_acts_.gpu_data(); + const Dtype* flush = bottom[2]->gpu_data(); + const Dtype* C = top[0]->gpu_data(); + const Dtype* H = top[1]->gpu_data(); + const Dtype* C_diff = top[0]->gpu_diff(); + const Dtype* H_diff = top[1]->gpu_diff(); + Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff(); + Dtype* X_acts_diff = X_acts_.mutable_gpu_diff(); + LSTMUnitBackward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>(count, hidden_dim_, + C_prev, X_acts, C, H, flush, C_diff, H_diff, C_prev_diff, X_acts_diff); + CUDA_POST_KERNEL_CHECK; + const int X_count = bottom[1]->count(); + Dtype* X_diff = bottom[1]->mutable_gpu_diff(); + LSTMActsBackward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + X_count, hidden_dim_, X_acts, X_acts_diff, X_diff); + CUDA_POST_KERNEL_CHECK; +} + +INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer); + +} // namespace caffe diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp new file mode 100644 index 00000000000..89256229b54 --- /dev/null +++ b/src/caffe/layers/recurrent_layer.cpp @@ -0,0 +1,240 @@ +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +string RecurrentLayer::int_to_str(const int t) const { + ostringstream num; + num << t; + return num.str(); +} + +template +void RecurrentLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)"; + T_ = bottom[0]->shape(0); + N_ = bottom[0]->shape(1); + LOG(INFO) << "Initializing recurrent layer: assuming input batch contains " + << T_ << " timesteps of " << N_ << " independent streams."; + + CHECK_EQ(bottom[1]->num_axes(), 2) + << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)"; + CHECK_EQ(T_, bottom[1]->shape(0)); + CHECK_EQ(N_, bottom[1]->shape(1)); + + // If provided, bottom[2] is a static input to the recurrent net. + static_input_ = (bottom.size() > 2); + if (static_input_) { + CHECK_GE(bottom[2]->num_axes(), 1); + CHECK_EQ(N_, bottom[2]->shape(0)); + } + + // Create a NetParameter; setup the inputs that aren't unique to particular + // recurrent architectures. + NetParameter net_param; + net_param.set_force_backward(true); + + net_param.add_input("x"); + BlobShape input_shape; + for (int i = 0; i < bottom[0]->num_axes(); ++i) { + input_shape.add_dim(bottom[0]->shape(i)); + } + net_param.add_input_shape()->CopyFrom(input_shape); + + input_shape.Clear(); + for (int i = 0; i < bottom[1]->num_axes(); ++i) { + input_shape.add_dim(bottom[1]->shape(i)); + } + net_param.add_input("cont"); + net_param.add_input_shape()->CopyFrom(input_shape); + + if (static_input_) { + input_shape.Clear(); + for (int i = 0; i < bottom[2]->num_axes(); ++i) { + input_shape.add_dim(bottom[2]->shape(i)); + } + net_param.add_input("x_static"); + net_param.add_input_shape()->CopyFrom(input_shape); + } + + // Call the child's FillUnrolledNet implementation to specify the unrolled + // recurrent architecture. + this->FillUnrolledNet(&net_param); + + // Prepend this layer's name to the names of each layer in the unrolled net. + const string& layer_name = this->layer_param_.name(); + if (layer_name.size() > 0) { + for (int i = 0; i < net_param.layer_size(); ++i) { + LayerParameter* layer = net_param.mutable_layer(i); + layer->set_name(layer_name + "_" + layer->name()); + } + } + + // Create the unrolled net. + unrolled_net_.reset(new Net(net_param)); + unrolled_net_->set_debug_info( + this->layer_param_.recurrent_param().debug_info()); + + // Setup pointers to the inputs. + x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get()); + cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get()); + if (static_input_) { + x_static_input_blob_ = + CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get()); + } + + // Setup pointers to paired recurrent inputs/outputs. + vector recur_input_names; + RecurrentInputBlobNames(&recur_input_names); + vector recur_output_names; + RecurrentOutputBlobNames(&recur_output_names); + const int num_recur_blobs = recur_input_names.size(); + CHECK_EQ(num_recur_blobs, recur_output_names.size()); + recur_input_blobs_.resize(num_recur_blobs); + recur_output_blobs_.resize(num_recur_blobs); + for (int i = 0; i < recur_input_names.size(); ++i) { + recur_input_blobs_[i] = + CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get()); + recur_output_blobs_[i] = + CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get()); + } + + // Setup pointers to outputs. + vector output_names; + OutputBlobNames(&output_names); + CHECK_EQ(top.size(), output_names.size()) + << "OutputBlobNames must provide an output blob name for each top."; + output_blobs_.resize(output_names.size()); + for (int i = 0; i < output_names.size(); ++i) { + output_blobs_[i] = + CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get()); + } + + // We should have 2 inputs (x and cont), plus a number of recurrent inputs, + // plus maybe a static input. + CHECK_EQ(2 + num_recur_blobs + static_input_, + unrolled_net_->input_blobs().size()); + + // This layer's parameters are any parameters in the layers of the unrolled + // net. We only want one copy of each parameter, so check that the parameter + // is "owned" by the layer, rather than shared with another. + this->blobs_.clear(); + for (int i = 0; i < unrolled_net_->params().size(); ++i) { + if (unrolled_net_->param_owners()[i] == -1) { + LOG(INFO) << "Adding parameter " << i << ": " + << unrolled_net_->param_display_names()[i]; + this->blobs_.push_back(unrolled_net_->params()[i]); + } + } + // Check that param_propagate_down is set for all of the parameters in the + // unrolled net; set param_propagate_down to true in this layer. + for (int i = 0; i < unrolled_net_->layers().size(); ++i) { + for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) { + CHECK(unrolled_net_->layers()[i]->param_propagate_down(j)) + << "param_propagate_down not set for layer " << i << ", param " << j; + } + } + this->param_propagate_down_.clear(); + this->param_propagate_down_.resize(this->blobs_.size(), true); + + // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across + // batches. + for (int i = 0; i < recur_output_blobs_.size(); ++i) { + caffe_set(recur_output_blobs_[i]->count(), Dtype(0), + recur_output_blobs_[i]->mutable_cpu_diff()); + } +} + +template +void RecurrentLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)"; + CHECK_EQ(T_, bottom[0]->shape(0)) << "input number of timesteps changed"; + N_ = bottom[0]->shape(1); + CHECK_EQ(bottom[1]->num_axes(), 2) + << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)"; + CHECK_EQ(T_, bottom[1]->shape(0)); + CHECK_EQ(N_, bottom[1]->shape(1)); + CHECK_EQ(top.size(), output_blobs_.size()); + x_input_blob_->ReshapeLike(*bottom[0]); + vector cont_shape = bottom[1]->shape(); + cont_input_blob_->Reshape(cont_shape); + if (static_input_) { + x_static_input_blob_->ReshapeLike(*bottom[2]); + } + vector recur_input_shapes; + RecurrentInputShapes(&recur_input_shapes); + CHECK_EQ(recur_input_shapes.size(), recur_input_blobs_.size()); + for (int i = 0; i < recur_input_shapes.size(); ++i) { + recur_input_blobs_[i]->Reshape(recur_input_shapes[i]); + } + unrolled_net_->Reshape(); + x_input_blob_->ShareData(*bottom[0]); + x_input_blob_->ShareDiff(*bottom[0]); + cont_input_blob_->ShareData(*bottom[1]); + if (static_input_) { + x_static_input_blob_->ShareData(*bottom[2]); + x_static_input_blob_->ShareDiff(*bottom[2]); + } + for (int i = 0; i < top.size(); ++i) { + top[i]->ReshapeLike(*output_blobs_[i]); + top[i]->ShareData(*output_blobs_[i]); + top[i]->ShareDiff(*output_blobs_[i]); + } +} + +template +void RecurrentLayer::Reset() { + // "Reset" the hidden state of the net by zeroing out all recurrent outputs. + for (int i = 0; i < recur_output_blobs_.size(); ++i) { + caffe_set(recur_output_blobs_[i]->count(), Dtype(0), + recur_output_blobs_[i]->mutable_cpu_data()); + } +} + +template +void RecurrentLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size()); + for (int i = 0; i < recur_input_blobs_.size(); ++i) { + const int count = recur_input_blobs_[i]->count(); + DCHECK_EQ(count, recur_output_blobs_[i]->count()); + const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data(); + Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data(); + caffe_copy(count, timestep_T_data, timestep_0_data); + } + + unrolled_net_->ForwardPrefilled(); +} + +template +void RecurrentLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators."; + + // TODO: skip backpropagation to inputs and parameters inside the unrolled + // net according to propagate_down[0] and propagate_down[2]. For now just + // backprop to inputs and parameters unconditionally, as either the inputs or + // the parameters do need backward (or Net would have set + // layer_needs_backward_[i] == false for this layer). + unrolled_net_->Backward(); +} + +#ifdef CPU_ONLY +STUB_GPU_FORWARD(RecurrentLayer, Forward); +#endif + +INSTANTIATE_CLASS(RecurrentLayer); + +} // namespace caffe diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu new file mode 100644 index 00000000000..fa06b8add5e --- /dev/null +++ b/src/caffe/layers/recurrent_layer.cu @@ -0,0 +1,35 @@ +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void RecurrentLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // Hacky fix for test time... reshare all the shared blobs. + // TODO: somehow make this work non-hackily. + if (this->phase_ == TEST) { + unrolled_net_->ShareWeights(); + } + + DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size()); + for (int i = 0; i < recur_input_blobs_.size(); ++i) { + const int count = recur_input_blobs_[i]->count(); + DCHECK_EQ(count, recur_output_blobs_[i]->count()); + const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data(); + Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data(); + caffe_copy(count, timestep_T_data, timestep_0_data); + } + + unrolled_net_->ForwardPrefilled(); +} + +INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer); + +} // namespace caffe diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp new file mode 100644 index 00000000000..88ec92179cc --- /dev/null +++ b/src/caffe/layers/rnn_layer.cpp @@ -0,0 +1,229 @@ +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void RNNLayer::RecurrentInputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "h_0"; +} + +template +void RNNLayer::RecurrentOutputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "h_" + this->int_to_str(this->T_); +} + +template +void RNNLayer::RecurrentInputShapes(vector* shapes) const { + const int num_output = this->layer_param_.recurrent_param().num_output(); + shapes->resize(1); + (*shapes)[0].Clear(); + (*shapes)[0].add_dim(1); // a single timestep + (*shapes)[0].add_dim(this->N_); + (*shapes)[0].add_dim(num_output); +} + +template +void RNNLayer::OutputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "o"; +} + +template +void RNNLayer::FillUnrolledNet(NetParameter* net_param) const { + const int num_output = this->layer_param_.recurrent_param().num_output(); + CHECK_GT(num_output, 0) << "num_output must be positive"; + const FillerParameter& weight_filler = + this->layer_param_.recurrent_param().weight_filler(); + const FillerParameter& bias_filler = + this->layer_param_.recurrent_param().bias_filler(); + + // Add generic LayerParameter's (without bottoms/tops) of layer types we'll + // use to save redundant code. + LayerParameter hidden_param; + hidden_param.set_type("InnerProduct"); + hidden_param.mutable_inner_product_param()->set_num_output(num_output); + hidden_param.mutable_inner_product_param()->set_bias_term(false); + hidden_param.mutable_inner_product_param()->set_axis(2); + hidden_param.mutable_inner_product_param()-> + mutable_weight_filler()->CopyFrom(weight_filler); + + LayerParameter biased_hidden_param(hidden_param); + biased_hidden_param.mutable_inner_product_param()->set_bias_term(true); + biased_hidden_param.mutable_inner_product_param()-> + mutable_bias_filler()->CopyFrom(bias_filler); + + LayerParameter sum_param; + sum_param.set_type("Eltwise"); + sum_param.mutable_eltwise_param()->set_operation( + EltwiseParameter_EltwiseOp_SUM); + + LayerParameter tanh_param; + tanh_param.set_type("TanH"); + + LayerParameter scalar_param; + scalar_param.set_type("Scalar"); + scalar_param.mutable_scalar_param()->set_axis(0); + + LayerParameter slice_param; + slice_param.set_type("Slice"); + slice_param.mutable_slice_param()->set_axis(0); + + vector input_shapes; + RecurrentInputShapes(&input_shapes); + CHECK_EQ(1, input_shapes.size()); + net_param->add_input("h_0"); + net_param->add_input_shape()->CopyFrom(input_shapes[0]); + + LayerParameter* cont_slice_param = net_param->add_layer(); + cont_slice_param->CopyFrom(slice_param); + cont_slice_param->set_name("cont_slice"); + cont_slice_param->add_bottom("cont"); + cont_slice_param->mutable_slice_param()->set_axis(0); + + // Add layer to transform all timesteps of x to the hidden state dimension. + // W_xh_x = W_xh * x + b_h + { + LayerParameter* x_transform_param = net_param->add_layer(); + x_transform_param->CopyFrom(biased_hidden_param); + x_transform_param->set_name("x_transform"); + x_transform_param->add_param()->set_name("W_xh"); + x_transform_param->add_param()->set_name("b_h"); + x_transform_param->add_bottom("x"); + x_transform_param->add_top("W_xh_x"); + } + + if (this->static_input_) { + // Add layer to transform x_static to the hidden state dimension. + // W_xh_x_static = W_xh_static * x_static + LayerParameter* x_static_transform_param = net_param->add_layer(); + x_static_transform_param->CopyFrom(hidden_param); + x_static_transform_param->mutable_inner_product_param()->set_axis(1); + x_static_transform_param->set_name("W_xh_x_static"); + x_static_transform_param->add_param()->set_name("W_xh_static"); + x_static_transform_param->add_bottom("x_static"); + x_static_transform_param->add_top("W_xh_x_static"); + + LayerParameter* reshape_param = net_param->add_layer(); + reshape_param->set_type("Reshape"); + BlobShape* new_shape = + reshape_param->mutable_reshape_param()->mutable_shape(); + new_shape->add_dim(1); // One timestep. + new_shape->add_dim(this->N_); + new_shape->add_dim( + x_static_transform_param->inner_product_param().num_output()); + reshape_param->set_name("W_xh_x_static_reshape"); + reshape_param->add_bottom("W_xh_x_static"); + reshape_param->add_top("W_xh_x_static"); + } + + LayerParameter* x_slice_param = net_param->add_layer(); + x_slice_param->CopyFrom(slice_param); + x_slice_param->set_name("W_xh_x_slice"); + x_slice_param->add_bottom("W_xh_x"); + + LayerParameter output_concat_layer; + output_concat_layer.set_name("o_concat"); + output_concat_layer.set_type("Concat"); + output_concat_layer.add_top("o"); + output_concat_layer.mutable_concat_param()->set_axis(0); + + for (int t = 1; t <= this->T_; ++t) { + string tm1s = this->int_to_str(t - 1); + string ts = this->int_to_str(t); + + cont_slice_param->add_top("cont_" + ts); + x_slice_param->add_top("W_xh_x_" + ts); + + // Add layer to flush the hidden state when beginning a new sequence, + // as indicated by cont_t. + // h_conted_{t-1} := cont_t * h_{t-1} + // + // Normally, cont_t is binary (i.e., 0 or 1), so: + // h_conted_{t-1} := h_{t-1} if cont_t == 1 + // 0 otherwise + { + LayerParameter* cont_h_param = net_param->add_layer(); + cont_h_param->CopyFrom(scalar_param); + cont_h_param->set_name("h_conted_" + tm1s); + cont_h_param->add_bottom("h_" + tm1s); + cont_h_param->add_bottom("cont_" + ts); + cont_h_param->add_top("h_conted_" + tm1s); + } + + // Add layer to compute + // W_hh_h_{t-1} := W_hh * h_conted_{t-1} + { + LayerParameter* w_param = net_param->add_layer(); + w_param->CopyFrom(hidden_param); + w_param->set_name("W_hh_h_" + tm1s); + w_param->add_param()->set_name("W_hh"); + w_param->add_bottom("h_conted_" + tm1s); + w_param->add_top("W_hh_h_" + tm1s); + w_param->mutable_inner_product_param()->set_axis(2); + } + + // Add layers to compute + // h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h ) + // = \tanh( W_hh_h_{t-1} + W_xh_t ) + { + LayerParameter* h_input_sum_param = net_param->add_layer(); + h_input_sum_param->CopyFrom(sum_param); + h_input_sum_param->set_name("h_input_sum_" + ts); + h_input_sum_param->add_bottom("W_hh_h_" + tm1s); + h_input_sum_param->add_bottom("W_xh_x_" + ts); + if (this->static_input_) { + h_input_sum_param->add_bottom("W_xh_x_static"); + } + h_input_sum_param->add_top("h_neuron_input_" + ts); + } + { + LayerParameter* h_neuron_param = net_param->add_layer(); + h_neuron_param->CopyFrom(tanh_param); + h_neuron_param->set_name("h_neuron_" + ts); + h_neuron_param->add_bottom("h_neuron_input_" + ts); + h_neuron_param->add_top("h_" + ts); + } + + // Add layer to compute + // W_ho_h_t := W_ho * h_t + b_o + { + LayerParameter* w_param = net_param->add_layer(); + w_param->CopyFrom(biased_hidden_param); + w_param->set_name("W_ho_h_" + ts); + w_param->add_param()->set_name("W_ho"); + w_param->add_param()->set_name("b_o"); + w_param->add_bottom("h_" + ts); + w_param->add_top("W_ho_h_" + ts); + w_param->mutable_inner_product_param()->set_axis(2); + } + + // Add layers to compute + // o_t := \tanh( W_ho h_t + b_o) + // = \tanh( W_ho_h_t ) + { + LayerParameter* o_neuron_param = net_param->add_layer(); + o_neuron_param->CopyFrom(tanh_param); + o_neuron_param->set_name("o_neuron_" + ts); + o_neuron_param->add_bottom("W_ho_h_" + ts); + o_neuron_param->add_top("o_" + ts); + } + output_concat_layer.add_bottom("o_" + ts); + } // for (int t = 1; t <= this->T_; ++t) + + net_param->add_layer()->CopyFrom(output_concat_layer); +} + +INSTANTIATE_CLASS(RNNLayer); +REGISTER_LAYER_CLASS(RNN); + +} // namespace caffe diff --git a/src/caffe/layers/scalar_layer.cpp b/src/caffe/layers/scalar_layer.cpp new file mode 100644 index 00000000000..5a4fac1aaee --- /dev/null +++ b/src/caffe/layers/scalar_layer.cpp @@ -0,0 +1,119 @@ +#include +#include + +#include "caffe/common_layers.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void ScalarLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + // TODO: make ScalarLayer usable in-place. + // Currently, in-place computation is broken during Backward with + // propagate_down[0] && propagate_down[1], as bottom[0]'s diff is used for + // temporary storage of an intermediate result, overwriting top[0]'s diff + // if using in-place computation. + CHECK_NE(bottom[0], top[0]) << "ScalarLayer cannot be used in-place"; + axis_ = + bottom[0]->CanonicalAxisIndex(this->layer_param_.scalar_param().axis()); + CHECK_GE(bottom[0]->num_axes(), axis_ + bottom[1]->num_axes()) + << "bottom[1]'s shape extends past bottom[0]'s shape when applied " + << "starting with bottom[0] axis = " << axis_; + for (int i = 0; i < bottom[1]->num_axes(); ++i) { + CHECK_EQ(bottom[0]->shape(axis_ + i), bottom[1]->shape(i)) + << "dimension mismatch between bottom[0]->shape(" << axis_ + i + << ") and bottom[1]->shape(" << i << ")"; + } + outer_dim_ = bottom[0]->count(0, axis_); + scalar_dim_ = bottom[1]->count(); + inner_dim_ = bottom[0]->count(axis_ + bottom[1]->num_axes()); + top[0]->ReshapeLike(*bottom[0]); + sum_result_.Reshape(vector(1, outer_dim_ * scalar_dim_)); + const int sum_mult_size = std::max(outer_dim_, inner_dim_); + sum_multiplier_.Reshape(vector(1, sum_mult_size)); + if (sum_multiplier_.cpu_data()[sum_mult_size - 1] != Dtype(1)) { + caffe_set(sum_mult_size, Dtype(1), sum_multiplier_.mutable_cpu_data()); + } +} + +template +void ScalarLayer::Forward_cpu( + const vector*>& bottom, const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* scalar_data = bottom[1]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int n = 0; n < outer_dim_; ++n) { + for (int d = 0; d < scalar_dim_; ++d) { + const Dtype factor = scalar_data[d]; + caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data); + bottom_data += inner_dim_; + top_data += inner_dim_; + } + } +} + +template +void ScalarLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + // Hack: store big eltwise product in bottom[0] diff, except in the special + // case where this layer itself does the eltwise product, in which case we + // can store it directly in the scalar diff, and we're done. + const bool is_eltwise = (inner_dim_ == 1 && outer_dim_ == 1); + Dtype* product = is_eltwise ? + bottom[1]->mutable_cpu_diff() : bottom[0]->mutable_cpu_diff(); + caffe_mul(top[0]->count(), top_diff, bottom_data, product); + if (!is_eltwise) { + Dtype* sum_result = NULL; + if (inner_dim_ == 1) { + sum_result = product; + } else if (sum_result_.count() == 1) { + const Dtype* sum_mult = sum_multiplier_.cpu_data(); + Dtype* scalar_diff = bottom[1]->mutable_cpu_diff(); + *scalar_diff = caffe_cpu_dot(inner_dim_, product, sum_mult); + } else { + const Dtype* sum_mult = sum_multiplier_.cpu_data(); + sum_result = (outer_dim_ == 1) ? + bottom[1]->mutable_cpu_diff() : sum_result_.mutable_cpu_data(); + caffe_cpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_, + Dtype(1), product, sum_mult, Dtype(0), sum_result); + } + if (outer_dim_ != 1) { + const Dtype* sum_mult = sum_multiplier_.cpu_data(); + Dtype* scalar_diff = bottom[1]->mutable_cpu_diff(); + if (scalar_dim_ == 1) { + *scalar_diff = caffe_cpu_dot(outer_dim_, sum_mult, sum_result); + } else { + caffe_cpu_gemv(CblasTrans, outer_dim_, scalar_dim_, + Dtype(1), sum_result, sum_mult, Dtype(0), scalar_diff); + } + } + } + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* scalar_data = bottom[1]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + for (int n = 0; n < outer_dim_; ++n) { + for (int d = 0; d < scalar_dim_; ++d) { + const Dtype factor = scalar_data[d]; + caffe_cpu_scale(inner_dim_, factor, top_diff, bottom_diff); + bottom_diff += inner_dim_; + top_diff += inner_dim_; + } + } + } +} + +#ifdef CPU_ONLY +STUB_GPU(ScalarLayer); +#endif + +INSTANTIATE_CLASS(ScalarLayer); +REGISTER_LAYER_CLASS(Scalar); + +} // namespace caffe diff --git a/src/caffe/layers/scalar_layer.cu b/src/caffe/layers/scalar_layer.cu new file mode 100644 index 00000000000..2711540048a --- /dev/null +++ b/src/caffe/layers/scalar_layer.cu @@ -0,0 +1,86 @@ +#include +#include + +#include "caffe/common_layers.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +__global__ void ScalarForward(const int n, const Dtype* in, + const Dtype* scalars, const int scalar_dim, const int inner_dim, + Dtype* out) { + CUDA_KERNEL_LOOP(index, n) { + const int scalar_index = (index / inner_dim) % scalar_dim; + out[index] = in[index] * scalars[scalar_index]; + } +} + +template +void ScalarLayer::Forward_gpu( + const vector*>& bottom, const vector*>& top) { + const int count = top[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* scalar_data = bottom[1]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + ScalarForward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + count, bottom_data, scalar_data, scalar_dim_, inner_dim_, top_data); +} + +template +void ScalarLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Hack: store big eltwise product in bottom[0] diff, except in the special + // case where this layer itself does the eltwise product, in which case we + // can store it directly in the scalar diff, and we're done. + const bool is_eltwise = (inner_dim_ == 1 && outer_dim_ == 1); + Dtype* product = is_eltwise ? + bottom[1]->mutable_gpu_diff() : bottom[0]->mutable_gpu_diff(); + caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product); + if (!is_eltwise) { + Dtype* sum_result = NULL; + if (inner_dim_ == 1) { + sum_result = product; + } else if (sum_result_.count() == 1) { + const Dtype* sum_mult = sum_multiplier_.gpu_data(); + Dtype* scalar_diff = bottom[1]->mutable_cpu_diff(); + caffe_gpu_dot(inner_dim_, product, sum_mult, scalar_diff); + } else { + const Dtype* sum_mult = sum_multiplier_.gpu_data(); + sum_result = (outer_dim_ == 1) ? + bottom[1]->mutable_gpu_diff() : sum_result_.mutable_gpu_data(); + caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_, + Dtype(1), product, sum_mult, Dtype(0), sum_result); + } + if (outer_dim_ != 1) { + const Dtype* sum_mult = sum_multiplier_.gpu_data(); + if (scalar_dim_ == 1) { + Dtype* scalar_diff = bottom[1]->mutable_cpu_diff(); + caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scalar_diff); + } else { + Dtype* scalar_diff = bottom[1]->mutable_gpu_diff(); + caffe_gpu_gemv(CblasTrans, outer_dim_, scalar_dim_, + Dtype(1), sum_result, sum_mult, Dtype(0), scalar_diff); + } + } + } + } + if (propagate_down[0]) { + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* scalar_data = bottom[1]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + ScalarForward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + count, top_diff, scalar_data, scalar_dim_, inner_dim_, bottom_diff); + } +} + +INSTANTIATE_LAYER_GPU_FUNCS(ScalarLayer); + +} // namespace caffe diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index aa299f8660b..7c7d6074184 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -301,7 +301,7 @@ message ParamSpec { // NOTE // Update the next available ID when you add a new LayerParameter field. // -// LayerParameter next available layer-specific ID: 139 (last added: tile_param) +// LayerParameter next available layer-specific ID: 141 (last added: recurrent_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type @@ -374,9 +374,11 @@ message LayerParameter { optional PowerParameter power_param = 122; optional PReLUParameter prelu_param = 131; optional PythonParameter python_param = 130; + optional RecurrentParameter recurrent_param = 140; optional ReductionParameter reduction_param = 136; optional ReLUParameter relu_param = 123; optional ReshapeParameter reshape_param = 133; + optional ScalarParameter scalar_param = 139; optional SigmoidParameter sigmoid_param = 124; optional SoftmaxParameter softmax_param = 125; optional SPPParameter spp_param = 132; @@ -767,6 +769,19 @@ message PythonParameter { optional bool share_in_parallel = 4 [default = false]; } +// Message that stores parameters used by RecurrentLayer +message RecurrentParameter { + // The dimension of the output (and usually hidden state) representation -- + // must be explicitly set to non-zero. + optional uint32 num_output = 1 [default = 0]; + + optional FillerParameter weight_filler = 2; // The filler for the weight + optional FillerParameter bias_filler = 3; // The filler for the bias + + // Whether to enable displaying debug_info in the unrolled recurrent net. + optional bool debug_info = 4 [default = false]; +} + // Message that stores parameters used by ReductionLayer message ReductionParameter { enum ReductionOp { @@ -876,6 +891,23 @@ message ReshapeParameter { optional int32 num_axes = 3 [default = -1]; } +message ScalarParameter { + // The first axis of bottom[0] (the first input Blob) along which to apply + // bottom[1] (the second input Blob). May be negative to index from the end + // (e.g., -1 for the last axis). + // + // For example, if bottom[0] is 4D with shape 100x3x224x224, the output + // top[0] will have the same shape, and bottom[1] may have any of the + // following shapes (for the given value of axis): + // (axis == 0 == -4) 100; 100x3; 100x3x224; 100x3x224x224 + // (axis == 1 == -3) 3; 3x224; 3x224x224 + // (axis == 2 == -2) 224; 224x224 + // (axis == 3 == -1) 224 + // Furthermore, bottom[1] may have the empty shape (regardless of the value of + // "axis") -- a literal scalar. + optional int32 axis = 1 [default = 0]; +} + message SigmoidParameter { enum Engine { DEFAULT = 0; diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp new file mode 100644 index 00000000000..1fdc2fd2041 --- /dev/null +++ b/src/caffe/test/test_lstm_layer.cpp @@ -0,0 +1,266 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/sequence_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template +class LSTMLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + + protected: + LSTMLayerTest() : num_output_(7) { + blob_bottom_vec_.push_back(&blob_bottom_); + blob_bottom_vec_.push_back(&blob_bottom_flush_); + blob_top_vec_.push_back(&blob_top_); + unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_); + unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_); + unit_blob_bottom_vec_.push_back(&unit_blob_bottom_flush_); + unit_blob_top_vec_.push_back(&unit_blob_top_c_); + unit_blob_top_vec_.push_back(&unit_blob_top_h_); + + ReshapeBlobs(1, 3); + + layer_param_.mutable_recurrent_param()->set_num_output(num_output_); + FillerParameter* weight_filler = + layer_param_.mutable_recurrent_param()->mutable_weight_filler(); + weight_filler->set_type("gaussian"); + weight_filler->set_std(0.2); + FillerParameter* bias_filler = + layer_param_.mutable_recurrent_param()->mutable_bias_filler(); + bias_filler->set_type("gaussian"); + bias_filler->set_std(0.1); + + layer_param_.set_phase(TEST); + } + + void ReshapeBlobs(int num_timesteps, int num_instances) { + blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2); + vector shape(2); + shape[0] = num_timesteps; + shape[1] = num_instances; + blob_bottom_flush_.Reshape(shape); + shape.push_back(num_output_); + + shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_; + unit_blob_bottom_x_.Reshape(shape); + shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_; + unit_blob_bottom_c_prev_.Reshape(shape); + shape.resize(2); + shape[0] = 1; shape[1] = num_instances; + unit_blob_bottom_flush_.Reshape(shape); + + FillerParameter filler_param; + filler_param.set_min(-1); + filler_param.set_max(1); + UniformFiller filler(filler_param); + filler.Fill(&blob_bottom_); + filler.Fill(&unit_blob_bottom_c_prev_); + filler.Fill(&unit_blob_bottom_x_); + } + + int num_output_; + LayerParameter layer_param_; + Blob blob_bottom_; + Blob blob_bottom_flush_; + Blob blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; + + Blob unit_blob_bottom_flush_; + Blob unit_blob_bottom_c_prev_; + Blob unit_blob_bottom_x_; + Blob unit_blob_top_c_; + Blob unit_blob_top_h_; + vector*> unit_blob_bottom_vec_; + vector*> unit_blob_top_vec_; +}; + +TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices); + +TYPED_TEST(LSTMLayerTest, TestSetUp) { + typedef typename TypeParam::Dtype Dtype; + LSTMLayer layer(this->layer_param_); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + vector expected_top_shape = this->blob_bottom_.shape(); + expected_top_shape.resize(3); + expected_top_shape[2] = this->num_output_; + EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape); +} + +TYPED_TEST(LSTMLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + const int kNumTimesteps = 3; + const int num = this->blob_bottom_.shape(1); + this->ReshapeBlobs(kNumTimesteps, num); + + // Fill the flush blob with <0, 1, 1, ..., 1>, + // indicating a sequence that begins at the first timestep + // then continues for the rest of the sequence. + for (int t = 0; t < kNumTimesteps; ++t) { + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0; + } + } + + // Process the full sequence in a single batch. + FillerParameter filler_param; + filler_param.set_mean(0); + filler_param.set_std(1); + GaussianFiller sequence_filler(filler_param); + Caffe::set_random_seed(1); + sequence_filler.Fill(&this->blob_bottom_); + shared_ptr > layer(new LSTMLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + LOG(INFO) << "Calling forward for full sequence LSTM"; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Copy the inputs and outputs to reuse/check them later. + Blob bottom_copy(this->blob_bottom_.shape()); + bottom_copy.CopyFrom(this->blob_bottom_); + Blob top_copy(this->blob_top_.shape()); + top_copy.CopyFrom(this->blob_top_); + + // Process the batch one timestep at a time; + // check that we get the same result. + this->ReshapeBlobs(1, num); + layer.reset(new LSTMLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const int bottom_count = this->blob_bottom_.count(); + const int top_count = this->blob_top_.count(); + const Dtype kEpsilon = 1e-5; + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0; + } + LOG(INFO) << "Calling forward for LSTM timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + ASSERT_LT(t * top_count + i, top_copy.count()); + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } + } + + // Process the batch one timestep at a time with all flush blobs set to 0. + // Check that we get a different result, except in the first timestep. + Caffe::set_random_seed(1701); + layer.reset(new LSTMLayer(this->layer_param_)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = 0; + } + LOG(INFO) << "Calling forward for LSTM timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + if (t == 0) { + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } else { + EXPECT_NE(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i]) + << "t = " << t << "; i = " << i; + } + } + } +} + +TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LSTMUnitLayer layer(layer_param); + layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_); + const int num_axes = this->unit_blob_bottom_c_prev_.num_axes(); + ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes()); + ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes()); + for (int i = 0; i < num_axes; ++i) { + EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i), + this->unit_blob_top_c_.shape(i)); + EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i), + this->unit_blob_top_h_.shape(i)); + } +} + +TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LSTMUnitLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data(); + flush_data[0] = 0; + flush_data[1] = 0; + flush_data[2] = 0; + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 0); + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 1); +} + +TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroFlush) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LSTMUnitLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data(); + flush_data[0] = 1; + flush_data[1] = 0; + flush_data[2] = 1; + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 0); + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 1); +} + +TYPED_TEST(LSTMLayerTest, TestGradient) { + typedef typename TypeParam::Dtype Dtype; + LSTMLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlush) { + typedef typename TypeParam::Dtype Dtype; + LSTMLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlushBufferSize2) { + typedef typename TypeParam::Dtype Dtype; + this->ReshapeBlobs(2, 2); + FillerParameter filler_param; + UniformFiller filler(filler_param); + filler.Fill(&this->blob_bottom_); + LSTMLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +} // namespace caffe diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp new file mode 100644 index 00000000000..eab9269ce77 --- /dev/null +++ b/src/caffe/test/test_rnn_layer.cpp @@ -0,0 +1,196 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/sequence_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template +class RNNLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + + protected: + RNNLayerTest() : num_output_(7) { + blob_bottom_vec_.push_back(&blob_bottom_); + blob_bottom_vec_.push_back(&blob_bottom_flush_); + blob_top_vec_.push_back(&blob_top_); + + ReshapeBlobs(1, 3); + + layer_param_.mutable_recurrent_param()->set_num_output(num_output_); + FillerParameter* weight_filler = + layer_param_.mutable_recurrent_param()->mutable_weight_filler(); + weight_filler->set_type("gaussian"); + weight_filler->set_std(0.2); + FillerParameter* bias_filler = + layer_param_.mutable_recurrent_param()->mutable_bias_filler(); + bias_filler->set_type("gaussian"); + bias_filler->set_std(0.1); + + layer_param_.set_phase(TEST); + } + + void ReshapeBlobs(int num_timesteps, int num_instances) { + blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2); + vector shape(2); + shape[0] = num_timesteps; + shape[1] = num_instances; + blob_bottom_flush_.Reshape(shape); + + FillerParameter filler_param; + filler_param.set_min(-1); + filler_param.set_max(1); + UniformFiller filler(filler_param); + filler.Fill(&blob_bottom_); + } + + int num_output_; + LayerParameter layer_param_; + Blob blob_bottom_; + Blob blob_bottom_flush_; + Blob blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; +}; + +TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices); + +TYPED_TEST(RNNLayerTest, TestSetUp) { + typedef typename TypeParam::Dtype Dtype; + RNNLayer layer(this->layer_param_); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + vector expected_top_shape = this->blob_bottom_.shape(); + expected_top_shape.resize(3); + expected_top_shape[2] = this->num_output_; + EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape); +} + +TYPED_TEST(RNNLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + const int kNumTimesteps = 3; + const int num = this->blob_bottom_.shape(1); + this->ReshapeBlobs(kNumTimesteps, num); + + // Fill the flush blob with <0, 1, 1, ..., 1>, + // indicating a sequence that begins at the first timestep + // then continues for the rest of the sequence. + for (int t = 0; t < kNumTimesteps; ++t) { + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0; + } + } + + // Process the full sequence in a single batch. + FillerParameter filler_param; + filler_param.set_mean(0); + filler_param.set_std(1); + GaussianFiller sequence_filler(filler_param); + sequence_filler.Fill(&this->blob_bottom_); + shared_ptr > layer(new RNNLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + LOG(INFO) << "Calling forward for full sequence RNN"; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Copy the inputs and outputs to reuse/check them later. + Blob bottom_copy(this->blob_bottom_.shape()); + bottom_copy.CopyFrom(this->blob_bottom_); + Blob top_copy(this->blob_top_.shape()); + top_copy.CopyFrom(this->blob_top_); + + // Process the batch one timestep at a time; + // check that we get the same result. + this->ReshapeBlobs(1, num); + layer.reset(new RNNLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const int bottom_count = this->blob_bottom_.count(); + const int top_count = this->blob_top_.count(); + const Dtype kEpsilon = 1e-5; + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0; + } + LOG(INFO) << "Calling forward for RNN timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + ASSERT_LT(t * top_count + i, top_copy.count()); + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } + } + + // Process the batch one timestep at a time with all flush blobs set to 0. + // Check that we get a different result, except in the first timestep. + Caffe::set_random_seed(1701); + layer.reset(new RNNLayer(this->layer_param_)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = 0; + } + LOG(INFO) << "Calling forward for RNN timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + if (t == 0) { + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } else { + EXPECT_NE(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i]) + << "t = " << t << "; i = " << i; + } + } + } +} + +TYPED_TEST(RNNLayerTest, TestGradient) { + typedef typename TypeParam::Dtype Dtype; + RNNLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlush) { + typedef typename TypeParam::Dtype Dtype; + RNNLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlushBufferSize2) { + typedef typename TypeParam::Dtype Dtype; + this->ReshapeBlobs(2, 2); + // fill the values + FillerParameter filler_param; + UniformFiller filler(filler_param); + filler.Fill(&this->blob_bottom_); + RNNLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +} // namespace caffe diff --git a/src/caffe/test/test_scalar_layer.cpp b/src/caffe/test/test_scalar_layer.cpp new file mode 100644 index 00000000000..d823f1ef3ce --- /dev/null +++ b/src/caffe/test/test_scalar_layer.cpp @@ -0,0 +1,258 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/vision_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template +class ScalarLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + + protected: + ScalarLayerTest() + : blob_bottom_(new Blob(2, 3, 4, 5)), + blob_bottom_eltwise_(new Blob(2, 3, 4, 5)), + blob_bottom_broadcast_0_(new Blob()), + blob_bottom_broadcast_1_(new Blob()), + blob_bottom_broadcast_2_(new Blob()), + blob_bottom_scalar_(new Blob(vector())), + blob_top_(new Blob()) { + Caffe::set_random_seed(1701); + vector broadcast_shape(2); + broadcast_shape[0] = 2; broadcast_shape[1] = 3; + this->blob_bottom_broadcast_0_->Reshape(broadcast_shape); + broadcast_shape[0] = 3; broadcast_shape[1] = 4; + this->blob_bottom_broadcast_1_->Reshape(broadcast_shape); + broadcast_shape[0] = 4; broadcast_shape[1] = 5; + this->blob_bottom_broadcast_2_->Reshape(broadcast_shape); + FillerParameter filler_param; + filler_param.set_min(1); + filler_param.set_max(10); + UniformFiller filler(filler_param); + filler.Fill(this->blob_bottom_); + filler.Fill(this->blob_bottom_eltwise_); + filler.Fill(this->blob_bottom_broadcast_0_); + filler.Fill(this->blob_bottom_broadcast_1_); + filler.Fill(this->blob_bottom_broadcast_2_); + filler.Fill(this->blob_bottom_scalar_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~ScalarLayerTest() { + delete blob_bottom_; + delete blob_bottom_eltwise_; + delete blob_bottom_broadcast_0_; + delete blob_bottom_broadcast_1_; + delete blob_bottom_broadcast_2_; + delete blob_bottom_scalar_; + delete blob_top_; + } + Blob* const blob_bottom_; + Blob* const blob_bottom_eltwise_; + Blob* const blob_bottom_broadcast_0_; + Blob* const blob_bottom_broadcast_1_; + Blob* const blob_bottom_broadcast_2_; + Blob* const blob_bottom_scalar_; + Blob* const blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; +}; + +TYPED_TEST_CASE(ScalarLayerTest, TestDtypesAndDevices); + +TYPED_TEST(ScalarLayerTest, TestForwardEltwise) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_); + LayerParameter layer_param; + shared_ptr > layer( + new ScalarLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* data = this->blob_top_->cpu_data(); + const int count = this->blob_top_->count(); + const Dtype* in_data_a = this->blob_bottom_->cpu_data(); + const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data(); + for (int i = 0; i < count; ++i) { + EXPECT_EQ(data[i], in_data_a[i] * in_data_b[i]); + } +} + +TYPED_TEST(ScalarLayerTest, TestForwardBroadcastBegin) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_); + LayerParameter layer_param; + shared_ptr > layer( + new ScalarLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int n = 0; n < this->blob_bottom_->num(); ++n) { + for (int c = 0; c < this->blob_bottom_->channels(); ++c) { + for (int h = 0; h < this->blob_bottom_->height(); ++h) { + for (int w = 0; w < this->blob_bottom_->width(); ++w) { + EXPECT_EQ(this->blob_top_->data_at(n, c, h, w), + this->blob_bottom_->data_at(n, c, h, w) * + this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0)); + } + } + } + } +} + +TYPED_TEST(ScalarLayerTest, TestForwardBroadcastMiddle) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_); + LayerParameter layer_param; + layer_param.mutable_scalar_param()->set_axis(1); + shared_ptr > layer( + new ScalarLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int n = 0; n < this->blob_bottom_->num(); ++n) { + for (int c = 0; c < this->blob_bottom_->channels(); ++c) { + for (int h = 0; h < this->blob_bottom_->height(); ++h) { + for (int w = 0; w < this->blob_bottom_->width(); ++w) { + EXPECT_EQ(this->blob_top_->data_at(n, c, h, w), + this->blob_bottom_->data_at(n, c, h, w) * + this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0)); + } + } + } + } +} + +TYPED_TEST(ScalarLayerTest, TestForwardBroadcastEnd) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_); + LayerParameter layer_param; + layer_param.mutable_scalar_param()->set_axis(2); + shared_ptr > layer( + new ScalarLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int n = 0; n < this->blob_bottom_->num(); ++n) { + for (int c = 0; c < this->blob_bottom_->channels(); ++c) { + for (int h = 0; h < this->blob_bottom_->height(); ++h) { + for (int w = 0; w < this->blob_bottom_->width(); ++w) { + EXPECT_EQ(this->blob_top_->data_at(n, c, h, w), + this->blob_bottom_->data_at(n, c, h, w) * + this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0)); + } + } + } + } +} + +TYPED_TEST(ScalarLayerTest, TestForwardScalar) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_); + LayerParameter layer_param; + shared_ptr > layer( + new ScalarLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* data = this->blob_top_->cpu_data(); + const int count = this->blob_top_->count(); + const Dtype* in_data = this->blob_bottom_->cpu_data(); + const Dtype scalar = *this->blob_bottom_scalar_->cpu_data(); + for (int i = 0; i < count; ++i) { + EXPECT_EQ(data[i], in_data[i] * scalar); + } +} + +TYPED_TEST(ScalarLayerTest, TestForwardScalarAxis2) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_); + LayerParameter layer_param; + layer_param.mutable_scalar_param()->set_axis(2); + shared_ptr > layer( + new ScalarLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape()); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* data = this->blob_top_->cpu_data(); + const int count = this->blob_top_->count(); + const Dtype* in_data = this->blob_bottom_->cpu_data(); + const Dtype scalar = *this->blob_bottom_scalar_->cpu_data(); + for (int i = 0; i < count; ++i) { + EXPECT_EQ(data[i], in_data[i] * scalar); + } +} + +TYPED_TEST(ScalarLayerTest, TestGradientEltwise) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_); + LayerParameter layer_param; + ScalarLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + +TYPED_TEST(ScalarLayerTest, TestGradientBroadcastBegin) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_); + LayerParameter layer_param; + ScalarLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + +TYPED_TEST(ScalarLayerTest, TestGradientBroadcastMiddle) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_); + LayerParameter layer_param; + layer_param.mutable_scalar_param()->set_axis(1); + ScalarLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + +TYPED_TEST(ScalarLayerTest, TestGradientBroadcastEnd) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_); + LayerParameter layer_param; + layer_param.mutable_scalar_param()->set_axis(2); + ScalarLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + +TYPED_TEST(ScalarLayerTest, TestGradientScalar) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_); + LayerParameter layer_param; + ScalarLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + +TYPED_TEST(ScalarLayerTest, TestGradientScalarAxis2) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_scalar_); + LayerParameter layer_param; + layer_param.mutable_scalar_param()->set_axis(2); + ScalarLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + +} // namespace caffe