From 2f869e7426e47d5123ee399704ff386bfdee8387 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sat, 24 Jan 2015 18:27:15 -0800
Subject: [PATCH 01/86] clarify draw_net.py usage: net prototxt, not caffemodel

---
 python/draw_net.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/python/draw_net.py b/python/draw_net.py
index ba488294275..abf701572a2 100755
--- a/python/draw_net.py
+++ b/python/draw_net.py
@@ -11,7 +11,7 @@
 
 def main(argv):
     if len(argv) != 3:
-        print 'Usage: %s input_net_proto_file output_image_file' % \
+        print 'Usage: %s input_net_prototxt output_image_file' % \
                 os.path.basename(sys.argv[0])
     else:
         net = caffe_pb2.NetParameter()

From 61c63f6d1ed4cd6a3c4b4d9229497fc89c5ef662 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sat, 24 Jan 2015 18:28:46 -0800
Subject: [PATCH 02/86] [docs] ask install + hardware questions on caffe-users

---
 docs/installation.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/installation.md b/docs/installation.md
index a42ef5b309c..16e319b4392 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -11,6 +11,8 @@ We have installed Caffe on Ubuntu 14.04, Ubuntu 12.04, OS X 10.9, and OS X 10.8.
 - [Compilation](#compilation)
 - [Hardware questions](#hardware_questions)
 
+Ask installation questions on the [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) mailing list.
+
 ## Prerequisites
 
 Caffe depends on several software packages.
@@ -299,4 +301,4 @@ As a workaround, if you are using Ubuntu 12.04 you can try the following steps t
 
 Once installed, check your times against our [reference performance numbers](performance_hardware.html) to make sure everything is configured properly.
 
-Refer to the project's issue tracker for [hardware/compatibility](https://github.com/BVLC/caffe/issues?labels=hardware%2Fcompatibility&page=1&state=open).
+Ask hardware questions on the [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) mailing list.

From 4cc8195d82f13391545b90c9c61baaca39b6df84 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 29 Jan 2015 09:26:12 -0800
Subject: [PATCH 03/86] [docs] send API link to class list

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index ccc8f750eef..bf1d9c3c78b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -46,7 +46,7 @@ Tested on Ubuntu, Red Hat, OS X.
 BVLC suggests a standard distribution format for Caffe models, and provides trained models.
 * [Developing & Contributing](/development.html)<br>
 Guidelines for development and contributing to Caffe.
-* [API Documentation](/doxygen/)<br>
+* [API Documentation](/doxygen/annotated.html)<br>
 Developer documentation automagically generated from code comments.
 
 ### Examples

From 1f7c3dea034ef19acd1addf5aa8f4c2b94bc358c Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 29 Jan 2015 10:53:44 -0800
Subject: [PATCH 04/86] [docs] add check mode hint to CPU-only mode error

---
 include/caffe/util/device_alternate.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 3df28a49ac3..1a33b947991 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -7,7 +7,7 @@
 
 // Stub out GPU calls as unavailable.
 
-#define NO_GPU LOG(FATAL) << "CPU-only Mode: cannot make GPU call."
+#define NO_GPU LOG(FATAL) << "Cannot make GPU call in CPU-only mode: check mode setting."
 
 #define STUB_GPU(classname) \
 template <typename Dtype> \

From 8b9647223346a2528804cb5bc8943f2065846d7d Mon Sep 17 00:00:00 2001
From: Andre Ambrosio Boechat <boechat107@gmail.com>
Date: Fri, 30 Jan 2015 13:57:31 -0200
Subject: [PATCH 05/86] Brief explanation of SLICE layer's attributes

* A sample code was added.
* `slice_dim` and `slice_point` attributes were explained.
---
 docs/tutorial/layers.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 5f8f519cdc4..34bb48050e8 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -451,6 +451,26 @@ The `CONCAT` layer is a utility layer that concatenates its multiple input blobs
 
 The `SLICE` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
 
+* Sample
+
+     layers {
+       name: "slicer_label"
+       type: SLICE
+       bottom: "label"
+       ## Example of label with a shape N x 3 x 1 x 1
+       top: "label1"
+       top: "label2"
+       top: "label3"
+       slice_param {
+           slice_dim: 1
+           slice_point: 1
+           slice_point: 2
+       }
+     }
+
+`slice_dim` indicates the target dimension and can assume only two values: 0 for num or 1 for channel; `slice_point` indicates indexes in the selected dimension (the number of indexes must be equal to the number of top blobs minus one). 
+
+
 #### Elementwise Operations
 
 `ELTWISE`

From 75d0e16be912a8dd23eddd8756ee0d278c66d6ab Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 30 Jan 2015 11:25:31 -0800
Subject: [PATCH 06/86] lint 1f7c3de

---
 include/caffe/util/device_alternate.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 1a33b947991..4d731e26199 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -7,7 +7,7 @@
 
 // Stub out GPU calls as unavailable.
 
-#define NO_GPU LOG(FATAL) << "Cannot make GPU call in CPU-only mode: check mode setting."
+#define NO_GPU LOG(FATAL) << "Cannot use GPU in CPU-only Caffe: check mode."
 
 #define STUB_GPU(classname) \
 template <typename Dtype> \

From 1e0d49a39d1e38c8de2a4c24ab0bff3a71da21ff Mon Sep 17 00:00:00 2001
From: Brandon Amos <bdamos@vt.edu>
Date: Mon, 16 Feb 2015 15:09:24 -0500
Subject: [PATCH 07/86] Correct 'epochs' to 'iterations'

See https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/solver.prototxt
---
 examples/imagenet/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/imagenet/readme.md b/examples/imagenet/readme.md
index 41384f9475b..c2dd62ec963 100644
--- a/examples/imagenet/readme.md
+++ b/examples/imagenet/readme.md
@@ -67,7 +67,7 @@ We will also lay out a protocol buffer for running the solver. Let's make a few
 * We will run in batches of 256, and run a total of 450,000 iterations (about 90 epochs).
 * For every 1,000 iterations, we test the learned net on the validation data.
 * We set the initial learning rate to 0.01, and decrease it every 100,000 iterations (about 20 epochs).
-* Information will be displayed every 20 epochs.
+* Information will be displayed every 20 iterations.
 * The network will be trained with momentum 0.9 and a weight decay of 0.0005.
 * For every 10,000 iterations, we will take a snapshot of the current status.
 

From af01b9c7354b36620881b0d2e608df83ebcedfd5 Mon Sep 17 00:00:00 2001
From: e3 <erik@cpan.org>
Date: Thu, 19 Feb 2015 22:07:27 -0800
Subject: [PATCH 08/86] Updated the path for get_ilsvrc_aux.sh to match what is
 found in the current project

---
 examples/imagenet/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/imagenet/readme.md b/examples/imagenet/readme.md
index c2dd62ec963..a6bdf49ca4d 100644
--- a/examples/imagenet/readme.md
+++ b/examples/imagenet/readme.md
@@ -26,7 +26,7 @@ We assume that you already have downloaded the ImageNet training data and valida
 
 You will first need to prepare some auxiliary data for training. This data can be downloaded by:
 
-    ./data/get_ilsvrc_aux.sh
+    ./data/ilsvrc12/get_ilsvrc_aux.sh
 
 The training and validation input are described in `train.txt` and `val.txt` as text listing all the files and their labels. Note that we use a different indexing for labels than the ILSVRC devkit: we sort the synset names in their ASCII order, and then label them from 0 to 999. See `synset_words.txt` for the synset/name mapping.
 

From eabbccd4d3f4ba5e1f7b37bf1e5ae3f7b67992cc Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 20 Feb 2015 11:18:47 -0800
Subject: [PATCH 09/86] [build] fix dynamic linking of tools

set the right rpath for tools and examples respectively

thanks for the report @mees!
---
 Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 29827270baf..2a75d66e02a 100644
--- a/Makefile
+++ b/Makefile
@@ -537,7 +537,12 @@ $(TOOL_BUILD_DIR)/%: $(TOOL_BUILD_DIR)/%.bin | $(TOOL_BUILD_DIR)
 	@ $(RM) $@
 	@ ln -s $(abspath $<) $@
 
-$(TOOL_BINS) $(EXAMPLE_BINS): %.bin : %.o | $(DYNAMIC_NAME)
+$(TOOL_BINS): %.bin : %.o | $(DYNAMIC_NAME)
+	@ echo CXX/LD -o $@
+	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \
+		-Wl,-rpath,$(ORIGIN)/../lib
+
+$(EXAMPLE_BINS): %.bin : %.o | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@
 	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../../lib

From 5a2633370f460b6a4d57c7564a38b14311420ab3 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 20 Feb 2015 20:29:53 -0800
Subject: [PATCH 10/86] check caffe tool runs in runtest

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 2a75d66e02a..642bde3571d 100644
--- a/Makefile
+++ b/Makefile
@@ -450,6 +450,7 @@ $(MAT$(PROJECT)_SO): $(MAT$(PROJECT)_SRC) $(STATIC_NAME)
 			CXXLIBS="\$$CXXLIBS $(STATIC_LINK_COMMAND) $(LDFLAGS)" -output $@
 
 runtest: $(TEST_ALL_BIN)
+	$(TOOL_BUILD_DIR)/caffe
 	$(TEST_ALL_BIN) $(TEST_GPUID) --gtest_shuffle $(TEST_FILTER)
 
 pytest: py

From a1e951dcf9f534796de9a0f73f4869b3df33ab58 Mon Sep 17 00:00:00 2001
From: Anatoly Baksheev <no@email>
Date: Sun, 22 Feb 2015 18:58:12 +0300
Subject: [PATCH 11/86] ignore pycharm files

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 73bba6cb364..28f2aca854b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,6 +44,9 @@
 # QtCreator files
 *.user
 
+# PyCharm files
+.idea
+
 # OSX dir files
 .DS_Store
 

From fca05c34c67701368245410e6f7c5e118e84a09f Mon Sep 17 00:00:00 2001
From: Anatoly Baksheev <no@email>
Date: Sun, 22 Feb 2015 19:03:47 +0300
Subject: [PATCH 12/86] set proper CMAKE_INSTALL_RPATH for _caffe.so and tools

---
 cmake/Misc.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/Misc.cmake b/cmake/Misc.cmake
index 608a5f13a79..39569eaf996 100644
--- a/cmake/Misc.cmake
+++ b/cmake/Misc.cmake
@@ -32,6 +32,11 @@ endif()
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath")
 set(CMAKE_MACOSX_RPATH TRUE)
 
+list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir)
+if(${__is_systtem_dir} STREQUAL -1)
+  set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
+endif()
+
 # ---[ Funny target
 if(UNIX OR APPLE)
   add_custom_target(symlink_to_build COMMAND "ln" "-sf" "${PROJECT_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/build"

From 645aa03207b24dc5f092686f399736c9334bc096 Mon Sep 17 00:00:00 2001
From: Anatoly Baksheev <no@email>
Date: Sun, 22 Feb 2015 19:04:22 +0300
Subject: [PATCH 13/86] fixed bug in install-tree: _caffe.so installed by
 install(TARGET ...) was overwritten with symlink created at build time and
 installed with install(DIRECTORY ...)

---
 python/CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6afed4fa183..a2f82089cac 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,7 +4,7 @@ if(NOT HAVE_PYTHON)
 endif()
 
 include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
-file(GLOB_RECURSE python_srcs ${PROJECT_SOURCE_DIR}/python/*.cpp)
+file(GLOB_RECURSE python_srcs ${CMAKE_SOURCE_DIR}/python/*.cpp)
 
 add_library(pycaffe SHARED ${python_srcs})
 target_link_libraries(pycaffe ${Caffe_LINK} ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
@@ -22,9 +22,13 @@ if(UNIX OR APPLE)
 endif()
 
 # ---[ Install
-file(GLOB files *.py requirements.txt)
-install(FILES  ${files} DESTINATION python)
-install(DIRECTORY caffe DESTINATION python)
-install(TARGETS pycaffe DESTINATION python/caffe)
+file(GLOB files1 *.py requirements.txt)
+install(FILES ${files1} DESTINATION python)
+
+file(GLOB files2 caffe/*.py)
+install(FILES  ${files2} DESTINATION python/caffe)
+install(TARGETS pycaffe  DESTINATION python/caffe)
+install(DIRECTORY caffe/imagenet caffe/proto caffe/test DESTINATION python/caffe)
+
 
 

From 5e06d16d14d8a46d3c7a3f82497c6cb1401e160f Mon Sep 17 00:00:00 2001
From: Anatoly Baksheev <no@email>
Date: Sun, 22 Feb 2015 20:14:40 +0300
Subject: [PATCH 14/86] minor cmake sumamry log fix

---
 cmake/Summary.cmake | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3f7dff6b6e0..32931942846 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -107,8 +107,9 @@ function(caffe_print_configuration_summary)
   caffe_status("  C++ compiler      :   ${CMAKE_CXX_COMPILER}")
   caffe_status("  Release CXX flags :   ${__flags_rel}")
   caffe_status("  Debug CXX flags   :   ${__flags_deb}")
-  caffe_status("  BUILD_SHARED_LIBS :   ${BUILD_SHARED_LIBS}")
   caffe_status("  Build type        :   ${CMAKE_BUILD_TYPE}")
+  caffe_status("")
+  caffe_status("  BUILD_SHARED_LIBS :   ${BUILD_SHARED_LIBS}")
   caffe_status("  BUILD_python      :   ${BUILD_python}")
   caffe_status("  BUILD_matlab      :   ${BUILD_matlab}")
   caffe_status("  BUILD_docs        :   ${BUILD_docs}")
@@ -116,8 +117,9 @@ function(caffe_print_configuration_summary)
   caffe_status("")
   caffe_status("Dependencies:")
   caffe_status("  BLAS              : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})")
+  caffe_status("  Boost             :   Yes (ver. ${Boost_MAJOR_VERSION}.${Boost_MINOR_VERSION})")
   caffe_status("  glog              :   Yes")
-  caffe_status("  gflags            :   Yes")
+  caffe_status("  gflags            :   Yes")  
   caffe_status("  protobuf          : " PROTOBUF_FOUND THEN "Yes (ver. ${PROTOBUF_VERSION})" ELSE "No" )
   caffe_status("  lmdb              : " LMDB_FOUND THEN "Yes (ver. ${LMDB_VERSION})" ELSE "No")
   caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )

From 569ae01cc309fc3e14352479735a137ae53cfb62 Mon Sep 17 00:00:00 2001
From: James Supancic III <arrummzen@gmail.com>
Date: Sun, 22 Feb 2015 10:16:45 -0800
Subject: [PATCH 15/86] cpp_lint.py fails silently with Python3 (which is the
 default on some systems). This commit specifies Python2 with which
 cpp_lint.py works :-)

---
 scripts/cpp_lint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 1b7c6c0536c..f750489f4f9 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python2
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #

From 845f9eac2b873e6d018ab83d5a32100ab443a35e Mon Sep 17 00:00:00 2001
From: spmallick <spmallick@gmail.com>
Date: Mon, 23 Feb 2015 16:13:20 -0800
Subject: [PATCH 16/86] APPLE was misspelled. in Line 27

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adea37be565..2a48e062eca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
 include(cmake/Dependencies.cmake)
 
 # ---[ Flags
-if(UNIX OR APLE)
+if(UNIX OR APPLE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
 endif()
 

From 54037d3942647ff3659028e1d7437f2e8f7cf911 Mon Sep 17 00:00:00 2001
From: philkr <philkr@users.noreply.github.com>
Date: Tue, 17 Feb 2015 10:50:12 -0800
Subject: [PATCH 17/86] Making python3 work with cmake and the new python
 wrapper

---
 CMakeLists.txt           |  1 +
 cmake/Dependencies.cmake | 39 +++++++++++++++++++++++++++++++++------
 docs/installation.md     |  4 ++--
 python/caffe/_caffe.cpp  |  4 +++-
 python/caffe/io.py       |  9 ++++++++-
 python/caffe/pycaffe.py  |  5 ++++-
 python/classify.py       |  6 +++---
 python/detect.py         |  2 +-
 python/draw_net.py       |  2 +-
 9 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adea37be565..626d5b44459 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ caffe_option(CPU_ONLY  "Build Caffe wihtout CUDA support" OFF) # TODO: rename to
 caffe_option(USE_CUDNN "Build Caffe with cuDNN libary support" ON IF NOT CPU_ONLY)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
+set(python_version "2" CACHE STRING "Specify which python version to use")
 caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE)
 caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index aa2dcbe1d0d..b1ac96c6777 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -92,12 +92,39 @@ endif()
 
 # ---[ Python
 if(BUILD_python)
-  # disable Python 3 search
-  find_package(PythonInterp 2.7)
-  find_package(PythonLibs 2.7)
-  find_package(NumPy 1.7.1)
-  find_package(Boost 1.46 COMPONENTS python)
-
+  if(NOT "${python_version}" VERSION_LESS "3.0.0")
+    # use python3
+    find_package(PythonInterp 3.0)
+    find_package(PythonLibs 3.0)
+    find_package(NumPy 1.7.1)
+    # Find the matching boost python implementation
+    set(version ${PYTHONLIBS_VERSION_STRING})
+    
+    STRING( REPLACE "." "" boost_py_version ${version} )
+    find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
+    set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
+    
+    while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
+      STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
+      STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
+      if("${has_more_version}" STREQUAL "")
+        break()
+      endif()
+      
+      STRING( REPLACE "." "" boost_py_version ${version} )
+      find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
+      set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
+    endwhile()
+    if(NOT Boost_PYTHON_FOUND)
+      find_package(Boost 1.46 COMPONENTS python)
+    endif()
+  else()
+    # disable Python 3 search
+    find_package(PythonInterp 2.7)
+    find_package(PythonLibs 2.7)
+    find_package(NumPy 1.7.1)
+    find_package(Boost 1.46 COMPONENTS python)
+  endif()
   if(PYTHONLIBS_FOUND AND NUMPY_FOUND AND Boost_PYTHON_FOUND)
     set(HAVE_PYTHON TRUE)
   endif()
diff --git a/docs/installation.md b/docs/installation.md
index 16575b54029..144e6a34f67 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -30,7 +30,7 @@ Caffe has several dependencies.
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
-* For Python Caffe:  `Python 2.7`, `numpy (>= 1.7)`, boost-provided `boost.python`
+* For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
 **cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. For now cuDNN v1 is integrated but see [PR #1731](https://github.com/BVLC/caffe/pull/1731) for v2.
@@ -69,7 +69,7 @@ but we suggest first installing the [Anaconda](https://store.continuum.io/cshop/
 
 To import the `caffe` Python module after completing the installation, add the module directory to your `$PYTHONPATH` by `export PYTHONPATH=/path/to/caffe/python:$PYTHONPATH` or the like. You should not import the module in the `caffe/python/caffe` directory!
 
-*Caffe's Python interface works with Python 2.7. Python 3 or earlier Pythons are your own adventure.*
+*Caffe's Python interface works with Python 2.7. Python 3.3+ should work out of the box without protobuf support. For protobuf support please install protobuf 3.0 alpha (https://developers.google.com/protocol-buffers/). Earlier Pythons are your own adventure.*
 
 #### MATLAB
 
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index a5d0e64605e..03967a21029 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -275,7 +275,9 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<vector<bool> >("BoolVec")
     .def(bp::vector_indexing_suite<vector<bool> >());
 
-  import_array();
+  // boost python expects a void (missing) return value, while import_array
+  // returns NULL for python3. import_array1() forces a void return value.
+  import_array1();
 }
 
 }  // namespace caffe
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 0ce9ecfeeed..f51e3a64d36 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -3,7 +3,14 @@
 from scipy.ndimage import zoom
 from skimage.transform import resize
 
-from caffe.proto import caffe_pb2
+try:
+    # Python3 will most likely not be able to load protobuf
+    from caffe.proto import caffe_pb2
+except:
+    if sys.version_info >= (3,0):
+        print("Failed to include caffe_pb2, things might go wrong!")
+    else:
+        raise
 
 ## proto / datum / ndarray conversion
 
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 31c145d77a5..d662d6cc282 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -4,7 +4,10 @@
 """
 
 from collections import OrderedDict
-from itertools import izip_longest
+try:
+	from itertools import izip_longest
+except:
+	from itertools import zip_longest as izip_longest
 import numpy as np
 
 from ._caffe import Net, SGDSolver
diff --git a/python/classify.py b/python/classify.py
index d435a572266..81d06369341 100755
--- a/python/classify.py
+++ b/python/classify.py
@@ -103,7 +103,7 @@ def main(argv):
             channel_swap=channel_swap)
 
     if args.gpu:
-        print 'GPU mode'
+        print('GPU mode')
 
     # Load numpy array (.npy), directory glob (*.jpg), or image file.
     args.input_file = os.path.expanduser(args.input_file)
@@ -115,12 +115,12 @@ def main(argv):
     else:
         inputs = [caffe.io.load_image(args.input_file)]
 
-    print "Classifying %d inputs." % len(inputs)
+    print("Classifying %d inputs." % len(inputs))
 
     # Classify.
     start = time.time()
     predictions = classifier.predict(inputs, not args.center_only)
-    print "Done in %.2f s." % (time.time() - start)
+    print("Done in %.2f s." % (time.time() - start))
 
     # Save
     np.save(args.output_file, predictions)
diff --git a/python/detect.py b/python/detect.py
index cb0c2645761..d395bd97abf 100755
--- a/python/detect.py
+++ b/python/detect.py
@@ -115,7 +115,7 @@ def main(argv):
             context_pad=args.context_pad)
 
     if args.gpu:
-        print 'GPU mode'
+        print('GPU mode')
 
     # Load input.
     t = time.time()
diff --git a/python/draw_net.py b/python/draw_net.py
index 4457b793e86..6320f775ef7 100755
--- a/python/draw_net.py
+++ b/python/draw_net.py
@@ -36,7 +36,7 @@ def main():
     args = parse_args()
     net = caffe_pb2.NetParameter()
     text_format.Merge(open(args.input_net_proto_file).read(), net)
-    print 'Drawing net to %s' % args.output_image_file
+    print('Drawing net to %s' % args.output_image_file)
     caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir)
 
 

From 2cf5089d273e0e46f51c1b4b7aa018cbf3b983fe Mon Sep 17 00:00:00 2001
From: philkr <philkr@users.noreply.github.com>
Date: Tue, 24 Feb 2015 16:02:06 -0800
Subject: [PATCH 18/86] Decoding the datum before feeding it into the reshaping
 data layer

---
 src/caffe/layers/data_layer.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 8877caf89c8..1861090f953 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -89,9 +89,17 @@ void DataLayer<Dtype>::InternalThreadEntry() {
   // Reshape on single input batches for inputs of varying dimension.
   const int batch_size = this->layer_param_.data_param().batch_size();
   const int crop_size = this->layer_param_.transform_param().crop_size();
+  bool force_color = this->layer_param_.data_param().force_encoded_color();
   if (batch_size == 1 && crop_size == 0) {
     Datum datum;
     datum.ParseFromString(cursor_->value());
+    if (datum.encoded()) {
+      if (force_color) {
+        DecodeDatum(&datum, true);
+      } else {
+        DecodeDatumNative(&datum);
+      }
+    }
     this->prefetch_data_.Reshape(1, datum.channels(),
         datum.height(), datum.width());
     this->transformed_data_.Reshape(1, datum.channels(),
@@ -104,7 +112,6 @@ void DataLayer<Dtype>::InternalThreadEntry() {
   if (this->output_labels_) {
     top_label = this->prefetch_label_.mutable_cpu_data();
   }
-  bool force_color = this->layer_param_.data_param().force_encoded_color();
   for (int item_id = 0; item_id < batch_size; ++item_id) {
     timer.Start();
     // get a blob

From 4a3887ab1791f7a0ea6d17cafb861f53398a42d5 Mon Sep 17 00:00:00 2001
From: forresti <forresti@eecs.berkeley.edu>
Date: Tue, 24 Feb 2015 16:51:56 -0800
Subject: [PATCH 19/86] fixed matcaffe printout to specify num of args (now
 including train/test phase)

---
 matlab/caffe/matcaffe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/matlab/caffe/matcaffe.cpp b/matlab/caffe/matcaffe.cpp
index 996d3d2149c..da37d920b20 100644
--- a/matlab/caffe/matcaffe.cpp
+++ b/matlab/caffe/matcaffe.cpp
@@ -272,7 +272,7 @@ static void get_init_key(MEX_ARGS) {
 static void init(MEX_ARGS) {
   if (nrhs != 3) {
     ostringstream error_msg;
-    error_msg << "Expected 2 arguments, got " << nrhs;
+    error_msg << "Expected 3 arguments, got " << nrhs;
     mex_error(error_msg.str());
   }
 

From d2beb8ab20b55aebd25a2fb1f3eed3cf2923efaa Mon Sep 17 00:00:00 2001
From: Gustav Larsson <gustav.m.larsson@gmail.com>
Date: Tue, 24 Feb 2015 23:22:15 -0600
Subject: [PATCH 20/86] Replaced illegal tab in Makefile with spaces.

Commands, such as $(error ...), are not allowed to be indented with tabs
outside of targets, throwing an error instead of outputting the actual
error. The solution is to use innocuous spaces instead. Ideally, spaces
should be used everywhere outside targets, but since make does not mind
it if variable assignments are tab-indented outside targets, a complete
overhaul is not necessary. However, if more errors are added, it might
make more sense to be consistent.

Also, make will already add a period so I removed it.
---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 642bde3571d..033473ac45b 100644
--- a/Makefile
+++ b/Makefile
@@ -261,7 +261,8 @@ ifneq (,$(findstring clang++,$(CXX)))
 else ifneq (,$(findstring g++,$(CXX)))
 	STATIC_LINK_COMMAND := -Wl,--whole-archive $(STATIC_NAME) -Wl,--no-whole-archive
 else
-	$(error Cannot static link with the $(CXX) compiler.)
+  # The following line must not be indented with a tab, since we are not inside a target
+  $(error Cannot static link with the $(CXX) compiler)
 endif
 
 # Debugging

From 1377e1be4ee70e9f57f8cef53d767603124ee84f Mon Sep 17 00:00:00 2001
From: Sergey Karayev <sergeykarayev@gmail.com>
Date: Tue, 24 Feb 2015 21:52:32 -0800
Subject: [PATCH 21/86] Makefile fix for OS X 10.10

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 642bde3571d..933e9be93f3 100644
--- a/Makefile
+++ b/Makefile
@@ -319,7 +319,7 @@ else
 		# 10.10 has accelerate while 10.9 has veclib
 		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep -o 'version: 6')
 		ifneq (,$(findstring version: 6,$(XCODE_CLT_VER)))
-			BLAS_INCLUDE ?= /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.10.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
 			LDFLAGS += -framework Accelerate
 		else
 			BLAS_INCLUDE ?= /System/Library/Frameworks/vecLib.framework/Versions/Current/Headers/

From 76519702a415467a61a3c1e9ce706f41647c7faf Mon Sep 17 00:00:00 2001
From: philkr <philkr@users.noreply.github.com>
Date: Wed, 25 Feb 2015 08:17:06 -0800
Subject: [PATCH 22/86] Fixing two bugs related to python3 and
 PROJECT_SOURCE_DIR

---
 python/CMakeLists.txt   | 2 +-
 python/caffe/pycaffe.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index a2f82089cac..df0401daa1c 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,7 +4,7 @@ if(NOT HAVE_PYTHON)
 endif()
 
 include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
-file(GLOB_RECURSE python_srcs ${CMAKE_SOURCE_DIR}/python/*.cpp)
+file(GLOB_RECURSE python_srcs ${PROJECT_SOURCE_DIR}/python/*.cpp)
 
 add_library(pycaffe SHARED ${python_srcs})
 target_link_libraries(pycaffe ${Caffe_LINK} ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index d662d6cc282..9cd894a576d 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -41,12 +41,12 @@ def _Net_params(self):
 
 @property
 def _Net_inputs(self):
-    return [self.blobs.keys()[i] for i in self._inputs]
+    return [list(self.blobs.keys())[i] for i in self._inputs]
 
 
 @property
 def _Net_outputs(self):
-    return [self.blobs.keys()[i] for i in self._outputs]
+    return [list(self.blobs.keys())[i] for i in self._outputs]
 
 
 def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):

From 25cdd35c9ab1447f06040720243526fc8d0343c5 Mon Sep 17 00:00:00 2001
From: Andre Ambrosio Boechat <boechat107@gmail.com>
Date: Wed, 25 Feb 2015 14:56:18 -0300
Subject: [PATCH 23/86] Small fix (visualization) on SLICE layer's
 documentation

The sample was missing some additional spaces to be correctly rendered on the HTML. The mistake was mine.
---
 docs/tutorial/layers.md | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 34bb48050e8..839939f5ad6 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -453,20 +453,20 @@ The `SLICE` layer is a utility layer that slices an input layer to multiple outp
 
 * Sample
 
-     layers {
-       name: "slicer_label"
-       type: SLICE
-       bottom: "label"
-       ## Example of label with a shape N x 3 x 1 x 1
-       top: "label1"
-       top: "label2"
-       top: "label3"
-       slice_param {
-           slice_dim: 1
-           slice_point: 1
-           slice_point: 2
-       }
-     }
+      layers {
+        name: "slicer_label"
+        type: SLICE
+        bottom: "label"
+        ## Example of label with a shape N x 3 x 1 x 1
+        top: "label1"
+        top: "label2"
+        top: "label3"
+        slice_param {
+            slice_dim: 1
+            slice_point: 1
+            slice_point: 2
+        }
+      }
 
 `slice_dim` indicates the target dimension and can assume only two values: 0 for num or 1 for channel; `slice_point` indicates indexes in the selected dimension (the number of indexes must be equal to the number of top blobs minus one). 
 

From 1434e87a7410835500a8feb2e7b2f96431b0c00a Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Tue, 25 Nov 2014 18:17:45 -0800
Subject: [PATCH 24/86] Blobs are ND arrays (for N not necessarily equals 4).

vector<int> shape_ instead of (num, channels, height, width).
---
 include/caffe/blob.hpp       | 146 ++++++++++++++++++++++++++++++-----
 src/caffe/blob.cpp           |  92 +++++++++++++++++-----
 src/caffe/net.cpp            |  25 +-----
 src/caffe/proto/caffe.proto  |   7 +-
 src/caffe/solver.cpp         |  14 +---
 src/caffe/test/test_blob.cpp |   5 +-
 6 files changed, 213 insertions(+), 76 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 42e4420408c..2d4df34ec6b 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -1,11 +1,17 @@
 #ifndef CAFFE_BLOB_HPP_
 #define CAFFE_BLOB_HPP_
 
+#include <algorithm>
+#include <string>
+#include <vector>
+
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/math_functions.hpp"
 
+const int kMaxBlobAxes = INT_MAX;
+
 namespace caffe {
 
 /**
@@ -19,10 +25,16 @@ template <typename Dtype>
 class Blob {
  public:
   Blob()
-       : data_(), diff_(), num_(0), channels_(0), height_(0), width_(0),
-       count_(0), capacity_(0) {}
+       : data_(), diff_(), count_(0), capacity_(0) {}
+
+  /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
   explicit Blob(const int num, const int channels, const int height,
-    const int width);
+      const int width);
+  explicit Blob(const vector<int>& shape);
+
+  /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
+  void Reshape(const int num, const int channels, const int height,
+      const int width);
   /**
    * @brief Change the dimensions of the blob, allocating new memory if
    *        necessary.
@@ -37,25 +49,118 @@ class Blob {
    * an error; either Net::Forward or Net::Reshape need to be called to
    * propagate the new input shape to higher layers.
    */
-  void Reshape(const int num, const int channels, const int height,
-    const int width);
+  void Reshape(const vector<int>& shape);
   void ReshapeLike(const Blob& other);
-  inline int num() const { return num_; }
-  inline int channels() const { return channels_; }
-  inline int height() const { return height_; }
-  inline int width() const { return width_; }
+  inline string shape_string() const {
+    ostringstream stream;
+    for (int i = 0; i < shape_.size(); ++i) {
+      stream << shape_[i] << " ";
+    }
+    stream << "(" << count_ << ")";
+    return stream.str();
+  }
+  inline const vector<int>& shape() const { return shape_; }
+  /**
+   * @brief Returns the dimension of the index-th axis (or the negative index-th
+   *        axis from the end, if index is negative).
+   *
+   * @param index the axis index, which may be negative as it will be
+   *        "canonicalized" using CanonicalAxisIndex.
+   *        Dies on out of range index.
+   */
+  inline int shape(int index) const {
+    return shape_[CanonicalAxisIndex(index)];
+  }
+  inline int num_axes() const { return shape_.size(); }
   inline int count() const { return count_; }
+
+  /**
+   * @brief Compute the volume of a slice; i.e., the product of dimensions
+   *        among a range of axes.
+   *
+   * @param start_axis The first axis to include in the slice.
+   *
+   * @param end_axis The first axis to exclude from the slice.
+   */
+  inline int count(int start_axis, int end_axis) const {
+    CHECK_LE(start_axis, end_axis);
+    CHECK_GE(start_axis, 0);
+    CHECK_GE(end_axis, 0);
+    CHECK_LE(start_axis, num_axes());
+    CHECK_LE(end_axis, num_axes());
+    int count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+      count *= shape(i);
+    }
+    return count;
+  }
+  /**
+   * @brief Compute the volume of a slice spanning from a particular first
+   *        axis to the final axis.
+   *
+   * @param start_axis The first axis to include in the slice.
+   */
+  inline int count(int start_axis) const {
+    return count(start_axis, num_axes());
+  }
+
+  /**
+   * @brief Returns the 'canonical' version of a (usually) user-specified axis,
+   *        allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param index the axis index.
+   *        If 0 <= index < num_axes(), return index.
+   *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
+   *        e.g., the last axis index (num_axes() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
+  inline int CanonicalAxisIndex(int axis_index) const {
+    CHECK_GE(axis_index, -num_axes())
+        << "axis " << axis_index << " out of range for " << num_axes()
+        << "-D Blob with shape " << shape_string();
+    CHECK_LT(axis_index, num_axes())
+        << "axis " << axis_index << " out of range for " << num_axes()
+        << "-D Blob with shape " << shape_string();
+    if (axis_index < 0) {
+      return axis_index + num_axes();
+    }
+    return axis_index;
+  }
+
+  /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
+  inline int num() const { return LegacyShape(0); }
+  /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
+  inline int channels() const { return LegacyShape(1); }
+  /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
+  inline int height() const { return LegacyShape(2); }
+  /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
+  inline int width() const { return LegacyShape(3); }
+  inline int LegacyShape(int index) const {
+    CHECK_LE(num_axes(), 4)
+        << "Cannot use legacy accessors on Blobs with > 4 axes.";
+    CHECK_LT(index, 4);
+    CHECK_GE(index, -4);
+    if (index >= num_axes() || index < -num_axes()) {
+      // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
+      // indexing) -- this special case simulates the one-padding used to fill
+      // extraneous axes of legacy blobs.
+      return 1;
+    }
+    return shape(index);
+  }
+
   inline int offset(const int n, const int c = 0, const int h = 0,
       const int w = 0) const {
     CHECK_GE(n, 0);
-    CHECK_LE(n, num_);
-    CHECK_GE(channels_, 0);
-    CHECK_LE(c, channels_);
-    CHECK_GE(height_, 0);
-    CHECK_LE(h, height_);
-    CHECK_GE(width_, 0);
-    CHECK_LE(w, width_);
-    return ((n * channels_ + c) * height_ + h) * width_ + w;
+    CHECK_LE(n, num());
+    CHECK_GE(channels(), 0);
+    CHECK_LE(c, channels());
+    CHECK_GE(height(), 0);
+    CHECK_LE(h, height());
+    CHECK_GE(width(), 0);
+    CHECK_LE(w, width());
+    return ((n * channels() + c) * height() + h) * width() + w;
   }
   /**
    * @brief Copy from a source Blob.
@@ -135,13 +240,12 @@ class Blob {
    */
   void ShareDiff(const Blob& other);
 
+  bool ShapeEquals(const BlobProto& other);
+
  protected:
   shared_ptr<SyncedMemory> data_;
   shared_ptr<SyncedMemory> diff_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
+  vector<int> shape_;
   int count_;
   int capacity_;
 
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index fbc1361a19d..4a6332f382d 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -1,3 +1,6 @@
+#include <climits>
+#include <vector>
+
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
@@ -8,15 +11,24 @@ namespace caffe {
 template <typename Dtype>
 void Blob<Dtype>::Reshape(const int num, const int channels, const int height,
     const int width) {
-  CHECK_GE(num, 0);
-  CHECK_GE(channels, 0);
-  CHECK_GE(height, 0);
-  CHECK_GE(width, 0);
-  num_ = num;
-  channels_ = channels;
-  height_ = height;
-  width_ = width;
-  count_ = num_ * channels_ * height_ * width_;
+  vector<int> shape(4);
+  shape[0] = num;
+  shape[1] = channels;
+  shape[2] = height;
+  shape[3] = width;
+  Reshape(shape);
+}
+
+template <typename Dtype>
+void Blob<Dtype>::Reshape(const vector<int>& shape) {
+  CHECK_LE(shape.size(), kMaxBlobAxes);
+  count_ = 1;
+  shape_.resize(shape.size());
+  for (int i = 0; i < shape.size(); ++i) {
+    CHECK_GE(shape[i], 0);
+    count_ *= shape[i];
+    shape_[i] = shape[i];
+  }
   if (count_ > capacity_) {
     capacity_ = count_;
     data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
@@ -26,7 +38,7 @@ void Blob<Dtype>::Reshape(const int num, const int channels, const int height,
 
 template <typename Dtype>
 void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
-  Reshape(other.num(), other.channels(), other.height(), other.width());
+  Reshape(other.shape());
 }
 
 template <typename Dtype>
@@ -37,6 +49,13 @@ Blob<Dtype>::Blob(const int num, const int channels, const int height,
   Reshape(num, channels, height, width);
 }
 
+template <typename Dtype>
+Blob<Dtype>::Blob(const vector<int>& shape)
+  // capacity_ must be initialized before calling Reshape
+  : capacity_(0) {
+  Reshape(shape);
+}
+
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
   CHECK(data_);
@@ -345,12 +364,34 @@ void Blob<Dtype>::scale_diff(Dtype scale_factor) {
   }
 }
 
+template <typename Dtype>
+bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
+  if (other.has_num() || other.has_channels() ||
+      other.has_height() || other.has_width()) {
+    // Using deprecated 4D Blob dimensions --
+    // shape is (num, channels, height, width).
+    // Note: we do not use the normal Blob::num(), Blob::channels(), etc.
+    // methods as these index from the beginning of the blob shape, where legacy
+    // parameter blobs were indexed from the end of the blob shape (e.g., bias
+    // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
+    return shape_.size() <= 4 &&
+           LegacyShape(-4) == other.num() &&
+           LegacyShape(-3) == other.channels() &&
+           LegacyShape(-2) == other.height() &&
+           LegacyShape(-1) == other.width();
+  }
+  vector<int> other_shape(other.dim_size());
+  for (int i = 0; i < other.dim_size(); ++i) {
+    other_shape[i] = other.dim(i);
+  }
+  return shape_ == other_shape;
+}
+
 template <typename Dtype>
 void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
-  if (num_ != source.num() || channels_ != source.channels() ||
-      height_ != source.height() || width_ != source.width()) {
+  if (source.count() != count_ || source.shape() != shape_) {
     if (reshape) {
-      Reshape(source.num(), source.channels(), source.height(), source.width());
+      ReshapeLike(source);
     } else {
       LOG(FATAL) << "Trying to copy blobs of different sizes.";
     }
@@ -381,7 +422,23 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
 
 template <typename Dtype>
 void Blob<Dtype>::FromProto(const BlobProto& proto) {
-  Reshape(proto.num(), proto.channels(), proto.height(), proto.width());
+  vector<int> shape;
+  if (proto.has_num() || proto.has_channels() ||
+      proto.has_height() || proto.has_width()) {
+    // Using deprecated 4D Blob dimensions --
+    // shape is (num, channels, height, width).
+    shape.resize(4);
+    shape[0] = proto.num();
+    shape[1] = proto.channels();
+    shape[2] = proto.height();
+    shape[3] = proto.width();
+  } else {
+    shape.resize(proto.dim_size());
+    for (int i = 0; i < proto.dim_size(); ++i) {
+      shape[i] = proto.dim(i);
+    }
+  }
+  Reshape(shape);
   // copy data
   Dtype* data_vec = mutable_cpu_data();
   for (int i = 0; i < count_; ++i) {
@@ -397,10 +454,9 @@ void Blob<Dtype>::FromProto(const BlobProto& proto) {
 
 template <typename Dtype>
 void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
-  proto->set_num(num_);
-  proto->set_channels(channels_);
-  proto->set_height(height_);
-  proto->set_width(width_);
+  for (int i = 0; i < shape_.size(); ++i) {
+    proto->add_dim(shape_[i]);
+  }
   proto->clear_data();
   proto->clear_diff();
   const Dtype* data_vec = cpu_data();
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index c359be9b575..a6aa917b556 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -109,11 +109,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
         blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
       }
       blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->num() << " "
-          << top_vecs_[layer_id][top_id]->channels() << " "
-          << top_vecs_[layer_id][top_id]->height() << " "
-          << top_vecs_[layer_id][top_id]->width() << " ("
-          << top_vecs_[layer_id][top_id]->count() << ")";
+      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
       if (layer->loss(top_id)) {
         LOG(INFO) << "    with loss weight " << layer->loss(top_id);
       }
@@ -427,14 +423,7 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
           << "Shared parameter blobs must have the same count.";
     } else {
       // Strict dimension checking -- all dims must be the same.
-      CHECK_EQ(this_blob->num(), owner_blob->num())
-          << "Shared parameter blobs must have the same num.";
-      CHECK_EQ(this_blob->channels(), owner_blob->channels())
-          << "Shared parameter blobs must have the same channels.";
-      CHECK_EQ(this_blob->height(), owner_blob->height())
-          << "Shared parameter blobs must have the same height.";
-      CHECK_EQ(this_blob->width(), owner_blob->width())
-          << "Shared parameter blobs must have the same width.";
+      CHECK(this_blob->shape() == owner_blob->shape());
     }
     layers_[layer_id]->blobs()[param_id]->ShareData(
         *layers_[owner_layer_id]->blobs()[owner_param_id]);
@@ -640,10 +629,7 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
         << "Incompatible number of blobs for layer " << source_layer_name;
     for (int j = 0; j < target_blobs.size(); ++j) {
       Blob<Dtype>* source_blob = source_layer->blobs()[j].get();
-      CHECK_EQ(target_blobs[j]->num(), source_blob->num());
-      CHECK_EQ(target_blobs[j]->channels(), source_blob->channels());
-      CHECK_EQ(target_blobs[j]->height(), source_blob->height());
-      CHECK_EQ(target_blobs[j]->width(), source_blob->width());
+      CHECK(target_blobs[j]->shape() == source_blob->shape());
       target_blobs[j]->ShareData(*source_blob);
     }
   }
@@ -707,10 +693,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
     CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
         << "Incompatible number of blobs for layer " << source_layer_name;
     for (int j = 0; j < target_blobs.size(); ++j) {
-      CHECK_EQ(target_blobs[j]->num(), source_layer.blobs(j).num());
-      CHECK_EQ(target_blobs[j]->channels(), source_layer.blobs(j).channels());
-      CHECK_EQ(target_blobs[j]->height(), source_layer.blobs(j).height());
-      CHECK_EQ(target_blobs[j]->width(), source_layer.blobs(j).width());
+      CHECK(target_blobs[j]->ShapeEquals(source_layer.blobs(j)));
       target_blobs[j]->FromProto(source_layer.blobs(j));
     }
   }
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 84b475ce3cd..e82b75d281b 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -3,12 +3,15 @@ syntax = "proto2";
 package caffe;
 
 message BlobProto {
+  repeated int32 dim = 7 [packed = true];
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "dim" instead.
   optional int32 num = 1 [default = 0];
   optional int32 channels = 2 [default = 0];
   optional int32 height = 3 [default = 0];
   optional int32 width = 4 [default = 0];
-  repeated float data = 5 [packed = true];
-  repeated float diff = 6 [packed = true];
 }
 
 // The BlobProtoVector is simply a way to pass multiple blobproto instances
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 8ed8aec2fc8..034390e6824 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -420,16 +420,10 @@ void SGDSolver<Dtype>::PreSolve() {
   update_.clear();
   temp_.clear();
   for (int i = 0; i < net_params.size(); ++i) {
-    const Blob<Dtype>* net_param = net_params[i].get();
-    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
-        net_param->num(), net_param->channels(), net_param->height(),
-        net_param->width())));
-    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
-        net_param->num(), net_param->channels(), net_param->height(),
-        net_param->width())));
-    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
-        net_param->num(), net_param->channels(), net_param->height(),
-        net_param->width())));
+    const vector<int>& shape = net_params[i]->shape();
+    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
   }
 }
 
diff --git a/src/caffe/test/test_blob.cpp b/src/caffe/test/test_blob.cpp
index e0678061173..a654896c654 100644
--- a/src/caffe/test/test_blob.cpp
+++ b/src/caffe/test/test_blob.cpp
@@ -31,10 +31,7 @@ TYPED_TEST(BlobSimpleTest, TestInitialization) {
   EXPECT_EQ(this->blob_preshaped_->height(), 4);
   EXPECT_EQ(this->blob_preshaped_->width(), 5);
   EXPECT_EQ(this->blob_preshaped_->count(), 120);
-  EXPECT_EQ(this->blob_->num(), 0);
-  EXPECT_EQ(this->blob_->channels(), 0);
-  EXPECT_EQ(this->blob_->height(), 0);
-  EXPECT_EQ(this->blob_->width(), 0);
+  EXPECT_EQ(this->blob_->num_axes(), 0);
   EXPECT_EQ(this->blob_->count(), 0);
 }
 

From 5407f82a184a0d35cc1a9265fc7cfa61dae22517 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 31 Dec 2014 16:05:52 -0800
Subject: [PATCH 25/86] Add BlobShape message; use for Net input shapes

---
 include/caffe/blob.hpp      |  1 +
 src/caffe/blob.cpp          | 25 ++++++++++++++++++-------
 src/caffe/net.cpp           | 24 ++++++++++++++++++------
 src/caffe/proto/caffe.proto | 16 +++++++++++++---
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 2d4df34ec6b..07b996ee392 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -50,6 +50,7 @@ class Blob {
    * propagate the new input shape to higher layers.
    */
   void Reshape(const vector<int>& shape);
+  void Reshape(const BlobShape& shape);
   void ReshapeLike(const Blob& other);
   inline string shape_string() const {
     ostringstream stream;
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 4a6332f382d..c65dc2dd1ac 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -36,6 +36,16 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   }
 }
 
+template <typename Dtype>
+void Blob<Dtype>::Reshape(const BlobShape& shape) {
+  CHECK_LE(shape.dim_size(), kMaxBlobAxes);
+  vector<int> shape_vec(shape.dim_size());
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    shape_vec[i] = shape.dim(i);
+  }
+  Reshape(shape_vec);
+}
+
 template <typename Dtype>
 void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
   Reshape(other.shape());
@@ -380,9 +390,9 @@ bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
            LegacyShape(-2) == other.height() &&
            LegacyShape(-1) == other.width();
   }
-  vector<int> other_shape(other.dim_size());
-  for (int i = 0; i < other.dim_size(); ++i) {
-    other_shape[i] = other.dim(i);
+  vector<int> other_shape(other.shape().dim_size());
+  for (int i = 0; i < other.shape().dim_size(); ++i) {
+    other_shape[i] = other.shape().dim(i);
   }
   return shape_ == other_shape;
 }
@@ -433,9 +443,9 @@ void Blob<Dtype>::FromProto(const BlobProto& proto) {
     shape[2] = proto.height();
     shape[3] = proto.width();
   } else {
-    shape.resize(proto.dim_size());
-    for (int i = 0; i < proto.dim_size(); ++i) {
-      shape[i] = proto.dim(i);
+    shape.resize(proto.shape().dim_size());
+    for (int i = 0; i < proto.shape().dim_size(); ++i) {
+      shape[i] = proto.shape().dim(i);
     }
   }
   Reshape(shape);
@@ -454,8 +464,9 @@ void Blob<Dtype>::FromProto(const BlobProto& proto) {
 
 template <typename Dtype>
 void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
+  proto->clear_shape();
   for (int i = 0; i < shape_.size(); ++i) {
-    proto->add_dim(shape_[i]);
+    proto->mutable_shape()->add_dim(shape_[i]);
   }
   proto->clear_data();
   proto->clear_diff();
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index a6aa917b556..60f387b92e9 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -48,8 +48,16 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   name_ = param.name();
   map<string, int> blob_name_to_idx;
   set<string> available_blobs;
-  CHECK_EQ(param.input_size() * 4, param.input_dim_size())
-      << "Incorrect input blob dimension specifications.";
+  CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
+      << "Must specify either input_shape OR deprecated input_dim, not both.";
+  if (param.input_dim_size() > 0) {
+    // Deprecated 4D dimensions.
+    CHECK_EQ(param.input_size() * 4, param.input_dim_size())
+        << "Incorrect input blob dimension specifications.";
+  } else {
+    CHECK_EQ(param.input_size(), param.input_shape_size())
+        << "Exactly one input_shape must be specified per input.";
+  }
   memory_used_ = 0;
   // set the input blobs
   for (int input_id = 0; input_id < param.input_size(); ++input_id) {
@@ -339,10 +347,14 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
     if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; }
     if (layer_id == -1) {
       // Set the (explicitly specified) dimensions of the input blob.
-      blob_pointer->Reshape(param.input_dim(top_id * 4),
-                            param.input_dim(top_id * 4 + 1),
-                            param.input_dim(top_id * 4 + 2),
-                            param.input_dim(top_id * 4 + 3));
+      if (param.input_dim_size() > 0) {
+        blob_pointer->Reshape(param.input_dim(top_id * 4),
+                              param.input_dim(top_id * 4 + 1),
+                              param.input_dim(top_id * 4 + 2),
+                              param.input_dim(top_id * 4 + 3));
+      } else {
+        blob_pointer->Reshape(param.input_shape(top_id));
+      }
       net_input_blob_indices_.push_back(blob_id);
       net_input_blobs_.push_back(blob_pointer.get());
     } else {
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index e82b75d281b..fdcb57366ad 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -2,12 +2,17 @@ syntax = "proto2";
 
 package caffe;
 
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
 message BlobProto {
-  repeated int32 dim = 7 [packed = true];
+  optional BlobShape shape = 7;
   repeated float data = 5 [packed = true];
   repeated float diff = 6 [packed = true];
 
-  // 4D dimensions -- deprecated.  Use "dim" instead.
+  // 4D dimensions -- deprecated.  Use "shape" instead.
   optional int32 num = 1 [default = 0];
   optional int32 channels = 2 [default = 0];
   optional int32 height = 3 [default = 0];
@@ -50,10 +55,15 @@ message NetParameter {
   optional string name = 1; // consider giving the network a name
   // The input blobs to the network.
   repeated string input = 3;
-  // The dim of the input blobs. For each input blob there should be four
+  // The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "shape" instead.
+  // If specified, for each input blob there should be four
   // values specifying the num, channels, height and width of the input blob.
   // Thus, there should be a total of (4 * #input) numbers.
   repeated int32 input_dim = 4;
+
   // Whether the network will force every layer to carry out backward operation.
   // If set False, then whether to carry out backward is determined
   // automatically according to the net structure and learning rates.

From 119a1c6699b7d97b39699a5764fa231f15bd6dfe Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 4 Feb 2015 15:00:03 -0800
Subject: [PATCH 26/86] add offset, {data,diff}_at nd blob accessors

---
 include/caffe/blob.hpp | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 07b996ee392..82ca95a5f0f 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -163,6 +163,20 @@ class Blob {
     CHECK_LE(w, width());
     return ((n * channels() + c) * height() + h) * width() + w;
   }
+
+  inline int offset(const vector<int>& indices) const {
+    CHECK_LE(indices.size(), num_axes());
+    int offset = 0;
+    for (int i = 0; i < num_axes(); ++i) {
+      offset *= shape(i);
+      if (indices.size() > i) {
+        CHECK_GE(indices[i], 0);
+        CHECK_LT(indices[i], shape(i));
+        offset += indices[i];
+      }
+    }
+    return offset;
+  }
   /**
    * @brief Copy from a source Blob.
    *
@@ -177,12 +191,20 @@ class Blob {
 
   inline Dtype data_at(const int n, const int c, const int h,
       const int w) const {
-    return *(cpu_data() + offset(n, c, h, w));
+    return cpu_data()[offset(n, c, h, w)];
   }
 
   inline Dtype diff_at(const int n, const int c, const int h,
       const int w) const {
-    return *(cpu_diff() + offset(n, c, h, w));
+    return cpu_diff()[offset(n, c, h, w)];
+  }
+
+  inline Dtype data_at(const vector<int>& index) const {
+    return cpu_data()[offset(index)];
+  }
+
+  inline Dtype diff_at(const vector<int>& index) const {
+    return cpu_diff()[offset(index)];
   }
 
   inline const shared_ptr<SyncedMemory>& data() const {

From c4e9ec40427693f3a3848932519e175d11b1738b Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sat, 29 Nov 2014 16:11:07 -0800
Subject: [PATCH 27/86] TestBlob: test that legacy BlobProtos are correctly
 handled by ShapeEquals

---
 src/caffe/test/test_blob.cpp | 54 ++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/caffe/test/test_blob.cpp b/src/caffe/test/test_blob.cpp
index a654896c654..7da6423b67c 100644
--- a/src/caffe/test/test_blob.cpp
+++ b/src/caffe/test/test_blob.cpp
@@ -1,4 +1,5 @@
 #include <cstring>
+#include <vector>
 
 #include "gtest/gtest.h"
 
@@ -51,6 +52,59 @@ TYPED_TEST(BlobSimpleTest, TestReshape) {
   EXPECT_EQ(this->blob_->count(), 120);
 }
 
+TYPED_TEST(BlobSimpleTest, TestLegacyBlobProtoShapeEquals) {
+  BlobProto blob_proto;
+
+  // Reshape to (3 x 2).
+  vector<int> shape(2);
+  shape[0] = 3;
+  shape[1] = 2;
+  this->blob_->Reshape(shape);
+
+  // (3 x 2) blob == (1 x 1 x 3 x 2) legacy blob
+  blob_proto.set_num(1);
+  blob_proto.set_channels(1);
+  blob_proto.set_height(3);
+  blob_proto.set_width(2);
+  EXPECT_TRUE(this->blob_->ShapeEquals(blob_proto));
+
+  // (3 x 2) blob != (0 x 1 x 3 x 2) legacy blob
+  blob_proto.set_num(0);
+  blob_proto.set_channels(1);
+  blob_proto.set_height(3);
+  blob_proto.set_width(2);
+  EXPECT_FALSE(this->blob_->ShapeEquals(blob_proto));
+
+  // (3 x 2) blob != (3 x 1 x 3 x 2) legacy blob
+  blob_proto.set_num(3);
+  blob_proto.set_channels(1);
+  blob_proto.set_height(3);
+  blob_proto.set_width(2);
+  EXPECT_FALSE(this->blob_->ShapeEquals(blob_proto));
+
+  // Reshape to (1 x 3 x 2).
+  shape.insert(shape.begin(), 1);
+  this->blob_->Reshape(shape);
+
+  // (1 x 3 x 2) blob == (1 x 1 x 3 x 2) legacy blob
+  blob_proto.set_num(1);
+  blob_proto.set_channels(1);
+  blob_proto.set_height(3);
+  blob_proto.set_width(2);
+  EXPECT_TRUE(this->blob_->ShapeEquals(blob_proto));
+
+  // Reshape to (2 x 3 x 2).
+  shape[0] = 2;
+  this->blob_->Reshape(shape);
+
+  // (2 x 3 x 2) blob != (1 x 1 x 3 x 2) legacy blob
+  blob_proto.set_num(1);
+  blob_proto.set_channels(1);
+  blob_proto.set_height(3);
+  blob_proto.set_width(2);
+  EXPECT_FALSE(this->blob_->ShapeEquals(blob_proto));
+}
+
 template <typename TypeParam>
 class BlobMathTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;

From 559ff3ae05677801d4904f103de6dc4d88398e07 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Tue, 25 Nov 2014 22:48:03 -0800
Subject: [PATCH 28/86] InnerProductLayer weights are 2D; biases are 1D

---
 src/caffe/layers/inner_product_layer.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index b1ec6cb25c0..60a547c5b49 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -26,14 +26,18 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       this->blobs_.resize(1);
     }
     // Intialize the weight
-    this->blobs_[0].reset(new Blob<Dtype>(1, 1, N_, K_));
+    vector<int> weight_shape(2);
+    weight_shape[0] = N_;
+    weight_shape[1] = K_;
+    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
     // fill the weights
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.inner_product_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, intiialize and fill the bias term
     if (bias_term_) {
-      this->blobs_[1].reset(new Blob<Dtype>(1, 1, 1, N_));
+      vector<int> bias_shape(1, N_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
       shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
           this->layer_param_.inner_product_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
@@ -49,10 +53,14 @@ void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   M_ = bottom[0]->num();
   CHECK_EQ(bottom[0]->count() / bottom[0]->num(), K_) << "Input size "
     "incompatible with inner product parameters.";
-  top[0]->Reshape(bottom[0]->num(), N_, 1, 1);
+  vector<int> top_shape(2);
+  top_shape[0] = M_;
+  top_shape[1] = N_;
+  top[0]->Reshape(top_shape);
   // Set up the bias multiplier
   if (bias_term_) {
-    bias_multiplier_.Reshape(1, 1, 1, M_);
+    vector<int> bias_shape(1, M_);
+    bias_multiplier_.Reshape(bias_shape);
     caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
   }
 }

From cf9fdda4403996326b6a240aa94165df73135625 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 16 Feb 2015 01:29:17 -0800
Subject: [PATCH 29/86] Fix sparse GaussianFiller for new IPLayer weight axes

---
 include/caffe/filler.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index eebf565b1d5..bb18e8e1e28 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -79,9 +79,8 @@ class GaussianFiller : public Filler<Dtype> {
       // These have num == channels == 1; width is number of inputs; height is
       // number of outputs.  The 'sparse' variable specifies the mean number
       // of non-zero input weights for a given output.
-      CHECK_EQ(blob->num(), 1);
-      CHECK_EQ(blob->channels(), 1);
-      int num_outputs = blob->height();
+      CHECK_GE(blob->num_axes(), 1);
+      const int num_outputs = blob->shape(0);
       Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
       rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
       int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());

From 29581e6a4a2c9b88c2399a85628fcd0e9fa94383 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sat, 29 Nov 2014 14:26:48 -0800
Subject: [PATCH 30/86] InnerProductLayer can multiply along any axis

---
 src/caffe/layers/inner_product_layer.cpp | 26 +++++++++++++++++-------
 src/caffe/proto/caffe.proto              |  5 +++++
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 60a547c5b49..6b88724491b 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -15,7 +15,12 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int num_output = this->layer_param_.inner_product_param().num_output();
   bias_term_ = this->layer_param_.inner_product_param().bias_term();
   N_ = num_output;
-  K_ = bottom[0]->count() / bottom[0]->num();
+  const int axis = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.inner_product_param().axis());
+  // Dimensions starting from "axis" are "flattened" into a single
+  // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
+  // N inner products with dimension CHW are performed.
+  K_ = bottom[0]->count(axis);
   // Check if we need to set up the weights
   if (this->blobs_.size() > 0) {
     LOG(INFO) << "Skipping parameter initialization";
@@ -50,12 +55,19 @@ template <typename Dtype>
 void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   // Figure out the dimensions
-  M_ = bottom[0]->num();
-  CHECK_EQ(bottom[0]->count() / bottom[0]->num(), K_) << "Input size "
-    "incompatible with inner product parameters.";
-  vector<int> top_shape(2);
-  top_shape[0] = M_;
-  top_shape[1] = N_;
+  const int axis = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.inner_product_param().axis());
+  const int new_K = bottom[0]->count(axis);
+  CHECK_EQ(K_, new_K)
+      << "Input size incompatible with inner product parameters.";
+  // The first "axis" dimensions are independent inner products; the total
+  // number of these is M_, the product over these dimensions.
+  M_ = bottom[0]->count(0, axis);
+  // The top shape will be the bottom shape with the flattened axes dropped,
+  // and replaced by a single axis with dimension num_output (N_).
+  vector<int> top_shape = bottom[0]->shape();
+  top_shape.resize(axis + 1);
+  top_shape[axis] = N_;
   top[0]->Reshape(top_shape);
   // Set up the bias multiplier
   if (bias_term_) {
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index fdcb57366ad..e8a134865bb 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -561,6 +561,11 @@ message InnerProductParameter {
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
   optional FillerParameter weight_filler = 3; // The filler for the weight
   optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
 }
 
 // Message that stores parameters used by LRNLayer

From 94179cc242a02017538362842125f929e5acc2be Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sat, 29 Nov 2014 17:02:18 -0800
Subject: [PATCH 31/86] ConvLayer biases are 1D

---
 src/caffe/layers/base_conv_layer.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index dccd5170c11..d4ab5e429b5 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -85,10 +85,10 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.convolution_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
-    // If necessary, initialize and fill the biases:
-    // 1 x 1 x 1 x output channels
+    // If necessary, initialize and fill the biases.
     if (bias_term_) {
-      this->blobs_[1].reset(new Blob<Dtype>(1, 1, 1, num_output_));
+      vector<int> bias_shape(1, num_output_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
       shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
           this->layer_param_.convolution_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
@@ -144,7 +144,8 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   }
   // Set up the all ones "bias multiplier" for adding biases by BLAS
   if (bias_term_) {
-    bias_multiplier_.Reshape(1, 1, 1, height_out_ * width_out_);
+    vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
+    bias_multiplier_.Reshape(bias_multiplier_shape);
     caffe_set(bias_multiplier_.count(), Dtype(1),
         bias_multiplier_.mutable_cpu_data());
   }

From a0fa2a9535bda318008c8a2a9edc9ca6218d5c41 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Tue, 25 Nov 2014 23:46:57 -0800
Subject: [PATCH 32/86] LossLayer output is 0D (scalar)

---
 src/caffe/layers/loss_layer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index a5b6d11b065..3496a5c2a8a 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -24,7 +24,8 @@ void LossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(bottom[0]->num(), bottom[1]->num())
       << "The data and label should have the same number.";
-  top[0]->Reshape(1, 1, 1, 1);
+  vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
+  top[0]->Reshape(loss_shape);
 }
 
 INSTANTIATE_CLASS(LossLayer);

From d8c6aeb3d213ead88328ec8e859ae91585c545f9 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sat, 29 Nov 2014 16:48:32 -0800
Subject: [PATCH 33/86] AccuracyLayer output is 0D (scalar)

---
 src/caffe/layers/accuracy_layer.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 3e8df34c0d6..7f28324541a 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -19,14 +19,15 @@ void AccuracyLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void AccuracyLayer<Dtype>::Reshape(
   const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-      << "The data and label should have the same number.";
-  CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
+  CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
       << "top_k must be less than or equal to the number of classes.";
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  top[0]->Reshape(1, 1, 1, 1);
+  CHECK_GE(bottom[0]->num_axes(), bottom[1]->num_axes());
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    CHECK_LE(bottom[0]->shape(i), bottom[1]->shape(i))
+        << "Dimension mismatch between predictions and label.";
+  }
+  vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
+  top[0]->Reshape(top_shape);
 }
 
 template <typename Dtype>

From 6b8a765864e853397e15c8ef57c8110d6a1b4332 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 30 Jan 2015 23:23:27 -0800
Subject: [PATCH 34/86] AccuracyLayer generalized to N instance axes

---
 src/caffe/layers/accuracy_layer.cpp    |  4 ++--
 src/caffe/test/test_accuracy_layer.cpp | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 7f28324541a..186f9f8632c 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -36,8 +36,8 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   Dtype accuracy = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int num = bottom[0]->count(0, bottom[1]->num_axes());
+  int dim = bottom[0]->count() / num;
   vector<Dtype> maxval(top_k_+1);
   vector<int> max_id(top_k_+1);
   for (int i = 0; i < num; ++i) {
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
index fa59fab1e8a..1c58b767bfc 100644
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ b/src/caffe/test/test_accuracy_layer.cpp
@@ -19,10 +19,16 @@ template <typename Dtype>
 class AccuracyLayerTest : public ::testing::Test {
  protected:
   AccuracyLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(100, 10, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(100, 1, 1, 1)),
+      : blob_bottom_data_(new Blob<Dtype>()),
+        blob_bottom_label_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()),
         top_k_(3) {
+    vector<int> shape(2);
+    shape[0] = 100;
+    shape[1] = 10;
+    blob_bottom_data_->Reshape(shape);
+    shape.resize(1);
+    blob_bottom_label_->Reshape(shape);
     // fill the probability values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);

From 8e96445ba57890a982f6e8f5570e3aac5eaf64e6 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 13 Feb 2015 15:52:39 -0800
Subject: [PATCH 35/86] Test{Net,Solver} fixes for AccuracyLayer generalization

---
 src/caffe/test/test_net.cpp    | 17 +++++++++--------
 src/caffe/test/test_solver.cpp | 17 +++++++++--------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 1680a3f28d5..08106e79274 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -63,18 +63,19 @@ class NetTest : public MultiDeviceTest<TypeParam> {
         "  name: 'data' "
         "  type: 'DummyData' "
         "  dummy_data_param { "
-        "    num: 5 "
-        "    channels: 2 "
-        "    height: 3 "
-        "    width: 4 "
-        "    num: 5 "
-        "    channels: 1 "
-        "    height: 1 "
-        "    width: 1 "
+        "    shape { "
+        "      dim: 5 "
+        "      dim: 2 "
+        "      dim: 3 "
+        "      dim: 4 "
+        "    } "
         "    data_filler { "
         "      type: 'gaussian' "
         "      std: 0.01 "
         "    } "
+        "    shape { "
+        "      dim: 5 "
+        "    } "
         "    data_filler { "
         "      type: 'constant' "
         "      value: 0 "
diff --git a/src/caffe/test/test_solver.cpp b/src/caffe/test/test_solver.cpp
index 1c2c9bbb740..ceabc9cdd2c 100644
--- a/src/caffe/test/test_solver.cpp
+++ b/src/caffe/test/test_solver.cpp
@@ -55,14 +55,15 @@ TYPED_TEST(SolverTest, TestInitTrainTestNets) {
      "    name: 'data' "
      "    type: 'DummyData' "
      "    dummy_data_param { "
-     "      num: 5 "
-     "      channels: 3 "
-     "      height: 10 "
-     "      width: 10 "
-     "      num: 5 "
-     "      channels: 1 "
-     "      height: 1 "
-     "      width: 1 "
+     "      shape { "
+     "        dim: 5 "
+     "        dim: 2 "
+     "        dim: 3 "
+     "        dim: 4 "
+     "      } "
+     "      shape { "
+     "        dim: 5 "
+     "      } "
      "    } "
      "    top: 'data' "
      "    top: 'label' "

From bf73cb18982712d03f096418e2aa33074457ae8c Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Tue, 25 Nov 2014 23:47:10 -0800
Subject: [PATCH 36/86] EltwiseLayer need not assume old 4D dim names

---
 src/caffe/layers/eltwise_layer.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index bbc34449588..a80700736bd 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -31,21 +31,14 @@ void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  const int num = bottom[0]->num();
-  const int channels = bottom[0]->channels();
-  const int height = bottom[0]->height();
-  const int width = bottom[0]->width();
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK_EQ(num, bottom[i]->num());
-    CHECK_EQ(channels, bottom[i]->channels());
-    CHECK_EQ(height, bottom[i]->height());
-    CHECK_EQ(width, bottom[i]->width());
+    CHECK(bottom[i]->shape() == bottom[0]->shape());
   }
-  top[0]->Reshape(num, channels, height, width);
+  top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
   if (this->layer_param_.eltwise_param().operation() ==
       EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
-    max_idx_.Reshape(bottom[0]->num(), channels, height, width);
+    max_idx_.Reshape(bottom[0]->shape());
   }
 }
 

From 1b97c06b16cae847578fb2d7130f347862b1a262 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 00:11:06 -0800
Subject: [PATCH 37/86] FlattenLayer: generalized Blob axes

---
 include/caffe/common_layers.hpp    |  6 ------
 src/caffe/layers/flatten_layer.cpp | 15 +++++----------
 src/caffe/layers/flatten_layer.cu  | 23 -----------------------
 3 files changed, 5 insertions(+), 39 deletions(-)
 delete mode 100644 src/caffe/layers/flatten_layer.cu

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index c67822c3738..a9c774f1485 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -216,8 +216,6 @@ class FlattenLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the error gradient w.r.t. the concatenate inputs.
@@ -230,10 +228,6 @@ class FlattenLayer : public Layer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
 };
 
 /**
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index eb7b42bc10b..745f271ea45 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -9,12 +9,11 @@ namespace caffe {
 template <typename Dtype>
 void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  int channels_out = bottom[0]->channels() * bottom[0]->height()
-      * bottom[0]->width();
-  top[0]->Reshape(bottom[0]->num(), channels_out, 1, 1);
-  count_ = bottom[0]->num() * channels_out;
-  CHECK_EQ(count_, bottom[0]->count());
-  CHECK_EQ(count_, top[0]->count());
+  vector<int> top_shape(2);
+  top_shape[0] = bottom[0]->num();
+  top_shape[1] = bottom[0]->count() / bottom[0]->num();
+  top[0]->Reshape(top_shape);
+  CHECK_EQ(top[0]->count(), bottom[0]->count());
 }
 
 template <typename Dtype>
@@ -29,10 +28,6 @@ void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   bottom[0]->ShareDiff(*top[0]);
 }
 
-#ifdef CPU_ONLY
-STUB_GPU(FlattenLayer);
-#endif
-
 INSTANTIATE_CLASS(FlattenLayer);
 REGISTER_LAYER_CLASS(Flatten);
 
diff --git a/src/caffe/layers/flatten_layer.cu b/src/caffe/layers/flatten_layer.cu
deleted file mode 100644
index 42abdad4499..00000000000
--- a/src/caffe/layers/flatten_layer.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  top[0]->ShareData(*bottom[0]);
-}
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  bottom[0]->ShareDiff(*top[0]);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(FlattenLayer);
-
-}  // namespace caffe

From fb9caeef09e75dbb6196977f2cfa592eb80c37e1 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 03:23:42 -0800
Subject: [PATCH 38/86] common_layers.hpp: remove unused "Blob col_bob_"

---
 include/caffe/common_layers.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index a9c774f1485..e9fe7cdeb9f 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -137,7 +137,6 @@ class ConcatLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  Blob<Dtype> col_bob_;
   int count_;
   int num_;
   int channels_;
@@ -452,7 +451,6 @@ class SliceLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  Blob<Dtype> col_bob_;
   int count_;
   int num_;
   int channels_;

From 704e524f6ec1e927db8cef57434002f6c34a30b7 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 02:12:09 -0800
Subject: [PATCH 39/86] TestConcatLayer: fix style errors

---
 src/caffe/test/test_concat_layer.cpp | 68 ++++++++++++++--------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index f14f1d2fa4f..3fc507ee24e 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -19,9 +19,9 @@ class ConcatLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   ConcatLayerTest()
-      : blob_bottom_0(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_bottom_1(new Blob<Dtype>(2, 5, 6, 5)),
-        blob_bottom_2(new Blob<Dtype>(5, 3, 6, 5)),
+      : blob_bottom_0_(new Blob<Dtype>(2, 3, 6, 5)),
+        blob_bottom_1_(new Blob<Dtype>(2, 5, 6, 5)),
+        blob_bottom_2_(new Blob<Dtype>(5, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {}
   virtual void SetUp() {
     // fill the values
@@ -29,30 +29,30 @@ class ConcatLayerTest : public MultiDeviceTest<TypeParam> {
     FillerParameter filler_param;
     filler_param.set_value(1.);
     filler.reset(new ConstantFiller<Dtype>(filler_param));
-    filler->Fill(this->blob_bottom_0);
+    filler->Fill(this->blob_bottom_0_);
     filler_param.set_value(2.);
     filler.reset(new ConstantFiller<Dtype>(filler_param));
-    filler->Fill(this->blob_bottom_1);
+    filler->Fill(this->blob_bottom_1_);
     filler_param.set_value(3.);
     filler.reset(new ConstantFiller<Dtype>(filler_param));
-    filler->Fill(this->blob_bottom_2);
-    blob_bottom_vec_0.push_back(blob_bottom_0);
-    blob_bottom_vec_0.push_back(blob_bottom_1);
-    blob_bottom_vec_1.push_back(blob_bottom_0);
-    blob_bottom_vec_1.push_back(blob_bottom_2);
+    filler->Fill(this->blob_bottom_2_);
+    blob_bottom_vec_0_.push_back(blob_bottom_0_);
+    blob_bottom_vec_0_.push_back(blob_bottom_1_);
+    blob_bottom_vec_1_.push_back(blob_bottom_0_);
+    blob_bottom_vec_1_.push_back(blob_bottom_2_);
     blob_top_vec_.push_back(blob_top_);
   }
 
   virtual ~ConcatLayerTest() {
-    delete blob_bottom_0; delete blob_bottom_1;
-    delete blob_bottom_2; delete blob_top_;
+    delete blob_bottom_0_; delete blob_bottom_1_;
+    delete blob_bottom_2_; delete blob_top_;
   }
 
-  Blob<Dtype>* const blob_bottom_0;
-  Blob<Dtype>* const blob_bottom_1;
-  Blob<Dtype>* const blob_bottom_2;
+  Blob<Dtype>* const blob_bottom_0_;
+  Blob<Dtype>* const blob_bottom_1_;
+  Blob<Dtype>* const blob_bottom_2_;
   Blob<Dtype>* const blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_0, blob_bottom_vec_1;
+  vector<Blob<Dtype>*> blob_bottom_vec_0_, blob_bottom_vec_1_;
   vector<Blob<Dtype>*> blob_top_vec_;
 };
 
@@ -63,24 +63,24 @@ TYPED_TEST(ConcatLayerTest, TestSetupNum) {
   LayerParameter layer_param;
   layer_param.mutable_concat_param()->set_concat_dim(0);
   ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_1, this->blob_top_vec_);
+  layer.SetUp(this->blob_bottom_vec_1_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(),
-    this->blob_bottom_0->num() + this->blob_bottom_2->num());
-  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_0->channels());
-  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0->height());
-  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0->width());
+      this->blob_bottom_0_->num() + this->blob_bottom_2_->num());
+  EXPECT_EQ(this->blob_top_->channels(), this->blob_bottom_0_->channels());
+  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0_->height());
+  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
 }
 
 TYPED_TEST(ConcatLayerTest, TestSetupChannels) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_0, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_0->num());
+  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_0_->num());
   EXPECT_EQ(this->blob_top_->channels(),
-    this->blob_bottom_0->channels()+this->blob_bottom_1->channels());
-  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0->height());
-  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0->width());
+      this->blob_bottom_0_->channels() + this->blob_bottom_1_->channels());
+  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0_->height());
+  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
 }
 
 
@@ -88,22 +88,22 @@ TYPED_TEST(ConcatLayerTest, TestNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ConcatLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_0, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_0, this->blob_top_vec_);
+  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_0_, this->blob_top_vec_);
   for (int n = 0; n < this->blob_top_->num(); ++n) {
-    for (int c = 0; c < this->blob_bottom_0->channels(); ++c) {
+    for (int c = 0; c < this->blob_bottom_0_->channels(); ++c) {
       for (int h = 0; h < this->blob_top_->height(); ++h) {
         for (int w = 0; w < this->blob_top_->width(); ++w) {
           EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
-            this->blob_bottom_vec_0[0]->data_at(n, c, h, w));
+              this->blob_bottom_vec_0_[0]->data_at(n, c, h, w));
         }
       }
     }
-    for (int c = 0; c < this->blob_bottom_1->channels(); ++c) {
+    for (int c = 0; c < this->blob_bottom_1_->channels(); ++c) {
       for (int h = 0; h < this->blob_top_->height(); ++h) {
         for (int w = 0; w < this->blob_top_->width(); ++w) {
-          EXPECT_EQ(this->blob_top_->data_at(n, c+3, h, w),
-            this->blob_bottom_vec_0[1]->data_at(n, c, h, w));
+          EXPECT_EQ(this->blob_top_->data_at(n, c + 3, h, w),
+              this->blob_bottom_vec_0_[1]->data_at(n, c, h, w));
         }
       }
     }
@@ -115,7 +115,7 @@ TYPED_TEST(ConcatLayerTest, TestGradient) {
   LayerParameter layer_param;
   ConcatLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-2);
-  checker.CheckGradient(&layer, this->blob_bottom_vec_0,
+  checker.CheckGradient(&layer, this->blob_bottom_vec_0_,
     this->blob_top_vec_);
 }
 

From d52e9a811bd0afa12602248cb377ed9234be1a33 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 02:24:41 -0800
Subject: [PATCH 40/86] TestConcatLayer: add forward/gradient tests for
 concatenation along num

---
 src/caffe/test/test_concat_layer.cpp | 42 ++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index 3fc507ee24e..875468faa49 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -83,8 +83,36 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannels) {
   EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
 }
 
+TYPED_TEST(ConcatLayerTest, TestForwardNum) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_concat_param()->set_concat_dim(0);
+  ConcatLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_1_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_1_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_vec_1_[0]->num(); ++n) {
+    for (int c = 0; c < this->blob_top_->channels(); ++c) {
+      for (int h = 0; h < this->blob_top_->height(); ++h) {
+        for (int w = 0; w < this->blob_top_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n, c, h, w),
+              this->blob_bottom_vec_1_[0]->data_at(n, c, h, w));
+        }
+      }
+    }
+  }
+  for (int n = 0; n < this->blob_bottom_vec_1_[1]->num(); ++n) {
+    for (int c = 0; c < this->blob_top_->channels(); ++c) {
+      for (int h = 0; h < this->blob_top_->height(); ++h) {
+        for (int w = 0; w < this->blob_top_->width(); ++w) {
+          EXPECT_EQ(this->blob_top_->data_at(n + 2, c, h, w),
+              this->blob_bottom_vec_1_[1]->data_at(n, c, h, w));
+        }
+      }
+    }
+  }
+}
 
-TYPED_TEST(ConcatLayerTest, TestNum) {
+TYPED_TEST(ConcatLayerTest, TestForwardChannels) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ConcatLayer<Dtype> layer(layer_param);
@@ -110,7 +138,17 @@ TYPED_TEST(ConcatLayerTest, TestNum) {
   }
 }
 
-TYPED_TEST(ConcatLayerTest, TestGradient) {
+TYPED_TEST(ConcatLayerTest, TestGradientNum) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_concat_param()->set_concat_dim(0);
+  ConcatLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradient(&layer, this->blob_bottom_vec_1_,
+    this->blob_top_vec_);
+}
+
+TYPED_TEST(ConcatLayerTest, TestGradientChannels) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ConcatLayer<Dtype> layer(layer_param);

From 8afdcd02f06923a1ea61fff7bd35e72fb5c57ab4 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 00:03:36 -0800
Subject: [PATCH 41/86] ConcatLayer: generalized Blob axes

---
 include/caffe/common_layers.hpp      |  16 ++--
 src/caffe/layers/concat_layer.cpp    | 118 ++++++++++++---------------
 src/caffe/layers/concat_layer.cu     |  71 +++++-----------
 src/caffe/proto/caffe.proto          |  10 ++-
 src/caffe/test/test_concat_layer.cpp |  22 ++++-
 5 files changed, 108 insertions(+), 129 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index e9fe7cdeb9f..114f24a2b75 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -99,8 +99,8 @@ class ConcatLayer : public Layer<Dtype> {
    *   - K @f$ (N \times C \times H \times W) @f$
    *      the inputs @f$ x_K @f$
    * @param top output Blob vector (length 1)
-   *   -# @f$ (KN \times C \times H \times W) @f$ if concat_dim == 0, or
-   *      @f$ (N \times KC \times H \times W) @f$ if concat_dim == 1:
+   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
    *      the concatenated output @f$
    *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
    *      @f$
@@ -115,8 +115,8 @@ class ConcatLayer : public Layer<Dtype> {
    *
    * @param top output Blob vector (length 1), providing the error gradient with
    *        respect to the outputs
-   *   -# @f$ (KN \times C \times H \times W) @f$ if concat_dim == 0, or
-   *      @f$ (N \times KC \times H \times W) @f$ if concat_dim == 1:
+   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
    *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
    *      with respect to concatenated outputs @f$ y @f$
    * @param propagate_down see Layer::Backward.
@@ -138,11 +138,9 @@ class ConcatLayer : public Layer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int count_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-  int concat_dim_;
+  int num_concats_;
+  int concat_input_size_;
+  int concat_axis_;
 };
 
 /**
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index fc88433c42b..1cac8fc3387 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -9,62 +9,63 @@ namespace caffe {
 template <typename Dtype>
 void ConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  concat_dim_ = this->layer_param_.concat_param().concat_dim();
-  CHECK_GE(concat_dim_, 0) <<
-    "concat_dim should be >= 0";
-  CHECK_LE(concat_dim_, 1) <<
-    "For now concat_dim <=1, it can only concat num and channels";
+  const ConcatParameter& concat_param = this->layer_param_.concat_param();
+  CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
+      << "Either axis or concat_dim should be specified; not both.";
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  const int num_axes = bottom[0]->num_axes();
+  const ConcatParameter& concat_param = this->layer_param_.concat_param();
+  if (concat_param.has_concat_dim()) {
+    concat_axis_ = static_cast<int>(concat_param.concat_dim());
+    // Don't allow negative indexing for concat_dim, a uint32 -- almost
+    // certainly unintended.
+    CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 "
+        << "produced negative result; concat_dim must satisfy "
+        << "0 <= concat_dim < " << kMaxBlobAxes;
+    CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range.";
+  } else {
+    concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis());
+  }
   // Initialize with the first blob.
-  count_ = bottom[0]->count();
-  num_ = bottom[0]->num();
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
+  vector<int> top_shape = bottom[0]->shape();
+  num_concats_ = bottom[0]->count(0, concat_axis_);
+  concat_input_size_ = bottom[0]->count(concat_axis_ + 1);
+  int bottom_count_sum = bottom[0]->count();
   for (int i = 1; i < bottom.size(); ++i) {
-    count_ += bottom[i]->count();
-    if (concat_dim_== 0) {
-      num_ += bottom[i]->num();
-    } else if (concat_dim_ == 1) {
-      channels_ += bottom[i]->channels();
-    } else if (concat_dim_ == 2) {
-      height_ += bottom[i]->height();
-    } else if (concat_dim_ == 3) {
-      width_ += bottom[i]->width();
+    CHECK_EQ(num_axes, bottom[i]->num_axes())
+        << "All inputs must have the same #axes.";
+    for (int j = 0; j < num_axes; ++j) {
+      if (j == concat_axis_) { continue; }
+      CHECK_EQ(top_shape[j], bottom[i]->shape(j))
+          << "All inputs must have the same shape, except at concat_axis.";
     }
+    bottom_count_sum += bottom[i]->count();
+    top_shape[concat_axis_] += bottom[i]->shape(concat_axis_);
   }
-  top[0]->Reshape(num_, channels_, height_, width_);
-  CHECK_EQ(count_, top[0]->count());
+  top[0]->Reshape(top_shape);
+  CHECK_EQ(bottom_count_sum, top[0]->count());
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   Dtype* top_data = top[0]->mutable_cpu_data();
-  if (concat_dim_== 0) {
-    int offset_num = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->cpu_data();
-      int num_elem = bottom[i]->count();
-      caffe_copy(num_elem, bottom_data, top_data+top[0]->offset(offset_num));
-      offset_num += bottom[i]->num();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->cpu_data();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    for (int n = 0; n < num_concats_; ++n) {
+      caffe_copy(bottom_concat_axis * concat_input_size_,
+          bottom_data + n * bottom_concat_axis * concat_input_size_,
+          top_data + (n * top_concat_axis + offset_concat_axis)
+              * concat_input_size_);
     }
-  } else if (concat_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->cpu_data();
-      int num_elem =
-        bottom[i]->channels()*bottom[i]->height()*bottom[i]->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, bottom_data+bottom[i]->offset(n),
-          top_data+top[0]->offset(n, offset_channel));
-      }
-      offset_channel += bottom[i]->channels();
-    }  // concat_dim_ is guaranteed to be 0 or 1 by LayerSetUp.
+    offset_concat_axis += bottom_concat_axis;
   }
 }
 
@@ -72,32 +73,19 @@ template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
-  if (concat_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      Blob<Dtype>* blob = bottom[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_cpu_diff();
-        caffe_copy(blob->count(), top_diff + top[0]->offset(offset_num),
-                   bottom_diff);
-      }
-      offset_num += blob->num();
-    }
-  } else if (concat_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      Blob<Dtype>* blob = bottom[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_cpu_diff();
-        int num_elem = blob->channels()*blob->height()*blob->width();
-        for (int n = 0; n < num_; ++n) {
-          caffe_copy(num_elem, top_diff + top[0]->offset(n, offset_channel),
-                     bottom_diff + blob->offset(n));
-        }
-      }
-      offset_channel += blob->channels();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (!propagate_down[i]) { continue; }
+    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    for (int n = 0; n < num_concats_; ++n) {
+      caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
+          (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+          bottom_diff + n * bottom_concat_axis * concat_input_size_);
     }
-  }  // concat_dim_ is guaranteed to be 0 or 1 by LayerSetUp.
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
index 88fc090025f..dbadb5aeb30 100644
--- a/src/caffe/layers/concat_layer.cu
+++ b/src/caffe/layers/concat_layer.cu
@@ -10,29 +10,18 @@ template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   Dtype* top_data = top[0]->mutable_gpu_data();
-  if (concat_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      caffe_copy(bottom[i]->count(), bottom_data,
-        top_data + top[0]->offset(offset_num));
-      offset_num += bottom[i]->num();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    for (int n = 0; n < num_concats_; ++n) {
+      caffe_copy(bottom_concat_axis * concat_input_size_,
+          bottom_data + n * bottom_concat_axis * concat_input_size_,
+          top_data + (n * top_concat_axis + offset_concat_axis)
+              * concat_input_size_);
     }
-  } else if (concat_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      int num_elem =
-        bottom[i]->channels() * bottom[i]->height() * bottom[i]->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, bottom_data+bottom[i]->offset(n),
-          top_data + top[0]->offset(n, offset_channel));
-      }
-      offset_channel += bottom[i]->channels();
-    }
-  } else {
-    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
-      " not implemented yet";
+    offset_concat_axis += bottom_concat_axis;
   }
 }
 
@@ -40,34 +29,18 @@ template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
-  if (concat_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      Blob<Dtype>* blob = bottom[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_gpu_diff();
-        caffe_copy(blob->count(), top_diff + top[0]->offset(offset_num),
-                       bottom_diff);
-      }
-      offset_num += blob->num();
-    }
-  } else if (concat_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < bottom.size(); ++i) {
-      Blob<Dtype>* blob = bottom[i];
-      if (propagate_down[i]) {
-        Dtype* bottom_diff = blob->mutable_gpu_diff();
-        int num_elem = blob->channels()*blob->height()*blob->width();
-        for (int n = 0; n < num_; ++n) {
-          caffe_copy(num_elem, top_diff + top[0]->offset(n, offset_channel),
-                         bottom_diff + blob->offset(n));
-        }
-      }
-      offset_channel += blob->channels();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (!propagate_down[i]) { continue; }
+    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    for (int n = 0; n < num_concats_; ++n) {
+      caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
+          (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+          bottom_diff + n * bottom_concat_axis * concat_input_size_);
     }
-  } else {
-    LOG(FATAL) << "concat_dim along dim" << concat_dim_ <<
-      " not implemented yet";
+    offset_concat_axis += bottom_concat_axis;
   }
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index e8a134865bb..7a4ecf93c7a 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -378,9 +378,13 @@ message ArgMaxParameter {
 
 // Message that stores parameters used by ConcatLayer
 message ConcatParameter {
-  // Concat Layer needs to specify the dimension along the concat will happen,
-  // the other dimensions must be the same for all the bottom blobs
-  // By default it will concatenate blobs along channels dimension
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
   optional uint32 concat_dim = 1 [default = 1];
 }
 
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index 875468faa49..662a50fa23b 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -61,7 +61,7 @@ TYPED_TEST_CASE(ConcatLayerTest, TestDtypesAndDevices);
 TYPED_TEST(ConcatLayerTest, TestSetupNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.mutable_concat_param()->set_concat_dim(0);
+  layer_param.mutable_concat_param()->set_axis(0);
   ConcatLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_1_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(),
@@ -83,10 +83,26 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannels) {
   EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
 }
 
+TYPED_TEST(ConcatLayerTest, TestSetupChannelsNegativeIndexing) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConcatLayer<Dtype> layer(layer_param);
+  // "channels" index is the third one from the end -- test negative indexing
+  // by setting axis to -3 and checking that we get the same results as above in
+  // TestSetupChannels.
+  layer_param.mutable_concat_param()->set_axis(-3);
+  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_0_->num());
+  EXPECT_EQ(this->blob_top_->channels(),
+      this->blob_bottom_0_->channels() + this->blob_bottom_1_->channels());
+  EXPECT_EQ(this->blob_top_->height(), this->blob_bottom_0_->height());
+  EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
+}
+
 TYPED_TEST(ConcatLayerTest, TestForwardNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.mutable_concat_param()->set_concat_dim(0);
+  layer_param.mutable_concat_param()->set_axis(0);
   ConcatLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_1_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_1_, this->blob_top_vec_);
@@ -141,7 +157,7 @@ TYPED_TEST(ConcatLayerTest, TestForwardChannels) {
 TYPED_TEST(ConcatLayerTest, TestGradientNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.mutable_concat_param()->set_concat_dim(0);
+  layer_param.mutable_concat_param()->set_axis(0);
   ConcatLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-2);
   checker.CheckGradient(&layer, this->blob_bottom_vec_1_,

From b86891635dbb24f70d5634a679150070caf776e4 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 03:22:59 -0800
Subject: [PATCH 42/86] SliceLayer: generalized Blob axes

---
 include/caffe/common_layers.hpp     |   8 +-
 src/caffe/layers/slice_layer.cpp    | 142 ++++++++++++----------------
 src/caffe/layers/slice_layer.cu     |  70 ++++++--------
 src/caffe/proto/caffe.proto         |  12 ++-
 src/caffe/test/test_slice_layer.cpp |   6 +-
 5 files changed, 100 insertions(+), 138 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 114f24a2b75..4e47e55d5b6 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -450,11 +450,9 @@ class SliceLayer : public Layer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int count_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-  int slice_dim_;
+  int num_slices_;
+  int slice_size_;
+  int slice_axis_;
   vector<int> slice_point_;
 };
 
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 46c3acd6513..e4418c9cf9c 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -11,9 +11,8 @@ template <typename Dtype>
 void SliceLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const SliceParameter& slice_param = this->layer_param_.slice_param();
-  slice_dim_ = slice_param.slice_dim();
-  CHECK_GE(slice_dim_, 0);
-  CHECK_LE(slice_dim_, 1) << "Can only slice num and channels";
+  CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
+      << "Either axis or slice_dim should be specified; not both.";
   slice_point_.clear();
   std::copy(slice_param.slice_point().begin(),
       slice_param.slice_point().end(),
@@ -23,18 +22,27 @@ void SliceLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  count_ = 0;
-  num_ = bottom[0]->num();
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
+  const int num_axes = bottom[0]->num_axes();
+  const SliceParameter& slice_param = this->layer_param_.slice_param();
+  if (slice_param.has_slice_dim()) {
+    slice_axis_ = static_cast<int>(slice_param.slice_dim());
+    // Don't allow negative indexing for slice_dim, a uint32 -- almost
+    // certainly unintended.
+    CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 "
+        << "produced negative result; slice_dim must satisfy "
+        << "0 <= slice_dim < " << kMaxBlobAxes;
+    CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range.";
+  } else {
+    slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis());
+  }
+  vector<int> top_shape = bottom[0]->shape();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  num_slices_ = bottom[0]->count(0, slice_axis_);
+  slice_size_ = bottom[0]->count(slice_axis_ + 1);
+  int count = 0;
   if (slice_point_.size() != 0) {
     CHECK_EQ(slice_point_.size(), top.size() - 1);
-    if (slice_dim_ == 0) {
-      CHECK_LE(top.size(), num_);
-    } else {
-      CHECK_LE(top.size(), channels_);
-    }
+    CHECK_LE(top.size(), bottom_slice_axis);
     int prev = 0;
     vector<int> slices;
     for (int i = 0; i < slice_point_.size(); ++i) {
@@ -42,94 +50,64 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       slices.push_back(slice_point_[i] - prev);
       prev = slice_point_[i];
     }
-    if (slice_dim_ == 0) {
-      slices.push_back(num_ - prev);
-      for (int i = 0; i < top.size(); ++i) {
-        top[i]->Reshape(slices[i], channels_, height_, width_);
-        count_ += top[i]->count();
-      }
-    } else {
-      slices.push_back(channels_ - prev);
-      for (int i = 0; i < top.size(); ++i) {
-        top[i]->Reshape(num_, slices[i], height_, width_);
-        count_ += top[i]->count();
-      }
+    slices.push_back(bottom_slice_axis - prev);
+    for (int i = 0; i < top.size(); ++i) {
+      top_shape[slice_axis_] = slices[i];
+      top[i]->Reshape(top_shape);
+      count += top[i]->count();
     }
   } else {
-    if (slice_dim_ == 0) {
-      CHECK_EQ(num_ % top.size(), 0)
-          << "Number of top blobs (" << top.size() << ") "
-          << "should evenly divide input num ( " << num_ << ")";
-      num_ = num_ / top.size();
-    } else {
-      CHECK_EQ(channels_ % top.size(), 0)
-          << "Number of top blobs (" << top.size() << ") "
-          << "should evenly divide input channels ( " << channels_ << ")";
-      channels_ = channels_ / top.size();
-    }
+    CHECK_EQ(bottom_slice_axis % top.size(), 0)
+        << "Number of top blobs (" << top.size() << ") should evenly "
+        << "divide input slice axis (" << bottom_slice_axis << ")";
+    top_shape[slice_axis_] = bottom_slice_axis / top.size();
     for (int i = 0; i < top.size(); ++i) {
-      top[i]->Reshape(num_, channels_, height_, width_);
-      count_ += top[i]->count();
+      top[i]->Reshape(top_shape);
+      count += top[i]->count();
     }
   }
-  CHECK_EQ(count_, bottom[0]->count());
+  CHECK_EQ(count, bottom[0]->count());
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->mutable_cpu_data();
-  if (slice_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      Dtype* top_data = blob->mutable_cpu_data();
-      caffe_copy(blob->count(), bottom_data + bottom[0]->offset(offset_num),
-                 top_data);
-      offset_num += blob->num();
+  int offset_slice_axis = 0;
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  for (int i = 0; i < top.size(); ++i) {
+    Dtype* top_data = top[i]->mutable_cpu_data();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    for (int n = 0; n < num_slices_; ++n) {
+      const int top_offset = n * top_slice_axis * slice_size_;
+      const int bottom_offset =
+          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_,
+          bottom_data + bottom_offset, top_data + top_offset);
     }
-  } else if (slice_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      Dtype* top_data = blob->mutable_cpu_data();
-      const int num_elem = blob->channels() * blob->height() * blob->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, bottom_data + bottom[0]->offset(n, offset_channel),
-                   top_data + blob->offset(n));
-      }
-      offset_channel += blob->channels();
-    }
-  }  // slice_dim_ is guaranteed to be 0 or 1 by SetUp.
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (!propagate_down[0]) { return; }
+  int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  if (slice_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      const Dtype* top_diff = blob->cpu_diff();
-      caffe_copy(blob->count(), top_diff,
-                 bottom_diff + bottom[0]->offset(offset_num));
-      offset_num += blob->num();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->cpu_diff();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    for (int n = 0; n < num_slices_; ++n) {
+      const int top_offset = n * top_slice_axis * slice_size_;
+      const int bottom_offset =
+          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_,
+          top_diff + top_offset, bottom_diff + bottom_offset);
     }
-  } else if (slice_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      const Dtype* top_diff = blob->cpu_diff();
-      const int num_elem = blob->channels() * blob->height() * blob->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, top_diff + blob->offset(n),
-                   bottom_diff + bottom[0]->offset(n, offset_channel));
-      }
-      offset_channel += blob->channels();
-    }
-  }  // slice_dim_ is guaranteed to be 0 or 1 by SetUp.
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
index b5c5e61533f..e6e65677bd8 100644
--- a/src/caffe/layers/slice_layer.cu
+++ b/src/caffe/layers/slice_layer.cu
@@ -9,58 +9,42 @@ namespace caffe {
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->mutable_gpu_data();
-  if (slice_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      Dtype* top_data = blob->mutable_gpu_data();
-      caffe_copy(blob->count(), bottom_data + bottom[0]->offset(offset_num),
-                 top_data);
-      offset_num += blob->num();
+  int offset_slice_axis = 0;
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  for (int i = 0; i < top.size(); ++i) {
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    for (int n = 0; n < num_slices_; ++n) {
+      const int top_offset = n * top_slice_axis * slice_size_;
+      const int bottom_offset =
+          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_,
+          bottom_data + bottom_offset, top_data + top_offset);
     }
-  } else if (slice_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      Dtype* top_data = blob->mutable_gpu_data();
-      const int num_elem = blob->channels() * blob->height() * blob->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, bottom_data + bottom[0]->offset(n, offset_channel),
-                   top_data + blob->offset(n));
-      }
-      offset_channel += blob->channels();
-    }
-  }  // slice_dim_ is guaranteed to be 0 or 1 by SetUp.
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (!propagate_down[0]) { return; }
+  int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  if (slice_dim_ == 0) {
-    int offset_num = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      const Dtype* top_diff = blob->gpu_diff();
-      caffe_copy(blob->count(), top_diff,
-                 bottom_diff + bottom[0]->offset(offset_num));
-      offset_num += blob->num();
-    }
-  } else if (slice_dim_ == 1) {
-    int offset_channel = 0;
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* blob = top[i];
-      const Dtype* top_diff = blob->gpu_diff();
-      const int num_elem = blob->channels() * blob->height() * blob->width();
-      for (int n = 0; n < num_; ++n) {
-        caffe_copy(num_elem, top_diff + blob->offset(n),
-                   bottom_diff +  bottom[0]->offset(n, offset_channel));
-      }
-      offset_channel += blob->channels();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    for (int n = 0; n < num_slices_; ++n) {
+      const int top_offset = n * top_slice_axis * slice_size_;
+      const int bottom_offset =
+          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_,
+          top_diff + top_offset, bottom_diff + bottom_offset);
     }
-  }  // slice_dim_ is guaranteed to be 0 or 1 by SetUp.
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 7a4ecf93c7a..7783a783dd7 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -674,12 +674,14 @@ message SigmoidParameter {
 
 // Message that stores parameters used by SliceLayer
 message SliceParameter {
-  // SliceLayer needs to know which dimension to slice across.
-  // Currently, SliceLayer only supports slicing across num (dim 0)
-  // and channels (dim 1).
-  // By default, SliceLayer slices across channels.
-  optional uint32 slice_dim = 1 [default = 1];
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
   repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
 }
 
 // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
diff --git a/src/caffe/test/test_slice_layer.cpp b/src/caffe/test/test_slice_layer.cpp
index 395be280089..ccd03646d19 100644
--- a/src/caffe/test/test_slice_layer.cpp
+++ b/src/caffe/test/test_slice_layer.cpp
@@ -62,7 +62,7 @@ TYPED_TEST_CASE(SliceLayerTest, TestDtypesAndDevices);
 TYPED_TEST(SliceLayerTest, TestSetupNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.mutable_slice_param()->set_slice_dim(0);
+  layer_param.mutable_slice_param()->set_axis(0);
   SliceLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_1_);
   EXPECT_EQ(this->blob_bottom_->num(), 3 * this->blob_top_0_->num());
@@ -91,7 +91,7 @@ TYPED_TEST(SliceLayerTest, TestSetupChannels) {
 TYPED_TEST(SliceLayerTest, TestSliceAcrossNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.mutable_slice_param()->set_slice_dim(0);
+  layer_param.mutable_slice_param()->set_axis(0);
   SliceLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_0_);
   const int top_num = this->blob_bottom_->num() / 2;
@@ -166,7 +166,7 @@ TYPED_TEST(SliceLayerTest, TestGradientAcrossNum) {
   // Gradient checks are slow; reduce blob size.
   this->ReduceBottomBlobSize();
   LayerParameter layer_param;
-  layer_param.mutable_slice_param()->set_slice_dim(0);
+  layer_param.mutable_slice_param()->set_axis(0);
   SliceLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-3);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,

From abec30252ced89d9e2550ca47fca569f563479f6 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 13:26:36 -0800
Subject: [PATCH 43/86] SoftmaxLayer: generalized Blob axes

---
 include/caffe/common_layers.hpp    |  3 ++
 src/caffe/layers/softmax_layer.cpp | 62 +++++++++++++++---------------
 src/caffe/layers/softmax_layer.cu  | 35 ++++++++---------
 src/caffe/proto/caffe.proto        |  5 +++
 4 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 4e47e55d5b6..b1ac3a93eff 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -353,6 +353,9 @@ class SoftmaxLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  int outer_num_;
+  int inner_num_;
+  int softmax_axis_;
   /// sum_multiplier is used to carry out sum using BLAS
   Blob<Dtype> sum_multiplier_;
   /// scale is an intermediate Blob to hold temporary results.
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 25142fdec53..04712c9e653 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -10,14 +10,18 @@ namespace caffe {
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  sum_multiplier_.Reshape(1, bottom[0]->channels(), 1, 1);
+  softmax_axis_ =
+      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+  top[0]->ReshapeLike(*bottom[0]);
+  vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
+  sum_multiplier_.Reshape(mult_dims);
   Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
-  for (int i = 0; i < sum_multiplier_.count(); ++i) {
-    multiplier_data[i] = 1.;
-  }
-  scale_.Reshape(bottom[0]->num(), 1, bottom[0]->height(), bottom[0]->width());
+  caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
+  outer_num_ = bottom[0]->count(0, softmax_axis_);
+  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+  vector<int> scale_dims = bottom[0]->shape();
+  scale_dims[softmax_axis_] = 1;
+  scale_.Reshape(scale_dims);
 }
 
 template <typename Dtype>
@@ -26,34 +30,32 @@ void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
   Dtype* scale_data = scale_.mutable_cpu_data();
-  int num = bottom[0]->num();
-  int channels = bottom[0]->channels();
-  int dim = bottom[0]->count() / bottom[0]->num();
-  int spatial_dim = bottom[0]->height() * bottom[0]->width();
+  int channels = bottom[0]->shape(softmax_axis_);
+  int dim = bottom[0]->count() / outer_num_;
   caffe_copy(bottom[0]->count(), bottom_data, top_data);
   // We need to subtract the max to avoid numerical issues, compute the exp,
   // and then normalize.
-  for (int i = 0; i < num; ++i) {
+  for (int i = 0; i < outer_num_; ++i) {
     // initialize scale_data to the first plane
-    caffe_copy(spatial_dim, bottom_data + i * dim, scale_data);
+    caffe_copy(inner_num_, bottom_data + i * dim, scale_data);
     for (int j = 0; j < channels; j++) {
-      for (int k = 0; k < spatial_dim; k++) {
+      for (int k = 0; k < inner_num_; k++) {
         scale_data[k] = std::max(scale_data[k],
-            bottom_data[i * dim + j * spatial_dim + k]);
+            bottom_data[i * dim + j * inner_num_ + k]);
       }
     }
     // subtraction
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, spatial_dim,
-        1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data + i * dim);
+    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_,
+        1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
     // exponentiation
-    caffe_exp<Dtype>(dim, top_data + i * dim, top_data + i * dim);
+    caffe_exp<Dtype>(dim, top_data, top_data);
     // sum after exp
-    caffe_cpu_gemv<Dtype>(CblasTrans, channels, spatial_dim, 1.,
-        top_data + i * dim, sum_multiplier_.cpu_data(), 0., scale_data);
+    caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,
+        top_data, sum_multiplier_.cpu_data(), 0., scale_data);
     // division
     for (int j = 0; j < channels; j++) {
-      caffe_div(spatial_dim, top_data + top[0]->offset(i, j), scale_data,
-          top_data + top[0]->offset(i, j));
+      caffe_div(inner_num_, top_data, scale_data, top_data);
+      top_data += inner_num_;
     }
   }
 }
@@ -66,20 +68,18 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   const Dtype* top_data = top[0]->cpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
   Dtype* scale_data = scale_.mutable_cpu_data();
-  int num = top[0]->num();
-  int channels = top[0]->channels();
-  int dim = top[0]->count() / top[0]->num();
-  int spatial_dim = top[0]->height() * top[0]->width();
+  int channels = top[0]->shape(softmax_axis_);
+  int dim = top[0]->count() / outer_num_;
   caffe_copy(top[0]->count(), top_diff, bottom_diff);
-  for (int i = 0; i < num; ++i) {
+  for (int i = 0; i < outer_num_; ++i) {
     // compute dot(top_diff, top_data) and subtract them from the bottom diff
-    for (int k = 0; k < spatial_dim; ++k) {
+    for (int k = 0; k < inner_num_; ++k) {
       scale_data[k] = caffe_cpu_strided_dot<Dtype>(channels,
-          bottom_diff + i * dim + k, spatial_dim,
-          top_data + i * dim + k, spatial_dim);
+          bottom_diff + i * dim + k, inner_num_,
+          top_data + i * dim + k, inner_num_);
     }
     // subtraction
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, spatial_dim, 1,
+    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
         -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
   }
   // elementwise multiplication
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu
index 6b8871a0b20..1f9c3a41203 100644
--- a/src/caffe/layers/softmax_layer.cu
+++ b/src/caffe/layers/softmax_layer.cu
@@ -90,36 +90,33 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_gpu_data();
   Dtype* scale_data = scale_.mutable_gpu_data();
   int count = bottom[0]->count();
-  int num = bottom[0]->num();
-  int channels = bottom[0]->channels();
-  int spatial_dim = bottom[0]->height() * bottom[0]->width();
+  int channels = top[0]->shape(softmax_axis_);
   caffe_copy(count, bottom_data, top_data);
   // We need to subtract the max to avoid numerical issues, compute the exp,
   // and then normalize.
   // compute max
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(num * spatial_dim),
-      CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, top_data,
+  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
+      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
       scale_data);
   // subtract
   // NOLINT_NEXT_LINE(whitespace/operators)
   kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, num, channels, spatial_dim,
+      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
       scale_data, top_data);
   // exponentiate
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(num * channels * spatial_dim),
-      CAFFE_CUDA_NUM_THREADS>>>(num * channels * spatial_dim, top_data,
-      top_data);
+  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, top_data, top_data);
   // sum after exp
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(num * spatial_dim),
-      CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, top_data,
+  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
+      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
       scale_data);
   // divide
   // NOLINT_NEXT_LINE(whitespace/operators)
   kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, num, channels, spatial_dim,
+      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
       scale_data, top_data);
 }
 
@@ -131,18 +128,16 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
   Dtype* scale_data = scale_.mutable_gpu_data();
   int count = top[0]->count();
-  int num = top[0]->num();
-  int channels = top[0]->channels();
-  int spatial_dim = top[0]->height() * top[0]->width();
-  caffe_copy(top[0]->count(), top_diff, bottom_diff);
+  int channels = top[0]->shape(softmax_axis_);
+  caffe_copy(count, top_diff, bottom_diff);
   // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_dot<Dtype><<<CAFFE_GET_BLOCKS(num * spatial_dim),
-      CAFFE_CUDA_NUM_THREADS>>>(num, channels, spatial_dim, top_diff, top_data,
-      scale_data);
+  kernel_channel_dot<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
+      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_,
+      top_diff, top_data, scale_data);
   // NOLINT_NEXT_LINE(whitespace/operators)
   kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, num, channels, spatial_dim,
+      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
       scale_data, bottom_diff);
   // elementwise multiplication
   caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 7783a783dd7..8fcb8def173 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -692,6 +692,11 @@ message SoftmaxParameter {
     CUDNN = 2;
   }
   optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
 }
 
 // Message that stores parameters used by TanHLayer

From 60c288b9de8ecd2b17ee2d2eff7a31fecfe3e98b Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 9 Feb 2015 18:12:54 -0800
Subject: [PATCH 44/86] CuDNNSoftmaxLayer: generalized Blob axes

---
 src/caffe/layers/cudnn_softmax_layer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp
index 83a5b69a626..211701cad49 100644
--- a/src/caffe/layers/cudnn_softmax_layer.cpp
+++ b/src/caffe/layers/cudnn_softmax_layer.cpp
@@ -26,10 +26,10 @@ template <typename Dtype>
 void CuDNNSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   SoftmaxLayer<Dtype>::Reshape(bottom, top);
-  int N = bottom[0]->num();
-  int K = bottom[0]->channels();
-  int H = bottom[0]->height();
-  int W = bottom[0]->width();
+  int N = this->outer_num_;
+  int K = bottom[0]->shape(this->softmax_axis_);
+  int H = this->inner_num_;
+  int W = 1;
   cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
   cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
 }

From 94d93da095486c3137aacd101f495acde673ba10 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 30 Jan 2015 23:22:26 -0800
Subject: [PATCH 45/86] SoftmaxLossLayer generalized like SoftmaxLayer

---
 include/caffe/loss_layers.hpp           |  2 ++
 src/caffe/layers/softmax_loss_layer.cpp | 42 ++++++++++++++-----------
 src/caffe/layers/softmax_loss_layer.cu  | 20 +++++-------
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 36413ccd176..62d6df71a4a 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -754,6 +754,8 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
   /// Whether to normalize the loss by the total number of values present
   /// (otherwise just by the batch size).
   bool normalize_;
+
+  int softmax_axis_, outer_num_, inner_num_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 0c9ba2c6626..132c30796a4 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -35,6 +35,14 @@ void SoftmaxWithLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
   softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  softmax_axis_ = this->layer_param_.softmax_param().axis();
+  outer_num_ = bottom[0]->count(0, softmax_axis_);
+  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
   if (top.size() >= 2) {
     // softmax output
     top[1]->ReshapeLike(*bottom[0]);
@@ -48,20 +56,18 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
   softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
   const Dtype* prob_data = prob_.cpu_data();
   const Dtype* label = bottom[1]->cpu_data();
-  int num = prob_.num();
-  int dim = prob_.count() / num;
-  int spatial_dim = prob_.height() * prob_.width();
+  int dim = prob_.count() / outer_num_;
   int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < spatial_dim; j++) {
-      const int label_value = static_cast<int>(label[i * spatial_dim + j]);
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value = static_cast<int>(label[i * inner_num_ + j]);
       if (has_ignore_label_ && label_value == ignore_label_) {
         continue;
       }
       DCHECK_GE(label_value, 0);
-      DCHECK_LT(label_value, prob_.channels());
-      loss -= log(std::max(prob_data[i * dim + label_value * spatial_dim + j],
+      DCHECK_LT(label_value, prob_.shape(softmax_axis_));
+      loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
                            Dtype(FLT_MIN)));
       ++count;
     }
@@ -69,7 +75,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
   if (normalize_) {
     top[0]->mutable_cpu_data()[0] = loss / count;
   } else {
-    top[0]->mutable_cpu_data()[0] = loss / num;
+    top[0]->mutable_cpu_data()[0] = loss / outer_num_;
   }
   if (top.size() == 2) {
     top[1]->ShareData(prob_);
@@ -88,19 +94,17 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const Dtype* prob_data = prob_.cpu_data();
     caffe_copy(prob_.count(), prob_data, bottom_diff);
     const Dtype* label = bottom[1]->cpu_data();
-    int num = prob_.num();
-    int dim = prob_.count() / num;
-    int spatial_dim = prob_.height() * prob_.width();
+    int dim = prob_.count() / outer_num_;
     int count = 0;
-    for (int i = 0; i < num; ++i) {
-      for (int j = 0; j < spatial_dim; ++j) {
-        const int label_value = static_cast<int>(label[i * spatial_dim + j]);
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value = static_cast<int>(label[i * inner_num_ + j]);
         if (has_ignore_label_ && label_value == ignore_label_) {
-          for (int c = 0; c < bottom[0]->channels(); ++c) {
-            bottom_diff[i * dim + c * spatial_dim + j] = 0;
+          for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
+            bottom_diff[i * dim + c * inner_num_ + j] = 0;
           }
         } else {
-          bottom_diff[i * dim + label_value * spatial_dim + j] -= 1;
+          bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
           ++count;
         }
       }
@@ -110,7 +114,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (normalize_) {
       caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
     } else {
-      caffe_scal(prob_.count(), loss_weight / num, bottom_diff);
+      caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
     }
   }
 }
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
index 215d589ffee..7e0f3da4552 100644
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -35,10 +35,8 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
   const Dtype* prob_data = prob_.gpu_data();
   const Dtype* label = bottom[1]->gpu_data();
-  const int num = prob_.num();
-  const int dim = prob_.count() / num;
-  const int spatial_dim = prob_.height() * prob_.width();
-  const int nthreads = num * spatial_dim;
+  const int dim = prob_.count() / outer_num_;
+  const int nthreads = outer_num_ * inner_num_;
   // Since this memory is not used for anything until it is overwritten
   // on the backward pass, we use it here to avoid having to allocate new GPU
   // memory to accumulate intermediate results in the kernel.
@@ -49,7 +47,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   // NOLINT_NEXT_LINE(whitespace/operators)
   SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
       CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
-      num, dim, spatial_dim, has_ignore_label_, ignore_label_, counts);
+      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
   Dtype loss;
   caffe_gpu_asum(nthreads, loss_data, &loss);
   if (normalize_) {
@@ -57,7 +55,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
     caffe_gpu_asum(nthreads, counts, &count);
     loss /= count;
   } else {
-    loss /= num;
+    loss /= outer_num_;
   }
   top[0]->mutable_cpu_data()[0] = loss;
   if (top.size() == 2) {
@@ -102,24 +100,22 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_data = top[0]->gpu_data();
     caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
     const Dtype* label = bottom[1]->gpu_data();
-    const int num = prob_.num();
-    const int dim = prob_.count() / num;
-    const int spatial_dim = prob_.height() * prob_.width();
-    const int nthreads = num * spatial_dim;
+    const int dim = prob_.count() / outer_num_;
+    const int nthreads = outer_num_ * inner_num_;
     // Since this memory is never used for anything else,
     // we use to to avoid allocating new GPU memory.
     Dtype* counts = prob_.mutable_gpu_diff();
     // NOLINT_NEXT_LINE(whitespace/operators)
     SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
         CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
-        num, dim, spatial_dim, has_ignore_label_, ignore_label_, counts);
+        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
     const Dtype loss_weight = top[0]->cpu_diff()[0];
     if (normalize_) {
       Dtype count;
       caffe_gpu_asum(nthreads, counts, &count);
       caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
     } else {
-      caffe_gpu_scal(prob_.count(), loss_weight / num, bottom_diff);
+      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
     }
   }
 }

From e2bc9f997d5ebfbefdc4a7e2fa0da9b029c1573a Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 05:02:15 -0800
Subject: [PATCH 46/86] SplitLayer: change Reshape(n,h,c,w) to ReshapeLike(...)

---
 src/caffe/layers/split_layer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index d6929b99683..272cb59cd37 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -18,8 +18,7 @@ void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     // some strange effects in practice...)
     CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not "
         "allow in-place computation.";
-    top[i]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-                       bottom[0]->height(), bottom[0]->width());
+    top[i]->ReshapeLike(*bottom[0]);
     CHECK_EQ(count_, top[i]->count());
   }
 }

From e6468e900f03b2c4f2d9ccf160b46bff0f52dfb9 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 05:42:11 -0800
Subject: [PATCH 47/86] HDF5DataLayer shapes output according to HDF5 shape

---
 src/caffe/layers/hdf5_data_layer.cpp   | 11 ++++++++---
 src/caffe/test/test_hdf5data_layer.cpp | 16 +++++++---------
 src/caffe/util/io.cpp                  | 10 +++++-----
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 3d856ec3001..1ceb6c24431 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -36,7 +36,7 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
   hdf_blobs_.resize(top_size);
 
   const int MIN_DATA_DIM = 1;
-  const int MAX_DATA_DIM = 4;
+  const int MAX_DATA_DIM = INT_MAX;
 
   for (int i = 0; i < top_size; ++i) {
     hdf_blobs_[i] = shared_ptr<Blob<Dtype> >(new Blob<Dtype>());
@@ -88,9 +88,14 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Reshape blobs.
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
   const int top_size = this->layer_param_.top_size();
+  vector<int> top_shape;
   for (int i = 0; i < top_size; ++i) {
-    top[i]->Reshape(batch_size, hdf_blobs_[i]->channels(),
-                    hdf_blobs_[i]->height(), hdf_blobs_[i]->width());
+    top_shape.resize(hdf_blobs_[i]->num_axes());
+    top_shape[0] = batch_size;
+    for (int j = 1; j < top_shape.size(); ++j) {
+      top_shape[j] = hdf_blobs_[i]->shape(j);
+    }
+    top[i]->Reshape(top_shape);
   }
 }
 
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 8d3b3d1e987..c9b027f88cf 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -77,15 +77,13 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   EXPECT_EQ(this->blob_top_data_->height(), height);
   EXPECT_EQ(this->blob_top_data_->width(), width);
 
-  EXPECT_EQ(this->blob_top_label_->num(), batch_size);
-  EXPECT_EQ(this->blob_top_label_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label_->height(), 1);
-  EXPECT_EQ(this->blob_top_label_->width(), 1);
-
-  EXPECT_EQ(this->blob_top_label2_->num(), batch_size);
-  EXPECT_EQ(this->blob_top_label2_->channels(), 1);
-  EXPECT_EQ(this->blob_top_label2_->height(), 1);
-  EXPECT_EQ(this->blob_top_label2_->width(), 1);
+  EXPECT_EQ(this->blob_top_label_->num_axes(), 2);
+  EXPECT_EQ(this->blob_top_label_->shape(0), batch_size);
+  EXPECT_EQ(this->blob_top_label_->shape(1), 1);
+
+  EXPECT_EQ(this->blob_top_label2_->num_axes(), 2);
+  EXPECT_EQ(this->blob_top_label2_->shape(0), batch_size);
+  EXPECT_EQ(this->blob_top_label2_->shape(1), 1);
 
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
 
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index b243a9804ec..77ef7f257f4 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -252,11 +252,11 @@ void hdf5_load_nd_dataset_helper(
   CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
   CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
 
-  blob->Reshape(
-    dims[0],
-    (dims.size() > 1) ? dims[1] : 1,
-    (dims.size() > 2) ? dims[2] : 1,
-    (dims.size() > 3) ? dims[3] : 1);
+  vector<int> blob_dims(dims.size());
+  for (int i = 0; i < dims.size(); ++i) {
+    blob_dims[i] = dims[i];
+  }
+  blob->Reshape(blob_dims);
 }
 
 template <>

From e56377d96e52fbfd2226bc35db6cd3e6e404afd0 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 05:34:47 -0800
Subject: [PATCH 48/86] DataLayer outputs 1D labels

---
 src/caffe/layers/data_layer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 1861090f953..0f2d66776a9 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -69,9 +69,9 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       << top[0]->width();
   // label
   if (this->output_labels_) {
-    top[1]->Reshape(this->layer_param_.data_param().batch_size(), 1, 1, 1);
-    this->prefetch_label_.Reshape(this->layer_param_.data_param().batch_size(),
-        1, 1, 1);
+    vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
+    top[1]->Reshape(label_shape);
+    this->prefetch_label_.Reshape(label_shape);
   }
 }
 

From 7c8725bc6f2a017dc8c6979f75f9b3f0163c8a0a Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 05:42:50 -0800
Subject: [PATCH 49/86] MemoryDataLayer outputs 1D labels

---
 src/caffe/layers/memory_data_layer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index effdad90aff..42de4198bc4 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -19,10 +19,11 @@ void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GT(batch_size_ * size_, 0) <<
       "batch_size, channels, height, and width must be specified and"
       " positive in memory_data_param";
+  vector<int> label_shape(1, batch_size_);
   top[0]->Reshape(batch_size_, channels_, height_, width_);
-  top[1]->Reshape(batch_size_, 1, 1, 1);
+  top[1]->Reshape(label_shape);
   added_data_.Reshape(batch_size_, channels_, height_, width_);
-  added_label_.Reshape(batch_size_, 1, 1, 1);
+  added_label_.Reshape(label_shape);
   data_ = NULL;
   labels_ = NULL;
   added_data_.cpu_data();

From c87a136add3a43eb6ccf9f5b69a5a8a73ae4e753 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 12:56:14 -0800
Subject: [PATCH 50/86] ImageDataLayer outputs 1D labels

---
 src/caffe/layers/image_data_layer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index f9046e1b3a1..38ebbd5ec14 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -81,8 +81,9 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
-  top[1]->Reshape(batch_size, 1, 1, 1);
-  this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
+  vector<int> label_shape(1, batch_size);
+  top[1]->Reshape(label_shape);
+  this->prefetch_label_.Reshape(label_shape);
 }
 
 template <typename Dtype>

From 9505001d82698cb8028c479a238eb49ef9201068 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 26 Nov 2014 12:57:15 -0800
Subject: [PATCH 51/86] WindowDataLayer outputs 1D labels

---
 src/caffe/layers/window_data_layer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 36e41560327..c127d56bc46 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -177,8 +177,9 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
-  top[1]->Reshape(batch_size, 1, 1, 1);
-  this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
+  vector<int> label_shape(1, batch_size);
+  top[1]->Reshape(label_shape);
+  this->prefetch_label_.Reshape(label_shape);
 
   // data mean
   has_mean_file_ = this->transform_param_.has_mean_file();

From fcbb933ab5e31418fc4f8705d72d01f9c80363ce Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sat, 29 Nov 2014 18:00:44 -0800
Subject: [PATCH 52/86] EuclideanLossLayer: generalized Blob axes

---
 src/caffe/layers/euclidean_loss_layer.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index b539d3487f5..80efa31b22c 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -11,11 +11,9 @@ template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Reshape(
   const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
-  CHECK_EQ(bottom[0]->height(), bottom[1]->height());
-  CHECK_EQ(bottom[0]->width(), bottom[1]->width());
-  diff_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
+  CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
+      << "Inputs must have the same dimension.";
+  diff_.ReshapeLike(*bottom[0]);
 }
 
 template <typename Dtype>

From 7462c84ca0b44b0c1e270e56e381c618f6134857 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 31 Dec 2014 16:06:46 -0800
Subject: [PATCH 53/86] DummyDataLayer outputs blobs of arbitrary shape

---
 src/caffe/layers/dummy_data_layer.cpp | 57 +++++++++++++++++----------
 src/caffe/proto/caffe.proto           |  6 ++-
 2 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
index d254eb1f961..6b0d617464c 100644
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ b/src/caffe/layers/dummy_data_layer.cpp
@@ -16,18 +16,30 @@ void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
         num_data_filler == num_top)
       << "Number of data fillers must be 0, 1 or equal to the number of tops: "
       << num_top << "; you specified " << num_data_filler << " data fillers.";
-  CHECK(param.num_size() == 1 || param.num_size() == num_top)
-      << "Must specify either a single (1) 'num' or one for each top blob "
-      << "(" << num_top << "); you specified " << param.num_size() << ".";
-  CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
-      << "Must specify either a single (1) 'channels' or one for each top blob "
-      << "(" << num_top << "); you specified " << param.channels_size() << ".";
-  CHECK(param.height_size() == 1 || param.height_size() == num_top)
-      << "Must specify either a single (1) 'height' or one for each top blob "
-      << "(" << num_top << "); you specified " << param.height_size() << ".";
-  CHECK(param.width_size() == 1 || param.width_size() == num_top)
-      << "Must specify either a single (1) 'width' or one for each top blob "
-      << "(" << num_top << "); you specified " << param.width_size() << ".";
+
+  const bool legacy_dims = param.num_size() || param.channels_size() ||
+                           param.height_size() || param.width_size();
+  if (legacy_dims) {
+    CHECK_EQ(0, param.shape_size())
+        << "Both shape and legacy fields were specified";
+    // Using deprecated 4D output dim specifiers.
+    CHECK(param.num_size() == 1 || param.num_size() == num_top)
+        << "Must specify 'num' once, or once per top blob "
+        << "(" << num_top << "); specified " << param.num_size() << ".";
+    CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
+        << "Must specify 'channels' once, or once per top blob "
+        << "(" << num_top << "); specified " << param.channels_size() << ".";
+    CHECK(param.height_size() == 1 || param.height_size() == num_top)
+        << "Must specify 'height' once, or once per top blob "
+        << "(" << num_top << "); specified " << param.height_size() << ".";
+    CHECK(param.width_size() == 1 || param.width_size() == num_top)
+        << "Must specify 'width' once, or once per top blob "
+        << "(" << num_top << "); specified " << param.width_size() << ".";
+  } else {
+    CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
+        << "Must specify 'shape' once, or once per top blob "
+        << "(" << num_top << "); specified " << param.shape_size() << ".";
+  }
   // refill_[i] tells Forward i whether or not to actually refill top Blob i.
   // If refill_[i] is false, Forward does nothing for Blob i. We use this to
   // avoid wastefully refilling "constant" Blobs in every forward pass.
@@ -63,14 +75,19 @@ void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     }
   }
   for (int i = 0; i < num_top; ++i) {
-    const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
-    const int channels =
-        (param.channels_size() == 1) ? param.channels(0) : param.channels(i);
-    const int height =
-        (param.height_size() == 1) ? param.height(0) : param.height(i);
-    const int width =
-        (param.width_size() == 1) ? param.width(0) : param.width(i);
-    top[i]->Reshape(num, channels, height, width);
+    if (legacy_dims) {
+      const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
+      const int channels =
+          (param.channels_size() == 1) ? param.channels(0) : param.channels(i);
+      const int height =
+          (param.height_size() == 1) ? param.height(0) : param.height(i);
+      const int width =
+          (param.width_size() == 1) ? param.width(0) : param.width(i);
+      top[i]->Reshape(num, channels, height, width);
+    } else {
+      const int shape_index = (param.shape_size() == 1) ? 0 : i;
+      top[i]->Reshape(param.shape(shape_index));
+    }
   }
   // Run Forward once, with refill_ inverted, to fill the constant Blobs.
   this->Forward(bottom, top);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 8fcb8def173..3b4794664b5 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -461,13 +461,15 @@ message DropoutParameter {
 // (or constant) data generated by "Fillers" (see "message FillerParameter").
 message DummyDataParameter {
   // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
-  // num, N channels, N height, and N width fields, and must specify 0, 1 or N
-  // data_fillers.
+  // shape fields, and 0, 1 or N data_fillers.
   //
   // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
   // If 1 data_filler is specified, it is applied to all top blobs.  If N are
   // specified, the ith is applied to the ith top blob.
   repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
   repeated uint32 num = 2;
   repeated uint32 channels = 3;
   repeated uint32 height = 4;

From 69fc1f642e8f87df44131aeaa0b55fb221897d40 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 15 Jan 2015 19:50:42 -0800
Subject: [PATCH 54/86] Add CHECK_EQ(4, ...)s to "vision layers" to enforce
 that the num/channnels/height/width indexing is valid.

---
 src/caffe/layers/base_conv_layer.cpp | 4 ++++
 src/caffe/layers/im2col_layer.cpp    | 2 ++
 src/caffe/layers/lrn_layer.cpp       | 2 ++
 src/caffe/layers/pooling_layer.cpp   | 2 ++
 4 files changed, 10 insertions(+)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index d4ab5e429b5..ccb3adc7e89 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -11,6 +11,8 @@ namespace caffe {
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
   // Configure the kernel size, padding, stride, and inputs.
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
   CHECK(!conv_param.has_kernel_size() !=
@@ -101,6 +103,8 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
   num_ = bottom[0]->num();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 112226116c8..1c802714e33 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -50,6 +50,8 @@ void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 5e3e7c429ef..36c1ace4c99 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -69,6 +69,8 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
   num_ = bottom[0]->num();
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 6f4c69c861e..c8d41499455 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -81,6 +81,8 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();

From 269dafa37a96250012a10537ad773840b4621ff7 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 1 Jan 2015 17:32:38 -0800
Subject: [PATCH 55/86] PyBlobs support generalized axes

---
 python/caffe/_caffe.cpp | 35 ++++++++++++++++++++++++++++++-----
 python/caffe/pycaffe.py |  2 --
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 03967a21029..d4eda798192 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -163,9 +163,10 @@ struct NdarrayCallPolicies : public bp::default_call_policies {
     // the shape information from the blob.
     void* data = PyArray_DATA(reinterpret_cast<PyArrayObject*>(result));
     Py_DECREF(result);
-    npy_intp dims[] = {blob->num(), blob->channels(),
-                       blob->height(), blob->width()};
-    PyObject* arr_obj = PyArray_SimpleNewFromData(4, dims, NPY_FLOAT32, data);
+    const int num_axes = blob->num_axes();
+    vector<npy_intp> dims(blob->shape().begin(), blob->shape().end());
+    PyObject *arr_obj = PyArray_SimpleNewFromData(num_axes, dims.data(),
+                                                  NPY_FLOAT32, data);
     // SetBaseObject steals a ref, so we need to INCREF.
     Py_INCREF(pyblob.ptr());
     PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(arr_obj),
@@ -174,6 +175,27 @@ struct NdarrayCallPolicies : public bp::default_call_policies {
   }
 };
 
+void Blob_Reshape(Blob<Dtype>* blob, bp::object shape_obj) {
+  PyArrayObject* shape_arr =
+      reinterpret_cast<PyArrayObject*>(shape_obj.ptr());
+  if (!(PyArray_FLAGS(shape_arr) & NPY_ARRAY_C_CONTIGUOUS)) {
+    throw std::runtime_error("new shape must be C contiguous");
+  }
+  if (PyArray_NDIM(shape_arr) != 1) {
+    throw std::runtime_error("new shape must be 1-d");
+  }
+  if (PyArray_TYPE(shape_arr) != NPY_INT32) {
+    throw std::runtime_error("new shape must be specified as int32 array");
+  }
+  npy_int32* shape_data = static_cast<npy_int32*>(PyArray_DATA(shape_arr));
+  const int num_axes = PyArray_SIZE(shape_arr);
+  vector<int> shape(num_axes);
+  for (int i = 0; i < num_axes; ++i) {
+    shape[i] = shape_data[i];
+  }
+  blob->Reshape(shape);
+}
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -218,8 +240,11 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("channels", &Blob<Dtype>::channels)
     .add_property("height",   &Blob<Dtype>::height)
     .add_property("width",    &Blob<Dtype>::width)
-    .add_property("count",    &Blob<Dtype>::count)
-    .def("reshape",           &Blob<Dtype>::Reshape)
+    .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
+        &Blob<Dtype>::count))
+    .def("reshape", static_cast<void (Blob<Dtype>::*)(int, int, int, int)>(
+        &Blob<Dtype>::Reshape))
+    .def("reshape", &Blob_Reshape)
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
           NdarrayCallPolicies()))
     .add_property("diff",     bp::make_function(&Blob<Dtype>::mutable_cpu_diff,
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index d662d6cc282..ac387d51d96 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -85,8 +85,6 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
         # Set input according to defined shapes and make arrays single and
         # C-contiguous as Caffe expects.
         for in_, blob in kwargs.iteritems():
-            if blob.ndim != 4:
-                raise Exception('{} blob is not 4-d'.format(in_))
             if blob.shape[0] != self.blobs[in_].num:
                 raise Exception('Input is not batch sized')
             self.blobs[in_].data[...] = blob

From 71df6f90c8cc72c197db6bfe4b24480ab42b93bc Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 30 Jan 2015 23:16:44 -0800
Subject: [PATCH 56/86] Add option not to reshape to Blob::FromProto; use when
 loading Blobs from saved NetParameter

Want to keep the param Blob shape the layer has set, and not necessarily
adopt the one from the saved net (e.g. want to keep new 1D bias shape,
rather than take the (1 x 1 x 1 x D) shape from a legacy net).
---
 include/caffe/blob.hpp |  2 +-
 src/caffe/blob.cpp     | 36 ++++++++++++++++++++----------------
 src/caffe/net.cpp      |  4 ++--
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 82ca95a5f0f..36579a5a545 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -227,7 +227,7 @@ class Blob {
   Dtype* mutable_cpu_diff();
   Dtype* mutable_gpu_diff();
   void Update();
-  void FromProto(const BlobProto& proto);
+  void FromProto(const BlobProto& proto, bool reshape = true);
   void ToProto(BlobProto* proto, bool write_diff = false) const;
 
   /// @brief Compute the sum of absolute values (L1 norm) of the data.
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index c65dc2dd1ac..6d2b3f502d9 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -431,24 +431,28 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
 }
 
 template <typename Dtype>
-void Blob<Dtype>::FromProto(const BlobProto& proto) {
-  vector<int> shape;
-  if (proto.has_num() || proto.has_channels() ||
-      proto.has_height() || proto.has_width()) {
-    // Using deprecated 4D Blob dimensions --
-    // shape is (num, channels, height, width).
-    shape.resize(4);
-    shape[0] = proto.num();
-    shape[1] = proto.channels();
-    shape[2] = proto.height();
-    shape[3] = proto.width();
-  } else {
-    shape.resize(proto.shape().dim_size());
-    for (int i = 0; i < proto.shape().dim_size(); ++i) {
-      shape[i] = proto.shape().dim(i);
+void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
+  if (reshape) {
+    vector<int> shape;
+    if (proto.has_num() || proto.has_channels() ||
+        proto.has_height() || proto.has_width()) {
+      // Using deprecated 4D Blob dimensions --
+      // shape is (num, channels, height, width).
+      shape.resize(4);
+      shape[0] = proto.num();
+      shape[1] = proto.channels();
+      shape[2] = proto.height();
+      shape[3] = proto.width();
+    } else {
+      shape.resize(proto.shape().dim_size());
+      for (int i = 0; i < proto.shape().dim_size(); ++i) {
+        shape[i] = proto.shape().dim(i);
+      }
     }
+    Reshape(shape);
+  } else {
+    CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)";
   }
-  Reshape(shape);
   // copy data
   Dtype* data_vec = mutable_cpu_data();
   for (int i = 0; i < count_; ++i) {
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 60f387b92e9..e8f7c05e09d 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -705,8 +705,8 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
     CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
         << "Incompatible number of blobs for layer " << source_layer_name;
     for (int j = 0; j < target_blobs.size(); ++j) {
-      CHECK(target_blobs[j]->ShapeEquals(source_layer.blobs(j)));
-      target_blobs[j]->FromProto(source_layer.blobs(j));
+      const bool kReshape = false;
+      target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
     }
   }
 }

From aa242aa5a121cf3afc0bacf4449da5a4ad76b236 Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Mon, 2 Mar 2015 15:27:45 -0800
Subject: [PATCH 57/86] [pycaffe] expose Blob.reshape as *args function

---
 python/caffe/_caffe.cpp | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index d4eda798192..bfea0de661b 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -5,6 +5,7 @@
 
 #include <boost/make_shared.hpp>
 #include <boost/python.hpp>
+#include <boost/python/raw_function.hpp>
 #include <boost/python/suite/indexing/vector_indexing_suite.hpp>
 #include <numpy/arrayobject.h>
 
@@ -175,25 +176,18 @@ struct NdarrayCallPolicies : public bp::default_call_policies {
   }
 };
 
-void Blob_Reshape(Blob<Dtype>* blob, bp::object shape_obj) {
-  PyArrayObject* shape_arr =
-      reinterpret_cast<PyArrayObject*>(shape_obj.ptr());
-  if (!(PyArray_FLAGS(shape_arr) & NPY_ARRAY_C_CONTIGUOUS)) {
-    throw std::runtime_error("new shape must be C contiguous");
+bp::object Blob_Reshape(bp::tuple args, bp::dict kwargs) {
+  if (bp::len(kwargs) > 0) {
+    throw std::runtime_error("Blob.reshape takes no kwargs");
   }
-  if (PyArray_NDIM(shape_arr) != 1) {
-    throw std::runtime_error("new shape must be 1-d");
+  Blob<Dtype>* self = bp::extract<Blob<Dtype>*>(args[0]);
+  vector<int> shape(bp::len(args) - 1);
+  for (int i = 1; i < bp::len(args); ++i) {
+    shape[i - 1] = bp::extract<int>(args[i]);
   }
-  if (PyArray_TYPE(shape_arr) != NPY_INT32) {
-    throw std::runtime_error("new shape must be specified as int32 array");
-  }
-  npy_int32* shape_data = static_cast<npy_int32*>(PyArray_DATA(shape_arr));
-  const int num_axes = PyArray_SIZE(shape_arr);
-  vector<int> shape(num_axes);
-  for (int i = 0; i < num_axes; ++i) {
-    shape[i] = shape_data[i];
-  }
-  blob->Reshape(shape);
+  self->Reshape(shape);
+  // We need to explicitly return None to use bp::raw_function.
+  return bp::object();
 }
 
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
@@ -242,9 +236,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("width",    &Blob<Dtype>::width)
     .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
         &Blob<Dtype>::count))
-    .def("reshape", static_cast<void (Blob<Dtype>::*)(int, int, int, int)>(
-        &Blob<Dtype>::Reshape))
-    .def("reshape", &Blob_Reshape)
+    .def("reshape",           bp::raw_function(&Blob_Reshape))
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
           NdarrayCallPolicies()))
     .add_property("diff",     bp::make_function(&Blob<Dtype>::mutable_cpu_diff,

From 8c79d65e333388e2bde9346899b7cdf812aab2c6 Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Mon, 2 Mar 2015 15:54:11 -0800
Subject: [PATCH 58/86] [pytest] use non-4d blobs in test_python_layer

---
 python/caffe/test/test_python_layer.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/python/caffe/test/test_python_layer.py b/python/caffe/test/test_python_layer.py
index 383c283959d..dd99f6f15b9 100644
--- a/python/caffe/test/test_python_layer.py
+++ b/python/caffe/test/test_python_layer.py
@@ -11,8 +11,7 @@ def setup(self, bottom, top):
         pass
 
     def reshape(self, bottom, top):
-        top[0].reshape(bottom[0].num, bottom[0].channels, bottom[0].height,
-                bottom[0].width)
+        top[0].reshape(*bottom[0].data.shape)
 
     def forward(self, bottom, top):
         top[0].data[...] = 10 * bottom[0].data
@@ -21,17 +20,16 @@ def backward(self, top, propagate_down, bottom):
         bottom[0].diff[...] = 10 * top[0].diff
 
 def python_net_file():
-    f = tempfile.NamedTemporaryFile(delete=False)
-    f.write("""name: 'pythonnet' force_backward: true
-    input: 'data' input_dim: 10 input_dim: 9 input_dim: 8 input_dim: 7
-    layer { type: 'Python' name: 'one' bottom: 'data' top: 'one'
-      python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }
-    layer { type: 'Python' name: 'two' bottom: 'one' top: 'two'
-      python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }
-    layer { type: 'Python' name: 'three' bottom: 'two' top: 'three'
-      python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }""")
-    f.close()
-    return f.name
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        f.write("""name: 'pythonnet' force_backward: true
+        input: 'data' input_shape { dim: 10 dim: 9 dim: 8 }
+        layer { type: 'Python' name: 'one' bottom: 'data' top: 'one'
+          python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }
+        layer { type: 'Python' name: 'two' bottom: 'one' top: 'two'
+          python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }
+        layer { type: 'Python' name: 'three' bottom: 'two' top: 'three'
+          python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }""")
+        return f.name
 
 class TestPythonLayer(unittest.TestCase):
     def setUp(self):

From 642619bcd45ab71be2483379ff4e7192628c11ab Mon Sep 17 00:00:00 2001
From: Luke Yeager <luke.yeager@gmail.com>
Date: Mon, 23 Feb 2015 09:18:31 -0800
Subject: [PATCH 59/86] Add error checking for image mean

When setting the mean, assert that it is either one pixel or an array with
shape equal to the input data size.
---
 python/caffe/io.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/caffe/io.py b/python/caffe/io.py
index f51e3a64d36..f7f75b73cd8 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -238,11 +238,16 @@ def set_mean(self, in_, mean):
         """
         self.__check_input(in_)
         if mean.ndim == 1:
+            # broadcast pixel
             mean = mean[:, np.newaxis, np.newaxis]
-        mk, mh, mw = mean.shape
-        in_k, in_h, in_w = self.inputs[in_][1:]
-        #if mk != in_k or (mh, mw) != (in_h, in_w) and (mh, mw) != (1, 1):
-        #    raise Exception('Mean shape incompatible with input shape.')
+        else:
+            ms = mean.shape
+            if len(ms) == 2:
+                ms = (1,) + ms
+            if len(ms) != 3:
+                raise ValueError('Mean shape invalid')
+            if ms != self.inputs[in_][1:]:
+                raise ValueError('Mean shape incompatible with input shape.')
         self.mean[in_] = mean
 
 

From dec148e89d30020b0d81ea59c6d88539db83b019 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 4 Mar 2015 11:17:51 -0800
Subject: [PATCH 60/86] fix comment I forgot about from @shelhamer's review of
 #1970

---
 src/caffe/layers/inner_product_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 6b88724491b..89e0c8fbad7 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -19,7 +19,7 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       this->layer_param_.inner_product_param().axis());
   // Dimensions starting from "axis" are "flattened" into a single
   // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
-  // N inner products with dimension CHW are performed.
+  // and axis == 1, N inner products with dimension CHW are performed.
   K_ = bottom[0]->count(axis);
   // Check if we need to set up the weights
   if (this->blobs_.size() > 0) {

From fdf98469b42392b0bedde015d5e42d93b3ed713f Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Mon, 11 Aug 2014 21:38:59 -0700
Subject: [PATCH 61/86] zero-init param diffs and accumulate gradients

(With layers whose backwards accumlate gradients), this effectively
decouples the computational batch from the SGD minibatch. Each
iteration accumulates gradients over iter_size batches, then parameters
are updated.
---
 src/caffe/proto/caffe.proto |  3 ++-
 src/caffe/solver.cpp        | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 3b4794664b5..9b68f5f737b 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -88,7 +88,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 36 (last added: clip_gradients)
+// SolverParameter next available ID: 37 (last added: iter_size)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -141,6 +141,7 @@ message SolverParameter {
   // Display the loss averaged over the last average_loss iterations
   optional int32 average_loss = 33 [default = 1];
   optional int32 max_iter = 7; // the maximum number of iterations
+  optional int32 iter_size = 36 [default = 1];
   optional string lr_policy = 8; // The learning rate decay policy.
   optional float gamma = 9; // The parameter to compute the learning rate.
   optional float power = 10; // The parameter to compute the learning rate.
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 034390e6824..43c4c498679 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -168,6 +168,25 @@ void Solver<Dtype>::Step(int iters) {
   Dtype smoothed_loss = 0;
 
   for (; iter_ < stop_iter; ++iter_) {
+    // zero-init the params
+    for (int i = 0; i < net_->params().size(); ++i) {
+      shared_ptr<Blob<Dtype> > blob = net_->params()[i];
+      switch(Caffe::mode()) {
+      case Caffe::CPU:
+        caffe_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_cpu_diff());
+        break;
+      case Caffe::GPU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
+#else
+        NO_GPU;
+#endif
+        break;
+      }
+    }
+
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
         && (iter_ > 0 || param_.test_initialization())) {
       TestAll();
@@ -175,7 +194,13 @@ void Solver<Dtype>::Step(int iters) {
 
     const bool display = param_.display() && iter_ % param_.display() == 0;
     net_->set_debug_info(display && param_.debug_info());
-    Dtype loss = net_->ForwardBackward(bottom_vec);
+    // accumulate the loss and gradient
+    Dtype loss = 0;
+    for (int i = 0; i < param_.iter_size(); ++i) {
+      loss += net_->ForwardBackward(bottom_vec);
+    }
+    loss /= param_.iter_size();
+    // average the loss across iterations for smoothed reporting
     if (losses.size() < average_loss) {
       losses.push_back(loss);
       int size = losses.size();
@@ -471,7 +496,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
   case Caffe::CPU:
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_rate = rate * net_params_lr[param_id]
+                              / this->param_.iter_size();
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
       if (local_decay) {
@@ -507,7 +533,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
 #ifndef CPU_ONLY
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_rate = rate * net_params_lr[param_id]
+                              / this->param_.iter_size();
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
       if (local_decay) {

From 3285efcda3d24e36e9c59e02e9f4d34a129d312e Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Tue, 30 Dec 2014 22:52:07 -0800
Subject: [PATCH 62/86] zero-init param diffs in gradient checker

---
 include/caffe/test/test_gradient_check_util.hpp | 7 +++++--
 src/caffe/solver.cpp                            | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index 22937711b58..cc5dcbad0ee 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -80,11 +80,14 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
       CHECK_EQ(top_count, bottom[blob_id]->count());
     }
   }
-  // First, figure out what blobs we need to check against.
+  // First, figure out what blobs we need to check against, and zero init
+  // parameter blobs.
   vector<Blob<Dtype>*> blobs_to_check;
   vector<bool> propagate_down(bottom.size(), check_bottom < 0);
   for (int i = 0; i < layer->blobs().size(); ++i) {
-    blobs_to_check.push_back(layer->blobs()[i].get());
+    Blob<Dtype>* blob = layer->blobs()[i].get();
+    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
+    blobs_to_check.push_back(blob);
   }
   if (check_bottom < 0) {
     for (int i = 0; i < bottom.size(); ++i) {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 43c4c498679..affbf149ac4 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -171,7 +171,7 @@ void Solver<Dtype>::Step(int iters) {
     // zero-init the params
     for (int i = 0; i < net_->params().size(); ++i) {
       shared_ptr<Blob<Dtype> > blob = net_->params()[i];
-      switch(Caffe::mode()) {
+      switch (Caffe::mode()) {
       case Caffe::CPU:
         caffe_set(blob->count(), static_cast<Dtype>(0),
             blob->mutable_cpu_diff());

From 0410dab2faaf881f9daddf6367a2788e1aa670ae Mon Sep 17 00:00:00 2001
From: Sergio <sguada@gmail.com>
Date: Fri, 26 Sep 2014 23:03:26 -0700
Subject: [PATCH 63/86] accumulate gradients in inner product layer

---
 src/caffe/layers/inner_product_layer.cpp | 4 ++--
 src/caffe/layers/inner_product_layer.cu  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 89e0c8fbad7..83c3235eb71 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -101,13 +101,13 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->cpu_data();
     // Gradient with respect to weight
     caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
+        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     // Gradient with respect to bias
     caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.cpu_data(), (Dtype)0.,
+        bias_multiplier_.cpu_data(), (Dtype)1.,
         this->blobs_[1]->mutable_cpu_diff());
   }
   if (propagate_down[0]) {
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index a9e1784a205..dd90cac12a8 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -33,13 +33,13 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->gpu_data();
     // Gradient with respect to weight
     caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
+        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bias
     caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.gpu_data(), (Dtype)0.,
+        bias_multiplier_.gpu_data(), (Dtype)1.,
         this->blobs_[1]->mutable_gpu_diff());
   }
   if (propagate_down[0]) {

From 32a7cada3cf8a5ff9e4434b668ec11448752e800 Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Tue, 30 Dec 2014 22:29:35 -0800
Subject: [PATCH 64/86] accumulate gradients in (de)conv layers

---
 src/caffe/layers/conv_layer.cpp   | 7 -------
 src/caffe/layers/conv_layer.cu    | 7 -------
 src/caffe/layers/deconv_layer.cpp | 7 -------
 src/caffe/layers/deconv_layer.cu  | 7 -------
 4 files changed, 28 deletions(-)

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index c0c9f6f3371..928ef5ee468 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -39,13 +39,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_cpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
     const Dtype* bottom_data = bottom[i]->cpu_data();
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index 3902fdf3930..b8a98ff7cc9 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -31,13 +31,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_gpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     // Bias gradient, if necessary.
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index e6d65ab526b..a4612963b6b 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -39,13 +39,6 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_cpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
     const Dtype* bottom_data = bottom[i]->cpu_data();
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
index 9198dd64c72..39bc4de8c66 100644
--- a/src/caffe/layers/deconv_layer.cu
+++ b/src/caffe/layers/deconv_layer.cu
@@ -31,13 +31,6 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_gpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     const Dtype* bottom_data = bottom[i]->gpu_data();

From db1660657e440f2b410ba23ee4c1d7b7e60fb65f Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Sat, 13 Sep 2014 17:41:59 -0700
Subject: [PATCH 65/86] accumulate gradients in cudnn conv layer

---
 src/caffe/layers/cudnn_conv_layer.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 071014e1b48..b5bfdb098e0 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -54,12 +54,10 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (this->param_propagate_down_[0]) {
     weight = this->blobs_[0]->gpu_data();
     weight_diff = this->blobs_[0]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
   }
   Dtype* bias_diff = NULL;
   if (this->bias_term_ && this->param_propagate_down_[1]) {
     bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
   }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();

From 894d0639df33768d87f649aaf7c43d8b33b5105e Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 21 Jan 2015 22:21:13 -0800
Subject: [PATCH 66/86] Add gpu_util.cuh, with caffe_gpu_atomic_add

(double impl from NVIDIA dev docs; float impl included in CUDA as
"atomicAdd")
---
 include/caffe/util/gpu_util.cuh | 35 +++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 include/caffe/util/gpu_util.cuh

diff --git a/include/caffe/util/gpu_util.cuh b/include/caffe/util/gpu_util.cuh
new file mode 100644
index 00000000000..994202f2a1a
--- /dev/null
+++ b/include/caffe/util/gpu_util.cuh
@@ -0,0 +1,35 @@
+#ifndef CAFFE_UTIL_GPU_UTIL_H_
+#define CAFFE_UTIL_GPU_UTIL_H_
+
+namespace caffe {
+
+template <typename Dtype>
+inline __device__ Dtype caffe_gpu_atomic_add(const Dtype val, Dtype* address);
+
+template <>
+inline __device__
+float caffe_gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+// double atomicAdd implementation taken from:
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3PVCpVsEG
+template <>
+inline __device__
+double caffe_gpu_atomic_add(const double val, double* address) {
+  unsigned long long int* address_as_ull =  // NOLINT(runtime/int)
+      // NOLINT_NEXT_LINE(runtime/int)
+      reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull;  // NOLINT(runtime/int)
+  unsigned long long int assumed;  // NOLINT(runtime/int)
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+        __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_GPU_UTIL_H_

From ab68f93c6b25e930be245a166cda697ad2dedbf5 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 21 Jan 2015 14:23:34 -0800
Subject: [PATCH 67/86] test_gradient_check_util: check_bottom < -1 only checks
 params

---
 include/caffe/test/test_gradient_check_util.hpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index cc5dcbad0ee..25f35d1589e 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -45,6 +45,10 @@ class GradientChecker {
   void CheckGradientEltwise(Layer<Dtype>* layer,
       const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
+  // Checks the gradient of a single output with respect to particular input
+  // blob(s).  If check_bottom = i >= 0, check only the ith bottom Blob.
+  // If check_bottom == -1, check everything -- all bottom Blobs and all
+  // param Blobs.  Otherwise (if check_bottom < -1), check only param Blobs.
   void CheckGradientSingle(Layer<Dtype>* layer,
       const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
       int check_bottom, int top_id, int top_data_id, bool element_wise = false);
@@ -83,21 +87,22 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
   // First, figure out what blobs we need to check against, and zero init
   // parameter blobs.
   vector<Blob<Dtype>*> blobs_to_check;
-  vector<bool> propagate_down(bottom.size(), check_bottom < 0);
+  vector<bool> propagate_down(bottom.size(), check_bottom == -1);
   for (int i = 0; i < layer->blobs().size(); ++i) {
     Blob<Dtype>* blob = layer->blobs()[i].get();
     caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
     blobs_to_check.push_back(blob);
   }
-  if (check_bottom < 0) {
+  if (check_bottom == -1) {
     for (int i = 0; i < bottom.size(); ++i) {
       blobs_to_check.push_back(bottom[i]);
     }
-  } else {
+  } else if (check_bottom >= 0) {
     CHECK_LT(check_bottom, bottom.size());
     blobs_to_check.push_back(bottom[check_bottom]);
     propagate_down[check_bottom] = true;
   }
+  CHECK_GT(blobs_to_check.size(), 0) << "No blobs to check.";
   // Compute the gradient analytically using Backward
   Caffe::set_random_seed(seed_);
   // Ignore the loss from the layer (it's just the weighted sum of the losses

From 6e38bb75fb34bcec4173cab1febc3a7e38a3d0d8 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 16:00:04 -0800
Subject: [PATCH 68/86] Add EmbedLayer for inner products with sparse input
 (one-hot vectors), with unit tests

---
 include/caffe/common_layers.hpp     |  38 ++++++
 src/caffe/layers/embed_layer.cpp    | 122 +++++++++++++++++++
 src/caffe/layers/embed_layer.cu     |  80 ++++++++++++
 src/caffe/proto/caffe.proto         |  17 ++-
 src/caffe/test/test_embed_layer.cpp | 183 ++++++++++++++++++++++++++++
 5 files changed, 439 insertions(+), 1 deletion(-)
 create mode 100644 src/caffe/layers/embed_layer.cpp
 create mode 100644 src/caffe/layers/embed_layer.cu
 create mode 100644 src/caffe/test/test_embed_layer.cpp

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index b1ac3a93eff..7e0046107c3 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -180,6 +180,44 @@ class EltwiseLayer : public Layer<Dtype> {
   bool stable_prod_grad_;
 };
 
+/**
+ * @brief A layer for learning "embeddings" of one-hot vector input.
+ *        Equivalent to an InnerProductLayer with one-hot vectors as input, but
+ *        for efficiency the input is the "hot" index of each column itself.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class EmbedLayer : public Layer<Dtype> {
+ public:
+  explicit EmbedLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Embed"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int M_;
+  int K_;
+  int N_;
+  bool bias_term_;
+  Blob<Dtype> bias_multiplier_;
+};
+
 /**
  * @brief Reshapes the input Blob into flat vectors.
  *
diff --git a/src/caffe/layers/embed_layer.cpp b/src/caffe/layers/embed_layer.cpp
new file mode 100644
index 00000000000..be6b2cd2727
--- /dev/null
+++ b/src/caffe/layers/embed_layer.cpp
@@ -0,0 +1,122 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  N_ = this->layer_param_.embed_param().num_output();
+  CHECK_GT(N_, 0) << "EmbedLayer num_output must be positive.";
+  K_ = this->layer_param_.embed_param().input_dim();
+  CHECK_GT(K_, 0) << "EmbedLayer input_dim must be positive.";
+  bias_term_ = this->layer_param_.embed_param().bias_term();
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Initialize the weights --
+    // transposed from InnerProductLayer for spatial locality.
+    vector<int> weight_shape(2);
+    weight_shape[0] = K_;
+    weight_shape[1] = N_;
+    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+        this->layer_param_.embed_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, initialize and fill the bias term
+    if (bias_term_) {
+      vector<int> bias_shape(1, N_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+          this->layer_param_.embed_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }  // parameter initialization
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  // Figure out the dimensions
+  M_ = bottom[0]->count();
+  vector<int> top_shape = bottom[0]->shape();
+  top_shape.push_back(N_);
+  top[0]->Reshape(top_shape);
+  // Set up the bias multiplier
+  if (bias_term_) {
+    vector<int> bias_shape(1, M_);
+    bias_multiplier_.Reshape(bias_shape);
+    caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int index;
+  for (int n = 0; n < M_; ++n) {
+    index = static_cast<int>(bottom_data[n]);
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, K_);
+    DCHECK_EQ(static_cast<Dtype>(index), bottom_data[n]) << "non-integer input";
+    caffe_copy(N_, weight + index * N_, top_data + n * N_);
+  }
+  if (bias_term_) {
+    const Dtype* bias = this->blobs_[1]->cpu_data();
+    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
+        bias_multiplier_.cpu_data(), bias, Dtype(1), top_data);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    // Gradient with respect to weight
+    Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+    int index;
+    for (int n = 0; n < M_; ++n) {
+      index = static_cast<int>(bottom_data[n]);
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, K_);
+      DCHECK_EQ(static_cast<Dtype>(index), bottom_data[n])
+          << "non-integer input";
+      caffe_axpy(N_, Dtype(1), top_diff + n * N_, weight_diff + index * N_);
+    }
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
+        bias_multiplier_.cpu_data(), Dtype(1), bias_diff);
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(EmbedLayer);
+#endif
+
+INSTANTIATE_CLASS(EmbedLayer);
+REGISTER_LAYER_CLASS(Embed);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
new file mode 100644
index 00000000000..37a4f7e35cd
--- /dev/null
+++ b/src/caffe/layers/embed_layer.cu
@@ -0,0 +1,80 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* weight, const int M, const int N, const int K,
+    Dtype* top_data) {
+  CUDA_KERNEL_LOOP(top_index, nthreads) {
+    const int n = top_index / N;
+    const int d = top_index % N;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = index * N + d;
+    top_data[top_index] = weight[weight_index];
+  }
+}
+
+template <typename Dtype>
+__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_diff, const int M, const int N, const int K,
+    Dtype* weight_diff) {
+  CUDA_KERNEL_LOOP(weight_index, nthreads) {
+    const int index = weight_index / N;
+    const int output_index = weight_index % N;
+    for (int n = 0; n < M; ++n) {
+      if (static_cast<int>(bottom_data[n]) == index) {
+        weight_diff[weight_index] += top_diff[n * N + output_index];
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const int count = top[0]->count();
+  EmbedForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, weight, M_, N_, K_, top_data);
+  if (bias_term_) {
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
+        bias_multiplier_.gpu_data(),
+        this->blobs_[1]->gpu_data(), Dtype(1), top_data);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
+  if (this->param_propagate_down_[0]) {
+    const int count = this->blobs_[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+    EmbedBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, top_diff, M_, N_, K_, weight_diff);
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
+        bias_multiplier_.gpu_data(), Dtype(1), bias_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(EmbedLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 9b68f5f737b..9dbd3155359 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -260,7 +260,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 131 (last added: python_param)
+// LayerParameter next available layer-specific ID: 132 (last added: embed_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -312,6 +312,7 @@ message LayerParameter {
   optional DropoutParameter dropout_param = 108;
   optional DummyDataParameter dummy_data_param = 109;
   optional EltwiseParameter eltwise_param = 110;
+  optional EmbedParameter embed_param = 131;
   optional ExpParameter exp_param = 111;
   optional HDF5DataParameter hdf5_data_param = 112;
   optional HDF5OutputParameter hdf5_output_param = 113;
@@ -492,6 +493,20 @@ message EltwiseParameter {
   optional bool stable_prod_grad = 3 [default = true];
 }
 
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
 // Message that stores parameters used by ExpLayer
 message ExpParameter {
   // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp
new file mode 100644
index 00000000000..7a4fb9800f2
--- /dev/null
+++ b/src/caffe/test/test_embed_layer.cpp
@@ -0,0 +1,183 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+#ifndef CPU_ONLY
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+#endif
+
+template <typename TypeParam>
+class EmbedLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+ protected:
+  EmbedLayerTest()
+      : blob_bottom_(new Blob<Dtype>(4, 1, 1, 1)),
+        blob_top_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~EmbedLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(EmbedLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(EmbedLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_top_->num_axes(), 5);
+  EXPECT_EQ(this->blob_top_->shape(0), 4);
+  EXPECT_EQ(this->blob_top_->shape(1), 1);
+  EXPECT_EQ(this->blob_top_->shape(2), 1);
+  EXPECT_EQ(this->blob_top_->shape(3), 1);
+  EXPECT_EQ(this->blob_top_->shape(4), 10);
+}
+
+TYPED_TEST(EmbedLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  const int kNumOutput = 10;
+  const int kInputDim = 5;
+  embed_param->set_num_output(kNumOutput);
+  embed_param->set_input_dim(kInputDim);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->set_bias_term(false);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(1, layer->blobs().size());
+  vector<int> weight_shape(2);
+  weight_shape[0] = kInputDim;
+  weight_shape[1] = kNumOutput;
+  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
+  }
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> weight_offset(2, 0);
+  vector<int> top_offset(5, 0);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
+    weight_offset[1] = 0;
+    top_offset[0] = i;
+    top_offset[4] = 0;
+    for (int j = 0; j < kNumOutput; ++j) {
+      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset),
+                this->blob_top_->data_at(top_offset));
+      ++top_offset[4];
+      ++weight_offset[1];
+    }
+  }
+}
+
+TYPED_TEST(EmbedLayerTest, TestForwardWithBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  const int kNumOutput = 10;
+  const int kInputDim = 5;
+  embed_param->set_num_output(kNumOutput);
+  embed_param->set_input_dim(kInputDim);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
+  embed_param->set_bias_term(true);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(2, layer->blobs().size());
+  vector<int> weight_shape(2);
+  weight_shape[0] = kInputDim;
+  weight_shape[1] = kNumOutput;
+  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
+  }
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> bias_offset(1, 0);
+  vector<int> weight_offset(2, 0);
+  vector<int> top_offset(5, 0);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
+    weight_offset[1] = 0;
+    top_offset[0] = i;
+    top_offset[4] = 0;
+    bias_offset[0] = 0;
+    for (int j = 0; j < kNumOutput; ++j) {
+      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) +
+                layer->blobs()[1]->data_at(bias_offset),
+                this->blob_top_->data_at(top_offset));
+      ++top_offset[4];
+      ++weight_offset[1];
+      ++bias_offset[0];
+    }
+  }
+}
+
+TYPED_TEST(EmbedLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  embed_param->set_bias_term(false);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  EmbedLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_bottom_->mutable_cpu_data()[0] = 4;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 2;
+  this->blob_bottom_->mutable_cpu_data()[3] = 3;
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, -2);
+}
+
+TYPED_TEST(EmbedLayerTest, TestGradientWithBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  embed_param->set_bias_term(true);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
+  EmbedLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_bottom_->mutable_cpu_data()[0] = 4;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 2;
+  this->blob_bottom_->mutable_cpu_data()[3] = 3;
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, -2);
+}
+
+}  // namespace caffe

From 2db4482bb2ee43ef395375fe7340e29db9b0f9ba Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 21 Jan 2015 16:12:12 -0800
Subject: [PATCH 69/86] EmbedBackward with no loops -- use caffe_gpu_atomic_add
 instead

---
 src/caffe/layers/embed_layer.cu | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
index 37a4f7e35cd..672fb9c608c 100644
--- a/src/caffe/layers/embed_layer.cu
+++ b/src/caffe/layers/embed_layer.cu
@@ -5,6 +5,7 @@
 #include "caffe/common_layers.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/layer.hpp"
+#include "caffe/util/gpu_util.cuh"
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
@@ -22,18 +23,21 @@ __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
   }
 }
 
+template <typename Dtype>
+__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_diff, const int M, const int N, const int K,
+    Dtype* weight_diff);
+
 template <typename Dtype>
 __global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
     const Dtype* top_diff, const int M, const int N, const int K,
     Dtype* weight_diff) {
-  CUDA_KERNEL_LOOP(weight_index, nthreads) {
-    const int index = weight_index / N;
-    const int output_index = weight_index % N;
-    for (int n = 0; n < M; ++n) {
-      if (static_cast<int>(bottom_data[n]) == index) {
-        weight_diff[weight_index] += top_diff[n * N + output_index];
-      }
-    }
+  CUDA_KERNEL_LOOP(top_index, nthreads) {
+    const int n = top_index / N;
+    const int d = top_index % N;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = index * N + d;
+    caffe_gpu_atomic_add(top_diff[top_index], weight_diff + weight_index);
   }
 }
 
@@ -59,13 +63,14 @@ void EmbedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
   if (this->param_propagate_down_[0]) {
+    const int top_count = top[0]->count();
     const int count = this->blobs_[0]->count();
     const Dtype* top_diff = top[0]->gpu_diff();
     const Dtype* bottom_data = bottom[0]->gpu_data();
     Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
     EmbedBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, top_diff, M_, N_, K_, weight_diff);
+        <<<CAFFE_GET_BLOCKS(top_count), CAFFE_CUDA_NUM_THREADS>>>(
+        top_count, bottom_data, top_diff, M_, N_, K_, weight_diff);
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->gpu_diff();

From 9aa811c80f45ce482ef4af08fc8967d81d1b3280 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 30 Jan 2015 23:19:34 -0800
Subject: [PATCH 70/86] Add (very simple version of) ReshapeLayer

---
 include/caffe/common_layers.hpp    | 47 ++++++++++++++++++++++++++++++
 src/caffe/layers/reshape_layer.cpp | 20 +++++++++++++
 src/caffe/proto/caffe.proto        | 10 ++++++-
 3 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 src/caffe/layers/reshape_layer.cpp

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 7e0046107c3..cadd6b20e5e 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -335,6 +335,53 @@ class MVNLayer : public Layer<Dtype> {
   Blob<Dtype> sum_multiplier_;
 };
 
+/**
+ * @brief Reshapes an input Blob.
+ */
+template <typename Dtype>
+class ReshapeLayer : public Layer<Dtype> {
+ public:
+  explicit ReshapeLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Reshape"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (D_1 \times D_2 \times ... \times D_m) @f$
+   *      the inputs
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (d_1 \times d_2 \times ... \times d_n) @f$,
+   *      the outputs -- i.e., the (virtually) copied inputs.
+   *      The shape is specified by <code>reshape_param.shape()</code>, and the
+   *      product of the dimensions in the new shape must match that of the
+   *      input shape; i.e., @f$ d_1 d_2 ... d_n = D_1 D_2 ... D_m @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  /**
+   * @brief Computes the error gradient w.r.t. the concatenate inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *        respect to the outputs
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length K), into which the top error
+   *        gradient is (virtually) copied
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+};
+
 /**
  * @brief Ignores bottom blobs while producing no top blobs. (This is useful
  *        to suppress outputs during testing.)
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
new file mode 100644
index 00000000000..f8e5122b297
--- /dev/null
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -0,0 +1,20 @@
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  top[0]->Reshape(this->layer_param_.reshape_param().shape());
+  CHECK_EQ(top[0]->count(), bottom[0]->count());
+  top[0]->ShareData(*bottom[0]);
+  top[0]->ShareDiff(*bottom[0]);
+}
+
+INSTANTIATE_CLASS(ReshapeLayer);
+REGISTER_LAYER_CLASS(Reshape);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 9dbd3155359..d0072f4a0fc 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -260,7 +260,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 132 (last added: embed_param)
+// LayerParameter next available layer-specific ID: 133 (last added: reshape_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -327,6 +327,7 @@ message LayerParameter {
   optional PowerParameter power_param = 122;
   optional PythonParameter python_param = 130;
   optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 132;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
   optional SliceParameter slice_param = 126;
@@ -664,6 +665,13 @@ message PythonParameter {
   optional string layer = 2;
 }
 
+// Message that stores parameters used by ReshapeLayer
+message ReshapeParameter {
+  // The new shape of the Blob. Must have the same "count" (product of
+  // dimensions) as the input Blob.
+  optional BlobShape shape = 1;
+}
+
 // Message that stores parameters used by ReLULayer
 message ReLUParameter {
   // Allow non-zero slope for negative inputs to speed up optimization

From 4a8b1ed523ddaf3d13c0118bf3114b8cee43e30e Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 30 Oct 2014 17:54:35 -0700
Subject: [PATCH 71/86] FlattenLayer fix -- top should always Share* from
 bottom (and do everything in Reshape)

---
 include/caffe/common_layers.hpp    |  8 ++++++--
 src/caffe/layers/flatten_layer.cpp | 12 +-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index cadd6b20e5e..1991d58206d 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -250,7 +250,9 @@ class FlattenLayer : public Layer<Dtype> {
    *      the outputs -- i.e., the (virtually) copied, flattened inputs
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
+      const vector<Blob<Dtype>*>& top) {}
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
 
   /**
    * @brief Computes the error gradient w.r.t. the concatenate inputs.
@@ -262,7 +264,9 @@ class FlattenLayer : public Layer<Dtype> {
    *        gradient is (virtually) copied
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
 };
 
 /**
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index 745f271ea45..3a078fabb81 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -14,18 +14,8 @@ void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   top_shape[1] = bottom[0]->count() / bottom[0]->num();
   top[0]->Reshape(top_shape);
   CHECK_EQ(top[0]->count(), bottom[0]->count());
-}
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
   top[0]->ShareData(*bottom[0]);
-}
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  bottom[0]->ShareDiff(*top[0]);
+  top[0]->ShareDiff(*bottom[0]);
 }
 
 INSTANTIATE_CLASS(FlattenLayer);

From 87eb8ddb9afcd8152cdd4172a6da6711be97fb01 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Wed, 5 Nov 2014 13:12:53 -0800
Subject: [PATCH 72/86] AccuracyLayer: add 'denominator' param

---
 include/caffe/loss_layers.hpp       | 1 +
 src/caffe/layers/accuracy_layer.cpp | 6 +++++-
 src/caffe/proto/caffe.proto         | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 62d6df71a4a..668219a7a7d 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -79,6 +79,7 @@ class AccuracyLayer : public Layer<Dtype> {
   }
 
   int top_k_;
+  Dtype denominator_;
 };
 
 /**
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 186f9f8632c..b5a7827af62 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -14,6 +14,9 @@ template <typename Dtype>
 void AccuracyLayer<Dtype>::LayerSetUp(
   const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   top_k_ = this->layer_param_.accuracy_param().top_k();
+  denominator_ = this->layer_param_.accuracy_param().denominator();
+  CHECK_GE(denominator_, 0)
+      << "Denominator must be positive; or 0, for the batch size.";
 }
 
 template <typename Dtype>
@@ -60,7 +63,8 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  top[0]->mutable_cpu_data()[0] = accuracy / num;
+  const Dtype denominator = (denominator_ == 0) ? num : denominator_;
+  top[0]->mutable_cpu_data()[0] = accuracy / denominator;
   // Accuracy layer should not be used as a loss function.
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index d0072f4a0fc..5429609091d 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -370,6 +370,12 @@ message AccuracyParameter {
   // the top k scoring classes.  By default, only compare to the top scoring
   // class (i.e. argmax).
   optional uint32 top_k = 1 [default = 1];
+
+  // Controls the denominator in the computed accuracy = #correct / denominator.
+  // Must be a positive number, or the default of 0, for the total input weight.
+  // If no input weights are used, the denominator is the batch size, as the
+  // weights each default to 1.
+  optional float denominator = 2 [default = 0];
 }
 
 // Message that stores parameters used by ArgMaxLayer

From 05009e59f1bb22906688498ff9938121e3e3b695 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 5 Feb 2015 14:02:01 -0800
Subject: [PATCH 73/86] AccuracyLayer: add support for ignore_label

---
 include/caffe/loss_layers.hpp       |  5 +++++
 src/caffe/layers/accuracy_layer.cpp | 18 +++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 668219a7a7d..4277269c938 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -78,6 +78,11 @@ class AccuracyLayer : public Layer<Dtype> {
     }
   }
 
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+
   int top_k_;
   Dtype denominator_;
 };
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index b5a7827af62..539a7f175b6 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -17,6 +17,12 @@ void AccuracyLayer<Dtype>::LayerSetUp(
   denominator_ = this->layer_param_.accuracy_param().denominator();
   CHECK_GE(denominator_, 0)
       << "Denominator must be positive; or 0, for the batch size.";
+
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
 }
 
 template <typename Dtype>
@@ -39,10 +45,11 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   Dtype accuracy = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
-  int num = bottom[0]->count(0, bottom[1]->num_axes());
+  int num = bottom[1]->count();
   int dim = bottom[0]->count() / num;
   vector<Dtype> maxval(top_k_+1);
   vector<int> max_id(top_k_+1);
+  int count = 0;
   for (int i = 0; i < num; ++i) {
     // Top-k accuracy
     std::vector<std::pair<Dtype, int> > bottom_data_vector;
@@ -54,8 +61,13 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
         bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
         bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
     // check if true label is in top k predictions
+    const int label_value = static_cast<int>(bottom_label[i]);
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      continue;
+    }
+    ++count;
     for (int k = 0; k < top_k_; k++) {
-      if (bottom_data_vector[k].second == static_cast<int>(bottom_label[i])) {
+      if (bottom_data_vector[k].second == label_value) {
         ++accuracy;
         break;
       }
@@ -63,7 +75,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  const Dtype denominator = (denominator_ == 0) ? num : denominator_;
+  const Dtype denominator = (denominator_ == 0) ? count : denominator_;
   top[0]->mutable_cpu_data()[0] = accuracy / denominator;
   // Accuracy layer should not be used as a loss function.
 }

From 0b29d6ff0059ab3fdde4be9dfc5f1de157cf35a2 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Tue, 30 Sep 2014 17:08:18 -0700
Subject: [PATCH 74/86] EltwiseLayer can take a blob of per-num coefficients

---
 include/caffe/common_layers.hpp       |  1 +
 src/caffe/layers/eltwise_layer.cpp    | 65 +++++++++++++++++++++------
 src/caffe/layers/eltwise_layer.cu     | 30 ++++++++++---
 src/caffe/proto/caffe.proto           |  4 ++
 src/caffe/test/test_eltwise_layer.cpp | 59 +++++++++++++++++++++++-
 5 files changed, 140 insertions(+), 19 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 1991d58206d..f8a68f0228e 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -176,6 +176,7 @@ class EltwiseLayer : public Layer<Dtype> {
   EltwiseParameter_EltwiseOp op_;
   vector<Dtype> coeffs_;
   Blob<int> max_idx_;
+  bool coeff_blob_;
 
   bool stable_prod_grad_;
 };
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index a80700736bd..5c159ac3d64 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -10,18 +10,23 @@ namespace caffe {
 template <typename Dtype>
 void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK(this->layer_param().eltwise_param().coeff_size() == 0
-      || this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
+  op_ = this->layer_param_.eltwise_param().operation();
+  coeff_blob_ = this->layer_param().eltwise_param().coeff_blob();
+  if (coeff_blob_) {
+    CHECK_EQ(op_, EltwiseParameter_EltwiseOp_SUM)
+        << "coeff_blob option only implemented for the SUM operation";
+  }
+  const int coeff_size = this->layer_param().eltwise_param().coeff_size();
+  CHECK(coeff_size == 0 || (!coeff_blob_ && coeff_size == bottom.size())
+                        || (coeff_blob_ && coeff_size == bottom.size() - 1)) <<
       "Eltwise Layer takes one coefficient per bottom blob.";
-  CHECK(!(this->layer_param().eltwise_param().operation()
-      == EltwiseParameter_EltwiseOp_PROD
-      && this->layer_param().eltwise_param().coeff_size())) <<
+  CHECK(op_ == EltwiseParameter_EltwiseOp_SUM
+      || this->layer_param().eltwise_param().coeff_size() == 0) <<
       "Eltwise layer only takes coefficients for summation.";
-  op_ = this->layer_param_.eltwise_param().operation();
   // Blob-wise coefficients for the elementwise operation.
   coeffs_ = vector<Dtype>(bottom.size(), 1);
-  if (this->layer_param().eltwise_param().coeff_size()) {
-    for (int i = 0; i < bottom.size(); ++i) {
+  if (coeff_size) {
+    for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
       coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
     }
   }
@@ -32,7 +37,19 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    if (coeff_blob_ && i == bottom.size() - 1) {
+      CHECK_EQ(i, bottom[i]->shape(0))
+          << "Dimension of coeff blob axis 0 must equal the number of bottom "
+          << "blobs (not including the coeff blob itself).";
+      for (int input_axis = 0, coeff_axis = 1;
+           coeff_axis < bottom[i]->num_axes(); ++input_axis, ++coeff_axis) {
+        CHECK_EQ(bottom[0]->shape(input_axis), bottom[i]->shape(coeff_axis))
+            << "Each axis i >= 1 of the coeff blob must match the (i-1)th "
+            << "axis of the input.";
+      }
+    } else {
+      CHECK(bottom[i]->shape() == bottom[0]->shape());
+    }
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
@@ -60,8 +77,21 @@ void EltwiseLayer<Dtype>::Forward_cpu(
   case EltwiseParameter_EltwiseOp_SUM:
     caffe_set(count, Dtype(0), top_data);
     // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+    for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
+      if (coeff_blob_) {
+        const int num = bottom[bottom.size() - 1]->count() /
+                        (bottom.size() - 1);
+        const int dim = bottom[i]->count() / num;
+        const Dtype* bottom_data = bottom[i]->cpu_data();
+        const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
+        for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) {
+          const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
+          caffe_axpy(dim, coeff, bottom_data, top_data);
+        }
+        top_data = top[0]->mutable_cpu_data();
+      } else {
+        caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+      }
     }
     break;
   case EltwiseParameter_EltwiseOp_MAX:
@@ -104,7 +134,7 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   const int count = top[0]->count();
   const Dtype* top_data = top[0]->cpu_data();
   const Dtype* top_diff = top[0]->cpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
+  for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
     if (propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->cpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
@@ -128,7 +158,16 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         caffe_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1)) {
+        if (coeff_blob_) {
+          const int num = bottom[bottom.size() - 1]->count() /
+                          (bottom.size() - 1);
+          const int dim = bottom[i]->count() / num;
+          const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
+          for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) {
+            const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
+            caffe_cpu_scale(dim, coeff, top_diff, bottom_diff);
+          }
+        } else if (coeffs_[i] == Dtype(1.)) {
           caffe_copy(count, top_diff, bottom_diff);
         } else {
           caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
index 2247870d97f..3abc582919e 100644
--- a/src/caffe/layers/eltwise_layer.cu
+++ b/src/caffe/layers/eltwise_layer.cu
@@ -48,8 +48,20 @@ void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   case EltwiseParameter_EltwiseOp_SUM:
     caffe_gpu_set(count, Dtype(0.), top_data);
     // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+    for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
+      if (coeff_blob_) {
+        const int num = bottom[i]->num();
+        const int dim = bottom[i]->count() / num;
+        const Dtype* bottom_data = bottom[i]->gpu_data();
+        const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
+        for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) {
+          const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
+          caffe_gpu_axpy(dim, coeff, bottom_data, top_data);
+        }
+        top_data = top[0]->mutable_gpu_data();
+      } else {
+        caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+      }
     }
     break;
   case EltwiseParameter_EltwiseOp_MAX:
@@ -86,10 +98,10 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const int* mask = NULL;
   const int count = top[0]->count();
   const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
+  for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
     if (propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->gpu_data();
+      const Dtype* top_diff = top[0]->gpu_diff();
       Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
       switch (op_) {
       case EltwiseParameter_EltwiseOp_PROD:
@@ -111,7 +123,15 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
+        if (coeff_blob_) {
+          const int num = bottom[i]->num();
+          const int dim = bottom[i]->count() / num;
+          const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
+          for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) {
+            const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
+            caffe_gpu_scale(dim, coeff, top_diff, bottom_diff);
+          }
+        } else if (coeffs_[i] == Dtype(1.)) {
           caffe_copy(count, top_diff, bottom_diff);
         } else {
           caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 5429609091d..d8a6a5d5f28 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -498,6 +498,10 @@ message EltwiseParameter {
   // Whether to use an asymptotically slower (for >2 inputs) but stabler method
   // of computing the gradient for the PROD operation. (No effect for SUM op.)
   optional bool stable_prod_grad = 3 [default = true];
+
+  // If true and the EltwiseOp is SUM, the last bottom blob is a singleton
+  // coefficient for the first N-1 bottom blobs, with shape (N-1, 1, 1, 1).
+  optional bool coeff_blob = 4 [default = false];
 }
 
 // Message that stores parameters used by EmbedLayer
diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
index be0c1347709..85c11341abd 100644
--- a/src/caffe/test/test_eltwise_layer.cpp
+++ b/src/caffe/test/test_eltwise_layer.cpp
@@ -22,14 +22,18 @@ class EltwiseLayerTest : public MultiDeviceTest<TypeParam> {
       : blob_bottom_a_(new Blob<Dtype>(2, 3, 4, 5)),
         blob_bottom_b_(new Blob<Dtype>(2, 3, 4, 5)),
         blob_bottom_c_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_coeff_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()) {
-    // fill the values
+    vector<int> coeff_shape(2);
+    coeff_shape[0] = 3; coeff_shape[1] = 2;
+    blob_bottom_coeff_->Reshape(coeff_shape);
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
     UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_a_);
     filler.Fill(this->blob_bottom_b_);
     filler.Fill(this->blob_bottom_c_);
+    filler.Fill(this->blob_bottom_coeff_);
     blob_bottom_vec_.push_back(blob_bottom_a_);
     blob_bottom_vec_.push_back(blob_bottom_b_);
     blob_bottom_vec_.push_back(blob_bottom_c_);
@@ -39,11 +43,13 @@ class EltwiseLayerTest : public MultiDeviceTest<TypeParam> {
     delete blob_bottom_a_;
     delete blob_bottom_b_;
     delete blob_bottom_c_;
+    delete blob_bottom_coeff_;
     delete blob_top_;
   }
   Blob<Dtype>* const blob_bottom_a_;
   Blob<Dtype>* const blob_bottom_b_;
   Blob<Dtype>* const blob_bottom_c_;
+  Blob<Dtype>* const blob_bottom_coeff_;
   Blob<Dtype>* const blob_top_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
@@ -126,6 +132,37 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeff) {
   }
 }
 
+TYPED_TEST(EltwiseLayerTest, TestSumBlobCoeff) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
+  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
+  eltwise_param->set_coeff_blob(true);
+  eltwise_param->add_coeff(1);
+  eltwise_param->add_coeff(-0.5);
+  eltwise_param->add_coeff(2);
+  shared_ptr<EltwiseLayer<Dtype> > layer(
+      new EltwiseLayer<Dtype>(layer_param));
+  this->blob_bottom_vec_.push_back(this->blob_bottom_coeff_);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const int num = this->blob_top_->num();
+  const int dim = count / num;
+  const Dtype* coeff_data = this->blob_bottom_coeff_->cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < dim; ++d) {
+      Dtype sum = 0;
+      for (int i = 0; i < this->blob_bottom_vec_.size() - 1; ++i) {
+        const Dtype coeff = coeff_data[i * num + n] * eltwise_param->coeff(i);
+        sum += coeff * this->blob_bottom_vec_[i]->cpu_data()[n * dim + d];
+      }
+      EXPECT_NEAR(data[n * dim + d], sum, 1e-4);
+    }
+  }
+}
+
 TYPED_TEST(EltwiseLayerTest, TestStableProdGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -175,6 +212,26 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeffGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(EltwiseLayerTest, TestSumBlobCoeffGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
+  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
+  eltwise_param->set_coeff_blob(true);
+  eltwise_param->add_coeff(1);
+  eltwise_param->add_coeff(-0.5);
+  eltwise_param->add_coeff(2);
+  EltwiseLayer<Dtype> layer(layer_param);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_coeff_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 1);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
 TYPED_TEST(EltwiseLayerTest, TestMax) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;

From fd299b4a509e707fb2db1ff113bd3ac7ac458fb4 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 2 Nov 2014 00:41:50 -0700
Subject: [PATCH 75/86] EltwiseLayer with coeff blob GPU kernel

---
 src/caffe/layers/eltwise_layer.cpp |  2 +-
 src/caffe/layers/eltwise_layer.cu  | 69 ++++++++++++++++++------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 5c159ac3d64..55d2d955c7c 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -24,7 +24,7 @@ void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       || this->layer_param().eltwise_param().coeff_size() == 0) <<
       "Eltwise layer only takes coefficients for summation.";
   // Blob-wise coefficients for the elementwise operation.
-  coeffs_ = vector<Dtype>(bottom.size(), 1);
+  coeffs_.resize(bottom.size(), 1);
   if (coeff_size) {
     for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
       coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
index 3abc582919e..97f52079108 100644
--- a/src/caffe/layers/eltwise_layer.cu
+++ b/src/caffe/layers/eltwise_layer.cu
@@ -31,12 +31,33 @@ __global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
   }
 }
 
+template <typename Dtype>
+__global__ void CoeffSum(const int count, const int dim,
+    const int num_offset, const Dtype coeff, const Dtype* coeff_data,
+    const bool backward, const Dtype* in, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, count) {
+    const int n = num_offset + index / dim;
+    const Dtype other_coeff = coeff_data ? coeff_data[n] : Dtype(1);
+    const Dtype final_coeff = coeff * other_coeff;
+    const Dtype result = in[index] * final_coeff;
+    if (num_offset == 0 || backward) {
+      out[index] = result;
+    } else {
+      out[index] += result;
+    }
+  }
+}
+
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   int* mask = NULL;
   const int count = top[0]->count();
+  const int num = top[0]->num();
+  const int dim = count / num;
   Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* coeff_data = NULL;
+  const bool kBackward = false;
   switch (op_) {
   case EltwiseParameter_EltwiseOp_PROD:
     caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
@@ -46,22 +67,17 @@ void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     }
     break;
   case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
     // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
+    if (coeff_blob_) {
+      coeff_data = bottom[bottom.size() - 1]->gpu_data();
+    }
     for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
-      if (coeff_blob_) {
-        const int num = bottom[i]->num();
-        const int dim = bottom[i]->count() / num;
-        const Dtype* bottom_data = bottom[i]->gpu_data();
-        const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
-        for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) {
-          const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
-          caffe_gpu_axpy(dim, coeff, bottom_data, top_data);
-        }
-        top_data = top[0]->mutable_gpu_data();
-      } else {
-        caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-      }
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      CoeffSum<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+          count, dim, i * num, coeffs_[i], coeff_data,
+          kBackward, bottom_data, top_data);
+      CUDA_POST_KERNEL_CHECK;
     }
     break;
   case EltwiseParameter_EltwiseOp_MAX:
@@ -97,7 +113,14 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const int* mask = NULL;
   const int count = top[0]->count();
+  const int num = top[0]->num();
+  const int dim = count / num;
   const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* coeff_data = NULL;
+  if (coeff_blob_) {
+    coeff_data = bottom[bottom.size() - 1]->gpu_data();
+  }
+  const bool kBackward = true;
   for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
     if (propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->gpu_data();
@@ -123,19 +146,11 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (coeff_blob_) {
-          const int num = bottom[i]->num();
-          const int dim = bottom[i]->count() / num;
-          const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
-          for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) {
-            const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
-            caffe_gpu_scale(dim, coeff, top_diff, bottom_diff);
-          }
-        } else if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
+        CoeffSum<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+            <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+            count, dim, i * num, coeffs_[i], coeff_data,
+            kBackward, top_diff, bottom_diff);
+        CUDA_POST_KERNEL_CHECK;
         break;
       case EltwiseParameter_EltwiseOp_MAX:
         mask = max_idx_.gpu_data();

From 82120d143a8310ee075f017672de03bc4286a416 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Tue, 7 Oct 2014 11:55:54 -0700
Subject: [PATCH 76/86] Allow SliceLayer to have a single top Blob (for
 testing)

---
 include/caffe/common_layers.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index f8a68f0228e..d694f4381c3 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -530,7 +530,7 @@ class SliceLayer : public Layer<Dtype> {
 
   virtual inline const char* type() const { return "Slice"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,

From dc113f27bc7b0e4c76dd88a90108d1f830209014 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 23 Jan 2015 12:52:44 -0800
Subject: [PATCH 77/86] Allow ConcatLayer to take a single bottom Blob (for
 testing)

---
 include/caffe/common_layers.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index d694f4381c3..a23c671b7e9 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -85,7 +85,7 @@ class ConcatLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top);
 
   virtual inline const char* type() const { return "Concat"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MinBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:

From 789b2ec20a9a2c8841dbf16bae2abff0647fed5f Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 15:18:56 -0800
Subject: [PATCH 78/86] Modifications to Net to facilitate unrolled recurrent
 networks

---
 include/caffe/net.hpp | 10 ++++++++++
 src/caffe/net.cpp     | 42 +++++++++++-------------------------------
 2 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 075afebc9b0..10fc1705192 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -84,6 +84,13 @@ class Net {
 
   /// @brief Updates the network weights based on the diff values computed.
   void Update();
+  /**
+   * @brief Shares weight data of owner blobs with shared blobs.
+   *
+   * Note: this is called by Net::Init, and thus should normally not be
+   * called manually.
+   */
+  void ShareWeightData();
 
   /**
    * @brief For an already initialized net, implicitly copies (i.e., using no
@@ -150,6 +157,9 @@ class Net {
     return param_names_index_;
   }
   inline const vector<int>& param_owners() const { return param_owners_; }
+  inline const vector<string>& param_display_names() const {
+    return param_display_names_;
+  }
   /// @brief Input and output blob numbers
   inline int num_inputs() const { return net_input_blobs_.size(); }
   inline int num_outputs() const { return net_output_blobs_.size(); }
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index e8f7c05e09d..bd6c22e5e5c 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -213,6 +213,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     layer_names_index_[layer_names_[layer_id]] = layer_id;
   }
   GetLearningRateAndWeightDecay();
+  ShareWeightData();
   debug_info_ = param.debug_info();
   LOG(INFO) << "Network initialization done.";
   LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
@@ -437,8 +438,6 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
       // Strict dimension checking -- all dims must be the same.
       CHECK(this_blob->shape() == owner_blob->shape());
     }
-    layers_[layer_id]->blobs()[param_id]->ShareData(
-        *layers_[owner_layer_id]->blobs()[owner_param_id]);
   }
 }
 
@@ -741,35 +740,7 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
 
 template <typename Dtype>
 void Net<Dtype>::Update() {
-  // First, accumulate the diffs of any shared parameters into their owner's
-  // diff. (Assumes that the learning rate, weight decay, etc. have already been
-  // accounted for in the current diff.)
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    const int count = params_[i]->count();
-    const Dtype* this_diff;
-    Dtype* owner_diff;
-    switch (Caffe::mode()) {
-    case Caffe::CPU:
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-      caffe_add(count, this_diff, owner_diff, owner_diff);
-      break;
-#ifndef CPU_ONLY
-    case Caffe::GPU:
-      this_diff = params_[i]->gpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-      caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
-      break;
-#else
-      NO_GPU;
-#endif
-    default:
-      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-    }
-  }
-  // Now, update the owned parameters.
+  // Update only the owned parameters.
   for (int i = 0; i < params_.size(); ++i) {
     if (param_owners_[i] >= 0) { continue; }
     if (debug_info_) { UpdateDebugInfo(i); }
@@ -777,6 +748,15 @@ void Net<Dtype>::Update() {
   }
 }
 
+template <typename Dtype>
+void Net<Dtype>::ShareWeightData() {
+  for (int i = 0; i < params_.size(); ++i) {
+    if (param_owners_[i] < 0) { continue; }
+    params_[i]->ShareData(*params_[param_owners_[i]]);
+    params_[i]->ShareDiff(*params_[param_owners_[i]]);
+  }
+}
+
 template <typename Dtype>
 bool Net<Dtype>::has_blob(const string& blob_name) const {
   return blob_names_index_.find(blob_name) != blob_names_index_.end();

From 50adf9b03d87daec33e9f63cf989f0e656b6172b Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 17:35:52 -0800
Subject: [PATCH 79/86] TestNet fixes for Net weight sharing modifications

---
 src/caffe/test/test_net.cpp | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 08106e79274..c65116d7eb1 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -1007,11 +1007,10 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
   EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
   Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
   Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   this->net_->Forward(bottom);
   this->net_->Backward();
   // Compute the expected update as the data minus the two diffs.
@@ -1024,11 +1023,7 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
   // Make sure the diffs are non-trivial.
   for (int i = 0; i < count; ++i) {
     EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
-    EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
-    EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
   }
-  caffe_axpy(count, Dtype(1), ip2_weights->cpu_diff(),
-             shared_params.mutable_cpu_diff());
   caffe_axpy(count, Dtype(-1), shared_params.cpu_diff(),
              shared_params.mutable_cpu_data());
   const Dtype* expected_updated_params = shared_params.cpu_data();
@@ -1065,8 +1060,8 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
     EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
     EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
     EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
-    EXPECT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
-              shared_params.cpu_diff()[i]);
+    EXPECT_FLOAT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
+                    shared_params.cpu_diff()[i]);
   }
   caffe_axpy(count, Dtype(-1), ip1_weights->cpu_diff(),
              unshared_params1.mutable_cpu_data());
@@ -1096,11 +1091,10 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) {
   EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
   Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
   Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   this->net_->ForwardBackward(bottom);
   this->net_->Update();
   Blob<Dtype> shared_params;
@@ -1123,14 +1117,13 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) {
   ASSERT_FALSE(NULL == ip1_weights);
   ASSERT_FALSE(NULL == ip2_weights);
   EXPECT_NE(ip1_weights, ip2_weights);
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   for (int i = 0; i < count; ++i) {
     EXPECT_FLOAT_EQ(shared_params.cpu_data()[i], ip1_weights->cpu_data()[i]);
   }
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
 }
 
 TYPED_TEST(NetTest, TestParamPropagateDown) {

From 97792f5cde0fa0f5a63dfca6db1f00ae3330c942 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 14:28:01 -0800
Subject: [PATCH 80/86] Add RecurrentLayer: an abstract superclass for other
 recurrent layer types

---
 include/caffe/sequence_layers.hpp    | 154 +++++++++++++++++++
 src/caffe/layers/recurrent_layer.cpp | 222 +++++++++++++++++++++++++++
 src/caffe/layers/recurrent_layer.cu  |  35 +++++
 src/caffe/proto/caffe.proto          |  16 +-
 4 files changed, 426 insertions(+), 1 deletion(-)
 create mode 100644 include/caffe/sequence_layers.hpp
 create mode 100644 src/caffe/layers/recurrent_layer.cpp
 create mode 100644 src/caffe/layers/recurrent_layer.cu

diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp
new file mode 100644
index 00000000000..d064136bf15
--- /dev/null
+++ b/include/caffe/sequence_layers.hpp
@@ -0,0 +1,154 @@
+#ifndef CAFFE_SEQUENCE_LAYERS_HPP_
+#define CAFFE_SEQUENCE_LAYERS_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instaed,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network arcthiecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief A helper function, useful for stringifying timestep indices.
+  virtual string int_to_str(const int t) const;
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net<Dtype> > unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  vector<Blob<Dtype>* > recur_input_blobs_;
+  vector<Blob<Dtype>* > recur_output_blobs_;
+  vector<Blob<Dtype>* > output_blobs_;
+  Blob<Dtype>* x_input_blob_;
+  Blob<Dtype>* x_static_input_blob_;
+  Blob<Dtype>* cont_input_blob_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SEQUENCE_LAYERS_HPP_
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
new file mode 100644
index 00000000000..7dc38fec901
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -0,0 +1,222 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+string RecurrentLayer<Dtype>::int_to_str(const int t) const {
+  ostringstream num;
+  num << t;
+  return num.str();
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  T_ = bottom[0]->shape(0);
+  N_ = bottom[0]->shape(1);
+  LOG(INFO) << "Initializing recurrent layer: assuming input batch contains "
+            << T_ << " timesteps of " << N_ << " independent streams.";
+
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+
+  // If provided, bottom[2] is a static input to the recurrent net.
+  static_input_ = (bottom.size() > 2);
+  if (static_input_) {
+    CHECK_GE(bottom[2]->num_axes(), 1);
+    CHECK_EQ(N_, bottom[2]->shape(0));
+  }
+
+  // Create a NetParameter; setup the inputs that aren't unique to particular
+  // recurrent architectures.
+  NetParameter net_param;
+  net_param.set_force_backward(true);
+
+  net_param.add_input("x");
+  BlobShape input_shape;
+  for (int i = 0; i < bottom[0]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[0]->shape(i));
+  }
+  net_param.add_input_shape()->CopyFrom(input_shape);
+
+  input_shape.Clear();
+  input_shape.add_dim(1);
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[1]->shape(i));
+  }
+  net_param.add_input("cont");
+  net_param.add_input_shape()->CopyFrom(input_shape);
+
+  if (static_input_) {
+    input_shape.Clear();
+    for (int i = 0; i < bottom[2]->num_axes(); ++i) {
+      input_shape.add_dim(bottom[2]->shape(i));
+    }
+    net_param.add_input("x_static");
+    net_param.add_input_shape()->CopyFrom(input_shape);
+  }
+
+  // Call the child's FillUnrolledNet implementation to specify the unrolled
+  // recurrent architecture.
+  this->FillUnrolledNet(&net_param);
+
+  // Prepend this layer's name to the names of each layer in the unrolled net.
+  const string& layer_name = this->layer_param_.name();
+  if (layer_name.size() > 0) {
+    for (int i = 0; i < net_param.layer_size(); ++i) {
+      LayerParameter* layer = net_param.mutable_layer(i);
+      layer->set_name(layer_name + "_" + layer->name());
+    }
+  }
+
+  // Create the unrolled net.
+  unrolled_net_.reset(new Net<Dtype>(net_param));
+  unrolled_net_->set_debug_info(
+      this->layer_param_.recurrent_param().debug_info());
+
+  // Setup pointers to the inputs.
+  x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get());
+  cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get());
+  if (static_input_) {
+    x_static_input_blob_ =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get());
+  }
+
+  // Setup pointers to paired recurrent inputs/outputs.
+  vector<string> recur_input_names;
+  RecurrentInputBlobNames(&recur_input_names);
+  vector<string> recur_output_names;
+  RecurrentOutputBlobNames(&recur_output_names);
+  const int num_recur_blobs = recur_input_names.size();
+  CHECK_EQ(num_recur_blobs, recur_output_names.size());
+  recur_input_blobs_.resize(num_recur_blobs);
+  recur_output_blobs_.resize(num_recur_blobs);
+  for (int i = 0; i < recur_input_names.size(); ++i) {
+    recur_input_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get());
+    recur_output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get());
+  }
+
+  // Setup pointers to outputs.
+  vector<string> output_names;
+  OutputBlobNames(&output_names);
+  CHECK_EQ(top.size(), output_names.size())
+      << "OutputBlobNames must provide an output blob name for each top.";
+  output_blobs_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get());
+  }
+
+  // We should have 2 inputs (x and cont), plus a number of recurrent inputs,
+  // plus maybe a static input.
+  CHECK_EQ(2 + num_recur_blobs + static_input_,
+           unrolled_net_->input_blobs().size());
+
+  // This layer's parameters are any parameters in the layers of the unrolled
+  // net. We only want one copy of each parameter, so check that the parameter
+  // is "owned" by the layer, rather than shared with another.
+  this->blobs_.clear();
+  for (int i = 0; i < unrolled_net_->params().size(); ++i) {
+    if (unrolled_net_->param_owners()[i] == -1) {
+      LOG(INFO) << "Adding parameter " << i << ": "
+                << unrolled_net_->param_display_names()[i];
+      this->blobs_.push_back(unrolled_net_->params()[i]);
+    }
+  }
+  // Check that param_propagate_down is set for all of the parameters in the
+  // unrolled net; set param_propagate_down to true in this layer.
+  for (int i = 0; i < unrolled_net_->layers().size(); ++i) {
+    for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) {
+      CHECK(unrolled_net_->layers()[i]->param_propagate_down(j))
+          << "param_propagate_down not set for layer " << i << ", param " << j;
+    }
+  }
+  this->param_propagate_down_.clear();
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+
+  // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across
+  // batches.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_diff());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(top.size(), output_blobs_.size());
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ReshapeLike(*output_blobs_[i]);
+    output_blobs_[i]->ShareData(*top[i]);
+    output_blobs_[i]->ShareDiff(*top[i]);
+  }
+  x_input_blob_->ShareData(*bottom[0]);
+  x_input_blob_->ShareDiff(*bottom[0]);
+  cont_input_blob_->ShareData(*bottom[1]);
+  if (static_input_) {
+    x_static_input_blob_->ShareData(*bottom[2]);
+    x_static_input_blob_->ShareDiff(*bottom[2]);
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reset() {
+  // "Reset" the hidden state of the net by zeroing out all recurrent outputs.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeightData();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+    const int count = recur_input_blobs_[i]->count();
+    DCHECK_EQ(count, recur_output_blobs_[i]->count());
+    const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data();
+    Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data();
+    caffe_copy(count, timestep_T_data, timestep_0_data);
+  }
+
+  unrolled_net_->ForwardPrefilled();
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[2]) { return; }
+
+  unrolled_net_->Backward();
+}
+
+#ifdef CPU_ONLY
+STUB_GPU_FORWARD(RecurrentLayer, Forward);
+#endif
+
+INSTANTIATE_CLASS(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu
new file mode 100644
index 00000000000..ce4b2f9b77c
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cu
@@ -0,0 +1,35 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeightData();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+    const int count = recur_input_blobs_[i]->count();
+    DCHECK_EQ(count, recur_output_blobs_[i]->count());
+    const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data();
+    Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data();
+    caffe_copy(count, timestep_T_data, timestep_0_data);
+  }
+
+  unrolled_net_->ForwardPrefilled();
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index d8a6a5d5f28..227271d6918 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -260,7 +260,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 133 (last added: reshape_param)
+// LayerParameter next available layer-specific ID: 134 (last added: recurrent_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -326,6 +326,7 @@ message LayerParameter {
   optional PoolingParameter pooling_param = 121;
   optional PowerParameter power_param = 122;
   optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 133;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 132;
   optional SigmoidParameter sigmoid_param = 124;
@@ -682,6 +683,19 @@ message ReshapeParameter {
   optional BlobShape shape = 1;
 }
 
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+}
+
 // Message that stores parameters used by ReLULayer
 message ReLUParameter {
   // Allow non-zero slope for negative inputs to speed up optimization

From 26a2847e6580425224aa9afd5af5fd66a10413ac Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 14:56:50 -0800
Subject: [PATCH 81/86] Add RNNLayer, with tests

---
 include/caffe/sequence_layers.hpp |  25 ++++
 src/caffe/layers/rnn_layer.cpp    | 217 ++++++++++++++++++++++++++++++
 src/caffe/test/test_rnn_layer.cpp | 196 +++++++++++++++++++++++++++
 3 files changed, 438 insertions(+)
 create mode 100644 src/caffe/layers/rnn_layer.cpp
 create mode 100644 src/caffe/test/test_rnn_layer.cpp

diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp
index d064136bf15..708f6380d93 100644
--- a/include/caffe/sequence_layers.hpp
+++ b/include/caffe/sequence_layers.hpp
@@ -149,6 +149,31 @@ class RecurrentLayer : public Layer<Dtype> {
   Blob<Dtype>* cont_input_blob_;
 };
 
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ *        (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ *     o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template <typename Dtype>
+class RNNLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit RNNLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
 }  // namespace caffe
 
 #endif  // CAFFE_SEQUENCE_LAYERS_HPP_
diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp
new file mode 100644
index 00000000000..a2a22f62819
--- /dev/null
+++ b/src/caffe/layers/rnn_layer.cpp
@@ -0,0 +1,217 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_0";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_" + this->int_to_str(this->T_);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "o";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter tanh_param;
+  tanh_param.set_type("TanH");
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  BlobShape input_shape;
+  input_shape.add_dim(1);  // h_0 is a single timestep
+  input_shape.add_dim(this->N_);
+  input_shape.add_dim(num_output);
+  net_param->add_input("h_0");
+  net_param->add_input_shape()->CopyFrom(input_shape);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(1);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xh_x = W_xh * x + b_h
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xh");
+    x_transform_param->add_param()->set_name("b_h");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xh_x");
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the hidden state dimension.
+    //     W_xh_x_static = W_xh_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xh_x_static");
+    x_static_transform_param->add_param()->set_name("W_xh_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xh_x_static");
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    new_shape->add_dim(this->N_);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xh_x_static_reshape");
+    reshape_param->add_bottom("W_xh_x_static");
+    reshape_param->add_top("W_xh_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->set_name("W_xh_x_slice");
+  x_slice_param->add_bottom("W_xh_x");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("o_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("o");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = this->int_to_str(t - 1);
+    string ts = this->int_to_str(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xh_x_" + ts);
+
+    // Add layer to flush the hidden state when beginning a new sequence,
+    // as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(sum_param);
+      cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hh_h_{t-1} := W_hh * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("W_hh_h_" + tm1s);
+      w_param->add_param()->set_name("W_hh");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hh_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h )
+    //          = \tanh( W_hh_h_{t-1} + W_xh_t )
+    {
+      LayerParameter* h_input_sum_param = net_param->add_layer();
+      h_input_sum_param->CopyFrom(sum_param);
+      h_input_sum_param->set_name("h_input_sum_" + ts);
+      h_input_sum_param->add_bottom("W_hh_h_" + tm1s);
+      h_input_sum_param->add_bottom("W_xh_x_" + ts);
+      if (this->static_input_) {
+        h_input_sum_param->add_bottom("W_xh_x_static");
+      }
+      h_input_sum_param->add_top("h_neuron_input_" + ts);
+    }
+    {
+      LayerParameter* h_neuron_param = net_param->add_layer();
+      h_neuron_param->CopyFrom(tanh_param);
+      h_neuron_param->set_name("h_neuron_" + ts);
+      h_neuron_param->add_bottom("h_neuron_input_" + ts);
+      h_neuron_param->add_top("h_" + ts);
+    }
+
+    // Add layer to compute
+    //     W_ho_h_t := W_ho * h_t + b_o
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(biased_hidden_param);
+      w_param->set_name("W_ho_h_" + ts);
+      w_param->add_param()->set_name("W_ho");
+      w_param->add_param()->set_name("b_o");
+      w_param->add_bottom("h_" + ts);
+      w_param->add_top("W_ho_h_" + ts);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     o_t := \tanh( W_ho h_t + b_o)
+    //          = \tanh( W_ho_h_t )
+    {
+      LayerParameter* o_neuron_param = net_param->add_layer();
+      o_neuron_param->CopyFrom(tanh_param);
+      o_neuron_param->set_name("o_neuron_" + ts);
+      o_neuron_param->add_bottom("W_ho_h_" + ts);
+      o_neuron_param->add_top("o_" + ts);
+    }
+    output_concat_layer.add_bottom("o_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(RNNLayer);
+REGISTER_LAYER_CLASS(RNN);
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp
new file mode 100644
index 00000000000..eab9269ce77
--- /dev/null
+++ b/src/caffe/test/test_rnn_layer.cpp
@@ -0,0 +1,196 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/sequence_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class RNNLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  RNNLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_flush_);
+    blob_top_vec_.push_back(&blob_top_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_flush_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_flush_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(RNNLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(RNNLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the flush blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<RNNLayer<Dtype> > layer(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence RNN";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all flush blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RNNLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlushBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe

From 9f830fa1ec6dfa79e66f1846ad872235ee7e9cda Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sun, 15 Feb 2015 14:57:22 -0800
Subject: [PATCH 82/86] Add LSTMLayer and LSTMUnitLayer, with tests

---
 include/caffe/sequence_layers.hpp    | 133 ++++++++++++++
 src/caffe/layers/lstm_layer.cpp      | 221 ++++++++++++++++++++++
 src/caffe/layers/lstm_unit_layer.cpp | 128 +++++++++++++
 src/caffe/layers/lstm_unit_layer.cu  | 154 ++++++++++++++++
 src/caffe/test/test_lstm_layer.cpp   | 265 +++++++++++++++++++++++++++
 5 files changed, 901 insertions(+)
 create mode 100644 src/caffe/layers/lstm_layer.cpp
 create mode 100644 src/caffe/layers/lstm_unit_layer.cpp
 create mode 100644 src/caffe/layers/lstm_unit_layer.cu
 create mode 100644 src/caffe/test/test_lstm_layer.cpp

diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp
index 708f6380d93..8ac735435a4 100644
--- a/include/caffe/sequence_layers.hpp
+++ b/include/caffe/sequence_layers.hpp
@@ -149,6 +149,139 @@ class RecurrentLayer : public Layer<Dtype> {
   Blob<Dtype>* cont_input_blob_;
 };
 
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented as a network
+ *        unrolled the LSTM computation in time.
+ *
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
 /**
  * @brief Processes time-varying inputs using a simple recurrent neural network
  *        (RNN). Implemented as a network unrolling the RNN computation in time.
diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp
new file mode 100644
index 00000000000..91543f73f71
--- /dev/null
+++ b/src/caffe/layers/lstm_layer.cpp
@@ -0,0 +1,221 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_0";
+  (*names)[1] = "c_0";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_" + this->int_to_str(this->T_);
+  (*names)[1] = "c_T";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  LayerParameter split_param;
+  split_param.set_type("Split");
+
+  BlobShape input_shape;
+  input_shape.add_dim(1);  // c_0 and h_0 are a single timestep
+  input_shape.add_dim(this->N_);
+  input_shape.add_dim(num_output);
+
+  net_param->add_input("c_0");
+  net_param->add_input_shape()->CopyFrom(input_shape);
+
+  net_param->add_input("h_0");
+  net_param->add_input_shape()->CopyFrom(input_shape);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(1);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xc_x = W_xc * x + b_c
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xc");
+    x_transform_param->add_param()->set_name("b_c");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xc_x");
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the gate dimension.
+    //     W_xc_x_static = W_xc_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xc_x_static");
+    x_static_transform_param->add_param()->set_name("W_xc_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xc_x_static");
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    new_shape->add_dim(this->N_);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->add_bottom("W_xc_x_static");
+    reshape_param->add_top("W_xc_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->add_bottom("W_xc_x");
+  x_slice_param->set_name("W_xc_x_slice");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("h_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("h");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = this->int_to_str(t - 1);
+    string ts = this->int_to_str(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xc_x_" + ts);
+
+    // Add layers to flush the hidden state when beginning a new
+    // sequence, as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(sum_param);
+      cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("transform_" + ts);
+      w_param->add_param()->set_name("W_hc");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hc_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add the outputs of the linear transformations to compute the gate input.
+    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
+    {
+      LayerParameter* input_sum_layer = net_param->add_layer();
+      input_sum_layer->CopyFrom(sum_param);
+      input_sum_layer->set_name("gate_input_" + ts);
+      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
+      input_sum_layer->add_bottom("W_xc_x_" + ts);
+      if (this->static_input_) {
+        input_sum_layer->add_bottom("W_xc_x_static");
+      }
+      input_sum_layer->add_top("gate_input_" + ts);
+    }
+
+    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
+    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
+    // Outputs: c_t, h_t
+    //     [ i_t' ]
+    //     [ f_t' ] := gate_input_t
+    //     [ o_t' ]
+    //     [ g_t' ]
+    //         i_t := \sigmoid[i_t']
+    //         f_t := \sigmoid[f_t']
+    //         o_t := \sigmoid[o_t']
+    //         g_t := \tanh[g_t']
+    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+    //         h_t := o_t .* \tanh[c_t]
+    {
+      LayerParameter* lstm_unit_param = net_param->add_layer();
+      lstm_unit_param->set_type("LSTMUnit");
+      lstm_unit_param->add_bottom("c_" + tm1s);
+      lstm_unit_param->add_bottom("gate_input_" + ts);
+      lstm_unit_param->add_bottom("cont_" + ts);
+      lstm_unit_param->add_top("c_" + ts);
+      lstm_unit_param->add_top("h_" + ts);
+      lstm_unit_param->set_name("unit_" + ts);
+    }
+    output_concat_layer.add_bottom("h_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  {
+    LayerParameter* c_T_copy_param = net_param->add_layer();
+    c_T_copy_param->CopyFrom(split_param);
+    c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_));
+    c_T_copy_param->add_top("c_T");
+  }
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(LSTMLayer);
+REGISTER_LAYER_CLASS(LSTM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
new file mode 100644
index 00000000000..74078d264f5
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -0,0 +1,128 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+inline Dtype tanh(Dtype x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < bottom.size(); ++i) {
+    CHECK_EQ(3, bottom[i]->num_axes());
+    CHECK_EQ(1, bottom[i]->shape(0));
+  }
+  const int num_instances = bottom[0]->shape(1);
+  hidden_dim_ = bottom[0]->shape(2);
+  CHECK_EQ(num_instances, bottom[1]->shape(1));
+  CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
+  CHECK_EQ(1, bottom[2]->shape(1));
+  CHECK_EQ(num_instances, bottom[2]->shape(2));
+  top[0]->ReshapeLike(*bottom[0]);
+  top[1]->ReshapeLike(*bottom[0]);
+  X_acts_.ReshapeLike(*bottom[1]);
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* flush = bottom[2]->cpu_data();
+  Dtype* C = top[0]->mutable_cpu_data();
+  Dtype* H = top[1]->mutable_cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*flush == 0) ? 0 :
+          (*flush * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = f * c_prev + i * g;
+      C[d] = c;
+      const Dtype tanh_c = tanh(c);
+      H[d] = o * tanh_c;
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    ++flush;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* flush = bottom[2]->cpu_data();
+  const Dtype* C = top[0]->cpu_data();
+  const Dtype* H = top[1]->cpu_data();
+  const Dtype* C_diff = top[0]->cpu_diff();
+  const Dtype* H_diff = top[1]->cpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* X_diff = bottom[1]->mutable_cpu_diff();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*flush == 0) ? 0 :
+          (*flush * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = C[d];
+      const Dtype tanh_c = tanh(c);
+      Dtype* c_prev_diff = C_prev_diff + d;
+      Dtype* i_diff = X_diff + d;
+      Dtype* f_diff = X_diff + 1 * hidden_dim_ + d;
+      Dtype* o_diff = X_diff + 2 * hidden_dim_ + d;
+      Dtype* g_diff = X_diff + 3 * hidden_dim_ + d;
+      const Dtype c_term_diff =
+          C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = c_term_diff * f;
+      *i_diff = c_term_diff * g * i * (1 - i);
+      *f_diff = c_term_diff * c_prev * f * (1 - f);
+      *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+      *g_diff = c_term_diff * i * (1 - g * g);
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    C_diff += hidden_dim_;
+    H_diff += hidden_dim_;
+    X_diff += x_dim;
+    C_prev_diff += hidden_dim_;
+    ++flush;
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LSTMUnitLayer);
+#endif
+
+INSTANTIATE_CLASS(LSTMUnitLayer);
+REGISTER_LAYER_CLASS(LSTMUnit);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu
new file mode 100644
index 00000000000..d6bf85071f5
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cu
@@ -0,0 +1,154 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype tanh(const Dtype x) {
+  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
+}
+
+template <typename Dtype>
+__global__ void LSTMActsForward(const int nthreads, const int dim,
+                                const Dtype* X, Dtype* X_acts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    if (d < 3 * dim) {
+      X_acts[index] = sigmoid(X[index]);
+    } else {
+      X_acts[index] = tanh(X[index]);
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitForward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* flush,
+    Dtype* C, Dtype* H) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = flush[n] * f * c_prev + i * g;
+    C[index] = c;
+    const Dtype tanh_c = tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X = bottom[1]->gpu_data();
+  const Dtype* flush = bottom[2]->gpu_data();
+  Dtype* X_acts = X_acts_.mutable_gpu_data();
+  Dtype* C = top[0]->mutable_gpu_data();
+  Dtype* H = top[1]->mutable_gpu_data();
+  const int X_count = bottom[1]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMActsForward<Dtype><<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X, X_acts);
+  CUDA_POST_KERNEL_CHECK;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMUnitForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, hidden_dim_, C_prev, X_acts, flush, C, H);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitBackward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
+    const Dtype* flush, const Dtype* C_diff, const Dtype* H_diff,
+    Dtype* C_prev_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = C[index];
+    const Dtype tanh_c = tanh(c);
+    Dtype* c_prev_diff = C_prev_diff + index;
+    Dtype* X_diff_offset = X_diff + 4 * dim * n;
+    Dtype* i_diff = X_diff_offset + d;
+    Dtype* f_diff = X_diff_offset + 1 * dim + d;
+    Dtype* o_diff = X_diff_offset + 2 * dim + d;
+    Dtype* g_diff = X_diff_offset + 3 * dim + d;
+    const Dtype c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    const Dtype flush_n = flush[n];
+    *c_prev_diff = flush_n * c_term_diff * f;
+    *i_diff = c_term_diff * g;
+    *f_diff = flush_n * c_term_diff * c_prev;
+    *o_diff = H_diff[index] * tanh_c;
+    *g_diff = c_term_diff * i;
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMActsBackward(const int nthreads, const int dim,
+    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    const Dtype X_act = X_acts[index];
+    if (d < 3 * dim) {
+      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
+    } else {
+      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
+    }
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X_acts = X_acts_.gpu_data();
+  const Dtype* flush = bottom[2]->gpu_data();
+  const Dtype* C = top[0]->gpu_data();
+  const Dtype* H = top[1]->gpu_data();
+  const Dtype* C_diff = top[0]->gpu_diff();
+  const Dtype* H_diff = top[1]->gpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* X_acts_diff = X_acts_.mutable_gpu_diff();
+  LSTMUnitBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, hidden_dim_,
+      C_prev, X_acts, C, H, flush, C_diff, H_diff, C_prev_diff, X_acts_diff);
+  CUDA_POST_KERNEL_CHECK;
+  const int X_count = bottom[1]->count();
+  Dtype* X_diff = bottom[1]->mutable_gpu_diff();
+  LSTMActsBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp
new file mode 100644
index 00000000000..a0ce45f6383
--- /dev/null
+++ b/src/caffe/test/test_lstm_layer.cpp
@@ -0,0 +1,265 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/sequence_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  LSTMLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_flush_);
+    blob_top_vec_.push_back(&blob_top_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_flush_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_flush_.Reshape(shape);
+    shape.push_back(num_output_);
+
+    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
+    unit_blob_bottom_x_.Reshape(shape);
+    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
+    unit_blob_bottom_c_prev_.Reshape(shape);
+    shape[0] = 1; shape[1] = 1; shape[2] = num_instances;
+    unit_blob_bottom_flush_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+    filler.Fill(&unit_blob_bottom_c_prev_);
+    filler.Fill(&unit_blob_bottom_x_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_flush_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype> unit_blob_bottom_flush_;
+  Blob<Dtype> unit_blob_bottom_c_prev_;
+  Blob<Dtype> unit_blob_bottom_x_;
+  Blob<Dtype> unit_blob_top_c_;
+  Blob<Dtype> unit_blob_top_h_;
+  vector<Blob<Dtype>*> unit_blob_bottom_vec_;
+  vector<Blob<Dtype>*> unit_blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LSTMLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(LSTMLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the flush blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<LSTMLayer<Dtype> > layer(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence LSTM";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all flush blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
+  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
+  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
+  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
+  for (int i = 0; i < num_axes; ++i) {
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_c_.shape(i));
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_h_.shape(i));
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data();
+  flush_data[0] = 0;
+  flush_data[1] = 0;
+  flush_data[2] = 0;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data();
+  flush_data[0] = 1;
+  flush_data[1] = 0;
+  flush_data[2] = 1;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlushBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe

From 8e84334ea2704a040d8b790abee4ae5449186250 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 16 Feb 2015 14:29:46 -0800
Subject: [PATCH 83/86] Add scripts for downloading COCO2014 tools & data

---
 data/coco/README.md           | 24 ++++++++++++++++++++++++
 data/coco/download_tools.sh   |  5 +++++
 data/coco/get_coco2014_aux.sh | 13 +++++++++++++
 3 files changed, 42 insertions(+)
 create mode 100644 data/coco/README.md
 create mode 100755 data/coco/download_tools.sh
 create mode 100755 data/coco/get_coco2014_aux.sh

diff --git a/data/coco/README.md b/data/coco/README.md
new file mode 100644
index 00000000000..53d36707f2e
--- /dev/null
+++ b/data/coco/README.md
@@ -0,0 +1,24 @@
+For details about the Microsoft COCO ("Common Objects in Context") dataset [1],
+visit mscoco.org.  This README provides instructions for downloading and
+installing the tools and dataset.
+
+1) Download and extract the COCO Python tools by running:
+
+    ./download_tools.sh
+
+2) Install the tools, and optionally download the data by running:
+
+    cd tools
+    python setup.py install  # follow prompts to download or skip data
+
+3) Download train/val/test splits using:
+
+    ./get_coco2014_aux.sh
+
+(or see the COCO README (tools/README) for more information).
+
+
+[1] Lin, Tsung-Yi, Michael Maire, Serge Belongie, James Hays, Pietro Perona,
+    Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick.
+    "Microsoft COCO: Common Objects in Context."
+    arXiv preprint arXiv:1405.0312 (2014).
diff --git a/data/coco/download_tools.sh b/data/coco/download_tools.sh
new file mode 100755
index 00000000000..c90bc1a2624
--- /dev/null
+++ b/data/coco/download_tools.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+wget http://msvocds.blob.core.windows.net/annotations-0-9/tools.zip
+unzip tools.zip
+rm tools.zip
diff --git a/data/coco/get_coco2014_aux.sh b/data/coco/get_coco2014_aux.sh
new file mode 100755
index 00000000000..91cf3e1ef0d
--- /dev/null
+++ b/data/coco/get_coco2014_aux.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env sh
+#
+# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files.
+
+echo "Downloading..."
+
+wget http://dl.caffe.berkeleyvision.org/coco2014_aux.tar.gz
+
+echo "Unzipping..."
+
+tar -xf coco2014_aux.tar.gz && rm -f coco2014_aux.tar.gz
+
+echo "Done."

From b34164ca1c9dd637f93fac49ddc463350439d2e7 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 16 Feb 2015 15:36:09 -0800
Subject: [PATCH 84/86] Add scripts to create HDF5 datasets from COCO captions

---
 examples/coco_caption/.gitignore              |   1 +
 examples/coco_caption/coco_to_hdf5_data.py    | 267 ++++++++++++++++++
 .../coco_caption/hdf5_sequence_generator.py   | 132 +++++++++
 3 files changed, 400 insertions(+)
 create mode 100644 examples/coco_caption/.gitignore
 create mode 100755 examples/coco_caption/coco_to_hdf5_data.py
 create mode 100644 examples/coco_caption/hdf5_sequence_generator.py

diff --git a/examples/coco_caption/.gitignore b/examples/coco_caption/.gitignore
new file mode 100644
index 00000000000..e040331b7f2
--- /dev/null
+++ b/examples/coco_caption/.gitignore
@@ -0,0 +1 @@
+h5_data/
diff --git a/examples/coco_caption/coco_to_hdf5_data.py b/examples/coco_caption/coco_to_hdf5_data.py
new file mode 100755
index 00000000000..233ee829078
--- /dev/null
+++ b/examples/coco_caption/coco_to_hdf5_data.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+
+from hashlib import sha1
+import os
+import random
+random.seed(3)
+import re
+import sys
+
+sys.path.append('./examples/coco_caption/')
+
+COCO_PATH = './data/coco/tools'
+COCO_TOOL_PATH = '%s/pycocotools' % COCO_PATH
+
+MAX_HASH = 100000
+
+sys.path.append(COCO_TOOL_PATH)
+from coco import COCO
+
+from hdf5_sequence_generator import SequenceGenerator, HDF5SequenceWriter
+
+# UNK_IDENTIFIER is the word used to identify unknown words
+UNK_IDENTIFIER = '<unk>'
+
+SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
+def split_sentence(sentence):
+  # break sentence into a list of words and punctuation
+  sentence = [s.lower() for s in SENTENCE_SPLIT_REGEX.split(sentence.strip()) if len(s.strip()) > 0]
+  # remove the '.' from the end of the sentence
+  if sentence[-1] != '.':
+    # print "Warning: sentence doesn't end with '.'; ends with: %s" % sentence[-1]
+    return sentence
+  return sentence[:-1]
+
+MAX_WORDS = 20
+
+class CocoSequenceGenerator(SequenceGenerator):
+  def __init__(self, coco, batch_num_streams, vocab=None,
+               max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True,
+               pad=True, truncate=True, split_ids=None):
+    self.max_words = max_words
+    num_empty_lines = 0
+    self.images = []
+    num_total = 0
+    num_missing = 0
+    num_captions = 0
+    known_images = {}
+    image_root = '%s/%s' % (COCO_PATH, coco.image_folder)
+    if split_ids is None:
+      split_ids = coco.images.keys()
+    for image_id in split_ids:
+      image_info = coco.images[image_id]
+      image_path = '%s/%s/%s' % \
+          (image_root, image_info['file_path'], image_info['file_name'])
+      if os.path.isfile(image_path):
+        assert image_id not in known_images  # no duplicates allowed
+        known_images[image_id] = {}
+        known_images[image_id]['path'] = image_path
+        if gt_captions:
+          known_images[image_id]['sentences'] = [split_sentence(anno['sentence'])
+              for anno in coco.image_to_annotations[image_id]]
+          num_captions += len(known_images[image_id]['sentences'])
+        else:
+          known_images[image_id]['sentences'] = []
+      else:
+        num_missing += 1
+        print 'Warning (#%d): image not found: %s' % (num_missing, image_path)
+      num_total += 1
+    print '%d/%d images missing' % (num_missing, num_total)
+    if vocab is None:
+      self.init_vocabulary(known_images)
+    else:
+      self.vocabulary_inverted = vocab
+      self.vocabulary = {}
+      for index, word in enumerate(self.vocabulary_inverted):
+        self.vocabulary[word] = index
+    self.image_sentence_pairs = []
+    num_no_sentences = 0
+    for image_filename, metadata in known_images.iteritems():
+      if not metadata['sentences']:
+        num_no_sentences += 1
+        print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename)
+      for sentence in metadata['sentences']:
+        self.image_sentence_pairs.append((metadata['path'], sentence))
+    self.index = 0
+    self.num_resets = 0
+    self.num_truncates = 0
+    self.num_pads = 0
+    self.num_outs = 0
+    self.image_list = []
+    SequenceGenerator.__init__(self)
+    self.batch_num_streams = batch_num_streams
+    # make the number of image/sentence pairs a multiple of the buffer size
+    # so each timestep of each batch is useful and we can align the images
+    if align:
+      num_pairs = len(self.image_sentence_pairs)
+      remainder = num_pairs % batch_num_streams
+      if remainder > 0:
+        num_needed = batch_num_streams - remainder
+        for i in range(num_needed):
+          choice = random.randint(0, num_pairs - 1)
+          self.image_sentence_pairs.append(self.image_sentence_pairs[choice])
+      assert len(self.image_sentence_pairs) % batch_num_streams == 0
+    if shuffle:
+      random.shuffle(self.image_sentence_pairs)
+    self.pad = pad
+    self.truncate = truncate
+    self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))
+
+  def streams_exhausted(self):
+    return self.num_resets > 0
+
+  def init_vocabulary(self, image_annotations, min_count=5):
+    words_to_count = {}
+    for image_id, annotations in image_annotations.iteritems():
+      for annotation in annotations['sentences']:
+        for word in annotation:
+          word = word.strip()
+          if word not in words_to_count:
+            words_to_count[word] = 0
+          words_to_count[word] += 1
+    # Sort words by count, then alphabetically
+    words_by_count = sorted(words_to_count.keys(), key=lambda w: (-words_to_count[w], w))
+    print 'Initialized vocabulary with %d words; top 10 words:' % len(words_by_count)
+    for word in words_by_count[:10]:
+      print '\t%s (%d)' % (word, words_to_count[word])
+    # Add words to vocabulary
+    self.vocabulary = {UNK_IDENTIFIER: 0}
+    self.vocabulary_inverted = [UNK_IDENTIFIER]
+    for index, word in enumerate(words_by_count):
+      word = word.strip()
+      if words_to_count[word] < min_count:
+        break
+      self.vocabulary_inverted.append(word)
+      self.vocabulary[word] = index + 1
+    print 'Final vocabulary (restricted to words with counts of %d+) has %d words' % \
+        (min_count, len(self.vocabulary))
+
+  def dump_vocabulary(self, vocab_filename):
+    print 'Dumping vocabulary to file: %s' % vocab_filename
+    with open(vocab_filename, 'wb') as vocab_file:
+      for word in self.vocabulary_inverted:
+        vocab_file.write('%s\n' % word)
+    print 'Done.'
+
+  def dump_image_file(self, image_filename, dummy_image_filename=None):
+    print 'Dumping image list to file: %s' % image_filename
+    with open(image_filename, 'wb') as image_file:
+      for image_path, _ in self.image_list:
+        image_file.write('%s\n' % image_path)
+    if dummy_image_filename is not None:
+      print 'Dumping image list with dummy labels to file: %s' % dummy_image_filename
+      with open(dummy_image_filename, 'wb') as image_file:
+        for path_and_hash in self.image_list:
+          image_file.write('%s %d\n' % path_and_hash)
+    print 'Done.'
+
+  def next_line(self):
+    num_lines = float(len(self.image_sentence_pairs))
+    self.index += 1
+    if self.index == 1 or self.index == num_lines or self.index % 10000 == 0:
+      print 'Processed %d/%d (%f%%) lines' % (self.index, num_lines,
+                                              100 * self.index / num_lines)
+    if self.index == num_lines:
+      self.index = 0
+      self.num_resets += 1
+
+  def line_to_stream(self, sentence):
+    stream = []
+    for word in sentence:
+      word = word.strip()
+      if word in self.vocabulary:
+        stream.append(self.vocabulary[word])
+      else:  # unknown word; append UNK
+        stream.append(self.vocabulary[UNK_IDENTIFIER])
+    # increment the stream -- 0 will be the EOS character
+    stream = [s + 1 for s in stream]
+    return stream
+
+  def get_pad_value(self, stream_name):
+    return -1 if stream_name in self.negative_one_padded_streams else 0
+
+  def get_streams(self):
+    image_filename, line = self.image_sentence_pairs[self.index]
+    stream = self.line_to_stream(line)
+    pad = self.max_words - (len(stream) + 1) if self.pad else 0
+    if pad > 0: self.num_pads += 1
+    self.num_outs += 1
+    out = {}
+    out['stage_indicators'] = [1] * (len(stream) + 1) + [0] * pad
+    out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad
+    out['input_sentence'] = [0] + stream + [-1] * pad
+    out['target_sentence'] = stream + [0] + [-1] * pad
+    truncated = False
+    if self.truncate:
+      for key, val in out.iteritems():
+        if len(val) > self.max_words:
+          out[key] = val[:self.max_words]
+          truncated = True
+      self.num_truncates += truncated
+    image_hash = self.image_hash(image_filename)
+    out['hashed_image_path'] = [image_hash] * len(out['input_sentence'])
+    self.image_list.append((image_filename, image_hash))
+    self.next_line()
+    return out
+
+  def image_hash(self, filename):
+    image_hash = int(sha1(filename).hexdigest(), 16) % MAX_HASH
+    assert image_hash == float(image_hash)
+    return image_hash
+
+COCO_ANNO_PATH = '%s/annotations/sentences_%%s2014.json' % COCO_PATH
+COCO_IMAGE_PATTERN = '%s/images/%%s2014' % COCO_PATH
+COCO_IMAGE_ID_PATTERN = 'COCO_%s2014_%%012d.jpg'
+
+BUFFER_SIZE = 100
+OUTPUT_DIR = './examples/coco_caption/h5_data/buffer_%d' % BUFFER_SIZE
+SPLITS_PATTERN = './data/coco/coco2014_cocoid.%s.txt'
+OUTPUT_DIR_PATTERN = '%s/%%s_batches' % OUTPUT_DIR
+
+def preprocess_dataset(split_name, coco_split_name, batch_stream_length,
+                       vocab=None, aligned=True):
+  with open(SPLITS_PATTERN % split_name, 'r') as split_file:
+    split_image_ids = [int(line) for line in split_file.readlines()]
+  output_dataset_name = split_name
+  if aligned:
+    output_dataset_name += '_aligned_%d' % MAX_WORDS
+  else:
+    output_dataset_name += '_unaligned'
+  output_path = OUTPUT_DIR_PATTERN % output_dataset_name
+  coco = COCO(COCO_ANNO_PATH % coco_split_name)
+  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, split_ids=split_image_ids,
+      vocab=vocab, align=aligned, pad=aligned, truncate=aligned)
+  sg.batch_stream_length = batch_stream_length
+  writer = HDF5SequenceWriter(sg, output_dir=output_path)
+  writer.write_to_exhaustion()
+  writer.write_filelists()
+  if vocab is None:
+    vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR
+    sg.dump_vocabulary(vocab_out_path)
+  image_out_path = '%s/image_list.txt' % output_path
+  image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path
+  sg.dump_image_file(image_out_path, image_dummy_labels_out_path)
+  num_outs = sg.num_outs
+  num_pads = sg.num_pads
+  num_truncates = sg.num_truncates
+  print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
+      (num_pads, num_outs, num_truncates, num_outs)
+  return sg.vocabulary_inverted
+
+def preprocess_coco():
+  vocab = None
+  DATASETS = [
+      ('train', 'train', 100000, True),
+      ('val', 'val', 100000, True),
+      ('test', 'val', 100000, True),
+      # Write unaligned datasets as well:
+      ('train', 'train', 100000, False),
+      ('val', 'val', 100000, False),
+      ('test', 'val', 100000, False),
+  ]
+  for split_name, coco_split_name, batch_stream_length, aligned in DATASETS:
+    vocab = preprocess_dataset(split_name, coco_split_name, batch_stream_length,
+                               vocab=vocab, aligned=aligned)
+
+if __name__ == "__main__":
+  preprocess_coco()
diff --git a/examples/coco_caption/hdf5_sequence_generator.py b/examples/coco_caption/hdf5_sequence_generator.py
new file mode 100644
index 00000000000..98d4657b6bf
--- /dev/null
+++ b/examples/coco_caption/hdf5_sequence_generator.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+import h5py
+import numpy as np
+import os
+import random
+import sys
+
+class SequenceGenerator():
+  def __init__(self):
+    self.dimension = 10
+    self.batch_stream_length = 2000
+    self.batch_num_streams = 8
+    self.min_stream_length = 13
+    self.max_stream_length = 17
+    self.substream_names = None
+    self.streams_initialized = False
+
+  def streams_exhausted(self):
+    return False
+
+  def init_streams(self):
+    self.streams = [None] * self.batch_num_streams
+    self.stream_indices = [0] * self.batch_num_streams
+    self.reset_stream(0)
+    self.streams_initialized = True
+
+  def reset_stream(self, stream_index):
+    streams = self.get_streams()
+    stream_names = sorted(streams.keys())
+    if self.substream_names is None:
+      assert len(stream_names) > 0
+      self.substream_names = stream_names
+    assert self.substream_names == stream_names
+    if self.streams[stream_index] is None:
+      self.streams[stream_index] = {}
+    stream_length = len(streams[stream_names[0]])
+    for k, v in streams.iteritems():
+      assert stream_length == len(v)
+      self.streams[stream_index][k] = v
+    self.stream_indices[stream_index] = 0
+
+  # Pad with zeroes by default -- override this to pad with soemthing else
+  # for a particular stream
+  def get_pad_value(self, stream_name):
+    return 0
+
+  def get_next_batch(self, truncate_at_exhaustion=True):
+    if not self.streams_initialized:
+      self.init_streams()
+    batch_size = self.batch_num_streams * self.batch_stream_length
+    batch = {}
+    batch_indicators = np.zeros((self.batch_stream_length, self.batch_num_streams))
+    for name in self.substream_names:
+      batch[name] = self.get_pad_value(name) * np.ones_like(batch_indicators)
+    exhausted = [False] * self.batch_num_streams
+    all_exhausted = False
+    reached_exhaustion = False
+    num_completed_streams = 0
+    for t in range(self.batch_stream_length):
+      all_exhausted = True
+      for i in range(self.batch_num_streams):
+        if not exhausted[i]:
+          if self.streams[i] is None or \
+              self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]):
+            self.stream_indices[i] = 0
+            reached_exhaustion = reached_exhaustion or self.streams_exhausted()
+            if reached_exhaustion: exhausted[i] = True
+            if not reached_exhaustion or not truncate_at_exhaustion:
+              self.reset_stream(i)
+            else:
+              continue
+          for name in self.substream_names:
+            batch[name][t, i] = self.streams[i][name][self.stream_indices[i]]
+          batch_indicators[t, i] = 0 if self.stream_indices[i] == 0 else 1
+          self.stream_indices[i] += 1
+          if self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]):
+            num_completed_streams += 1
+        if not exhausted[i]: all_exhausted = False
+      if all_exhausted and truncate_at_exhaustion:
+        print ('Exhausted all data; cutting off batch at timestep %d ' +
+               'with %d streams completed') % (t, num_completed_streams)
+        for name in self.substream_names:
+          batch[name] = batch[name][:t, :]
+        batch_indicators = batch_indicators[:t, :]
+        break
+    return batch, batch_indicators
+
+  def get_streams(self):
+    raise Exception('get_streams should be overridden to return a dict ' +
+                    'of equal-length iterables.')
+
+class HDF5SequenceWriter():
+  def __init__(self, sequence_generator, output_dir=None, verbose=False):
+    self.generator = sequence_generator
+    assert output_dir is not None  # required
+    self.output_dir = output_dir
+    if os.path.exists(output_dir):
+      raise Exception('Output directory already exists: ' + output_dir)
+    os.makedirs(output_dir)
+    self.verbose = verbose
+    self.filenames = []
+
+  def write_batch(self, stop_at_exhaustion=False):
+    batch_comps, cont_indicators = self.generator.get_next_batch()
+    batch_index = len(self.filenames)
+    filename = '%s/batch_%d.h5' % (self.output_dir, batch_index)
+    self.filenames.append(filename)
+    h5file = h5py.File(filename, 'w')
+    dataset = h5file.create_dataset('cont', shape=cont_indicators.shape, dtype=cont_indicators.dtype)
+    dataset[:] = cont_indicators
+    dataset = h5file.create_dataset('buffer_size', shape=(1,), dtype=np.int)
+    dataset[:] = self.generator.batch_num_streams
+    for key, batch in batch_comps.iteritems():
+      if self.verbose:
+        for s in range(self.generator.batch_num_streams):
+          stream = np.array(self.generator.streams[s][key])
+          print 'batch %d, stream %s, index %d: ' % (batch_index, key, s), stream
+      h5dataset = h5file.create_dataset(key, shape=batch.shape, dtype=batch.dtype)
+      h5dataset[:] = batch
+    h5file.close()
+
+  def write_to_exhaustion(self):
+    while not self.generator.streams_exhausted():
+      self.write_batch(stop_at_exhaustion=True)
+
+  def write_filelists(self):
+    assert self.filenames is not None
+    filelist_filename = '%s/hdf5_chunk_list.txt' % self.output_dir
+    with open(filelist_filename, 'w') as listfile:
+      for filename in self.filenames:
+        listfile.write('%s\n' % filename)

From 6c52bd50a7400abf35db9fb60a14d62d93d30fe1 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 16 Feb 2015 15:55:31 -0800
Subject: [PATCH 85/86] Prototxts + script for training COCO caption language
 model

---
 .../coco_caption/lstm_language_model.prototxt | 149 ++++++++++++++++++
 examples/coco_caption/lstm_lm_solver.prototxt |  21 +++
 examples/coco_caption/train_language_model.sh |  14 ++
 3 files changed, 184 insertions(+)
 create mode 100644 examples/coco_caption/lstm_language_model.prototxt
 create mode 100644 examples/coco_caption/lstm_lm_solver.prototxt
 create mode 100755 examples/coco_caption/train_language_model.sh

diff --git a/examples/coco_caption/lstm_language_model.prototxt b/examples/coco_caption/lstm_language_model.prototxt
new file mode 100644
index 00000000000..3cf4f6a686f
--- /dev/null
+++ b/examples/coco_caption/lstm_language_model.prototxt
@@ -0,0 +1,149 @@
+name: "lstm_language_model"
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801  # = vocab_size + 1 (for EOS)
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "embed-drop"
+  type: "Dropout"
+  bottom: "embedded_input_sentence"
+  top: "embedded_input_sentence"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "embed-drop" }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm-drop"
+  type: "Dropout"
+  bottom: "lstm1"
+  top: "lstm1"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "lstm-drop" }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 8801  # = vocab_size + 1 (+1 for EOS)
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  loss_param {
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lstm_lm_solver.prototxt b/examples/coco_caption/lstm_lm_solver.prototxt
new file mode 100644
index 00000000000..fb36ad15a5b
--- /dev/null
+++ b/examples/coco_caption/lstm_lm_solver.prototxt
@@ -0,0 +1,21 @@
+net: "./examples/coco_caption/lstm_language_model.prototxt"
+train_state: { stage: 'embed-drop' stage: 'lstm-drop' }
+test_iter: 25
+test_state: { stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.1
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lstm_lm"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/train_language_model.sh b/examples/coco_caption/train_language_model.sh
new file mode 100755
index 00000000000..6e8a8c47b37
--- /dev/null
+++ b/examples/coco_caption/train_language_model.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lstm_lm_solver.prototxt \
+    -gpu $GPU_ID

From 54fa90fa1b38af14a6fca32ed8aa5ead38752a09 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 16 Feb 2015 16:08:28 -0800
Subject: [PATCH 86/86] Prototxts + script for training LRCN COCO image
 captioning model

---
 examples/coco_caption/lrcn.prototxt        | 767 +++++++++++++++++++++
 examples/coco_caption/lrcn_solver.prototxt |  30 +
 examples/coco_caption/train_lrcn.sh        |  17 +
 3 files changed, 814 insertions(+)
 create mode 100644 examples/coco_caption/lrcn.prototxt
 create mode 100644 examples/coco_caption/lrcn_solver.prototxt
 create mode 100755 examples/coco_caption/train_lrcn.sh

diff --git a/examples/coco_caption/lrcn.prototxt b/examples/coco_caption/lrcn.prototxt
new file mode 100644
index 00000000000..62d08a2738d
--- /dev/null
+++ b/examples/coco_caption/lrcn.prototxt
@@ -0,0 +1,767 @@
+# The network is used for the image captioning experiments of LRCN [1].
+# Please consider citing LRCN [1] if you use this example in your work.
+#
+# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan,
+#     K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for
+#     Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014).
+
+name: "lrcn_caffenet_to_lstm"
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "silence"
+  type: "Silence"
+  bottom: "label"
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "pool1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "norm1"
+  top: "conv2"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "norm1"
+  top: "conv2"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "pool2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "norm2"
+  top: "conv3"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "norm2"
+  top: "conv3"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  # exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm1"
+  include { stage: "unfactored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  top: "lstm2"
+  include {
+    stage: "unfactored"
+    stage: "2-layer"
+  }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm2"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  exclude { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm2"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  include { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  loss_param {
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lrcn_solver.prototxt b/examples/coco_caption/lrcn_solver.prototxt
new file mode 100644
index 00000000000..65ca272b30c
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/train_lrcn.sh b/examples/coco_caption/train_lrcn.sh
new file mode 100755
index 00000000000..5099e762ccd
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID