fix conflict of merging BVLC#3948

weiliu89 · Apr 7, 2016 · b1678f3 · b1678f3
2 parents cf455be + 10f9273
commit b1678f3
Show file tree

Hide file tree

Showing 34 changed files with 3,270 additions and 53 deletions.
diff --git a/Makefile b/Makefile
@@ -364,9 +364,9 @@ ifeq ($(BLAS), mkl)
 	# MKL
 	LIBRARIES += mkl_rt
 	COMMON_FLAGS += -DUSE_MKL
-	MKL_DIR ?= /opt/intel/mkl
-	BLAS_INCLUDE ?= $(MKL_DIR)/include
-	BLAS_LIB ?= $(MKL_DIR)/lib $(MKL_DIR)/lib/intel64
+	MKLROOT ?= /opt/intel/mkl
+	BLAS_INCLUDE ?= $(MKLROOT)/include
+	BLAS_LIB ?= $(MKLROOT)/lib $(MKLROOT)/lib/intel64
 else ifeq ($(BLAS), open)
 	# OpenBLAS
 	LIBRARIES += openblas

diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
@@ -14,9 +14,9 @@ Let's fine-tune the BVLC-distributed CaffeNet model on a different dataset, [Fli
 ## Explanation
 
 The Flickr-sourced images of the Style dataset are visually very similar to the ImageNet dataset, on which the `bvlc_reference_caffenet` was trained.
-Since that model works well for object category classification, we'd like to use it architecture for our style classifier.
+Since that model works well for object category classification, we'd like to use this architecture for our style classifier.
 We also only have 80,000 images to train on, so we'd like to start with the parameters learned on the 1,000,000 ImageNet images, and fine-tune as needed.
-If we give provide the `weights` argument to the `caffe train` command, the pretrained weights will be loaded into our model, matching layers by name.
+If we provide the `weights` argument to the `caffe train` command, the pretrained weights will be loaded into our model, matching layers by name.
 
 Because we are predicting 20 classes instead of a 1,000, we do need to change the last layer in the model.
 Therefore, we change the name of the last layer from `fc8` to `fc8_flickr` in our prototxt.

diff --git a/examples/pascal-multilabel-with-datalayer.ipynb b/examples/pascal-multilabel-with-datalayer.ipynb
@@ -452,8 +452,8 @@
   }
  ],
  "metadata": {
-  "description": "Multilabel classification on PASCAL using python data-layers.",
-  "example_name": "PASCAL Multilabel with python datalayer",
+  "description": "Multilabel classification on PASCAL VOC using a Python data layer.",
+  "example_name": "Multilabel Classification with Python Data Layer",
   "include_in_docs": true,
   "kernelspec": {
    "display_name": "Python 2",
@@ -471,7 +471,8 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
    "version": "2.7.11"
-  }
+  },
+  "priority": 5
  },
  "nbformat": 4,
  "nbformat_minor": 0

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
@@ -153,6 +153,11 @@ class Caffe {
   static void SetDevice(const int device_id);
   // Prints the current GPU status.
   static void DeviceQuery();
+  // Check if specified device is available
+  static bool CheckDevice(const int device_id);
+  // Search from start_id to the highest possible device ordinal,
+  // return the ordinal of the first available device.
+  static int FindDevice(const int start_id = 0);
   // Parallel training info
   inline static int solver_count() { return Get().solver_count_; }
   inline static void set_solver_count(int val) { Get().solver_count_ = val; }

diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
@@ -0,0 +1,67 @@
+#ifndef CAFFE_CROP_LAYER_HPP_
+#define CAFFE_CROP_LAYER_HPP_
+
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Takes a Blob and crop it, to the shape specified by the second input
+ *  Blob, across all dimensions after the specified axis.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+
+template <typename Dtype>
+class CropLayer : public Layer<Dtype> {
+ public:
+  explicit CropLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Crop"; }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  vector<int> offsets;
+
+ private:
+  void crop_copy(const vector<Blob<Dtype>*>& bottom,
+               const vector<Blob<Dtype>*>& top,
+               const vector<int>& offsets,
+               vector<int> indices,
+               int cur_dim,
+               const Dtype* src_data,
+               Dtype* dest_data,
+               bool is_forward);
+
+  void crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
+                const vector<Blob<Dtype>*>& top,
+                const vector<int>& offsets,
+                vector<int> indices,
+                int cur_dim,
+                const Dtype* src_data,
+                Dtype* dest_data,
+                bool is_forward);
+};
+}  // namespace caffe
+
+#endif  // CAFFE_CROP_LAYER_HPP_
diff --git a/include/caffe/layers/lstm_layer.hpp b/include/caffe/layers/lstm_layer.hpp
@@ -0,0 +1,155 @@
+#ifndef CAFFE_LSTM_LAYER_HPP_
+#define CAFFE_LSTM_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented as a network
+ *        unrolled the LSTM computation in time.
+ *
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LSTM_LAYER_HPP_