modify conv.cu

songguanglu · songguanglu · commit 5681161f5587 · 2017-08-12T12:26:25.000+08:00
diff --git a/include/caffe/layers/base_conv_layer.hpp b/include/caffe/layers/base_conv_layer.hpp
@@ -7,7 +7,7 @@
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/im2col.hpp"
-
+#include "time.h"
 namespace caffe {
 
 /**
@@ -28,7 +28,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   virtual inline int MinTopBlobs() const { return 1; }
   virtual inline bool EqualNumBottomTopBlobs() const { return true; }
 
- protected:
+ //protected:
   // Helper functions that abstract away the column buffer and gemm arguments.
   // The last argument in forward_cpu_gemm is so that we can skip the im2col if
   // we just called weight_cpu_gemm with the same input.
@@ -40,12 +40,16 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
       weights);
   void backward_cpu_bias(Dtype* bias, const Dtype* input);
-
+  /*virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+	  const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+	  const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void forward_gpu_gemm_mask(const Dtype* col_input, const Dtype* weights,
+	  Dtype* output, const Dtype* mask_input, bool skip_im2col = false);*/
 #ifndef CPU_ONLY
   void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
       Dtype* output, bool skip_im2col = false);
-  void forward_gpu_gemm_mask(const Dtype* col_input, const Dtype* weights,
-	  Dtype* output, const Dtype* mask_input, bool skip_im2col = false);
+
   void forward_gpu_bias(Dtype* output, const Dtype* bias);
   void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
       Dtype* col_output);
@@ -79,6 +83,10 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   Blob<int> pass_idx_;
   Blob<Dtype> buffer_col_;
   Blob<Dtype> output_buffer_;
+  Blob<int> src_index_;
+  Blob<int> dst_index_;
+  Blob<int> src_fin_index_;
+  Blob<int> dst_fin_index_;
   vector<int> col_buffer_shape_;
   vector<int> col_buffer_shape_mask_;
   /// @brief The spatial dimensions of the output.
@@ -87,12 +95,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   vector<int> output_shape_mask_;
   const vector<int>* bottom_shape_;
   const vector<int>* bottom_mask_shape_;
+  
   int num_spatial_axes_;
   int bottom_dim_;
   int bottom_dim_mask_;
   int top_dim_;
   int top_dim_mask_;
-
+  int output_offset_;
   int channel_axis_;
   int num_;
   int channels_;
@@ -104,7 +113,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   bool is_1x1_;
   bool force_nd_im2col_;
 
- private:
+ //private:
   // wrap im2col/col2im so we don't have to remember the (long) argument lists
   inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
     if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
@@ -169,13 +178,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 
   int num_kernels_im2col_;
   int num_kernels_col2im_;
-  int conv_out_channels_;
+  
   int conv_in_channels_;
   int conv_out_spatial_dim_;
   int conv_out_spatial_dim_mask_;
-  int kernel_dim_;
+  int conv_out_channels_;
   int col_offset_;
-  int output_offset_;
+  int kernel_dim_;
   int col_offset_mask_;
   int output_offset_mask_;
   Blob<Dtype> col_buffer_;
diff --git a/include/caffe/layers/conv_layer.hpp b/include/caffe/layers/conv_layer.hpp
@@ -8,7 +8,7 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/base_conv_layer.hpp"
-
+#include "windows.h"
 namespace caffe {
 
 /**
@@ -67,6 +67,7 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
   virtual inline const char* type() const { return "Convolution"; }
 
  protected:
+
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -76,6 +77,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual inline bool reverse_dimensions() { return false; }
+  void forward_gpu_gemm_mask(const Dtype* col_input, const Dtype* weights,
+	  Dtype* output, const Dtype* mask_input, bool skip_im2col = false);
   virtual void compute_output_shape();
 };
 
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
@@ -8,6 +8,16 @@
 
 namespace caffe {
 
+
+	//template <typename Dtype>
+	//__global__ void MaskCopy(const int n, const Dtype* in, int *src, Dtype* out,int *dst) {
+	//	CUDA_KERNEL_LOOP(index, n) {
+	//		out[dst[index]] = in[src[index]];
+	//	}
+	//}
+
+
+
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -377,85 +387,133 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
         (Dtype)0., output + output_offset_ * g);
   }
 }
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_mask(const Dtype* input,
-	const Dtype* weights, Dtype* output, const Dtype* mask_input,bool skip_im2col) {
-	//const Dtype* col_buff = input;
-	//const Dtype* col_buff_mask = mask_input;
-	const int height = conv_input_shape_.cpu_data()[1];
-	const int width = conv_input_shape_.cpu_data()[2];
-	const int kernel_h = kernel_shape_.cpu_data()[0];
-	const int kernel_w = kernel_shape_.cpu_data()[1];
-	const int pad_h = pad_.cpu_data()[0];
-	const int pad_w = pad_.cpu_data()[1];
-	const int stride_h = stride_.cpu_data()[0];
-	const int stride_w = stride_.cpu_data()[1];
-	const int dilation_h = dilation_.cpu_data()[0];
-	const int dilation_w = dilation_.cpu_data()[1];
-	int height_col = (height + 2 * pad_h -
-		(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-	int width_col = (width + 2 * pad_w -
-		(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-	int skip_num = height_col*width_col;
-	int validnum = caffe_cpu_asum(height_col*width_col, mask_input);
-	pass_idx_.Reshape(validnum, 1, 1, 1);
-	buffer_col_.Reshape(kernel_dim_, validnum, 1, 1);
-	Dtype* buffer_col_data = buffer_col_.mutable_cpu_data();
-	//Dtype* buffer_col_data = buffer_col_.mutable_cpu_data();
-	output_buffer_.Reshape(conv_out_channels_, validnum, 1, 1);
-	int idx = 0;
-	//if (!is_1x1_) {;
-	if (1){
-		//if (!skip_im2col) {
-		if (1){
-			conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); //here 11111
-		}
-	//	LOG(INFO) << "Debuf here:finish im2col\n";
-		const Dtype* col_buff = col_buffer_.cpu_data();
-	//	col_buff = col_buffer_.cpu_data();
-		//generate new trans respond to mask 1
-		for (int h = 0; h < height_col; h++){
-			for (int w = 0; w < width_col; w++){
-				if (mask_input[h*width_col + w] >= 1)
-				{
-					for (int temp = 0; temp < kernel_dim_; temp++){
-						buffer_col_data[temp*validnum + idx] = col_buff[temp*height_col*width_col + h*width_col + w];
-					}
-					idx += 1;
-				}
-			}
-		}
-		
-	}
-	//Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
-	//const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
-	Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
-	const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-			group_, validnum, kernel_dim_,
-			(Dtype)1., weights + weight_offset_ * g, buffer_col_data_com + col_offset_ * g,
-			(Dtype)0., output_buffer_data + conv_out_channels_* validnum / group_* g); //here11111
-	}
-//	LOG(INFO) << "Debuf here:finish gpu_gemm\n";
-	//generate new output for mask 0
-	caffe_set(output_offset_, Dtype(0), output);
-	idx = 0;
+//
+//template <typename Dtype>
+//void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_mask(const Dtype* input,
+//	const Dtype* weights, Dtype* output, const Dtype* mask_input,bool skip_im2col) {
+//	//const Dtype* col_buff = input;
+//	//const Dtype* col_buff_mask = mask_input;
+//	clock_t start, end,all_start,all_end;
+//	double dur;
+//	start = clock();
+//	all_start = clock();
+//	const int height = conv_input_shape_.cpu_data()[1];
+//	const int width = conv_input_shape_.cpu_data()[2];
+//	const int kernel_h = kernel_shape_.cpu_data()[0];
+//	const int kernel_w = kernel_shape_.cpu_data()[1];
+//	const int pad_h = pad_.cpu_data()[0];
+//	const int pad_w = pad_.cpu_data()[1];
+//	const int stride_h = stride_.cpu_data()[0];
+//	const int stride_w = stride_.cpu_data()[1];
+//	const int dilation_h = dilation_.cpu_data()[0];
+//	const int dilation_w = dilation_.cpu_data()[1];
+//	int height_col = (height + 2 * pad_h -
+//		(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+//	int width_col = (width + 2 * pad_w -
+//		(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+//	int validnum = caffe_cpu_asum(height_col*width_col, mask_input);
+//	int* src_index = new int[validnum*kernel_dim_];
+//	int* dst_index = new int[validnum*kernel_dim_];
+//	int cnt = 0;
+//	pass_idx_.Reshape(validnum, 1, 1, 1);
+//	buffer_col_.Reshape(kernel_dim_, validnum, 1, 1);
+//	Dtype* buffer_col_data = buffer_col_.mutable_gpu_data();
+//	//Dtype* buffer_col_data = buffer_col_.mutable_cpu_data();
+//	output_buffer_.Reshape(conv_out_channels_, validnum, 1, 1);
+//	end = clock();
+//	dur = (double)(end - start);
+//	LOG(INFO) << "the base_conv_layer before im2col using time:" << dur / CLOCKS_PER_SEC;
+//	start = clock();
+//	int idx = 0;
+//	//if (!is_1x1_) {;
+//	if (1){
+//		//if (!skip_im2col) {
+//		if (1){
+//			conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); //here 11111
+//		}
+//	//	LOG(INFO) << "Debuf here:finish im2col\n";
+//		end = clock();
+//
+//		const Dtype* col_buff = col_buffer_.gpu_data();
+//		
+//		dur = (double)(end - start);
+//		LOG(INFO) << "the base_conv_layer im2col using time:" << dur / CLOCKS_PER_SEC;
+//		google::FlushLogFiles(google::INFO);
+//	//	col_buff = col_buffer_.cpu_data();
+//		//generate new trans respond to mask 1
+//		LOG(INFO) << "shape_0:" << col_buffer_.shape(0);
+//		LOG(INFO) << "shape_1:" << col_buffer_.shape(1);
+//		LOG(INFO) << "shape_2:" << col_buffer_.shape(2);
+//		google::FlushLogFiles(google::INFO);
+//		start = clock();
+//		for (int h = 0; h < height_col; h++){
+//			for (int w = 0; w < width_col; w++){
+//				if (mask_input[h*width_col + w] >= 1)
+//				{
+//					for (int temp = 0; temp < kernel_dim_; temp++){
+//						src_index[cnt] = temp*height_col*width_col + h*width_col + w;
+//						dst_index[cnt] = temp*validnum + idx;
+//						cnt++;
+//						//	buffer_col_data[temp*validnum + idx] = col_buff[temp*height_col*width_col + h*width_col + w];
+//						//LOG(INFO) << "index:" << temp*height_col*width_col + h*width_col + w;
+//						//google::FlushLogFiles(google::INFO);
+//					}
+//					idx += 1;
+//				}
+//			}
+//		}
+//		MaskCopy<Dtype> << <CAFFE_GET_BLOCKS(validnum*kernel_dim_), CAFFE_CUDA_NUM_THREADS >> >(
+//			validnum*kernel_dim_, col_buff, src_index, buffer_col_data, dst_index);
+//		CUDA_POST_KERNEL_CHECK;
+//
+//		end = clock();
+//		dur = (double)(end - start);
+//		LOG(INFO) << "the base_conv_layer 418-429 using time:" << dur / CLOCKS_PER_SEC;
+//		google::FlushLogFiles(google::INFO);
+//	}
+//	//Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
+//	//const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
+//	start = clock();
+//	Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
+//	const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
+//
+//	for (int g = 0; g < group_; ++g) {
+//		caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
+//			group_, validnum, kernel_dim_,
+//			(Dtype)1., weights + weight_offset_ * g, buffer_col_data_com + col_offset_ * g,
+//			(Dtype)0., output_buffer_data + conv_out_channels_* validnum / group_* g); //here11111
+//	}
+//	end = clock();
+//	dur = (double)(end - start);
+//	LOG(INFO) << "the base_conv_layer 435-440 using time:" << dur / CLOCKS_PER_SEC;
+//	google::FlushLogFiles(google::INFO);
+////	LOG(INFO) << "Debuf here:finish gpu_gemm\n";
+//	//generate new output for mask 0
+//	start = clock();
+//	caffe_set(output_offset_, Dtype(0), output);
+//	idx = 0;
+////	const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
 //	const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
-	const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
-	for (int h = 0; h < height_col; h++){
-		for (int w = 0; w < width_col; w++){
-			if (mask_input[h*width_col + w] >= 1)
-			{
-				for (int temp = 0; temp < conv_out_channels_; temp++){
-					output[temp*height_col*width_col + h*width_col + w] = output_buffer_data_fin[temp*validnum + idx];
-				}
-				idx += 1;
-			}
-		}
-	}
-}
+//
+//	for (int h = 0; h < height_col; h++){
+//		for (int w = 0; w < width_col; w++){
+//			if (mask_input[h*width_col + w] >= 1)
+//			{
+//				for (int temp = 0; temp < conv_out_channels_; temp++){
+//					output[temp*height_col*width_col + h*width_col + w] = output_buffer_data_fin[temp*validnum + idx];
+//				}
+//				idx += 1;
+//			}
+//		}
+//	}
+//	end = clock();
+//	dur = (double)(end - start);
+//	LOG(INFO) << "the base_conv_layer 446-457 using time:" << dur / CLOCKS_PER_SEC;
+//	google::FlushLogFiles(google::INFO);
+//	all_end = clock();
+//	dur = (double)(all_end - all_start);
+//	LOG(INFO) << "the gemm_mask inner using time:" << dur / CLOCKS_PER_SEC;
+//}
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu