Skip to content

Commit 5681161

Browse files
committed
modify conv.cu
1 parent ce792ce commit 5681161

File tree

4 files changed

+332
-92
lines changed

4 files changed

+332
-92
lines changed

include/caffe/layers/base_conv_layer.hpp

+19-10
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "caffe/layer.hpp"
88
#include "caffe/proto/caffe.pb.h"
99
#include "caffe/util/im2col.hpp"
10-
10+
#include "time.h"
1111
namespace caffe {
1212

1313
/**
@@ -28,7 +28,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
2828
virtual inline int MinTopBlobs() const { return 1; }
2929
virtual inline bool EqualNumBottomTopBlobs() const { return true; }
3030

31-
protected:
31+
//protected:
3232
// Helper functions that abstract away the column buffer and gemm arguments.
3333
// The last argument in forward_cpu_gemm is so that we can skip the im2col if
3434
// we just called weight_cpu_gemm with the same input.
@@ -40,12 +40,16 @@ class BaseConvolutionLayer : public Layer<Dtype> {
4040
void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
4141
weights);
4242
void backward_cpu_bias(Dtype* bias, const Dtype* input);
43-
43+
/*virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
44+
const vector<Blob<Dtype>*>& top);
45+
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
46+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
47+
virtual void forward_gpu_gemm_mask(const Dtype* col_input, const Dtype* weights,
48+
Dtype* output, const Dtype* mask_input, bool skip_im2col = false);*/
4449
#ifndef CPU_ONLY
4550
void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
4651
Dtype* output, bool skip_im2col = false);
47-
void forward_gpu_gemm_mask(const Dtype* col_input, const Dtype* weights,
48-
Dtype* output, const Dtype* mask_input, bool skip_im2col = false);
52+
4953
void forward_gpu_bias(Dtype* output, const Dtype* bias);
5054
void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
5155
Dtype* col_output);
@@ -79,6 +83,10 @@ class BaseConvolutionLayer : public Layer<Dtype> {
7983
Blob<int> pass_idx_;
8084
Blob<Dtype> buffer_col_;
8185
Blob<Dtype> output_buffer_;
86+
Blob<int> src_index_;
87+
Blob<int> dst_index_;
88+
Blob<int> src_fin_index_;
89+
Blob<int> dst_fin_index_;
8290
vector<int> col_buffer_shape_;
8391
vector<int> col_buffer_shape_mask_;
8492
/// @brief The spatial dimensions of the output.
@@ -87,12 +95,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
8795
vector<int> output_shape_mask_;
8896
const vector<int>* bottom_shape_;
8997
const vector<int>* bottom_mask_shape_;
98+
9099
int num_spatial_axes_;
91100
int bottom_dim_;
92101
int bottom_dim_mask_;
93102
int top_dim_;
94103
int top_dim_mask_;
95-
104+
int output_offset_;
96105
int channel_axis_;
97106
int num_;
98107
int channels_;
@@ -104,7 +113,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
104113
bool is_1x1_;
105114
bool force_nd_im2col_;
106115

107-
private:
116+
//private:
108117
// wrap im2col/col2im so we don't have to remember the (long) argument lists
109118
inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
110119
if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
@@ -169,13 +178,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
169178

170179
int num_kernels_im2col_;
171180
int num_kernels_col2im_;
172-
int conv_out_channels_;
181+
173182
int conv_in_channels_;
174183
int conv_out_spatial_dim_;
175184
int conv_out_spatial_dim_mask_;
176-
int kernel_dim_;
185+
int conv_out_channels_;
177186
int col_offset_;
178-
int output_offset_;
187+
int kernel_dim_;
179188
int col_offset_mask_;
180189
int output_offset_mask_;
181190
Blob<Dtype> col_buffer_;

include/caffe/layers/conv_layer.hpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include "caffe/proto/caffe.pb.h"
99

1010
#include "caffe/layers/base_conv_layer.hpp"
11-
11+
#include "windows.h"
1212
namespace caffe {
1313

1414
/**
@@ -67,6 +67,7 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
6767
virtual inline const char* type() const { return "Convolution"; }
6868

6969
protected:
70+
7071
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
7172
const vector<Blob<Dtype>*>& top);
7273
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -76,6 +77,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
7677
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
7778
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
7879
virtual inline bool reverse_dimensions() { return false; }
80+
void forward_gpu_gemm_mask(const Dtype* col_input, const Dtype* weights,
81+
Dtype* output, const Dtype* mask_input, bool skip_im2col = false);
7982
virtual void compute_output_shape();
8083
};
8184

src/caffe/layers/base_conv_layer.cpp

+136-78
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@
88

99
namespace caffe {
1010

11+
12+
//template <typename Dtype>
13+
//__global__ void MaskCopy(const int n, const Dtype* in, int *src, Dtype* out,int *dst) {
14+
// CUDA_KERNEL_LOOP(index, n) {
15+
// out[dst[index]] = in[src[index]];
16+
// }
17+
//}
18+
19+
20+
1121
template <typename Dtype>
1222
void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
1323
const vector<Blob<Dtype>*>& top) {
@@ -377,85 +387,133 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
377387
(Dtype)0., output + output_offset_ * g);
378388
}
379389
}
380-
381-
template <typename Dtype>
382-
void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_mask(const Dtype* input,
383-
const Dtype* weights, Dtype* output, const Dtype* mask_input,bool skip_im2col) {
384-
//const Dtype* col_buff = input;
385-
//const Dtype* col_buff_mask = mask_input;
386-
const int height = conv_input_shape_.cpu_data()[1];
387-
const int width = conv_input_shape_.cpu_data()[2];
388-
const int kernel_h = kernel_shape_.cpu_data()[0];
389-
const int kernel_w = kernel_shape_.cpu_data()[1];
390-
const int pad_h = pad_.cpu_data()[0];
391-
const int pad_w = pad_.cpu_data()[1];
392-
const int stride_h = stride_.cpu_data()[0];
393-
const int stride_w = stride_.cpu_data()[1];
394-
const int dilation_h = dilation_.cpu_data()[0];
395-
const int dilation_w = dilation_.cpu_data()[1];
396-
int height_col = (height + 2 * pad_h -
397-
(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
398-
int width_col = (width + 2 * pad_w -
399-
(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
400-
int skip_num = height_col*width_col;
401-
int validnum = caffe_cpu_asum(height_col*width_col, mask_input);
402-
pass_idx_.Reshape(validnum, 1, 1, 1);
403-
buffer_col_.Reshape(kernel_dim_, validnum, 1, 1);
404-
Dtype* buffer_col_data = buffer_col_.mutable_cpu_data();
405-
//Dtype* buffer_col_data = buffer_col_.mutable_cpu_data();
406-
output_buffer_.Reshape(conv_out_channels_, validnum, 1, 1);
407-
int idx = 0;
408-
//if (!is_1x1_) {;
409-
if (1){
410-
//if (!skip_im2col) {
411-
if (1){
412-
conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); //here 11111
413-
}
414-
// LOG(INFO) << "Debuf here:finish im2col\n";
415-
const Dtype* col_buff = col_buffer_.cpu_data();
416-
// col_buff = col_buffer_.cpu_data();
417-
//generate new trans respond to mask 1
418-
for (int h = 0; h < height_col; h++){
419-
for (int w = 0; w < width_col; w++){
420-
if (mask_input[h*width_col + w] >= 1)
421-
{
422-
for (int temp = 0; temp < kernel_dim_; temp++){
423-
buffer_col_data[temp*validnum + idx] = col_buff[temp*height_col*width_col + h*width_col + w];
424-
}
425-
idx += 1;
426-
}
427-
}
428-
}
429-
430-
}
431-
//Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
432-
//const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
433-
Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
434-
const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
435-
for (int g = 0; g < group_; ++g) {
436-
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
437-
group_, validnum, kernel_dim_,
438-
(Dtype)1., weights + weight_offset_ * g, buffer_col_data_com + col_offset_ * g,
439-
(Dtype)0., output_buffer_data + conv_out_channels_* validnum / group_* g); //here11111
440-
}
441-
// LOG(INFO) << "Debuf here:finish gpu_gemm\n";
442-
//generate new output for mask 0
443-
caffe_set(output_offset_, Dtype(0), output);
444-
idx = 0;
390+
//
391+
//template <typename Dtype>
392+
//void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_mask(const Dtype* input,
393+
// const Dtype* weights, Dtype* output, const Dtype* mask_input,bool skip_im2col) {
394+
// //const Dtype* col_buff = input;
395+
// //const Dtype* col_buff_mask = mask_input;
396+
// clock_t start, end,all_start,all_end;
397+
// double dur;
398+
// start = clock();
399+
// all_start = clock();
400+
// const int height = conv_input_shape_.cpu_data()[1];
401+
// const int width = conv_input_shape_.cpu_data()[2];
402+
// const int kernel_h = kernel_shape_.cpu_data()[0];
403+
// const int kernel_w = kernel_shape_.cpu_data()[1];
404+
// const int pad_h = pad_.cpu_data()[0];
405+
// const int pad_w = pad_.cpu_data()[1];
406+
// const int stride_h = stride_.cpu_data()[0];
407+
// const int stride_w = stride_.cpu_data()[1];
408+
// const int dilation_h = dilation_.cpu_data()[0];
409+
// const int dilation_w = dilation_.cpu_data()[1];
410+
// int height_col = (height + 2 * pad_h -
411+
// (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
412+
// int width_col = (width + 2 * pad_w -
413+
// (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
414+
// int validnum = caffe_cpu_asum(height_col*width_col, mask_input);
415+
// int* src_index = new int[validnum*kernel_dim_];
416+
// int* dst_index = new int[validnum*kernel_dim_];
417+
// int cnt = 0;
418+
// pass_idx_.Reshape(validnum, 1, 1, 1);
419+
// buffer_col_.Reshape(kernel_dim_, validnum, 1, 1);
420+
// Dtype* buffer_col_data = buffer_col_.mutable_gpu_data();
421+
// //Dtype* buffer_col_data = buffer_col_.mutable_cpu_data();
422+
// output_buffer_.Reshape(conv_out_channels_, validnum, 1, 1);
423+
// end = clock();
424+
// dur = (double)(end - start);
425+
// LOG(INFO) << "the base_conv_layer before im2col using time:" << dur / CLOCKS_PER_SEC;
426+
// start = clock();
427+
// int idx = 0;
428+
// //if (!is_1x1_) {;
429+
// if (1){
430+
// //if (!skip_im2col) {
431+
// if (1){
432+
// conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); //here 11111
433+
// }
434+
// // LOG(INFO) << "Debuf here:finish im2col\n";
435+
// end = clock();
436+
//
437+
// const Dtype* col_buff = col_buffer_.gpu_data();
438+
//
439+
// dur = (double)(end - start);
440+
// LOG(INFO) << "the base_conv_layer im2col using time:" << dur / CLOCKS_PER_SEC;
441+
// google::FlushLogFiles(google::INFO);
442+
// // col_buff = col_buffer_.cpu_data();
443+
// //generate new trans respond to mask 1
444+
// LOG(INFO) << "shape_0:" << col_buffer_.shape(0);
445+
// LOG(INFO) << "shape_1:" << col_buffer_.shape(1);
446+
// LOG(INFO) << "shape_2:" << col_buffer_.shape(2);
447+
// google::FlushLogFiles(google::INFO);
448+
// start = clock();
449+
// for (int h = 0; h < height_col; h++){
450+
// for (int w = 0; w < width_col; w++){
451+
// if (mask_input[h*width_col + w] >= 1)
452+
// {
453+
// for (int temp = 0; temp < kernel_dim_; temp++){
454+
// src_index[cnt] = temp*height_col*width_col + h*width_col + w;
455+
// dst_index[cnt] = temp*validnum + idx;
456+
// cnt++;
457+
// // buffer_col_data[temp*validnum + idx] = col_buff[temp*height_col*width_col + h*width_col + w];
458+
// //LOG(INFO) << "index:" << temp*height_col*width_col + h*width_col + w;
459+
// //google::FlushLogFiles(google::INFO);
460+
// }
461+
// idx += 1;
462+
// }
463+
// }
464+
// }
465+
// MaskCopy<Dtype> << <CAFFE_GET_BLOCKS(validnum*kernel_dim_), CAFFE_CUDA_NUM_THREADS >> >(
466+
// validnum*kernel_dim_, col_buff, src_index, buffer_col_data, dst_index);
467+
// CUDA_POST_KERNEL_CHECK;
468+
//
469+
// end = clock();
470+
// dur = (double)(end - start);
471+
// LOG(INFO) << "the base_conv_layer 418-429 using time:" << dur / CLOCKS_PER_SEC;
472+
// google::FlushLogFiles(google::INFO);
473+
// }
474+
// //Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
475+
// //const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
476+
// start = clock();
477+
// Dtype* output_buffer_data = output_buffer_.mutable_gpu_data();
478+
// const Dtype* buffer_col_data_com = buffer_col_.gpu_data();
479+
//
480+
// for (int g = 0; g < group_; ++g) {
481+
// caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
482+
// group_, validnum, kernel_dim_,
483+
// (Dtype)1., weights + weight_offset_ * g, buffer_col_data_com + col_offset_ * g,
484+
// (Dtype)0., output_buffer_data + conv_out_channels_* validnum / group_* g); //here11111
485+
// }
486+
// end = clock();
487+
// dur = (double)(end - start);
488+
// LOG(INFO) << "the base_conv_layer 435-440 using time:" << dur / CLOCKS_PER_SEC;
489+
// google::FlushLogFiles(google::INFO);
490+
//// LOG(INFO) << "Debuf here:finish gpu_gemm\n";
491+
// //generate new output for mask 0
492+
// start = clock();
493+
// caffe_set(output_offset_, Dtype(0), output);
494+
// idx = 0;
495+
//// const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
445496
// const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
446-
const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
447-
for (int h = 0; h < height_col; h++){
448-
for (int w = 0; w < width_col; w++){
449-
if (mask_input[h*width_col + w] >= 1)
450-
{
451-
for (int temp = 0; temp < conv_out_channels_; temp++){
452-
output[temp*height_col*width_col + h*width_col + w] = output_buffer_data_fin[temp*validnum + idx];
453-
}
454-
idx += 1;
455-
}
456-
}
457-
}
458-
}
497+
//
498+
// for (int h = 0; h < height_col; h++){
499+
// for (int w = 0; w < width_col; w++){
500+
// if (mask_input[h*width_col + w] >= 1)
501+
// {
502+
// for (int temp = 0; temp < conv_out_channels_; temp++){
503+
// output[temp*height_col*width_col + h*width_col + w] = output_buffer_data_fin[temp*validnum + idx];
504+
// }
505+
// idx += 1;
506+
// }
507+
// }
508+
// }
509+
// end = clock();
510+
// dur = (double)(end - start);
511+
// LOG(INFO) << "the base_conv_layer 446-457 using time:" << dur / CLOCKS_PER_SEC;
512+
// google::FlushLogFiles(google::INFO);
513+
// all_end = clock();
514+
// dur = (double)(all_end - all_start);
515+
// LOG(INFO) << "the gemm_mask inner using time:" << dur / CLOCKS_PER_SEC;
516+
//}
459517

460518
template <typename Dtype>
461519
void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,

0 commit comments

Comments
 (0)