|
8 | 8 |
|
9 | 9 | namespace caffe {
|
10 | 10 |
|
| 11 | + |
| 12 | + //template <typename Dtype> |
| 13 | + //__global__ void MaskCopy(const int n, const Dtype* in, int *src, Dtype* out,int *dst) { |
| 14 | + // CUDA_KERNEL_LOOP(index, n) { |
| 15 | + // out[dst[index]] = in[src[index]]; |
| 16 | + // } |
| 17 | + //} |
| 18 | + |
| 19 | + |
| 20 | + |
11 | 21 | template <typename Dtype>
|
12 | 22 | void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
|
13 | 23 | const vector<Blob<Dtype>*>& top) {
|
@@ -377,85 +387,133 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
|
377 | 387 | (Dtype)0., output + output_offset_ * g);
|
378 | 388 | }
|
379 | 389 | }
|
380 |
| - |
381 |
| -template <typename Dtype> |
382 |
| -void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_mask(const Dtype* input, |
383 |
| - const Dtype* weights, Dtype* output, const Dtype* mask_input,bool skip_im2col) { |
384 |
| - //const Dtype* col_buff = input; |
385 |
| - //const Dtype* col_buff_mask = mask_input; |
386 |
| - const int height = conv_input_shape_.cpu_data()[1]; |
387 |
| - const int width = conv_input_shape_.cpu_data()[2]; |
388 |
| - const int kernel_h = kernel_shape_.cpu_data()[0]; |
389 |
| - const int kernel_w = kernel_shape_.cpu_data()[1]; |
390 |
| - const int pad_h = pad_.cpu_data()[0]; |
391 |
| - const int pad_w = pad_.cpu_data()[1]; |
392 |
| - const int stride_h = stride_.cpu_data()[0]; |
393 |
| - const int stride_w = stride_.cpu_data()[1]; |
394 |
| - const int dilation_h = dilation_.cpu_data()[0]; |
395 |
| - const int dilation_w = dilation_.cpu_data()[1]; |
396 |
| - int height_col = (height + 2 * pad_h - |
397 |
| - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; |
398 |
| - int width_col = (width + 2 * pad_w - |
399 |
| - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; |
400 |
| - int skip_num = height_col*width_col; |
401 |
| - int validnum = caffe_cpu_asum(height_col*width_col, mask_input); |
402 |
| - pass_idx_.Reshape(validnum, 1, 1, 1); |
403 |
| - buffer_col_.Reshape(kernel_dim_, validnum, 1, 1); |
404 |
| - Dtype* buffer_col_data = buffer_col_.mutable_cpu_data(); |
405 |
| - //Dtype* buffer_col_data = buffer_col_.mutable_cpu_data(); |
406 |
| - output_buffer_.Reshape(conv_out_channels_, validnum, 1, 1); |
407 |
| - int idx = 0; |
408 |
| - //if (!is_1x1_) {; |
409 |
| - if (1){ |
410 |
| - //if (!skip_im2col) { |
411 |
| - if (1){ |
412 |
| - conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); //here 11111 |
413 |
| - } |
414 |
| - // LOG(INFO) << "Debuf here:finish im2col\n"; |
415 |
| - const Dtype* col_buff = col_buffer_.cpu_data(); |
416 |
| - // col_buff = col_buffer_.cpu_data(); |
417 |
| - //generate new trans respond to mask 1 |
418 |
| - for (int h = 0; h < height_col; h++){ |
419 |
| - for (int w = 0; w < width_col; w++){ |
420 |
| - if (mask_input[h*width_col + w] >= 1) |
421 |
| - { |
422 |
| - for (int temp = 0; temp < kernel_dim_; temp++){ |
423 |
| - buffer_col_data[temp*validnum + idx] = col_buff[temp*height_col*width_col + h*width_col + w]; |
424 |
| - } |
425 |
| - idx += 1; |
426 |
| - } |
427 |
| - } |
428 |
| - } |
429 |
| - |
430 |
| - } |
431 |
| - //Dtype* output_buffer_data = output_buffer_.mutable_gpu_data(); |
432 |
| - //const Dtype* buffer_col_data_com = buffer_col_.gpu_data(); |
433 |
| - Dtype* output_buffer_data = output_buffer_.mutable_gpu_data(); |
434 |
| - const Dtype* buffer_col_data_com = buffer_col_.gpu_data(); |
435 |
| - for (int g = 0; g < group_; ++g) { |
436 |
| - caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ / |
437 |
| - group_, validnum, kernel_dim_, |
438 |
| - (Dtype)1., weights + weight_offset_ * g, buffer_col_data_com + col_offset_ * g, |
439 |
| - (Dtype)0., output_buffer_data + conv_out_channels_* validnum / group_* g); //here11111 |
440 |
| - } |
441 |
| -// LOG(INFO) << "Debuf here:finish gpu_gemm\n"; |
442 |
| - //generate new output for mask 0 |
443 |
| - caffe_set(output_offset_, Dtype(0), output); |
444 |
| - idx = 0; |
| 390 | +// |
| 391 | +//template <typename Dtype> |
| 392 | +//void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_mask(const Dtype* input, |
| 393 | +// const Dtype* weights, Dtype* output, const Dtype* mask_input,bool skip_im2col) { |
| 394 | +// //const Dtype* col_buff = input; |
| 395 | +// //const Dtype* col_buff_mask = mask_input; |
| 396 | +// clock_t start, end,all_start,all_end; |
| 397 | +// double dur; |
| 398 | +// start = clock(); |
| 399 | +// all_start = clock(); |
| 400 | +// const int height = conv_input_shape_.cpu_data()[1]; |
| 401 | +// const int width = conv_input_shape_.cpu_data()[2]; |
| 402 | +// const int kernel_h = kernel_shape_.cpu_data()[0]; |
| 403 | +// const int kernel_w = kernel_shape_.cpu_data()[1]; |
| 404 | +// const int pad_h = pad_.cpu_data()[0]; |
| 405 | +// const int pad_w = pad_.cpu_data()[1]; |
| 406 | +// const int stride_h = stride_.cpu_data()[0]; |
| 407 | +// const int stride_w = stride_.cpu_data()[1]; |
| 408 | +// const int dilation_h = dilation_.cpu_data()[0]; |
| 409 | +// const int dilation_w = dilation_.cpu_data()[1]; |
| 410 | +// int height_col = (height + 2 * pad_h - |
| 411 | +// (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; |
| 412 | +// int width_col = (width + 2 * pad_w - |
| 413 | +// (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; |
| 414 | +// int validnum = caffe_cpu_asum(height_col*width_col, mask_input); |
| 415 | +// int* src_index = new int[validnum*kernel_dim_]; |
| 416 | +// int* dst_index = new int[validnum*kernel_dim_]; |
| 417 | +// int cnt = 0; |
| 418 | +// pass_idx_.Reshape(validnum, 1, 1, 1); |
| 419 | +// buffer_col_.Reshape(kernel_dim_, validnum, 1, 1); |
| 420 | +// Dtype* buffer_col_data = buffer_col_.mutable_gpu_data(); |
| 421 | +// //Dtype* buffer_col_data = buffer_col_.mutable_cpu_data(); |
| 422 | +// output_buffer_.Reshape(conv_out_channels_, validnum, 1, 1); |
| 423 | +// end = clock(); |
| 424 | +// dur = (double)(end - start); |
| 425 | +// LOG(INFO) << "the base_conv_layer before im2col using time:" << dur / CLOCKS_PER_SEC; |
| 426 | +// start = clock(); |
| 427 | +// int idx = 0; |
| 428 | +// //if (!is_1x1_) {; |
| 429 | +// if (1){ |
| 430 | +// //if (!skip_im2col) { |
| 431 | +// if (1){ |
| 432 | +// conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); //here 11111 |
| 433 | +// } |
| 434 | +// // LOG(INFO) << "Debuf here:finish im2col\n"; |
| 435 | +// end = clock(); |
| 436 | +// |
| 437 | +// const Dtype* col_buff = col_buffer_.gpu_data(); |
| 438 | +// |
| 439 | +// dur = (double)(end - start); |
| 440 | +// LOG(INFO) << "the base_conv_layer im2col using time:" << dur / CLOCKS_PER_SEC; |
| 441 | +// google::FlushLogFiles(google::INFO); |
| 442 | +// // col_buff = col_buffer_.cpu_data(); |
| 443 | +// //generate new trans respond to mask 1 |
| 444 | +// LOG(INFO) << "shape_0:" << col_buffer_.shape(0); |
| 445 | +// LOG(INFO) << "shape_1:" << col_buffer_.shape(1); |
| 446 | +// LOG(INFO) << "shape_2:" << col_buffer_.shape(2); |
| 447 | +// google::FlushLogFiles(google::INFO); |
| 448 | +// start = clock(); |
| 449 | +// for (int h = 0; h < height_col; h++){ |
| 450 | +// for (int w = 0; w < width_col; w++){ |
| 451 | +// if (mask_input[h*width_col + w] >= 1) |
| 452 | +// { |
| 453 | +// for (int temp = 0; temp < kernel_dim_; temp++){ |
| 454 | +// src_index[cnt] = temp*height_col*width_col + h*width_col + w; |
| 455 | +// dst_index[cnt] = temp*validnum + idx; |
| 456 | +// cnt++; |
| 457 | +// // buffer_col_data[temp*validnum + idx] = col_buff[temp*height_col*width_col + h*width_col + w]; |
| 458 | +// //LOG(INFO) << "index:" << temp*height_col*width_col + h*width_col + w; |
| 459 | +// //google::FlushLogFiles(google::INFO); |
| 460 | +// } |
| 461 | +// idx += 1; |
| 462 | +// } |
| 463 | +// } |
| 464 | +// } |
| 465 | +// MaskCopy<Dtype> << <CAFFE_GET_BLOCKS(validnum*kernel_dim_), CAFFE_CUDA_NUM_THREADS >> >( |
| 466 | +// validnum*kernel_dim_, col_buff, src_index, buffer_col_data, dst_index); |
| 467 | +// CUDA_POST_KERNEL_CHECK; |
| 468 | +// |
| 469 | +// end = clock(); |
| 470 | +// dur = (double)(end - start); |
| 471 | +// LOG(INFO) << "the base_conv_layer 418-429 using time:" << dur / CLOCKS_PER_SEC; |
| 472 | +// google::FlushLogFiles(google::INFO); |
| 473 | +// } |
| 474 | +// //Dtype* output_buffer_data = output_buffer_.mutable_gpu_data(); |
| 475 | +// //const Dtype* buffer_col_data_com = buffer_col_.gpu_data(); |
| 476 | +// start = clock(); |
| 477 | +// Dtype* output_buffer_data = output_buffer_.mutable_gpu_data(); |
| 478 | +// const Dtype* buffer_col_data_com = buffer_col_.gpu_data(); |
| 479 | +// |
| 480 | +// for (int g = 0; g < group_; ++g) { |
| 481 | +// caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ / |
| 482 | +// group_, validnum, kernel_dim_, |
| 483 | +// (Dtype)1., weights + weight_offset_ * g, buffer_col_data_com + col_offset_ * g, |
| 484 | +// (Dtype)0., output_buffer_data + conv_out_channels_* validnum / group_* g); //here11111 |
| 485 | +// } |
| 486 | +// end = clock(); |
| 487 | +// dur = (double)(end - start); |
| 488 | +// LOG(INFO) << "the base_conv_layer 435-440 using time:" << dur / CLOCKS_PER_SEC; |
| 489 | +// google::FlushLogFiles(google::INFO); |
| 490 | +//// LOG(INFO) << "Debuf here:finish gpu_gemm\n"; |
| 491 | +// //generate new output for mask 0 |
| 492 | +// start = clock(); |
| 493 | +// caffe_set(output_offset_, Dtype(0), output); |
| 494 | +// idx = 0; |
| 495 | +//// const Dtype* output_buffer_data_fin = output_buffer_.cpu_data(); |
445 | 496 | // const Dtype* output_buffer_data_fin = output_buffer_.cpu_data();
|
446 |
| - const Dtype* output_buffer_data_fin = output_buffer_.cpu_data(); |
447 |
| - for (int h = 0; h < height_col; h++){ |
448 |
| - for (int w = 0; w < width_col; w++){ |
449 |
| - if (mask_input[h*width_col + w] >= 1) |
450 |
| - { |
451 |
| - for (int temp = 0; temp < conv_out_channels_; temp++){ |
452 |
| - output[temp*height_col*width_col + h*width_col + w] = output_buffer_data_fin[temp*validnum + idx]; |
453 |
| - } |
454 |
| - idx += 1; |
455 |
| - } |
456 |
| - } |
457 |
| - } |
458 |
| -} |
| 497 | +// |
| 498 | +// for (int h = 0; h < height_col; h++){ |
| 499 | +// for (int w = 0; w < width_col; w++){ |
| 500 | +// if (mask_input[h*width_col + w] >= 1) |
| 501 | +// { |
| 502 | +// for (int temp = 0; temp < conv_out_channels_; temp++){ |
| 503 | +// output[temp*height_col*width_col + h*width_col + w] = output_buffer_data_fin[temp*validnum + idx]; |
| 504 | +// } |
| 505 | +// idx += 1; |
| 506 | +// } |
| 507 | +// } |
| 508 | +// } |
| 509 | +// end = clock(); |
| 510 | +// dur = (double)(end - start); |
| 511 | +// LOG(INFO) << "the base_conv_layer 446-457 using time:" << dur / CLOCKS_PER_SEC; |
| 512 | +// google::FlushLogFiles(google::INFO); |
| 513 | +// all_end = clock(); |
| 514 | +// dur = (double)(all_end - all_start); |
| 515 | +// LOG(INFO) << "the gemm_mask inner using time:" << dur / CLOCKS_PER_SEC; |
| 516 | +//} |
459 | 517 |
|
460 | 518 | template <typename Dtype>
|
461 | 519 | void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
|
|
0 commit comments