PaddlePaddle · yuanlehome · Nov 9, 2023 · Nov 6, 2023 · zyfncg · Nov 7, 2023
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -8,7 +8,6 @@ register_operators(
   fused_bn_activation_op
   conv_fusion_op
   fusion_conv_inception_op
-  self_dp_attention_op
   skip_layernorm_op
   yolo_box_head_op
   yolo_box_post_op
@@ -27,15 +26,6 @@ register_operators(
   resnet_basic_block_op)
 
 op_library(fusion_lstm_op)
-if(WITH_AVX
-   AND AVX512F_FOUND
-   AND AVX512F_FLAG
-   AND WITH_MKL)
-  op_library(self_dp_attention_op)
-  set_target_properties(
-    self_dp_attention_op
-    PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized -mfma ${AVX512F_FLAG}")
-endif()
 
 if(WITH_XPU)
   op_library(resnet_basic_block_op)

diff --git a/paddle/fluid/operators/fused/self_dp_attention_op.cc b/paddle/fluid/operators/fused/self_dp_attention_op.cc
diff --git a/paddle/fluid/operators/fused/self_dp_attention_op.h b/paddle/fluid/operators/fused/self_dp_attention_op.h
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -86,6 +86,7 @@
     'fusion_seqexpand_concat_fc',
     'fused_attention',
     'fused_feedforward',
+    'self_dp_attention',
 ]
 
 NO_NEED_GEN_STATIC_ONLY_APIS = [

diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
@@ -114,6 +114,15 @@ else()
       CACHE INTERNAL "" FORCE)
 endif()
 
+if(WITH_AVX
+   AND AVX512F_FOUND
+   AND AVX512F_FLAG
+   AND WITH_MKL)
+  set_source_files_properties(
+    kernels/fusion/cpu/self_dp_attention_kernel.cc
+    PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized  -mfma ${AVX512F_FLAG}")
+endif()
+
 if(WITH_GPU)
   set_source_files_properties(
     backends/gpu/gpu_resources.cc

diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
@@ -304,6 +304,15 @@
     func : quantize_xpu
     data_type : x
 
+- op : self_dp_attention
+  args : (Tensor x, float alpha = 1.0f, int head_number = 1)
+  output : Tensor(out)
+  infer_meta :
+    func : SelfDPAttenInferMeta
+  kernel :
+    func : self_dp_attention
+    data_type : x
+
 - op : squeeze_excitation_block
   args : (Tensor x, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] act_type, float[] act_param, int[] filter_dims)
   output : Tensor(out)

diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
@@ -2624,6 +2624,12 @@
   outputs :
     {out : Out, summed_ids : SummedIds}
 
+- op : self_dp_attention
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : selu
   backward : selu_grad
   inputs :

diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
@@ -2463,4 +2463,23 @@ void FusionSeqExpandConcatFCInferMeta(const std::vector<const MetaTensor*>& x,
   // explicit share the ref lod
   out->share_lod(*x[0]);
 }
+
+void SelfDPAttenInferMeta(const MetaTensor& x,
+                          const float alpha,
+                          const int head_number,
+                          MetaTensor* out) {
+  auto dim_input = x.dims();
+  PADDLE_ENFORCE_EQ(
+      dim_input.size(),
+      5,
+      phi::errors::InvalidArgument("The size of input X dims should be 5, "
+                                   "[batchsize, tokensize, 3, nhead, headsize] "
+                                   ", but now Input X dim is:[%s] ",
+                                   dim_input));
+  DDim out_dims({dim_input[0], dim_input[1], dim_input[3], dim_input[4]});
+  out->set_dims(out_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
@@ -556,4 +556,9 @@ void FusionSeqExpandConcatFCInferMeta(const std::vector<const MetaTensor*>& x,
                                       const std::string& fc_activation,
                                       MetaTensor* out,
                                       MetaTensor* fc_out);
+
+void SelfDPAttenInferMeta(const MetaTensor& x,
+                          const float alpha,
+                          const int head_number,
+                          MetaTensor* out);
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -142,6 +142,14 @@ if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
   list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$")
 endif()
 
+if(NOT
+   (WITH_AVX
+    AND AVX512F_FOUND
+    AND AVX512F_FLAG
+    AND WITH_MKL))
+  list(REMOVE_ITEM kernel_cc "fusion/cpu/self_dp_attention_kernel.cc")
+endif()
+
 file(
   GLOB kernel_xpu
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"