From 0c6b490ff17cef9724522520f412510a27766f2f Mon Sep 17 00:00:00 2001
From: Xinyi_LI <xinyi1.li@intel.com>
Date: Mon, 11 Dec 2023 09:53:57 +0800
Subject: [PATCH 01/28] [oneDNN] Reshape attr_axes when going to oneDNN kernel
 (#59641)

---
 paddle/phi/kernels/onednn/squeeze_kernel.cc  |  22 ++++
 test/cpp/fluid/mkldnn/CMakeLists.txt         |  15 +++
 test/cpp/fluid/mkldnn/test_mkldnn_squeeze.cc | 106 +++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 test/cpp/fluid/mkldnn/test_mkldnn_squeeze.cc
diff --git a/paddle/phi/kernels/onednn/squeeze_kernel.cc b/paddle/phi/kernels/onednn/squeeze_kernel.cc
index 2de2cbb2ecbab8..2d9522277d8572 100644
--- a/paddle/phi/kernels/onednn/squeeze_kernel.cc
+++ b/paddle/phi/kernels/onednn/squeeze_kernel.cc
@@ -59,7 +59,29 @@ void SqueezeInferKernel(const Context& dev_ctx,
                         const IntArray& axes,
                         DenseTensor* out) {
   auto x_dims = x.dims();
+  auto x_dims_tz = x_dims.size();
   std::vector<int32_t> tmp(axes.GetData().begin(), axes.GetData().end());
+
+  // Currently there is only tranformation for tensors, while attr axes still
+  // follows default dtype instead of oneDNN dtype, so here manually change it
+  if ((x_dims_tz >= 3) &&
+      (phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
+           phi::DataLayout::NDHWC ||
+       phi::OneDNNContext::tls().get_cur_paddle_data_layout() ==
+           phi::DataLayout::NHWC)) {
+    int axes_size = tmp.size();
+    for (int i = 0; i < axes_size; i++) {
+      if (tmp[i] < 0) {
+        tmp[i] += x_dims_tz;
+      }
+      if (tmp[i] >= 1 && tmp[i] < (x_dims_tz - 1)) {
+        tmp[i] += 1;
+      } else if (tmp[i] == (x_dims_tz - 1)) {
+        tmp[i] = 1;
+      }
+    }
+  }
+
   auto out_dims = funcs::GetOutputSqueezeShape(tmp, x_dims, true);
   ExecuteSqueeze<T, Context>(dev_ctx, x, x_dims, out_dims, out);
 }
diff --git a/test/cpp/fluid/mkldnn/CMakeLists.txt b/test/cpp/fluid/mkldnn/CMakeLists.txt
index 22ea64bdbdb0c1..bae9ec2dc5f73b 100644
--- a/test/cpp/fluid/mkldnn/CMakeLists.txt
+++ b/test/cpp/fluid/mkldnn/CMakeLists.txt
@@ -104,3 +104,18 @@ cc_test(
        scope
        device_context
        enforce)
+
+cc_test(
+  test_mkldnn_squeeze
+  SRCS test_mkldnn_squeeze.cc
+  DEPS fleet_executor
+       conditional_block_op
+       standalone_executor
+       executor
+       op_registry
+       generated_static_op
+       generated_op
+       phi
+       scope
+       device_context
+       enforce)
diff --git a/test/cpp/fluid/mkldnn/test_mkldnn_squeeze.cc b/test/cpp/fluid/mkldnn/test_mkldnn_squeeze.cc
new file mode 100644
index 00000000000000..a08ca3075ca00e
--- /dev/null
+++ b/test/cpp/fluid/mkldnn/test_mkldnn_squeeze.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+
+#include <fstream>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+template <typename DataType>
+void AddVarToScope(const std::string var_name,
+                   paddle::framework::Scope* scope,
+                   const paddle::framework::DDim& dims) {
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+  std::uniform_real_distribution<float> dist(0, 100);
+
+  phi::DenseTensor tmp_tensor;
+  auto* tmp_data =
+      tmp_tensor.mutable_data<DataType>(dims, paddle::platform::CPUPlace());
+  auto* tensor = scope->Var(var_name)->GetMutable<phi::DenseTensor>();
+  tensor->mutable_data<DataType>(dims, paddle::platform::CPUPlace());
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    tmp_data[i] = static_cast<DataType>(dist(engine));
+  }
+  paddle::framework::TensorCopySync(
+      tmp_tensor, paddle::platform::CPUPlace(), tensor);
+}
+void test_squeeze() {
+  framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+  // Prepare Op description
+  framework::OpDesc desc;
+  // We assume it is kNHWC, so that can use this transformation
+  phi::OneDNNContext::tls().set_cur_paddle_data_layout(DataLayout::kNHWC);
+  desc.SetType("squeeze2");
+  desc.SetInput("X", {"squeeze-X"});
+  desc.SetOutput("Out", {"squeeze-Out"});
+  // DataLayout::kNHWC will make it become {2, 3, 2, 1}
+  AddVarToScope<float>("squeeze-X", &scope, {2, 2, 1, 3});
+  AddVarToScope<float>("squeeze-Out", &scope, {2, 3, 2});
+  // transform will make it become -1
+  std::vector<int> axes({-2});
+
+  desc.SetAttr("axes", axes);
+  desc.SetAttr("use_mkldnn", true);
+
+  auto op = paddle::framework::OpRegistry::CreateOp(desc);
+
+  op->Run(scope, cpu_place);
+}
+
+void test_squeeze2() {
+  framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+  // Prepare Op description
+  framework::OpDesc desc;
+  // We assume it is HNWC, so that can use this transformation
+  phi::OneDNNContext::tls().set_cur_paddle_data_layout(DataLayout::kNHWC);
+  desc.SetType("squeeze2");
+  desc.SetInput("X", {"squeeze-X"});
+  desc.SetOutput("Out", {"squeeze-Out"});
+  // DataLayout::kNHWC will make it become {2, 1, 3, 2}
+  AddVarToScope<float>("squeeze-X", &scope, {2, 3, 2, 1});
+  AddVarToScope<float>("squeeze-Out", &scope, {2, 3, 2});
+  // transform will make it become -3(1)
+  std::vector<int> axes({-1});
+
+  desc.SetAttr("axes", axes);
+  desc.SetAttr("use_mkldnn", true);
+
+  auto op = paddle::framework::OpRegistry::CreateOp(desc);
+
+  op->Run(scope, cpu_place);
+}
+
+TEST(SqueezeOpConverter, normal) { test_squeeze(); }
+TEST(SqueezeOpConverter_2, normal) { test_squeeze2(); }
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP_ITSELF(squeeze2);
+PD_DECLARE_KERNEL(squeeze_infer, OneDNN, ONEDNN);

From a339e8b383c16823bdff5dea3ee8eeded9620d92 Mon Sep 17 00:00:00 2001
From: "Zhang,Lirong" <56445728+zhanglirong1999@users.noreply.github.com>
Date: Mon, 11 Dec 2023 09:55:11 +0800
Subject: [PATCH 02/28] [oneDNN] Restrictions on matmul broadcast optimiztion
 (#59744)

---
 paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
index 1f2c0766f95e40..8995934c2fa035 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
@@ -180,7 +180,8 @@ class FusedMatmulOneDNNHandler
       auto residual_data_tz = vectorize(residual_data->dims());
       auto chosen_memory_format = funcs::OneDNNMemoryFormat::any;
       dnnl::memory::desc residual_data_md;
-      if (residual_data_tz.size() == 4 && residual_data_tz[0] == 1 &&
+      if (out_ddims.size() > 0 && out_ddims[0] > 1 &&
+          residual_data_tz.size() == 4 && residual_data_tz[0] == 1 &&
           residual_data_tz[1] > 1 && residual_data_tz[2] > 1 &&
           residual_data_tz[3] > 1) {
         chosen_memory_format = funcs::OneDNNMemoryFormat::nchw;
@@ -320,7 +321,8 @@ void ExecuteFusedMatmul(const OneDNNContext &dev_ctx,
   if (residual_data) {
     auto residual_data_vec = vectorize(residual_data->dims());
     std::shared_ptr<dnnl::memory> residual_data_memory_p;
-    if (residual_data_vec.size() == 4 && residual_data_vec[0] == 1 &&
+    if (std::max((x_dims)[0], (y_dims)[0]) > 1 &&
+        residual_data_vec.size() == 4 && residual_data_vec[0] == 1 &&
         residual_data_vec[1] > 1 && residual_data_vec[2] > 1 &&
         residual_data_vec[3] > 1) {
       residual_data_memory_p = handler.AcquireSrcMemoryStride(residual_data);

From 99a281b8f8da49cdfcaee44a1b9926193d0cf803 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Mon, 11 Dec 2023 10:38:22 +0800
Subject: [PATCH 03/28] [Auto Parallel] Add Strategy api for configuring the
 distributed training with static graph (#59862)

* move strategy to api.py

* add Strategy api

* fix sample code

* add detailed comments for the configs in dist.Strategy

* add an error case in unit test

* add the unit test to CMakeLists
---
 python/paddle/distributed/__init__.py         |   2 +
 .../paddle/distributed/auto_parallel/api.py   | 269 ++++++++++++++++--
 .../distributed/auto_parallel/strategy.py     |   2 +-
 test/auto_parallel/CMakeLists.txt             |   1 +
 .../hybrid_strategy/semi_auto_llama.py        |   9 +-
 ...test_semi_auto_parallel_hybrid_strategy.py |   2 +-
 .../semi_auto_parallel_dist_to_static_api.py  |   3 +-
 test/auto_parallel/test_strategy_api.py       | 108 +++++++
 8 files changed, 372 insertions(+), 24 deletions(-)
 create mode 100644 test/auto_parallel/test_strategy_api.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index c6d8e626421736..29b6e18ccb53dd 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -85,6 +85,7 @@
     shard_layer,
     shard_optimizer,
     to_static,
+    Strategy,
 )
 
 from .fleet import BoxPSDataset  # noqa: F401
@@ -165,4 +166,5 @@
     "load_state_dict",
     "shard_optimizer",
     "to_static",
+    "Strategy",
 ]
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index 83e931f1ce7de8..f217f07b9a0534 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 from collections import defaultdict
 from typing import Callable
 
@@ -23,7 +24,7 @@
     Variable,
     default_main_program,
 )
-from paddle.distributed.auto_parallel import Engine
+from paddle.distributed.auto_parallel import Engine, strategy as auto_strategy
 from paddle.distributed.auto_parallel.interface import (
     shard_tensor as shard_tensor_static,
 )
@@ -108,9 +109,11 @@ def sharding_specs(self):
 
 class DistModel:
     """
-    DistModel is a wrapper of the network model for the use of static mode
-    auto parallel. DistModel contains the distributed Graph of the model and
-    offers the APIs for training, evaluation and prediction.
+    DistModel is generated by ``paddle.distributed.to_static``. It contains the
+    static graph converted from a ``paddle.nn.layer`` whose parameters are
+    distributed tensors (constructed from ``paddle.distributed.shard_tensor``),
+    and provides the APIs for training, evaluation and prediction with the
+    static graph.
 
     Please first set the DistModel to "train", "eval" or "predict" mode and
     then use the __call__ method for training, evaluation and prediction
@@ -127,8 +130,8 @@ class DistModel:
     to "eval" mode in default. If loss and optimizer are both None, DistModel
     will be set to "predict" mode in default.
 
-    DistModel is generated by ``paddle.distributed.to_static``, for more details
-    of the usage, please refer to the sample code in ``paddle.distributed.to_static``.
+    For more details of the usage, please refer to the sample code in
+    ``paddle.distributed.to_static``.
     """
 
     def __init__(
@@ -141,8 +144,9 @@ def __init__(
         metrics=None,
     ):
         self._feed_name_list = []
+        self._inner_strategy = self.__convert_strategy(strategy)
         self._engine = Engine(
-            layer, loss, optimizer, metrics, strategy=strategy
+            layer, loss, optimizer, metrics, strategy=self._inner_strategy
         )
         self._mode = None
         self._feed_name_list = {}
@@ -271,6 +275,27 @@ def _make_feeds(self, data_list):
             )
         return dict(zip(feed_name_list, data_list))
 
+    def __convert_strategy(self, strategy):
+        import copy
+
+        if strategy is None:
+            return None
+        inner_strategy = auto_strategy.Strategy()
+        inner_strategy.fused_passes.enable = strategy.fused_passes.enable
+        if strategy.fused_passes.gemm_epilogue is True:
+            inner_strategy.fused_passes.fused_passes_list.append(
+                "fused_gemm_epilogue_pass"
+            )
+        if strategy.fused_passes.dropout_add is True:
+            inner_strategy.fused_passes.fused_passes_list.append(
+                "fused_dropout_add_pass"
+            )
+
+        inner_strategy.sharding = copy.deepcopy(strategy.sharding)
+        inner_strategy.gradient_merge = copy.deepcopy(strategy.gradient_merge)
+        inner_strategy.pipeline = copy.deepcopy(strategy.pipeline)
+        return inner_strategy
+
     def __call__(self, *args):
         if self._mode is None:
             raise ValueError("Please call train()/eval()/predict() first.")
@@ -298,6 +323,209 @@ def __call__(self, *args):
 
 
 # Part2: DistTensor construction related APIs
+
+
+class FusePasses:
+    """
+    A helper class for users to configure the fuse passes.
+    """
+
+    def __init__(self, config_dict=None):
+        self.enable = False
+        self.gemm_epilogue = False
+        self.dropout_add = False
+        if config_dict is not None:
+            for key, value in config_dict.items():
+                if hasattr(self, key):
+                    setattr(self, key, value)
+                else:
+                    raise ValueError(f"Unknown fuse pass {key}")
+
+
+class Strategy(auto_strategy.BaseConfig):
+    """
+    The `Strategy` object is used to configure the parallelization
+    and optimization strategies for static graph. Currently contains
+    configuring ``sharding``, ``fused_passes``, ``gradient_merge``
+    and ``pipline``. More strategies will be supported in the future.
+
+    ``sharding`` is used to cnofigure the sharding states of the optimizer,
+    for saving the GPU memory.
+
+    ``fused_passes`` is used to configure the fusion of the computation in
+    the model.
+
+    ``gradient_merge`` is used to configure the gradient merge strategy in
+    training.
+
+    ``pipeline`` is used to configure the pipeline parallelism strategy.
+
+    Args:
+        config (dict|None, optional): If ``config`` is None, the default
+        configurations will be set. If it is a dict, the itmes inside
+        the dict will be used to set the configurations, the others remain
+        the default values.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> strategy = dist.Strategy()
+
+            >>> strategy.sharding.enable = True
+            >>> strategy.sharding.stage = 2
+            >>> strategy.sharding.degree = 2
+
+            >>> strategy.gradient_merge.enable = True
+            >>> strategy.gradient_merge.k_steps = 2
+            >>> strategy.gradient_merge.avg = False
+
+            >>> strategy.pipeline.enable = True
+            >>> strategy.pipeline.schedule_mode = "1F1B" # default is "1F1B"
+            >>> strategy.pipeline.micro_batch_size = 2
+    """
+
+    def __init__(self, config=None):
+        if config is not None:
+            if isinstance(config, dict):
+                self._config_dict = copy.deepcopy(config)
+            else:
+                raise ValueError(
+                    f"Expected a dictionary. But received: {config}"
+                )
+        else:
+            self._config_dict = {}
+
+        category = auto_strategy.constants.BASE
+        super().__init__(category, self._config_dict)
+
+        config_dict = self._config_dict.get(
+            auto_strategy.constants.SHARDING, None
+        )
+        self._sharding = auto_strategy.ShardingConfig(config_dict)
+
+        config_dict = self._config_dict.get(
+            auto_strategy.constants.GRADIENT_MERGE, None
+        )
+        self._gradient_merge = auto_strategy.GradientMergeConfig(config_dict)
+
+        config_dict = self._config_dict.get(
+            auto_strategy.constants.PIPELINE, None
+        )
+        self._pipeline = auto_strategy.PipelineConfig(config_dict)
+
+        config_dict = self._config_dict.get(
+            auto_strategy.constants.FUSED_PASSES, None
+        )
+        self._fused_passes = FusePasses(config_dict)
+
+    @property
+    def sharding(self):
+        """
+        ``sharding`` is used to cnofigure the sharding states of the optimizer,
+        containing following configs:
+
+            ``enable`` (bool): whether to enable sharding. Default: False.
+
+            ``stage`` (int): can be set to 1, 2 or 3. 1 indicates the optimizer states segmentation,
+            2 indicates optimizer states and gradient segmentation, 3 indicates the segmentation
+            of optimizer states, gradient and parameters. Default: 1.
+
+            ``degree`` (int): the number of segmentation pieces. Default: 8.
+
+        Examples:
+            .. code-block:: python
+                >>> import paddle
+                >>> import paddle.distributed as dist
+
+                >>> strategy = dist.Strategy()
+
+                >>> strategy.sharding.enable = True
+                >>> strategy.sharding.stage = 2
+                >>> strategy.sharding.degree = 2
+        """
+        return self._sharding
+
+    @property
+    def gradient_merge(self):
+        """
+        ``gradient_merge`` is used to configure the gradient merge strategy in
+        training, containing following configs:
+
+            ``enable`` (bool): whether to enable gradient merge. Default: False.
+
+            ``k_steps`` (int): the number of steps for merging gradients. Default: 1.
+
+            ``avg`` (bool): whether to average the gradients of each step. Default: True.
+
+        Examples:
+            .. code-block:: python
+                >>> import paddle
+                >>> import paddle.distributed as dist
+
+                >>> strategy = dist.Strategy()
+
+                >>> strategy.gradient_merge.enable = True
+                >>> strategy.gradient_merge.k_steps = 2
+                >>> strategy.gradient_merge.avg = True
+        """
+        return self._gradient_merge
+
+    @property
+    def fused_passes(self):
+        """
+        ``fused_passes`` is used to configure the fusion of the computation in
+        the model, containing following configs:
+
+            ``enable`` (bool): whether to enable fused passes. Default: False.
+
+            ``gemm_epilogue`` (bool): whether to fuse ``matmul`` and ``add`` computation
+            in the ``Linear`` layer. Default: False
+
+            "dropout_add" (bool): whether to fuse ``dropout`` and ``add`` computation. Default: False.
+
+        Examples:
+            .. code-block:: python
+                >>> import paddle
+                >>> import paddle.distributed as dist
+
+                >>> strategy = dist.Strategy()
+
+                >>> strategy.fused_passes.enable = True
+                >>> strategy.fused_passes.gemm_spilogue = True
+                >>> strategy.fused_passes.dropout_add = True
+        """
+        return self._fused_passes
+
+    @property
+    def pipeline(self):
+        """
+        ``pipeline`` is used to configure the pipeline parallelism in training,
+        containing following configs:
+
+            ``enable`` (bool): whether to enable pipeline parallelism. Default: False.
+
+            ``schedule_mode`` (str): the scheduling mode of pipeline parallelism. Default: "1F1B".
+
+            ``micro_batch_size`` (int): the size of each micro-batch inside a mini-batch. Default: 1.
+
+            ``accumulate_steps`` (int): number of steps for accumulating. Default: 1.
+
+        Examples:
+            .. code-block:: python
+                >>> import paddle
+                >>> import paddle.distributed as dist
+
+                >>> strategy = dist.Strategy()
+
+                >>> strategy.pipeline.enable = True
+                >>> strategy.pipeline.micro_batch_size = 2
+        """
+        return self._pipeline
+
+
 def to_static(
     layer: paddle.nn.Layer,
     loader=None,
@@ -306,29 +534,30 @@ def to_static(
     strategy=None,
 ):
     """
-    Converts the model and data loader used in dygraph auto-parallelism to
-    that in static mode auto-parallelism. to_static returns a DistModel
-    instance that provides APIs and a DistributedDataLoader to generate data
-    for static mode auto-parallel training, evaluation and prediction.
+    Converts the ``layer`` with distributed tensor (constructed from
+    ``paddle.distributed.shard_tensor``) to a static graph. to_static
+    returns a DistModel instance containing the static graph for
+    distributed training, evaluation and prediction, and an object of
+    DistributedDataLoader to generate data.
 
     Args:
-        layer(paddle.nn.Layer): The layer in dygraph model, the parameters
-            or its inputs can be sharded.
-        loader(paddle.io.DataLoader): The data loader used in dygraph model,
-            used to generate Distributed Dataloader for static auto parallel.
+        layer(paddle.nn.Layer): The layer in dygraph mode, the parameters
+            or its inputs can be distributed tensors.
+        loader(paddle.io.DataLoader): The data loader used in dygraph mode,
+            used to generate DistributedDataloader.
         loss(Loss|Callable|None, optional): The loss function for training
             or evaluating the model. Can be a `paddle.nn.Layer` instance or
             any callable function. Default: None.
         optimizer(paddle.optimizer.Optimizer|None, optional): The optimizer
             for training. Default: None.
-        strategy(Strategy|None, optional): Configs for parallel strategies
-            (e.g. data parallel, hybrid parallel etc.) and optimization
-            settings (e.g. mixed-precision). Default: None.
+        strategy(paddle.distributed.Strategy|None, optional): Configs for
+            parallel strategies and optimization settings (e.g. sharding,
+            pipeline parallelism). Default: None.
 
     Returns:
         DistModel: A DistModel tha contains corresponding computational graph
-            for the input layer and provides APIs for training, evaluation and
-            prediction.
+            for the input ``layer`` and provides APIs for training, evaluation
+            and prediction.
         DistributedDataLoader: An optimized data loader that can be used
             to generate data.
 
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
index 0ee9c29d23348d..7f026827d34466 100644
--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -156,7 +156,7 @@ def __init__(self, config_dict=None):
 
 class Strategy(BaseConfig):
     """
-    The `Strategy` object is used to configure the parallelization and optimization behaviors.
+    The `Strategy` object is used to configure the parallelization and optimization for static graph.
 
     Args:
         config (dict|string, optional): If this is None, the default configurations will used.
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 84292d72660710..2bce1e6cf2c860 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -269,6 +269,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_cost_interface MODULES test_cost_interface)
   py_test_modules(test_auto_conditional_block MODULES
                   test_auto_conditional_block)
+  py_test_modules(test_strategy_api MODULES test_strategy_api)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_llama.py b/test/auto_parallel/hybrid_strategy/semi_auto_llama.py
index bbac6d8094593f..0dc578523b2c34 100644
--- a/test/auto_parallel/hybrid_strategy/semi_auto_llama.py
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_llama.py
@@ -188,8 +188,15 @@ def run_dy2static(self):
         else:
             opt = optimizer
 
+        strategy = None
+        if self.gradient_accumulation_steps > 1:
+            strategy = dist.Strategy()
+            strategy.pipeline.accumulate_steps = (
+                self.gradient_accumulation_steps
+            )
+
         dist_model, dist_loader = dist.to_static(
-            model, train_dataloader, criterion, opt
+            model, train_dataloader, criterion, opt, strategy=strategy
         )
 
         dist_model.train()
diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_strategy.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_strategy.py
index 2fb8c2a7e7a812..137a48143664dd 100644
--- a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_strategy.py
+++ b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_strategy.py
@@ -212,7 +212,7 @@ def test_simple_net_hybrid_strategy(self):
 class TestSemiAutoParallelLlama3D(test_base.CommunicationTestDistBase):
     def setUp(self):
         super().setUp(num_of_devices=8, timeout=200, nnode=1)
-        self._default_envs = {"dp": "2", "mp": "2", "pp": "2", "acc_step": "1"}
+        self._default_envs = {"dp": "2", "mp": "2", "pp": "2", "acc_step": "2"}
         self._changeable_envs = {
             "backend": ["gpu"],
             "use_sp": ["true", "false"],
diff --git a/test/auto_parallel/semi_auto_parallel_dist_to_static_api.py b/test/auto_parallel/semi_auto_parallel_dist_to_static_api.py
index bd353e894c2b0d..3b86c7d5ed6fd5 100644
--- a/test/auto_parallel/semi_auto_parallel_dist_to_static_api.py
+++ b/test/auto_parallel/semi_auto_parallel_dist_to_static_api.py
@@ -150,8 +150,9 @@ def run_test(self):
         loss_fn = nn.MSELoss()
 
         # static training
+        strategy = dist.Strategy()
         dist_model, dist_loader = dist.to_static(
-            layer, self.data_loader, loss_fn, opt
+            layer, self.data_loader, loss_fn, opt, strategy=strategy
         )
 
         dist_model._mode = None
diff --git a/test/auto_parallel/test_strategy_api.py b/test/auto_parallel/test_strategy_api.py
new file mode 100644
index 00000000000000..0c2a954b69865c
--- /dev/null
+++ b/test/auto_parallel/test_strategy_api.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import yaml
+import unittest
+
+import paddle.distributed as dist
+
+
+class TestStrategy(unittest.TestCase):
+    def test_default_config(self):
+        strategy = dist.Strategy()
+        self.assertEqual(strategy.sharding.enable, False)
+        self.assertEqual(strategy.sharding.stage, 1)
+        self.assertEqual(strategy.sharding.degree, 8)
+
+        self.assertEqual(strategy.gradient_merge.enable, False)
+        self.assertEqual(strategy.gradient_merge.k_steps, 1)
+        self.assertEqual(strategy.gradient_merge.avg, True)
+
+        self.assertEqual(strategy.pipeline.enable, False)
+        self.assertEqual(strategy.pipeline.schedule_mode, "1F1B")
+        self.assertEqual(strategy.pipeline.micro_batch_size, 1)
+        self.assertEqual(strategy.pipeline.accumulate_steps, 1)
+
+        self.assertEqual(strategy.fused_passes.enable, False)
+        self.assertEqual(strategy.fused_passes.gemm_epilogue, False)
+        self.assertEqual(strategy.fused_passes.dropout_add, False)
+
+    def test_modify_config(self):
+        strategy = dist.Strategy()
+
+        strategy.sharding.enable = True
+        strategy.sharding.stage = 2
+        strategy.sharding.degree = 16
+        self.assertEqual(strategy.sharding.enable, True)
+        self.assertEqual(strategy.sharding.stage, 2)
+        self.assertEqual(strategy.sharding.degree, 16)
+
+        strategy.gradient_merge.enable = True
+        strategy.gradient_merge.k_steps = 2
+        strategy.gradient_merge.avg = False
+        self.assertEqual(strategy.gradient_merge.enable, True)
+        self.assertEqual(strategy.gradient_merge.k_steps, 2)
+        self.assertEqual(strategy.gradient_merge.avg, False)
+
+        strategy.pipeline.enable = True
+        strategy.pipeline.schedule_mode = "FThenB"
+        strategy.pipeline.micro_batch_size = 2
+        self.assertEqual(strategy.pipeline.enable, True)
+        self.assertEqual(strategy.pipeline.schedule_mode, "FThenB")
+        self.assertEqual(strategy.pipeline.micro_batch_size, 2)
+
+        strategy.fused_passes.enable = True
+        strategy.fused_passes.gemm_epilogue = True
+        self.assertEqual(strategy.fused_passes.enable, True)
+        self.assertEqual(strategy.fused_passes.gemm_epilogue, True)
+
+    def test_init_from_dict(self):
+        config = {
+            "sharding": {"enable": True, "stage": 2},
+            "gradient_merge": {"enable": True, "k_steps": 2},
+            "fused_passes": {"enable": True, "gemm_epilogue": True},
+            "pipeline": {"enable": True, "schedule_mode": "FThenB"},
+        }
+        strategy = dist.Strategy(config)
+        self.assertEqual(strategy.sharding.enable, True)
+        self.assertEqual(strategy.sharding.stage, 2)
+        self.assertEqual(strategy.sharding.degree, 8)  # default
+        self.assertEqual(strategy.gradient_merge.enable, True)
+        self.assertEqual(strategy.gradient_merge.k_steps, 2)
+        self.assertEqual(strategy.gradient_merge.avg, True)  # default
+        self.assertEqual(strategy.fused_passes.enable, True)
+        self.assertEqual(strategy.fused_passes.gemm_epilogue, True)
+        self.assertEqual(strategy.fused_passes.dropout_add, False)  # default
+
+    def test_error_init(self):
+        with self.assertRaises(ValueError):
+            config = [{"enable": True, "stage": 2}]
+            err_strategy1 = dist.Strategy(config)
+
+        with self.assertRaises(ValueError):
+            config = {
+                "sharding": {"enable": True, "stage": 2},
+                "gradient_merge": {"enable": True, "k_steps": 2},
+                "fused_passes": {
+                    "enable": True,
+                    "gemm_epilogue": True,
+                    "dropout": True,
+                },
+                "pipeline": {"enable": True, "schedule_mode": "FThenB"},
+            }
+            err_strategy2 = dist.Strategy(config)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 866819acc86bf7a863470f70ecebac056d90e38a Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Mon, 11 Dec 2023 10:40:10 +0800
Subject: [PATCH 04/28] [compiler opt]change_cc_test_old (#59477)

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* update

* update

* update

* update

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* chang_cc_test_old

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* fix

* update

* update

* fix bug in windows

* fix windows bug

* Update CMakeLists.txt

* fix

* Update CMakeLists.txt

* fix

* fix

* Update CMakeLists.txt

* Update CMakeLists.txt

* fix win-infer

* Update CMakeLists.txt

* fix win infer
---
 cmake/generic.cmake                           |  4 +--
 paddle/fluid/inference/utils/CMakeLists.txt   | 21 ++++----------
 paddle/fluid/inference/utils/io_utils.h       | 28 ++++++++++---------
 paddle/phi/common/place.h                     |  2 +-
 test/cpp/fluid/CMakeLists.txt                 |  3 ++
 test/cpp/fluid/inference/CMakeLists.txt       |  1 +
 test/cpp/fluid/inference/utils/CMakeLists.txt | 16 +++++++++++
 .../fluid/inference/utils/io_utils_tester.cc  |  0
 test/cpp/pir/core/CMakeLists.txt              | 25 ++++++-----------
 test/cpp/pir/core/ir_value_test.cc            | 20 +++++++++++--
 10 files changed, 70 insertions(+), 50 deletions(-)
 create mode 100644 test/cpp/fluid/inference/CMakeLists.txt
 create mode 100644 test/cpp/fluid/inference/utils/CMakeLists.txt
 rename {paddle => test/cpp}/fluid/inference/utils/io_utils_tester.cc (100%)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index ab09d597499772..7c881edca0e4ed 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -595,8 +595,8 @@ function(paddle_test_build TARGET_NAME)
     add_executable(${TARGET_NAME} ${paddle_test_SRCS})
     get_property(paddle_lib GLOBAL PROPERTY PADDLE_LIB_NAME)
     target_link_libraries(${TARGET_NAME} $<TARGET_LINKER_FILE:${paddle_lib}>
-                          ${paddle_test_DEPS} paddle_gtest_main_new)
-    add_dependencies(${TARGET_NAME} ${paddle_lib} ${paddle_test_DEPS}
+                          ${paddle_test_DEPS} common paddle_gtest_main_new)
+    add_dependencies(${TARGET_NAME} ${paddle_lib} ${paddle_test_DEPS} common
                      paddle_gtest_main_new)
     if(WITH_SHARED_PHI)
       target_link_libraries(${TARGET_NAME} $<TARGET_LINKER_FILE:phi>)
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 46b74a60ad4449..3dbc06bfc11b7e 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -12,23 +12,14 @@ cc_library(
   SRCS model_utils.cc
   DEPS proto_desc enforce common)
 
-cc_test_old(
-  infer_io_utils_tester
-  SRCS
-  io_utils_tester.cc
-  DEPS
-  infer_io_utils
-  fleet_executor
-  parallel_executor
-  python)
+cc_library(table_printer SRCS table_printer.cc)
+paddle_test(test_table_printer SRCS table_printer_tester.cc)
+
+proto_library(shape_range_info_proto SRCS shape_range_info.proto)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
   # be build only in CI, so suppose the generator in Windows is Ninja.
-  copy_onnx(infer_io_utils_tester)
+  copy_onnx(test_benchmark)
+  copy_onnx(test_table_printer)
 endif()
-
-cc_library(table_printer SRCS table_printer.cc)
-paddle_test(test_table_printer SRCS table_printer_tester.cc)
-
-proto_library(shape_range_info_proto SRCS shape_range_info.proto)
diff --git a/paddle/fluid/inference/utils/io_utils.h b/paddle/fluid/inference/utils/io_utils.h
index 7ca274fc1d8469..1743bd85eafe77 100644
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -30,19 +30,21 @@ namespace inference {
 
 constexpr uint32_t kCurPDTensorVersion = 0;
 
-void SerializePDTensorToStream(std::ostream* os, const PaddleTensor& tensor);
-void DeserializePDTensorToStream(std::istream& is, PaddleTensor* tensor);
+TEST_API void SerializePDTensorToStream(std::ostream* os,
+                                        const PaddleTensor& tensor);
+TEST_API void DeserializePDTensorToStream(std::istream& is,
+                                          PaddleTensor* tensor);
 
-void SerializePDTensorsToStream(std::ostream* os,
-                                const std::vector<PaddleTensor>& tensors);
-void DeserializePDTensorsToStream(std::istream& is,
-                                  std::vector<PaddleTensor>* tensors);
+TEST_API void SerializePDTensorsToStream(
+    std::ostream* os, const std::vector<PaddleTensor>& tensors);
+TEST_API void DeserializePDTensorsToStream(std::istream& is,
+                                           std::vector<PaddleTensor>* tensors);
 
-void SerializePDTensorsToFile(const std::string& path,
-                              const std::vector<PaddleTensor>& tensors);
-void DeserializePDTensorsToFile(const std::string& path,
-                                std::vector<PaddleTensor>* tensors);
-void SerializeShapeRangeInfo(
+TEST_API void SerializePDTensorsToFile(
+    const std::string& path, const std::vector<PaddleTensor>& tensors);
+TEST_API void DeserializePDTensorsToFile(const std::string& path,
+                                         std::vector<PaddleTensor>* tensors);
+TEST_API void SerializeShapeRangeInfo(
     const std::string& path,
     const std::map<std::string, std::vector<int32_t>>& min_shape,
     const std::map<std::string, std::vector<int32_t>>& max_shape,
@@ -50,7 +52,7 @@ void SerializeShapeRangeInfo(
     const std::map<std::string, std::vector<int32_t>>& min_value,
     const std::map<std::string, std::vector<int32_t>>& max_value,
     const std::map<std::string, std::vector<int32_t>>& opt_value);
-void DeserializeShapeRangeInfo(
+TEST_API void DeserializeShapeRangeInfo(
     const std::string& path,
     std::map<std::string, std::vector<int32_t>>* min_shape,
     std::map<std::string, std::vector<int32_t>>* max_shape,
@@ -58,7 +60,7 @@ void DeserializeShapeRangeInfo(
     std::map<std::string, std::vector<int32_t>>* min_value,
     std::map<std::string, std::vector<int32_t>>* max_value,
     std::map<std::string, std::vector<int32_t>>* opt_value);
-void UpdateShapeRangeInfo(
+TEST_API void UpdateShapeRangeInfo(
     const std::string& path,
     const std::map<std::string, std::vector<int32_t>>& min_shape,
     const std::map<std::string, std::vector<int32_t>>& max_shape,
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 8865d9c2690cdb..6f8bfe01bf6ae2 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -55,7 +55,7 @@ class TEST_API CustomRegisteredDeviceMap {
 const char* AllocationTypeStr(AllocationType type);
 
 /// \brief The place is used to specify where the data is stored.
-class PADDLE_API Place {
+class TEST_API Place {
  public:
   Place()
       : device(0), alloc_type_(AllocationType::UNDEFINED), device_type_id_(0) {}
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 32e73e0cda6009..59ed51f7681685 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_subdirectory(benchmark)
 add_subdirectory(framework)
+
+add_subdirectory(inference)
+
 if(WITH_CINN)
   add_subdirectory(cinn)
 endif()
diff --git a/test/cpp/fluid/inference/CMakeLists.txt b/test/cpp/fluid/inference/CMakeLists.txt
new file mode 100644
index 00000000000000..512d2b1553c8c9
--- /dev/null
+++ b/test/cpp/fluid/inference/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(utils)
diff --git a/test/cpp/fluid/inference/utils/CMakeLists.txt b/test/cpp/fluid/inference/utils/CMakeLists.txt
new file mode 100644
index 00000000000000..3ea72839b19243
--- /dev/null
+++ b/test/cpp/fluid/inference/utils/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(WITH_TESTING)
+    if(NOT APPLE)
+      inference_base_test(
+            infer_io_utils_tester SRCS io_utils_tester.cc 
+            DEPS
+            paddle_inference_shared
+            common
+      )
+      endif()
+endif()
+
+if(WITH_ONNXRUNTIME AND WIN32)
+    # Copy onnxruntime for some c++ test in Windows, since the test will
+    # be build only in CI, so suppose the generator in Windows is Ninja.
+    copy_onnx(infer_io_utils_tester)
+endif()
diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/test/cpp/fluid/inference/utils/io_utils_tester.cc
similarity index 100%
rename from paddle/fluid/inference/utils/io_utils_tester.cc
rename to test/cpp/fluid/inference/utils/io_utils_tester.cc
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 5a5981fccee931..e8ab757d38104c 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -1,10 +1,7 @@
-cc_test(
-  type_test
-  SRCS type_test.cc
-  DEPS pir op_dialect_vjp)
-cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest)
-cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest)
-cc_test_old(
+paddle_test(type_test SRCS type_test.cc DEPS pir op_dialect_vjp)
+paddle_test(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir)
+paddle_test(ir_value_test SRCS ir_value_test.cc DEPS pir)
+paddle_test(
   ir_op_test
   SRCS
   ir_op_test.cc
@@ -12,8 +9,8 @@ cc_test_old(
   pir
   test_dialect
   op_dialect_vjp)
-cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest)
-cc_test_old(ir_builder_test SRCS ir_builder_test.cc DEPS pir gtest)
+paddle_test(ir_region_test SRCS ir_region_test.cc DEPS pir)
+paddle_test(ir_builder_test SRCS ir_builder_test.cc DEPS pir)
 cc_test_old(
   ir_program_test
   SRCS
@@ -134,14 +131,8 @@ cc_test_old(
   gtest
   pir)
 
-cc_test_old(
-  block_argument_test
-  SRCS
-  block_argument_test.cc
-  DEPS
-  test_dialect
-  gtest
-  pir)
+paddle_test(block_argument_test SRCS block_argument_test.cc DEPS test_dialect
+            pir)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/cpp/pir/core/ir_value_test.cc b/test/cpp/pir/core/ir_value_test.cc
index dba46b72c08a08..c6863fe7b8a2ae 100644
--- a/test/cpp/pir/core/ir_value_test.cc
+++ b/test/cpp/pir/core/ir_value_test.cc
@@ -14,17 +14,33 @@
 
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/pir/core/attribute.h"
 #include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/operation.h"
-
-#include "test/cpp/pir/tools/test_pir_utils.h"
+#include "paddle/pir/dialect/shape/utils/shape_utils.h"
 
 // This unittest is used to test the construction interfaces of value class and
 // operation. The constructed test scenario is: a = OP1(); b = OP2(); c = OP3(a,
 // b); d, e, f, g, h, i, j = OP4(a, c);
+namespace test {
+
+pir::AttributeMap CreateAttributeMap(
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::AttributeMap attr_map;
+  for (size_t i = 0; i < attribute_names.size(); i++) {
+    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
+    attr_map.insert(
+        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
+  }
+  return attr_map;
+}
 
+}  // namespace test
 TEST(value_test, value_test) {
   pir::IrContext *ctx = pir::IrContext::Instance();
   // 1. Construct OP1: a = OP1()

From 217cc544bea49d1a3babc360089a07f42e88a2fc Mon Sep 17 00:00:00 2001
From: Ryan <44900829+DrRyanHuang@users.noreply.github.com>
Date: Mon, 11 Dec 2023 10:49:36 +0800
Subject: [PATCH 05/28] =?UTF-8?q?=E3=80=90PIR=20API=20adaptor=20No.258?=
 =?UTF-8?q?=E3=80=81295=E3=80=81299=E3=80=81307=E3=80=91=20Migrate=20glu/r?=
 =?UTF-8?q?ank/sgn/take=20into=20pir=20(#59535)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/paddle/tensor/attribute.py            |   8 +-
 python/paddle/tensor/math.py                 |  16 ++-
 test/dygraph_to_static/test_function_spec.py |   4 +-
 test/legacy_test/test_glu.py                 |   3 +
 test/legacy_test/test_sgn.py                 | 110 ++++++++++++++-----
 test/legacy_test/test_take.py                |  23 ++--
 6 files changed, 121 insertions(+), 43 deletions(-)

diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 8bc7cff200b344..538c789fdf5960 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -49,7 +49,7 @@ def rank(input):
             >>> print(rank.numpy())
             3
     """
-    check_type(input, 'input', (Variable), 'input')
+    check_type(input, 'input', (Variable, paddle.pir.Value), 'input')
     ndims = len(input.shape)
     out = assign(np.array(ndims, 'int32'))
 
@@ -163,12 +163,16 @@ def is_complex(x):
             >>> print(paddle.is_complex(x))
             False
     """
-    if not isinstance(x, (paddle.Tensor, paddle.static.Variable)):
+    if not isinstance(
+        x, (paddle.Tensor, paddle.static.Variable, paddle.pir.Value)
+    ):
         raise TypeError(f"Expected Tensor, but received type of x: {type(x)}")
     dtype = x.dtype
     is_complex_dtype = (
         dtype == core.VarDesc.VarType.COMPLEX64
         or dtype == core.VarDesc.VarType.COMPLEX128
+        or dtype == core.DataType.COMPLEX64
+        or dtype == core.DataType.COMPLEX128
     )
     return is_complex_dtype
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index a6c787863351c7..63ed9f1248d54c 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -6230,6 +6230,11 @@ def sgn(x, name=None):
         paddle.float64,
         paddle.complex64,
         paddle.complex128,
+        DataType.FLOAT16,
+        DataType.FLOAT32,
+        DataType.FLOAT64,
+        DataType.COMPLEX64,
+        DataType.COMPLEX128,
     ]:
         raise TypeError(
             f"The data type of input must be one of ['float16', 'float32', 'float64', 'complex64', 'complex128'], but got {x.dtype}"
@@ -6317,12 +6322,17 @@ def take(x, index, mode='raise', name=None):
             f"'mode' in 'take' should be 'raise', 'wrap', 'clip', but received {mode}."
         )
 
-    if in_dynamic_mode():
-        if not isinstance(index, (paddle.Tensor, Variable)):
+    if in_dynamic_or_pir_mode():
+        if not isinstance(index, (paddle.Tensor, Variable, paddle.pir.Value)):
             raise TypeError(
                 f"The type of 'index' must be Tensor, but got {type(index)}"
             )
-        if index.dtype not in [paddle.int32, paddle.int64]:
+        if index.dtype not in [
+            paddle.int32,
+            paddle.int64,
+            DataType.INT32,
+            DataType.INT64,
+        ]:
             raise TypeError(
                 "The data type of 'index' must be one of ['int32', 'int64'], but got {}".format(
                     index.dtype
diff --git a/test/dygraph_to_static/test_function_spec.py b/test/dygraph_to_static/test_function_spec.py
index 780a4ccafc40b7..1109c48082e7d4 100644
--- a/test/dygraph_to_static/test_function_spec.py
+++ b/test/dygraph_to_static/test_function_spec.py
@@ -106,9 +106,7 @@ def test_args_to_input_spec(self):
         if in_pir_mode():
             self.assertEqual(input_with_spec[1].shape, [4, 10])  # b.shape
         else:
-            self.assertTupleEqual(
-                tuple(input_with_spec[1].shape), (4, 10)
-            )  # b.shape
+            self.assertTupleEqual(input_with_spec[1].shape, (4, 10))  # b.shape
 
         self.assertEqual(input_with_spec[1].name, 'b_var')  # b.name
 
diff --git a/test/legacy_test/test_glu.py b/test/legacy_test/test_glu.py
index 1baa3295b9a4a3..22253f820888ef 100644
--- a/test/legacy_test/test_glu.py
+++ b/test/legacy_test/test_glu.py
@@ -20,6 +20,7 @@
 import paddle.base.dygraph as dg
 from paddle import base, nn
 from paddle.nn import functional as F
+from paddle.pir_utils import test_with_pir_api
 
 
 def sigmoid(x):
@@ -58,6 +59,7 @@ def glu_axis_size(self):
         x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='float32')
         paddle.nn.functional.glu(x, axis=256)
 
+    @test_with_pir_api
     def test_errors(self):
         self.assertRaises(ValueError, self.glu_axis_size)
 
@@ -92,6 +94,7 @@ def glu_axis_size(self):
         act = nn.GLU(256)
         act(x)
 
+    @test_with_pir_api
     def test_errors(self):
         self.assertRaises(ValueError, self.glu_axis_size)
         act = nn.GLU(256)
diff --git a/test/legacy_test/test_sgn.py b/test/legacy_test/test_sgn.py
index bf9517f42767c3..abcba6b950eda3 100644
--- a/test/legacy_test/test_sgn.py
+++ b/test/legacy_test/test_sgn.py
@@ -15,8 +15,10 @@
 import unittest
 
 import numpy as np
+from utils import static_guard
 
 import paddle
+from paddle.pir_utils import test_with_pir_api
 
 
 def np_sgn(x: np.ndarray):
@@ -31,7 +33,7 @@ def np_sgn(x: np.ndarray):
 
 
 class TestSgnError(unittest.TestCase):
-    def test_errors(self):
+    def test_errors_dynamic(self):
         # The input dtype of sgn must be float16, float32, float64,complex64,complex128.
         input2 = paddle.to_tensor(
             np.random.randint(-10, 10, size=[12, 20]).astype('int32')
@@ -43,33 +45,28 @@ def test_errors(self):
         self.assertRaises(TypeError, paddle.sgn, input2)
         self.assertRaises(TypeError, paddle.sgn, input3)
 
+    @test_with_pir_api
+    def test_errors_static_and_pir(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
 
-class TestSignAPI(unittest.TestCase):
-    def setUp(self) -> None:
-        self.support_dtypes = [
-            'float16',
-            'float32',
-            'float64',
-            'complex64',
-            'complex128',
-        ]
-        if paddle.device.get_device() == 'cpu':
-            self.support_dtypes = [
-                'float32',
-                'float64',
-                'complex64',
-                'complex128',
-            ]
-
-    def test_dtype(self):
-        for dtype in self.support_dtypes:
-            x = paddle.to_tensor(
-                np.random.randint(-10, 10, size=[12, 20, 2]).astype(dtype)
+        with paddle.static.program_guard(main_program, startup_program):
+            # The input dtype of sgn must be float16, float32, float64,complex64,complex128.
+            input2 = paddle.to_tensor(
+                np.random.randint(-10, 10, size=[12, 20]).astype('int32')
+            )
+            input3 = paddle.to_tensor(
+                np.random.randint(-10, 10, size=[12, 20]).astype('int64')
             )
 
-            paddle.sgn(x)
+            self.assertRaises(TypeError, paddle.sgn, input2)
+            self.assertRaises(TypeError, paddle.sgn, input3)
+        paddle.disable_static()
+
 
-    def test_complex(self):
+class TestSignAPI(unittest.TestCase):
+    def test_complex_dynamic(self):
         for dtype in ['complex64', 'complex128']:
             np_x = np.array(
                 [[3 + 4j, 7 - 24j, 0, 1 + 2j], [6 + 8j, 3, 0, -2]], dtype=dtype
@@ -80,8 +77,38 @@ def test_complex(self):
             z_expected = np_sgn(np_x)
             np.testing.assert_allclose(np_z, z_expected, rtol=1e-05)
 
-    def test_float(self):
-        for dtype in self.support_dtypes:
+    @test_with_pir_api
+    def test_complex_static_and_pir(self):
+        with static_guard():
+            for dtype in ['complex64', 'complex128']:
+                exe = paddle.static.Executor()
+
+                train_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+                with paddle.static.program_guard(
+                    train_program, startup_program
+                ):
+                    x = paddle.static.data(name='X', shape=[2, 4], dtype=dtype)
+                    z = paddle.sgn(x)
+
+                # Run the startup program once and only once.
+                # Not need to optimize/compile the startup program.
+                exe.run(startup_program)
+
+                # Run the main program directly without compile.
+                x = np.array(
+                    [[3 + 4j, 7 - 24j, 0, 1 + 2j], [6 + 8j, 3, 0, -2]],
+                    dtype=dtype,
+                )
+                (z,) = exe.run(train_program, feed={"X": x}, fetch_list=[z])
+                z_expected = np_sgn(x)
+                np.testing.assert_allclose(z, z_expected, rtol=1e-05)
+
+    def test_float_dynamic(self):
+        dtype_list = ['float32', 'float64']
+        if paddle.is_compiled_with_cuda():
+            dtype_list.append('float16')
+        for dtype in dtype_list:
             np_x = np.random.randint(-10, 10, size=[12, 20, 2]).astype(dtype)
             x = paddle.to_tensor(np_x)
             z = paddle.sgn(x)
@@ -89,6 +116,37 @@ def test_float(self):
             z_expected = np_sgn(np_x)
             np.testing.assert_allclose(np_z, z_expected, rtol=1e-05)
 
+    @test_with_pir_api
+    def test_float_static_and_pir(self):
+        dtype_list = ['float32', 'float64']
+        if paddle.is_compiled_with_cuda():
+            dtype_list.append('float16')
+        with static_guard():
+            for dtype in dtype_list:
+                exe = paddle.static.Executor()
+
+                train_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+                with paddle.static.program_guard(
+                    train_program, startup_program
+                ):
+                    np_x = np.random.randint(-10, 10, size=[12, 20, 2]).astype(
+                        dtype
+                    )
+                    x = paddle.static.data(
+                        name='X', shape=[12, 20, 2], dtype=dtype
+                    )
+                    z = paddle.sgn(x)
+
+                # Run the startup program once and only once.
+                # Not need to optimize/compile the startup program.
+                exe.run(startup_program)
+
+                # Run the main program directly without compile.
+                (z,) = exe.run(train_program, feed={"X": np_x}, fetch_list=[z])
+                z_expected = np_sgn(np_x)
+                np.testing.assert_allclose(z, z_expected, rtol=1e-05)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_take.py b/test/legacy_test/test_take.py
index 2ea39ae96dc90a..72683f114b65c6 100644
--- a/test/legacy_test/test_take.py
+++ b/test/legacy_test/test_take.py
@@ -18,7 +18,8 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, core, program_guard
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestTakeAPI(unittest.TestCase):
@@ -49,11 +50,12 @@ def setUp(self):
             else base.CPUPlace()
         )
 
+    @test_with_pir_api
     def test_static_graph(self):
         paddle.enable_static()
-        startup_program = Program()
-        train_program = Program()
-        with program_guard(startup_program, train_program):
+        startup_program = paddle.static.Program()
+        train_program = paddle.static.Program()
+        with paddle.static.program_guard(startup_program, train_program):
             x = paddle.static.data(
                 name='input', dtype=self.input_dtype, shape=self.input_shape
             )
@@ -62,9 +64,9 @@ def test_static_graph(self):
             )
             out = paddle.take(x, index, mode=self.mode)
 
-            exe = base.Executor(self.place)
+            exe = paddle.static.Executor(self.place)
             st_result = exe.run(
-                base.default_main_program(),
+                paddle.static.default_main_program(),
                 feed={'input': self.input_np, 'index': self.index_np},
                 fetch_list=out,
             )
@@ -111,10 +113,11 @@ def set_dtype(self):
 class TestTakeTypeError(TestTakeAPI):
     """Test take Type Error"""
 
+    @test_with_pir_api
     def test_static_type_error(self):
         """Argument 'index' must be Tensor"""
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(
                 name='input', dtype=self.input_dtype, shape=self.input_shape
             )
@@ -127,10 +130,11 @@ def test_dygraph_type_error(self):
         x = paddle.to_tensor(self.input_np)
         self.assertRaises(TypeError, paddle.take, x, self.index_np, self.mode)
 
+    @test_with_pir_api
     def test_static_dtype_error(self):
         """Data type of argument 'index' must be in [paddle.int32, paddle.int64]"""
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(
                 name='input', dtype='float64', shape=self.input_shape
             )
@@ -178,11 +182,12 @@ def setUp(self):
             else base.CPUPlace()
         )
 
+    @test_with_pir_api
     def test_static_index_error(self):
         """When the index is out of range,
         an error is reported directly through `paddle.index_select`"""
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(
                 name='input', dtype=self.input_dtype, shape=self.input_shape
             )

From e5fbff457077b8545351b98830c203fbccb94ae7 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 11 Dec 2023 11:12:57 +0800
Subject: [PATCH 06/28] [PIR] Refine conditional_block op translator (#59723)

* refine

* fix

* add select_input

* fix

* fix

* refine if_op without falseblock

* fix bug

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix
---
 .../new_executor/instruction/CMakeLists.txt   |   3 +-
 ...{cond_instruction.cc => if_instruction.cc} |  35 +--
 .../{cond_instruction.h => if_instruction.h}  |  20 +-
 .../instruction/select_input_instruction.cc   | 140 ++++++++++++
 .../instruction/select_input_instruction.h    |  52 +++++
 .../instruction/while_instruction.cc          |  22 +-
 .../pir_adaptor/pir_adaptor_util.h            |   4 +-
 .../framework/new_executor/pir_interpreter.cc |  11 +-
 .../ir_adaptor/translator/op_translator.cc    | 105 +++++++++
 .../translator/program_translator.cc          | 207 ++++++++----------
 .../translator/program_translator.h           |  26 +--
 .../dialect/operator/ir/control_flow_op.cc    |  76 ++++---
 .../pir/dialect/operator/ir/manual_op.cc      |  91 ++++++++
 .../fluid/pir/dialect/operator/ir/manual_op.h |  12 +
 .../pir/dialect/operator/ir/op_dialect.cc     |  19 +-
 .../pir/transforms/pd_op_to_kernel_pass.cc    |  27 +++
 test/cpp/pir/core/program_translator_test.cc  |  73 ++----
 test/dygraph_to_static/test_ifelse.py         |  57 ++++-
 .../test_program_translator.py                |   3 +
 test/dygraph_to_static/test_return.py         | 144 +++++++++++-
 test/dygraph_to_static/test_warning.py        |   8 +-
 test/legacy_test/test_cond.py                 |   4 +-
 22 files changed, 845 insertions(+), 294 deletions(-)
 rename paddle/fluid/framework/new_executor/instruction/{cond_instruction.cc => if_instruction.cc} (90%)
 rename paddle/fluid/framework/new_executor/instruction/{cond_instruction.h => if_instruction.h} (80%)
 create mode 100644 paddle/fluid/framework/new_executor/instruction/select_input_instruction.cc
 create mode 100644 paddle/fluid/framework/new_executor/instruction/select_input_instruction.h

diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
index bfdd4f2b50db48..46d907d60841b8 100644
--- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
@@ -3,8 +3,9 @@ cc_library(
   SRCS instruction_base.cc
        phi_kernel_instruction.cc
        legacy_kernel_instruction.cc
-       cond_instruction.cc
+       if_instruction.cc
        while_instruction.cc
+       select_input_instruction.cc
        has_elements_instruction.cc
        tuple_push_instruction.cc
        tuple_pop_instruction.cc
diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc b/paddle/fluid/framework/new_executor/instruction/if_instruction.cc
similarity index 90%
rename from paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
rename to paddle/fluid/framework/new_executor/instruction/if_instruction.cc
index a25d7d2a5a6df4..3ac3a9e4780be3 100644
--- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/if_instruction.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/new_executor/instruction/cond_instruction.h"
+#include "paddle/fluid/framework/new_executor/instruction/if_instruction.h"
 
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
@@ -39,11 +39,11 @@
 namespace paddle {
 namespace framework {
 
-CondInstruction::CondInstruction(size_t id,
-                                 const platform::Place& place,
-                                 pir::Operation* op,
-                                 ValueExecutionInfo* value_exec_info,
-                                 const std::set<std::string>& skip_gc_vars)
+IfInstruction::IfInstruction(size_t id,
+                             const platform::Place& place,
+                             pir::Operation* op,
+                             ValueExecutionInfo* value_exec_info,
+                             const std::set<std::string>& skip_gc_vars)
     : InstructionBase(id, place) {
   PADDLE_ENFORCE(
       op->isa<paddle::dialect::IfOp>(),
@@ -66,12 +66,14 @@ CondInstruction::CondInstruction(size_t id,
   // OpOperand of IfOp, and the other is external Values used in true_block or
   // false_block.
   auto& true_branch_block = if_op.true_block();
-  auto& false_branch_block = if_op.false_block();
+
   std::unordered_map<pir::Value, std::vector<int>> inputs;
   GetInputIds(op, *value_exec_info, &inputs);
   auto true_outside_inputs =
       GetExternalInputs(&true_branch_block, *value_exec_info, &inputs);
-  auto false_outside_inputs =
+  std::vector<pir::Value> false_outside_inputs;
+  auto& false_branch_block = if_op.false_block();
+  false_outside_inputs =
       GetExternalInputs(&false_branch_block, *value_exec_info, &inputs);
   SetInputs(inputs);
 
@@ -90,8 +92,10 @@ CondInstruction::CondInstruction(size_t id,
     }
   }
   InsertTuplePushContinerToOuts(&true_branch_block, *value_exec_info, &outputs);
+
   InsertTuplePushContinerToOuts(
-      &false_branch_block, *value_exec_info, &outputs);
+      &if_op.false_block(), *value_exec_info, &outputs);
+
   SetOutputs(outputs);
   VLOG(6) << "finish process inputs outputs index";
 
@@ -126,11 +130,10 @@ CondInstruction::CondInstruction(size_t id,
   false_branch_inter_ =
       new PirInterpreter(place,
                          {},
-                         &false_branch_block,
+                         &if_op.false_block(),
                          false_scope,
                          value_exec_info->NewChild(false_scope),
                          {});
-
   std::set<std::string> false_skip_gc_names_set;
   for (auto value : GetYiedOpInputs(&false_branch_block)) {
     false_branch_outputs_.push_back(false_branch_inter_->GetNameByValue(value));
@@ -146,10 +149,11 @@ CondInstruction::CondInstruction(size_t id,
     false_skip_gc_names_set.insert(var_name);
   }
   false_branch_inter_->SetSkipGcVars(false_skip_gc_names_set);
+
   VLOG(6) << "finish process false branch interpreter";
 }
 
-CondInstruction::~CondInstruction() {
+IfInstruction::~IfInstruction() {
   if (true_branch_inter_ != nullptr) {
     delete true_branch_inter_;
   }
@@ -158,8 +162,8 @@ CondInstruction::~CondInstruction() {
   }
 }
 
-void CondInstruction::CopyBranchOutput(
-    const std::vector<std::string>& var_names, const PirInterpreter* inter) {
+void IfInstruction::CopyBranchOutput(const std::vector<std::string>& var_names,
+                                     const PirInterpreter* inter) {
   for (size_t i = 0; i < var_names.size(); ++i) {
     auto* inner_var = inter->InnerScope()->GetVar(var_names[i]);
 
@@ -179,7 +183,7 @@ void CondInstruction::CopyBranchOutput(
   }
 }
 
-void CondInstruction::Run() {
+void IfInstruction::Run() {
   DeviceContext().Wait();
   if (cond_var_->Get<phi::DenseTensor>().data<bool>()[0]) {
     true_branch_inter_->Run({}, false);
@@ -188,7 +192,6 @@ void CondInstruction::Run() {
     false_branch_inter_->Run({}, false);
     CopyBranchOutput(false_branch_outputs_, false_branch_inter_);
   }
-
   // copy ouptut
 }
 
diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h b/paddle/fluid/framework/new_executor/instruction/if_instruction.h
similarity index 80%
rename from paddle/fluid/framework/new_executor/instruction/cond_instruction.h
rename to paddle/fluid/framework/new_executor/instruction/if_instruction.h
index 45f39ba338814f..e6d1fc4723c5d6 100644
--- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/if_instruction.h
@@ -27,15 +27,15 @@ class Value;
 class PirInterpreter;
 class ValueExecutionInfo;
 
-class CondInstruction : public InstructionBase {
+class IfInstruction : public InstructionBase {
  public:
-  CondInstruction(size_t id,
-                  const platform::Place& place,
-                  ::pir::Operation* op,
-                  ValueExecutionInfo* value_exe_info,
-                  const std::set<std::string>& skip_gc_vars);
+  IfInstruction(size_t id,
+                const platform::Place& place,
+                ::pir::Operation* op,
+                ValueExecutionInfo* value_exe_info,
+                const std::set<std::string>& skip_gc_vars);
 
-  ~CondInstruction();
+  ~IfInstruction();
 
   void Run() override;
 
@@ -53,15 +53,15 @@ class CondInstruction : public InstructionBase {
 
   ::pir::Operation* op_;
 
-  std::string cond_name_{"cond_instruction"};
+  std::string cond_name_{"if_instruction"};
 
   Variable* cond_var_;
 
   std::vector<Variable*> output_vars_;
 
-  PirInterpreter* true_branch_inter_;
+  PirInterpreter* true_branch_inter_ = nullptr;
 
-  PirInterpreter* false_branch_inter_;
+  PirInterpreter* false_branch_inter_ = nullptr;
 
   std::vector<std::string> true_branch_outputs_;
 
diff --git a/paddle/fluid/framework/new_executor/instruction/select_input_instruction.cc b/paddle/fluid/framework/new_executor/instruction/select_input_instruction.cc
new file mode 100644
index 00000000000000..893915f841d7fc
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/select_input_instruction.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/instruction/select_input_instruction.h"
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+
+namespace paddle {
+namespace framework {
+
+SelectInputInstruction::SelectInputInstruction(
+    size_t id,
+    const platform::Place &place,
+    ::pir::Operation *op,
+    ValueExecutionInfo *value_exe_info)
+    : InstructionBase(id, place), op_(op) {
+  VLOG(6) << "construct select_input instruction";
+
+  std::unordered_map<pir::Value, std::vector<int>> inputs;
+  mask_ = value_exe_info->GetVarByValue(op->operand_source(0));
+  inputs.emplace(op->operand_source(0),
+                 GetValueIds(op->operand_source(0), *value_exe_info));
+
+  for (size_t i = 1; i < op->num_operands(); ++i) {
+    inputs_.push_back(value_exe_info->GetVarByValue(op->operand_source(i)));
+    inputs.emplace(op->operand_source(i),
+                   GetValueIds(op->operand_source(i), *value_exe_info));
+  }
+  SetInputs(inputs);
+
+  std::unordered_map<pir::Value, std::vector<int>> outputs;
+  out_ = value_exe_info->GetVarByValue(op->result(0));
+  outputs.emplace(op->result(0), GetValueIds(op->result(0), *value_exe_info));
+  SetOutputs(outputs);
+}
+
+inline int GetBranchNumber(const phi::DenseTensor &mask) {
+  PADDLE_ENFORCE_EQ(
+      mask.numel(),
+      1,
+      phi::errors::Fatal("The numel of Input(Mask) in SelectInputOp or "
+                         "SelectOutputOp must be 1. "
+                         "But received %d, and it's shape is [%s].",
+                         mask.numel(),
+                         mask.dims()));
+  if (platform::is_cpu_place(mask.place())) {
+    return mask.data<int>()[0];
+  }
+  // when platform::is_gpu_place(mask.place()) is true
+  std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+  framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
+#else
+  PADDLE_THROW(phi::errors::Fatal(
+      "This version of PaddlePaddle does NOT support GPU, "
+      "but got GPU tensor 'Mask' in SelectInputOp or SelectOutputOp. "
+      "Please compile PaddlePaddle WITH_GPU first."));
+#endif
+  return cpu_mask->data<int>()[0];
+}
+
+class AssignFunctor {
+ public:
+  explicit AssignFunctor(Variable *out) : out_(out) {}
+
+  void operator()(const phi::DenseTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<phi::DenseTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const phi::TensorArray &array) const {
+    auto &out_array = *out_->GetMutable<phi::TensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const phi::SelectedRows &rows) const {
+    phi::SelectedRows &out_rows = *out_->GetMutable<phi::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    auto *m = out_rows.mutable_value();
+    TensorCopy(t, t.place(), m);
+  }
+
+  template <typename T>
+  void operator()(const T &v UNUSED) const {
+    PADDLE_ENFORCE_EQ(
+        true,
+        false,
+        platform::errors::PermissionDenied(
+            "Not support type for assign op with type %s", typeid(T).name()));
+  }
+
+ private:
+  void copy_tensor(const phi::DenseTensor &lod_tensor,
+                   phi::DenseTensor *out) const {
+    if (!lod_tensor.IsInitialized()) return;
+    auto &out_tensor = *out;
+    TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  Variable *out_;
+};
+
+void SelectInputInstruction::Run() {
+  VLOG(6) << "run select_input instruction";
+  auto &mask = mask_->Get<phi::DenseTensor>();
+  size_t output_branch = static_cast<size_t>(GetBranchNumber(mask));
+  PADDLE_ENFORCE_LT(
+      output_branch,
+      inputs_.size(),
+      phi::errors::Fatal(
+          "Input 'Mask' in SelectInputOp is invalid. "
+          "'Mask' must be less than the size of input vector 'X'. "
+          "But received Mask = %d, X's size = %d.",
+          output_branch,
+          inputs_.size()));
+  Variable *selected = inputs_[output_branch];
+  VisitVarType(*selected, AssignFunctor(out_));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/select_input_instruction.h b/paddle/fluid/framework/new_executor/instruction/select_input_instruction.h
new file mode 100644
index 00000000000000..16038e66152f69
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/select_input_instruction.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+
+namespace paddle {
+namespace framework {
+class ValueExecutionInfo;
+
+class SelectInputInstruction : public InstructionBase {
+ public:
+  SelectInputInstruction(size_t id,
+                         const platform::Place& place,
+                         ::pir::Operation* op,
+                         ValueExecutionInfo* value_exe_info);
+
+  void Run() override;
+
+  const std::string& Name() const override { return name_; }
+
+  ::pir::Operation* Operation() const override { return op_; }
+
+ private:
+  ::pir::Operation* op_;
+
+  OpFuncType type_;
+
+  std::string name_{"pd_op.select_input"};
+
+  Variable* mask_;  // not owned
+
+  std::vector<Variable*> inputs_;  // not owned
+
+  Variable* out_;  // not owned
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/while_instruction.cc b/paddle/fluid/framework/new_executor/instruction/while_instruction.cc
index aee25e8d816843..2f3787118d2e48 100644
--- a/paddle/fluid/framework/new_executor/instruction/while_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/while_instruction.cc
@@ -175,20 +175,14 @@ void WhileInstruction::CopyOutputsToBlockArgs() {
       auto* dst_tensor_array = inner_var->GetMutable<phi::TensorArray>();
       dst_tensor_array->set_type(src_tensor_array.dtype());
       dst_tensor_array->set_layout(src_tensor_array.layout());
-      if (dst_tensor_array->empty()) {
-        for (auto src_tensor : src_tensor_array) {
-          phi::DenseTensor* tmp_dst_tensor = new phi::DenseTensor();
-          tmp_dst_tensor->set_meta(src_tensor.meta());
-          framework::TensorCopy(src_tensor, src_tensor.place(), tmp_dst_tensor);
-          dst_tensor_array->push_back(*tmp_dst_tensor);
-        }
-      } else {
-        for (size_t id = 0; id < dst_tensor_array->size(); id++) {
-          auto& src_tensor = src_tensor_array[id];
-          phi::DenseTensor* tmp_dst_tensor = &dst_tensor_array->at(id);
-          tmp_dst_tensor->set_meta(src_tensor.meta());
-          framework::TensorCopy(src_tensor, src_tensor.place(), tmp_dst_tensor);
-        }
+      while (dst_tensor_array->size() < src_tensor_array.size()) {
+        dst_tensor_array->emplace_back();
+      }
+      for (size_t id = 0; id < dst_tensor_array->size(); id++) {
+        auto& src_tensor = src_tensor_array[id];
+        phi::DenseTensor* tmp_dst_tensor = &dst_tensor_array->at(id);
+        tmp_dst_tensor->set_meta(src_tensor.meta());
+        framework::TensorCopy(src_tensor, src_tensor.place(), tmp_dst_tensor);
       }
     } else {
       PADDLE_THROW(
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
index 26fc26a2dd3713..2dfe34b298bbd1 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
@@ -43,11 +43,11 @@
 namespace paddle {
 namespace framework {
 
-class CondInstruction;
+class IfInstruction;
 class WhileInstruction;
 class ValueExecutionInfo {
  public:
-  friend class CondInstruction;
+  friend class IfInstruction;
   friend class WhileInstruction;
 
   explicit ValueExecutionInfo(Scope* scope) : scope_(scope) {}
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 66de40585130b5..45674498b179fb 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -46,10 +46,11 @@
 #endif
 
 #include "paddle/fluid/framework/new_executor/instruction/builtin_combine_instruction.h"
-#include "paddle/fluid/framework/new_executor/instruction/cond_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/has_elements_instruction.h"
+#include "paddle/fluid/framework/new_executor/instruction/if_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
+#include "paddle/fluid/framework/new_executor/instruction/select_input_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/tuple_pop_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/tuple_push_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/while_instruction.h"
@@ -671,15 +672,15 @@ void PirInterpreter::BuildInstruction() {
     } else if (op.dialect()->name() == "pd_op") {
       if (op.isa<paddle::dialect::IfOp>()) {
         auto skip_gc_vars = execution_config_.skip_gc_vars;
-        vec_instruction_base_.emplace_back(std::make_unique<CondInstruction>(
+        vec_instruction_base_.emplace_back(std::make_unique<IfInstruction>(
             op_idx++, place_, &op, value_exe_info_.get(), skip_gc_vars));
         sub_blocks_.insert(
             {&op.dyn_cast<paddle::dialect::IfOp>().true_block(),
-             dynamic_cast<CondInstruction*>(vec_instruction_base_.back().get())
+             dynamic_cast<IfInstruction*>(vec_instruction_base_.back().get())
                  ->TrueBranchInterpreter()});
         sub_blocks_.insert(
             {&op.dyn_cast<paddle::dialect::IfOp>().false_block(),
-             dynamic_cast<CondInstruction*>(vec_instruction_base_.back().get())
+             dynamic_cast<IfInstruction*>(vec_instruction_base_.back().get())
                  ->FalseBranchInterpreter()});
       } else if (op.isa<paddle::dialect::WhileOp>()) {
         auto skip_gc_vars = execution_config_.skip_gc_vars;
@@ -691,6 +692,8 @@ void PirInterpreter::BuildInstruction() {
                  ->BodyInterpreter()});
       } else if (op.isa<paddle::dialect::HasElementsOp>()) {
         CREATE_INSTR(HasElementsInstruction);
+      } else if (op.isa<paddle::dialect::SelectInputOp>()) {
+        CREATE_INSTR(SelectInputInstruction);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Now only support pd_kernel and cinn dialect."));
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index d29781af492de1..4736a3af20a3d4 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1863,6 +1863,110 @@ struct FillConstantTranscriber : public OpTranscriber {
   }
 };
 
+static std::vector<int64_t> ParseCompatibleShapes(
+    const std::vector<int64_t>& dim1, const std::vector<int64_t>& dim2) {
+  IR_ENFORCE(dim1.size() == dim2.size(),
+             "Does not support rank inconsistency: dim1=%d, dim2=%d",
+             dim1.size(),
+             dim2.size());
+  std::vector<int64_t> result;
+  for (size_t i = 0; i < dim1.size(); ++i) {
+    if (dim1[i] != dim2[i]) {
+      result.push_back(-1);
+    } else {
+      result.push_back(dim1[i]);
+    }
+  }
+  return result;
+}
+
+struct SelectInputOpTranscriber : public OpTranscriber {
+  pir::Operation* operator()(pir::IrContext* ctx,
+                             TranslationContext* param_map,
+                             const OpDesc& op_desc,
+                             pir::Block* block) override {
+    VLOG(10) << "[op select_input] start transcribing";
+    auto op_info = this->LoopkUpOpInfo(ctx, op_desc);
+
+    std::vector<pir::Value> op_inputs = {};
+    auto Mask_name = op_desc.Input("Mask")[0];
+    auto& Input_name = op_desc.Input("X");
+    IR_ENFORCE(param_map->count(Mask_name) > 0,
+               "Expected op[%s]'s input %s has been parsed",
+               op_desc.Type(),
+               Mask_name);
+    op_inputs.push_back(param_map->at(Mask_name).value);
+    for (auto in_name : Input_name) {
+      IR_ENFORCE(param_map->count(in_name) > 0,
+                 "Expected op[%s]'s input %s has been parsed",
+                 op_desc.Type(),
+                 in_name);
+      op_inputs.push_back(param_map->at(in_name).value);
+    }
+
+    pir::AttributeMap attribute_map;
+
+    OpOutputMapping arg_to_idx;
+    OpOutputTypeList op_output_types;
+    auto Out_name = op_desc.Output("Out")[0];
+    VarDesc* var = op_desc.Block()->FindVarRecursive(Out_name);
+    arg_to_idx[var->Name()] = {0, 0};
+
+    // NOTE(zhangbo): Only support
+    auto input1 = op_inputs[1].type();
+    auto input2 = op_inputs[2].type();
+    if (input1 == input2) {
+      op_output_types.push_back(op_inputs[1].type());
+    } else if (input1.isa<paddle::dialect::DenseTensorType>() &&
+               input2.isa<paddle::dialect::DenseTensorType>()) {
+      auto tensor1 = input1.dyn_cast<paddle::dialect::DenseTensorType>();
+      auto tensor2 = input2.dyn_cast<paddle::dialect::DenseTensorType>();
+      if (tensor1.dtype() != tensor2.dtype() ||
+          tensor1.data_layout() != tensor2.data_layout() ||
+          tensor1.lod() != tensor2.lod() ||
+          tensor1.offset() != tensor2.offset()) {
+        IR_THROW(
+            "select_input only support same type or DenseTensorType with "
+            "only different dim, but get dtype:[%s, %s], layout:[%s, %s], "
+            "lod:[%s, %s], offset:[%s, %s].",
+            tensor1.dtype(),
+            tensor2.dtype(),
+            tensor1.data_layout(),
+            tensor2.data_layout(),
+            tensor1.lod(),
+            tensor2.lod(),
+            tensor1.offset(),
+            tensor2.offset());
+      }
+      auto dim1 = input1.dyn_cast<paddle::dialect::DenseTensorType>().dims();
+      auto dim2 = input2.dyn_cast<paddle::dialect::DenseTensorType>().dims();
+      std::vector<int64_t> compat_shape = ParseCompatibleShapes(
+          common::vectorize(dim1), common::vectorize(dim2));
+      op_output_types.push_back(
+          paddle::dialect::DenseTensorType::get(ctx,
+                                                tensor1.dtype(),
+                                                common::make_ddim(compat_shape),
+                                                tensor1.data_layout(),
+                                                tensor1.lod(),
+                                                tensor1.offset()));
+    } else {
+      IR_THROW(
+          "select_input only support same type or DenseTensorType with only "
+          "different dim, now is %s != %s.",
+          input1,
+          input2);
+    }
+
+    pir::Operation* operation = pir::Operation::Create(
+        op_inputs, attribute_map, op_output_types, op_info);
+    block->push_back(operation);
+    RecordOpResultMapping(ctx, param_map, op_desc, operation, arg_to_idx);
+
+    VLOG(10) << "[op assign_value] translation finished";
+    return operation;
+  }
+};
+
 pir::OpResult TranslateNumClassesForOneHot(
     pir::IrContext* ctx,
     TranslationContext* param_map,
@@ -2736,6 +2840,7 @@ OpTranslator::OpTranslator() {
   special_handlers["tril_triu"] = TrilAndTriuOpTranscriber();
   special_handlers["mul"] = MulOpTranscriber();
   special_handlers["mul_grad"] = MulGradOpTranscriber();
+  special_handlers["select_input"] = SelectInputOpTranscriber();
 
   // To adapt LodTensorArray
   special_handlers["lod_array_length"] = LodArrayLengthOpTranscriber();
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index 47b8ac58c8a351..ba296feca0344d 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -389,15 +389,11 @@ void ProgramTranslator::Translate() {
   }
 }
 
-void ProgramTranslator::TranslateBlock(
-    const BlockDesc& src_block,
-    uint64_t start_id,
-    uint64_t end_id,
-    TranslationContext* translation_ctx,
-    pir::Block* dst_block,
-    bool for_cond_block,
-    const std::vector<std::string>& cond_sub_block_outputs,
-    const std::vector<::paddle::framework::OpDesc*>& cond_init_ops) {
+void ProgramTranslator::TranslateBlock(const BlockDesc& src_block,
+                                       uint64_t start_id,
+                                       uint64_t end_id,
+                                       TranslationContext* translation_ctx,
+                                       pir::Block* dst_block) {
   VLOG(8) << "=============>start to translate a block";
   PADDLE_ENFORCE(
       (src_block.OpSize() >= end_id) && (start_id <= end_id),
@@ -408,13 +404,8 @@ void ProgramTranslator::TranslateBlock(
           end_id,
           src_block.OpSize()));
 
-  std::unordered_map<uint64_t, bool> translate_completed;
   std::map<std::string, std::string> assign_output_2_input;
   for (uint64_t op_id = start_id; op_id < end_id; op_id++) {
-    if (translate_completed.count(op_id) && translate_completed.at(op_id)) {
-      continue;
-    }
-
     auto op = src_block.Op(static_cast<int>(op_id));
     VLOG(8) << "=============>start to translate a op: " << op->Type();
 
@@ -424,144 +415,137 @@ void ProgramTranslator::TranslateBlock(
                           "Not support translated %s op", op->Type()));
 
     if (op->Type() == "conditional_block") {
-      std::vector<uint64_t> cond_op_ids = GetCondOpIds(src_block, op_id);
-      ConditionBlockCombination cond_op_combination(src_block, cond_op_ids);
-      pir::Operation* if_op = TranslateCondIfOperation(
-          cond_op_combination, translation_ctx, dst_block);
-      for (auto cond_id : cond_op_ids) {
-        translate_completed[cond_id] = true;
-      }
-      VLOG(10) << "[op translated][conditional_block]" << if_op;
+      TranslateIfOperation(op, translation_ctx, dst_block);
     } else if (op->Type() == "while") {
       TranslateWhileOperation(op, translation_ctx, dst_block);
     } else {
-      if (for_cond_block && op->Type() == "assign" &&
-          std::count(cond_sub_block_outputs.begin(),
-                     cond_sub_block_outputs.end(),
-                     op->Output("Out")[0])) {
-        assign_output_2_input[op->Output("Out")[0]] = op->Input("X")[0];
-        translate_completed[op_id] = true;
-      } else {
-        TranslateGeneralOperation(op, translation_ctx, dst_block);
-        translate_completed[op_id] = true;
-      }
-    }
-  }
-
-  // NOTE(zhangbo): If conditional_block operator has output, the cf.yeild
-  // operator needs to be inserted
-  if (for_cond_block) {
-    // insert init ops
-    for (::paddle::framework::OpDesc* init_op : cond_init_ops) {
-      TranslateGeneralOperation(init_op, translation_ctx, dst_block);
+      TranslateGeneralOperation(op, translation_ctx, dst_block);
     }
-    // insert yeild op
-    std::vector<pir::Value> yeild_inputs;
-    for (auto output_name : cond_sub_block_outputs) {
-      if (assign_output_2_input.count(output_name) != 0) {
-        if (translation_ctx->count(assign_output_2_input[output_name]) == 0) {
-          CreateUndefinedVariable(assign_output_2_input[output_name],
-                                  src_block);
-        }
-        yeild_inputs.emplace_back(
-            (*translation_ctx)[assign_output_2_input[output_name]].value);
-      } else {
-        if (translation_ctx->count(output_name) == 0) {
-          CreateUndefinedVariable(output_name, src_block);
-        }
-        yeild_inputs.emplace_back((*translation_ctx)[output_name].value);
-      }
-    }
-    pir::AttributeMap attribute_map;
-    auto yeild_info = ctx_->GetRegisteredOpInfo(pir::YieldOp::name());
-    pir::Operation* yeild_op =
-        pir::Operation::Create(yeild_inputs, attribute_map, {}, yeild_info);
-    dst_block->push_back(yeild_op);
   }
 }
 
-pir::Operation* ProgramTranslator::TranslateCondIfOperation(
-    const ConditionBlockCombination& cond_ops,
+pir::Operation* ProgramTranslator::InsertFullOrDataOpToBlock(
+    pir::Block* insert_block, pir::Type type) {
+  pir::Builder builder(ctx_, insert_block, insert_block->begin());
+  if (type.isa<paddle::dialect::DenseTensorType>()) {
+    auto tensor_type = type.dyn_cast<paddle::dialect::DenseTensorType>();
+    std::vector<int64_t> shape = common::vectorize(tensor_type.dims());
+    paddle::dialect::FullOp full_op = builder.Build<paddle::dialect::FullOp>(
+        shape,
+        0,
+        paddle::dialect::TransToPhiDataType(tensor_type.dtype()),
+        phi::CPUPlace());
+    full_op.out().set_type(type);
+    return full_op.operation();
+  } else if (type.isa<paddle::dialect::DenseTensorArrayType>()) {
+    auto array_type = type.dyn_cast<paddle::dialect::DenseTensorArrayType>();
+    paddle::dialect::CreateArrayOp array_op =
+        builder.Build<paddle::dialect::CreateArrayOp>(
+            paddle::dialect::TransToPhiDataType(array_type.dtype()));
+    array_op.out().set_type(type);
+    return array_op.operation();
+  }
+  return nullptr;
+}
+
+// NOTE(zhangbo): All condition_block_op will be translated as an if_op with
+// only a true branch.
+void ProgramTranslator::TranslateIfOperation(
+    const OpDesc* op,
     TranslationContext* translation_ctx,
     pir::Block* dst_block) {
+  VLOG(8) << "=============>Start to translate if op:" << op;
   auto& type_translator = TypeTranslator::instance();
-  auto op_info = ctx_->GetRegisteredOpInfo(paddle::dialect::IfOp::name());
-  std::vector<pir::Value> op_inputs = {
-      (*translation_ctx)[cond_ops.CondVarName()].value};
 
-  auto input_names = cond_ops.GetInputNamesForIfOp();
-  for (auto input_name : input_names) {
+  auto cond_op_cond = op->Input("Cond")[0];
+  auto& cond_op_inputs = op->Input("Input");
+  for (auto input_name : cond_op_inputs) {
     VLOG(6) << "[general op][conditional_block][inputs: " << input_name << "]";
     GetValueOrCreateInTop(input_name, translation_ctx);
   }
+  auto& cond_op_outputs = op->Output("Out");
+  std::vector<::paddle::framework::VarDesc*> cond_op_output_vars;
+  for (auto out_name : cond_op_outputs) {
+    cond_op_output_vars.emplace_back(op->Block()->FindVarRecursive(out_name));
+  }
 
-  // NOTE(zhangbo): Now paddle::dialect::IfOp has 0 attribute
+  std::vector<pir::Value> if_op_inputs = {
+      (*translation_ctx)[cond_op_cond].value};
   pir::AttributeMap attribute_map;
-
-  std::vector<pir::Type> op_output_types;
-  std::vector<::paddle::framework::VarDesc*> output_vardescs =
-      std::get<0>(cond_ops.CondOutputVars());
-  for (auto var_desc : output_vardescs) {
+  std::vector<pir::Type> if_op_output_types;
+  for (auto var_desc : cond_op_output_vars) {
     IR_ENFORCE(var_desc != nullptr, "[control flow] Output should not be null");
     pir::Type translated_var_type =
         type_translator[var_desc->GetType()](ctx_, *var_desc);
-    op_output_types.emplace_back(translated_var_type);
+    if_op_output_types.emplace_back(translated_var_type);
   }
-  VLOG(4) << "[general op][conditional_block] IfOp preparation end.";
-
+  auto if_op_info = ctx_->GetRegisteredOpInfo(paddle::dialect::IfOp::name());
   pir::Operation* operation = pir::Operation::Create(
-      op_inputs, attribute_map, op_output_types, op_info, 2);
+      if_op_inputs, attribute_map, if_op_output_types, if_op_info, 2);
 
   dst_block->push_back(operation);
   VLOG(4) << "[general op][conditional_block] IfOp creation end.";
 
-  if (cond_ops.TrueBlockId() != -1) {
-    const BlockDesc& true_sub_block =
-        legacy_program_->Block(cond_ops.TrueBlockId());
+  if (op->GetBlockAttrId("sub_block") != -1) {
+    // Translate true branch by sub_block.
+    auto& sub_block = legacy_program_->Block(op->GetBlockAttrId("sub_block"));
     pir::Region& true_region = operation->region(0);
     if (true_region.empty()) true_region.emplace_back();
-
     auto* true_block_context = translation_ctx->CreateInnerContext();
-
-    TranslateBlock(true_sub_block,
+    TranslateBlock(sub_block,
                    0,
-                   true_sub_block.OpSize(),
+                   sub_block.OpSize(),
                    true_block_context,
-                   &true_region.front(),
-                   true,
-                   std::get<1>(cond_ops.CondOutputVars()),
-                   cond_ops.TrueBlockInitOps());
-  }
-  VLOG(4) << "[general op][conditional_block] IfOp true block translate end.";
-
-  if (cond_ops.FalseBlockId() != -1) {
-    const BlockDesc& false_sub_block =
-        legacy_program_->Block(cond_ops.FalseBlockId());
+                   &true_region.front());
+    // insert yeild op to true block
+    auto yeild_info = ctx_->GetRegisteredOpInfo(pir::YieldOp::name());
+    std::vector<pir::Value> true_yeild_inputs;
+    for (auto& out_name : cond_op_outputs) {
+      true_yeild_inputs.push_back(true_block_context->at(out_name).value);
+    }
+    true_region.front().push_back(
+        pir::Operation::Create(true_yeild_inputs, {}, {}, yeild_info));
+
+    // NOTE(zhangbo): The if_op of PIR requires that both true and false
+    // branches must exist, and the number of outputs and dtypes must be
+    // consistent. Only inconsistent shape is allowed. To be compatible with the
+    // old IR design, only true branches are allowed. The false branch may
+    // require yeild some fake variables.
     pir::Region& false_region = operation->region(1);
     if (false_region.empty()) false_region.emplace_back();
     auto* false_block_context = translation_ctx->CreateInnerContext();
-    TranslateBlock(false_sub_block,
-                   0,
-                   false_sub_block.OpSize(),
-                   false_block_context,
-                   &false_region.front(),
-                   true,
-                   std::get<2>(cond_ops.CondOutputVars()),
-                   cond_ops.FalseBlockInitOps());
+    std::vector<pir::Value> false_yeild_inputs;
+    for (size_t id = 0; id < cond_op_outputs.size(); id++) {
+      if (false_block_context->count(cond_op_outputs[id]) == 0) {
+        auto true_type = true_yeild_inputs[id].type();
+        pir::Operation* init_op =
+            InsertFullOrDataOpToBlock(&false_region.front(), true_type);
+        PADDLE_ENFORCE_NOT_NULL(
+            init_op,
+            phi::errors::PreconditionNotMet(
+                "Only support insert full or data op for DenseTensor or "
+                "DenseTensorArray to false block failed."));
+        false_block_context->PushValue(
+            cond_op_outputs[id], VariableDefiningInfo(init_op->result(0)));
+      }
+      false_yeild_inputs.push_back(
+          false_block_context->at(cond_op_outputs[id]).value);
+    }
+    false_region.front().push_back(
+        pir::Operation::Create(false_yeild_inputs, {}, {}, yeild_info));
   }
-  VLOG(4) << "[general op][conditional_block] IfOp false block translate end.";
+  VLOG(4) << "[general op][conditional_block] IfOp true block translate end.";
 
-  for (size_t i = 0; i < output_vardescs.size(); i++) {
-    translation_ctx->PushValue(output_vardescs[i]->Name(),
+  for (size_t i = 0; i < cond_op_output_vars.size(); i++) {
+    translation_ctx->PushValue(cond_op_output_vars[i]->Name(),
                                VariableDefiningInfo(operation->result(i)));
     VLOG(4) << "[general op][conditional_block] var "
-            << output_vardescs[i]->Name() << " was mapped to If's " << i
+            << cond_op_output_vars[i]->Name() << " was mapped to If's " << i
             << "-th output.";
   }
 
   operation->Verify();
   VLOG(4) << "[general op][conditional_block] IfOp translate end.";
-  return operation;
 }
 
 void ProgramTranslator::TranslateWhileOperation(
@@ -813,11 +797,13 @@ void ProgramTranslator::SetStopGradientAttributeForAllValue(
     }
   }
 }
+
 const VariableDefiningInfo& ProgramTranslator::GetValueOrCreateInTop(
     const std::string& var_name, TranslationContext* translation_ctx) {
   if (translation_ctx->Has(var_name)) return translation_ctx->at(var_name);
   return CreateUndefinedVariable(var_name, legacy_program_->Block(0));
 }
+
 const VariableDefiningInfo& ProgramTranslator::CreateUndefinedVariable(
     const std::string& var_name, const BlockDesc& block) {
   VLOG(10) << "[undefined variable]" << var_name;
@@ -840,6 +826,7 @@ const VariableDefiningInfo& ProgramTranslator::CreateUndefinedVariable(
   param_map_.PushValue(var_name, val);
   return param_map_.at(var_name);
 }
+
 void ProgramTranslator::SetIsPersisableAttributeForAllValue(
     const BlockDesc& block) {
   // Currently we set is persisable for operation that generated a value
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.h b/paddle/fluid/ir_adaptor/translator/program_translator.h
index 0dda3dc9b89219..052a8fa13cea41 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.h
@@ -145,15 +145,11 @@ class ProgramTranslator {
 
   static const std::unordered_set<std::string> unsupported_ops;
 
-  void TranslateBlock(
-      const BlockDesc& src_block,
-      uint64_t start_id,
-      uint64_t end_id,
-      TranslationContext* translation_ctx,
-      pir::Block* dst_block,
-      bool for_cond_block = false,
-      const std::vector<std::string>& cond_sub_block_outputs = {},
-      const std::vector<::paddle::framework::OpDesc*>& cond_init_ops = {});
+  void TranslateBlock(const BlockDesc& src_block,
+                      uint64_t start_id,
+                      uint64_t end_id,
+                      TranslationContext* translation_ctx,
+                      pir::Block* dst_block);
 
   void TranslateGeneralOperation(const OpDesc* src_op,
                                  TranslationContext* translation_ctx,
@@ -169,11 +165,13 @@ class ProgramTranslator {
   const VariableDefiningInfo& CreateUndefinedVariable(
       const std::string& var_name, const BlockDesc& block);
 
-  /// Translate methods for control flow ops.
-  pir::Operation* TranslateCondIfOperation(
-      const ConditionBlockCombination& cond_ops,
-      TranslationContext* translation_ctx,
-      pir::Block* dst_block);
+  pir::Operation* InsertFullOrDataOpToBlock(pir::Block* insert_block,
+                                            pir::Type type);
+
+  void TranslateIfOperation(const OpDesc* op,
+                            TranslationContext* translation_ctx,
+                            pir::Block* dst_block);
+
   void TranslateWhileOperation(const OpDesc* op,
                                TranslationContext* translation_ctx,
                                pir::Block* dst_block);
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index 204a4c176d3ffc..dbb7c7c248dd48 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -164,45 +164,43 @@ void IfOp::VerifySig() {
 }
 
 void IfOp::VerifyRegion() {
-  // VLOG(4) << "Start Verifying sub regions for: IfOp.";
-  // PADDLE_ENFORCE_EQ(
-  //     (*this)->region(0).size(),
-  //     1u,
-  //     phi::errors::PreconditionNotMet("The size %d of true_region must
-  //     be 1.",
-  //                                     (*this)->region(0).size()));
-
-  // if ((*this)->num_results() != 0) {
-  //   PADDLE_ENFORCE_EQ(
-  //       (*this)->region(0).size(),
-  //       (*this)->region(1).size(),
-  //       phi::errors::PreconditionNotMet("The size %d of true_region must be "
-  //                                       "equal to the size %d of
-  //                                       false_region.",
-  //                                       (*this)->region(0).size(),
-  //                                       (*this)->region(1).size()));
-
-  //   auto &true_last_op = (*this)->region(0).front().back();
-  //   auto &false_last_op = (*this)->region(1).front().back();
-  //   PADDLE_ENFORCE_EQ(true,
-  //                     true_last_op.isa<pir::YieldOp>(),
-  //                     phi::errors::PreconditionNotMet(
-  //                         "The last of true block must be YieldOp"));
-  //   PADDLE_ENFORCE_EQ(true_last_op.num_operands(),
-  //                     (*this)->num_results(),
-  //                     phi::errors::PreconditionNotMet(
-  //                         "The size of last of true block op's input must be
-  //                         " "equal to IfOp's outputs num."));
-  //   PADDLE_ENFORCE_EQ(true,
-  //                     false_last_op.isa<pir::YieldOp>(),
-  //                     phi::errors::PreconditionNotMet(
-  //                         "The last of false block must be YieldOp"));
-  //   PADDLE_ENFORCE_EQ(false_last_op.num_operands(),
-  //                     (*this)->num_results(),
-  //                     phi::errors::PreconditionNotMet(
-  //                         "The size of last of false block op's input must be
-  //                         " "equal to IfOp's outputs num."));
-  // }
+  VLOG(4) << "Start Verifying sub regions for: IfOp.";
+  VLOG(4) << "Start Verifying true branch.";
+  PADDLE_ENFORCE_EQ(
+      (*this)->region(0).size(),
+      1u,
+      phi::errors::PreconditionNotMet("The size %d of true_region must be 1.",
+                                      (*this)->region(0).size()));
+  if ((*this)->region(0).front().size() > 0) {
+    auto &true_last_op = (*this)->region(0).front().back();
+    PADDLE_ENFORCE_EQ(true,
+                      true_last_op.isa<pir::YieldOp>(),
+                      phi::errors::PreconditionNotMet(
+                          "The last of true block must be YieldOp"));
+    PADDLE_ENFORCE_EQ(true_last_op.num_operands(),
+                      (*this)->num_results(),
+                      phi::errors::PreconditionNotMet(
+                          "The size of last of true block op's input must be "
+                          "equal to IfOp's outputs num."));
+  }
+  VLOG(4) << "Start Verifying false branch.";
+  PADDLE_ENFORCE_EQ(
+      (*this)->region(1).size(),
+      1u,
+      phi::errors::PreconditionNotMet("The size %d of false_region must be 1.",
+                                      (*this)->region(0).size()));
+  if ((*this)->region(1).front().size() > 0) {
+    auto &false_last_op = (*this)->region(1).front().back();
+    PADDLE_ENFORCE_EQ(true,
+                      false_last_op.isa<pir::YieldOp>(),
+                      phi::errors::PreconditionNotMet(
+                          "The last of false block must be YieldOp"));
+    PADDLE_ENFORCE_EQ(false_last_op.num_operands(),
+                      (*this)->num_results(),
+                      phi::errors::PreconditionNotMet(
+                          "The size of last of false block op's input must be "
+                          "equal to IfOp's outputs num."));
+  }
 }
 
 std::vector<std::vector<pir::OpResult>> IfOp::Vjp(
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index cda564bedbb1df..2160e56442d465 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -11,8 +11,20 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#ifdef GET_OP_LIST
+#undef GET_OP_LIST
+paddle::dialect::AddNOp, paddle::dialect::AddN_Op,
+    paddle::dialect::AddNWithKernelOp, paddle::dialect::FusedGemmEpilogueOp,
+    paddle::dialect::FusedGemmEpilogueGradOp, paddle::dialect::SplitGradOp,
+    paddle::dialect::ExpandOp, paddle::dialect::CreateArrayOp,
+    paddle::dialect::ArrayLengthOp, paddle::dialect::ArrayReadOp,
+    paddle::dialect::ArrayWrite_Op, paddle::dialect::SliceArrayOp,
+    paddle::dialect::SliceArrayDenseOp, paddle::dialect::AssignArray_Op,
+    paddle::dialect::ArrayToTensorOp, paddle::dialect::SelectInputOp
+#else
 
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_tensor.h"
@@ -2181,6 +2193,83 @@ phi::DataType ExpandOp::GetKernelTypeForVar(
   return expected_kernel_dtype;
 }
 
+void SelectInputOp::VerifySig() {
+  VLOG(4) << "Verifying inputs, outputs and attributes for: SelectInputOp.";
+  VLOG(4) << "Verifying inputs:";
+  {
+    auto in_size = num_operands();
+    IR_ENFORCE(in_size == 3u, "Size %d of inputs must be >= 3.", in_size);
+    auto input1 = (*this)->operand_source(1).type();
+    auto input2 = (*this)->operand_source(2).type();
+    if (input1.isa<paddle::dialect::DenseTensorType>() &&
+        input2.isa<paddle::dialect::DenseTensorType>()) {
+      auto tensor1 = input1.dyn_cast<paddle::dialect::DenseTensorType>();
+      auto tensor2 = input1.dyn_cast<paddle::dialect::DenseTensorType>();
+      IR_ENFORCE(
+          tensor1.dtype() == tensor2.dtype(),
+          "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+          tensor1.dtype(),
+          tensor2.dtype());
+      IR_ENFORCE(tensor1.data_layout() == tensor2.data_layout(),
+                 "The 1st input data_layout %s should be equal to 2ed input "
+                 "data_layout %s.",
+                 tensor1.data_layout(),
+                 tensor2.data_layout());
+      IR_ENFORCE(tensor1.lod() == tensor2.lod(),
+                 "The 1st input lod %s should be equal to 2ed input lod %s.",
+                 tensor1.lod(),
+                 tensor2.lod());
+      IR_ENFORCE(
+          tensor1.offset() == tensor2.offset(),
+          "The 1st input offset %s should be equal to 2ed input offset %s.",
+          tensor1.offset(),
+          tensor2.offset());
+    } else if (input1.isa<paddle::dialect::AllocatedDenseTensorType>() &&
+               input2.isa<paddle::dialect::AllocatedDenseTensorType>()) {
+      auto tensor1 =
+          input1.dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
+      auto tensor2 =
+          input1.dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
+      IR_ENFORCE(
+          tensor1.dtype() == tensor2.dtype(),
+          "The 1st input dtype %s should be equal to 2ed input dtype %s.",
+          tensor1.dtype(),
+          tensor2.dtype());
+      IR_ENFORCE(tensor1.data_layout() == tensor2.data_layout(),
+                 "The 1st input data_layout %s should be equal to 2ed input "
+                 "data_layout %s.",
+                 tensor1.data_layout(),
+                 tensor2.data_layout());
+      IR_ENFORCE(tensor1.lod() == tensor2.lod(),
+                 "The 1st input lod %s should be equal to 2ed input lod %s.",
+                 tensor1.lod(),
+                 tensor2.lod());
+      IR_ENFORCE(
+          tensor1.offset() == tensor2.offset(),
+          "The 1st input offset %s should be equal to 2ed input offset %s.",
+          tensor1.offset(),
+          tensor2.offset());
+      IR_ENFORCE(
+          tensor1.place() == tensor2.place(),
+          "The 1st input place %s should be equal to 2ed input place %s.",
+          tensor1.place(),
+          tensor2.place());
+    } else {
+      IR_ENFORCE(input1 == input2,
+                 "The 1st input type %s should be equal to 2ed input type %s.",
+                 input1,
+                 input2);
+    }
+  }
+  VLOG(4) << "Verifying outputs:";
+  {
+    auto out_size = num_results();
+    IR_ENFORCE(
+        out_size == 1u, "Size %d of outputs must be equal to 1.", out_size);
+  }
+  VLOG(4) << "End Verifying for: AssignArray_Op.";
+}
+
 }  // namespace dialect
 }  // namespace paddle
 
@@ -2199,3 +2288,5 @@ IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::SliceArrayDenseOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AssignArray_Op)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ArrayToTensorOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ExpandOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::SelectInputOp)
+#endif
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
index 6bdeac5bc04c9a..460356039d84ab 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -401,6 +401,17 @@ class ExpandOp : public pir::Op<ExpandOp,
       const std::vector<std::vector<bool>> &stop_gradients);
 };
 
+class SelectInputOp : public pir::Op<SelectInputOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.select_input"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  void VerifySig();
+  pir::Value mask() { return operand_source(0); }
+  pir::OpResult out() { return result(0); }
+};
+
 }  // namespace dialect
 }  // namespace paddle
 
@@ -419,3 +430,4 @@ IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::SliceArrayDenseOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AssignArray_Op)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ArrayToTensorOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ExpandOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::SelectInputOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 4c44b91af35b72..002b8cb731ed7e 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -64,21 +64,10 @@ void OperatorDialect::initialize() {
 #include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc"  // NOLINT
       >();
 
-  RegisterOps<paddle::dialect::AddNOp,
-              paddle::dialect::AddN_Op,
-              paddle::dialect::AddNWithKernelOp,
-              paddle::dialect::FusedGemmEpilogueOp,
-              paddle::dialect::FusedGemmEpilogueGradOp,
-              paddle::dialect::SplitGradOp,
-              paddle::dialect::ExpandOp,
-              paddle::dialect::CreateArrayOp,
-              paddle::dialect::ArrayLengthOp,
-              paddle::dialect::ArrayReadOp,
-              paddle::dialect::ArrayWrite_Op,
-              paddle::dialect::SliceArrayOp,
-              paddle::dialect::SliceArrayDenseOp,
-              paddle::dialect::AssignArray_Op,
-              paddle::dialect::ArrayToTensorOp>();
+  RegisterOps<
+#define GET_OP_LIST
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.cc"  // NOLINT
+      >();
 
   RegisterInterfaces<ParameterConvertInterface>();
 }
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 40ca238d397b44..04c4d68933140d 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -105,6 +105,7 @@ const std::unordered_set<std::string> SpecialLowerOps = {
     pir::TuplePushOp::name(),
     pir::TuplePopOp::name(),
     HasElementsOp::name(),
+    SelectInputOp::name(),
     "cinn_runtime.jit_kernel"};
 
 static bool NeedFallBackCpu(const pir::Operation* op,
@@ -997,6 +998,20 @@ pir::Value GetNewInput(
   return new_in;
 }
 
+phi::Place ParsePhiPlace(pir::Type type) {
+  if (type.isa<AllocatedDenseTensorType>()) {
+    return type.dyn_cast<AllocatedDenseTensorType>().place();
+  } else if (type.isa<AllocatedSelectedRowsType>()) {
+    return type.dyn_cast<AllocatedSelectedRowsType>().place();
+  } else if (type.isa<AllocatedDenseTensorArrayType>()) {
+    return type.dyn_cast<AllocatedDenseTensorArrayType>().place();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "ParsePhiPlace only support AllocatedDenseTensorType or "
+        "AllocatedSelectedRowsType or AllocatedDenseTensorArrayType"));
+  }
+}
+
 void HandleForSpecialOp(
     const phi::Place& place,
     pir::Operation* op_item,
@@ -1222,6 +1237,18 @@ void HandleForSpecialOp(
     }
   }
 
+  if (op_item->isa<SelectInputOp>()) {
+    for (size_t i = 0; i < op_item->num_operands(); ++i) {
+      auto cur_in = op_item->operand_source(i);
+      auto new_in = GetNewInput(
+          cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
+      vec_inputs.push_back(new_in);
+    }
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      op_output_types.push_back(vec_inputs[1].type());
+    }
+  }
+
   if (op_item->name() == "cinn_runtime.jit_kernel") {
     if (op_item->num_operands() > 0) {
       for (size_t i = 0; i < op_item->num_operands(); ++i) {
diff --git a/test/cpp/pir/core/program_translator_test.cc b/test/cpp/pir/core/program_translator_test.cc
index 0c199f0481e710..010ed757d2ab8b 100644
--- a/test/cpp/pir/core/program_translator_test.cc
+++ b/test/cpp/pir/core/program_translator_test.cc
@@ -85,7 +85,7 @@ TEST(OperatorDialectTest, ConditionBlock) {
   ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto program = paddle::TranslateLegacyProgramToProgram(p);
 
-  EXPECT_EQ(program->block()->size(), 4u);
+  EXPECT_EQ(program->block()->size(), 9u);
   size_t id = 0;
   for (auto &op : *program->block()) {
     if (id == 0 || id == 1) {
@@ -117,77 +117,36 @@ TEST(OperatorDialectTest, ConditionBlock) {
               EXPECT_EQ(op2.isa<paddle::dialect::AddOp>(), true);
             }
             if (true_true_id == 1) {
-              EXPECT_EQ(op2.isa<pir::YieldOp>(), true);
-            }
-            true_true_id++;
-          }
-          auto &false_false_block =
-              op1.dyn_cast<paddle::dialect::IfOp>().false_block();
-          size_t false_false_id = 0;
-          for (auto &op2 : false_false_block) {
-            if (false_false_id == 0) {
-              EXPECT_EQ(op2.isa<paddle::dialect::MultiplyOp>(), true);
+              EXPECT_EQ(op2.isa<paddle::dialect::AssignOp>(), true);
             }
-            if (false_false_id == 1) {
+            if (true_true_id == 2) {
               EXPECT_EQ(op2.isa<pir::YieldOp>(), true);
             }
-            false_false_id++;
+            true_true_id++;
           }
         }
         if (true_id == 4) {
-          EXPECT_EQ(op1.isa<paddle::dialect::MultiplyOp>(), true);
+          EXPECT_EQ(op1.isa<paddle::dialect::LogicalNotOp>(), true);
         }
         if (true_id == 5) {
-          EXPECT_EQ(op1.isa<pir::YieldOp>(), true);
+          EXPECT_EQ(op1.isa<paddle::dialect::IfOp>(), true);
         }
-        true_id++;
-      }
-      // false block
-      auto &false_block = op.dyn_cast<paddle::dialect::IfOp>().false_block();
-      size_t false_id = 0;
-      for (auto &op1 : false_block) {
-        if (false_id == 0 || false_id == 1) {
-          EXPECT_EQ(op1.isa<paddle::dialect::FullOp>(), true);
+        if (true_id == 6) {
+          EXPECT_EQ(op1.isa<paddle::dialect::CastOp>(), true);
         }
-        if (false_id == 2) {
-          EXPECT_EQ(op1.isa<paddle::dialect::LessThanOp>(), true);
+        if (true_id == 7) {
+          EXPECT_EQ(op1.isa<paddle::dialect::SelectInputOp>(), true);
         }
-        if (false_id == 3) {
-          EXPECT_EQ(op1.isa<paddle::dialect::IfOp>(), true);
-          // true block
-          auto &false_true_block =
-              op1.dyn_cast<paddle::dialect::IfOp>().true_block();
-          size_t false_true_id = 0;
-          for (auto &op2 : false_true_block) {
-            if (false_true_id == 0) {
-              EXPECT_EQ(op2.isa<paddle::dialect::AddOp>(), true);
-            }
-            if (false_true_id == 1) {
-              EXPECT_EQ(op2.isa<pir::YieldOp>(), true);
-            }
-            false_true_id++;
-          }
-          // false block
-          auto &false_false_block =
-              op1.dyn_cast<paddle::dialect::IfOp>().true_block();
-          size_t false_false_id = 0;
-          for (auto &op2 : false_false_block) {
-            if (false_false_id == 0) {
-              EXPECT_EQ(op2.isa<paddle::dialect::AddOp>(), true);
-            }
-            if (false_false_id == 1) {
-              EXPECT_EQ(op2.isa<pir::YieldOp>(), true);
-            }
-            false_false_id++;
-          }
-        }
-        if (false_id == 4) {
+        if (true_id == 8) {
           EXPECT_EQ(op1.isa<paddle::dialect::MultiplyOp>(), true);
         }
-        if (false_id == 5) {
+        if (true_id == 9 || true_id == 10) {
+          EXPECT_EQ(op1.isa<paddle::dialect::AssignOp>(), true);
+        }
+        if (true_id == 11) {
           EXPECT_EQ(op1.isa<pir::YieldOp>(), true);
         }
-        false_id++;
+        true_id++;
       }
     }
     id++;
diff --git a/test/dygraph_to_static/test_ifelse.py b/test/dygraph_to_static/test_ifelse.py
index bc28e1fb3368a9..5f50780597e814 100644
--- a/test/dygraph_to_static/test_ifelse.py
+++ b/test/dygraph_to_static/test_ifelse.py
@@ -22,6 +22,7 @@
     disable_test_case,
     enable_to_static_guard,
     test_ast_only,
+    test_legacy_only,
 )
 from ifelse_simple_func import (
     NetWithControlFlowIf,
@@ -108,11 +109,28 @@ def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
 
-class TestDygraphIfElse3(TestDygraphIfElse):
+class TestDygraphIfElse3(Dy2StTestBase):
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = dyfunc_with_if_else3
 
+    def _run_static(self):
+        return self._run_dygraph(to_static=True)
+
+    def _run_dygraph(self, to_static=False):
+        with base.dygraph.guard(place):
+            x_v = base.dygraph.to_variable(self.x)
+            if to_static:
+                ret = paddle.jit.to_static(self.dyfunc)(x_v)
+            else:
+                ret = self.dyfunc(x_v)
+            return ret.numpy()
+
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
+    def test_ast_to_func(self):
+        self.assertTrue((self._run_dygraph() == self._run_static()).all())
+
 
 class TestDygraphIfElse4(TestDygraphIfElse):
     def setUp(self):
@@ -143,6 +161,8 @@ def _run_dygraph(self, to_static=False):
                 ret = self.dyfunc(x_v)
             return ret.numpy()
 
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
     def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
@@ -153,11 +173,28 @@ def setUp(self):
         self.dyfunc = nested_if_else_2
 
 
-class TestDygraphNestedIfElse3(TestDygraphIfElse):
+class TestDygraphNestedIfElse3(Dy2StTestBase):
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = nested_if_else_3
 
+    def _run_static(self):
+        return self._run_dygraph(to_static=True)
+
+    def _run_dygraph(self, to_static=False):
+        with base.dygraph.guard(place):
+            x_v = paddle.to_tensor(self.x)
+            if to_static:
+                ret = paddle.jit.to_static(self.dyfunc)(x_v)
+            else:
+                ret = self.dyfunc(x_v)
+            return ret.numpy()
+
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
+    def test_ast_to_func(self):
+        self.assertTrue((self._run_dygraph() == self._run_static()).all())
+
 
 def dyfunc_ifExp_with_while(x):
     y = [x]
@@ -185,10 +222,10 @@ def body(i, ten, y):
     return y[0]
 
 
-class TestDygraphIfElse6(TestDygraphIfElse):
-    def setUp(self):
-        self.x = np.random.random([10, 16]).astype('float32')
-        self.dyfunc = dyfunc_ifExp_with_while
+# class TestDygraphIfElse6(TestDygraphIfElse):
+#     def setUp(self):
+#         self.x = np.random.random([10, 16]).astype('float32')
+#         self.dyfunc = dyfunc_ifExp_with_while
 
 
 def dyfunc_ifExp(x):
@@ -268,6 +305,8 @@ def _run_dygraph(self, to_static=False):
                 ret = self.dyfunc(x_v)
             return ret.numpy()
 
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different dtype
+    @test_legacy_only
     def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
@@ -296,6 +335,8 @@ def _run(self, to_static=False):
                 ret = net(x_v)
                 return ret.numpy()
 
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
     def test_ast_to_func(self):
         self.assertTrue((self._run_dygraph() == self._run_static()).all())
 
@@ -460,6 +501,8 @@ def get_dy2stat_out(self):
             out = static_func(self.x)
         return out
 
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
     @test_ast_only
     def test_ast_to_func(self):
         self.setUp()
@@ -480,6 +523,8 @@ def setUp(self):
         self.dyfunc = paddle.jit.to_static(dyfunc_ifelse_ret_int3)
         self.out = self.get_dy2stat_out()
 
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
     @test_ast_only
     def test_ast_to_func(self):
         self.setUp()
diff --git a/test/dygraph_to_static/test_program_translator.py b/test/dygraph_to_static/test_program_translator.py
index 2e373e5a57b6bd..812dd9d040e747 100644
--- a/test/dygraph_to_static/test_program_translator.py
+++ b/test/dygraph_to_static/test_program_translator.py
@@ -24,6 +24,7 @@
     ToStaticMode,
     disable_test_case,
     test_ast_only,
+    test_legacy_only,
 )
 from ifelse_simple_func import (
     dyfunc_with_if_else_early_return1,
@@ -304,6 +305,8 @@ def test_raise_error(self):
 
 
 class TestIfElseEarlyReturn(Dy2StTestBase):
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different rank
+    @test_legacy_only
     def test_ifelse_early_return1(self):
         answer = np.zeros([2, 2]) + 1
         static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return1)
diff --git a/test/dygraph_to_static/test_return.py b/test/dygraph_to_static/test_return.py
index ceab96855c3d43..5d7e02ac8181e9 100644
--- a/test/dygraph_to_static/test_return.py
+++ b/test/dygraph_to_static/test_return.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_ast_only
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_ast_only,
+    test_legacy_only,
+)
 from ifelse_simple_func import dyfunc_with_if_else
 
 import paddle
@@ -316,10 +320,54 @@ def init_dygraph_func(self):
         self.dygraph_func = test_inside_func_base
 
 
-class TestReturnIf(TestReturnBase):
+class TestReturnIf(Dy2StTestBase):
+    def setUp(self):
+        self.input = np.ones(1).astype('int32')
+        self.place = (
+            base.CUDAPlace(0)
+            if base.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        self.init_dygraph_func()
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_if
 
+    def _run(self, to_static=False):
+        paddle.jit.enable_to_static(to_static)
+        with base.dygraph.guard():
+            res = self.dygraph_func(self.input)
+            if isinstance(res, (tuple, list)):
+                return tuple(r.numpy() for r in res)
+            elif isinstance(res, core.eager.Tensor):
+                return res.numpy()
+            return res
+
+    def _test_value_impl(self):
+        dygraph_res = self._run(to_static=False)
+        static_res = self._run(to_static=True)
+        if isinstance(dygraph_res, tuple):
+            self.assertTrue(isinstance(static_res, tuple))
+            self.assertEqual(len(dygraph_res), len(static_res))
+            for i in range(len(dygraph_res)):
+                np.testing.assert_allclose(
+                    dygraph_res[i], static_res[i], rtol=1e-05
+                )
+        elif isinstance(dygraph_res, np.ndarray):
+            np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
+        else:
+            self.assertEqual(dygraph_res, static_res)
+
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different dtype
+    @test_legacy_only
+    @test_ast_only
+    def test_transformed_static_result(self):
+        if hasattr(self, "error"):
+            with self.assertRaisesRegex(Dygraph2StaticException, self.error):
+                self._test_value_impl()
+        else:
+            self._test_value_impl()
+
 
 class TestReturnOnlyIf(TestReturnBase):
     def init_dygraph_func(self):
@@ -331,20 +379,108 @@ def init_dygraph_func(self):
         self.dygraph_func = test_return_in_for
 
 
-class TestReturnInWhile(TestReturnBase):
+class TestReturnInWhile(Dy2StTestBase):
+    def setUp(self):
+        self.input = np.ones(1).astype('int32')
+        self.place = (
+            base.CUDAPlace(0)
+            if base.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        self.init_dygraph_func()
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_in_while
 
+    def _run(self, to_static=False):
+        paddle.jit.enable_to_static(to_static)
+        with base.dygraph.guard():
+            res = self.dygraph_func(self.input)
+            if isinstance(res, (tuple, list)):
+                return tuple(r.numpy() for r in res)
+            elif isinstance(res, core.eager.Tensor):
+                return res.numpy()
+            return res
+
+    def _test_value_impl(self):
+        dygraph_res = self._run(to_static=False)
+        static_res = self._run(to_static=True)
+        if isinstance(dygraph_res, tuple):
+            self.assertTrue(isinstance(static_res, tuple))
+            self.assertEqual(len(dygraph_res), len(static_res))
+            for i in range(len(dygraph_res)):
+                np.testing.assert_allclose(
+                    dygraph_res[i], static_res[i], rtol=1e-05
+                )
+        elif isinstance(dygraph_res, np.ndarray):
+            np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
+        else:
+            self.assertEqual(dygraph_res, static_res)
+
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different dtype
+    @test_legacy_only
+    @test_ast_only
+    def test_transformed_static_result(self):
+        if hasattr(self, "error"):
+            with self.assertRaisesRegex(Dygraph2StaticException, self.error):
+                self._test_value_impl()
+        else:
+            self._test_value_impl()
+
 
 class TestReturnIfDiff(TestReturnBase):
     def init_dygraph_func(self):
         self.dygraph_func = test_diff_return
 
 
-class TestReturnIfElse(TestReturnBase):
+class TestReturnIfElse(Dy2StTestBase):
+    def setUp(self):
+        self.input = np.ones(1).astype('int32')
+        self.place = (
+            base.CUDAPlace(0)
+            if base.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        self.init_dygraph_func()
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_if_else
 
+    def _run(self, to_static=False):
+        paddle.jit.enable_to_static(to_static)
+        with base.dygraph.guard():
+            res = self.dygraph_func(self.input)
+            if isinstance(res, (tuple, list)):
+                return tuple(r.numpy() for r in res)
+            elif isinstance(res, core.eager.Tensor):
+                return res.numpy()
+            return res
+
+    def _test_value_impl(self):
+        dygraph_res = self._run(to_static=False)
+        static_res = self._run(to_static=True)
+        if isinstance(dygraph_res, tuple):
+            self.assertTrue(isinstance(static_res, tuple))
+            self.assertEqual(len(dygraph_res), len(static_res))
+            for i in range(len(dygraph_res)):
+                np.testing.assert_allclose(
+                    dygraph_res[i], static_res[i], rtol=1e-05
+                )
+        elif isinstance(dygraph_res, np.ndarray):
+            np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
+        else:
+            self.assertEqual(dygraph_res, static_res)
+
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different dtype
+    @test_legacy_only
+    @test_ast_only
+    def test_transformed_static_result(self):
+        if hasattr(self, "error"):
+            with self.assertRaisesRegex(Dygraph2StaticException, self.error):
+                self._test_value_impl()
+        else:
+            self._test_value_impl()
+
 
 class TestReturnInWhile2(TestReturnBase):
     def init_dygraph_func(self):
diff --git a/test/dygraph_to_static/test_warning.py b/test/dygraph_to_static/test_warning.py
index 9eac0f6a8902bb..e1b9a02b2851dd 100644
--- a/test/dygraph_to_static/test_warning.py
+++ b/test/dygraph_to_static/test_warning.py
@@ -15,7 +15,11 @@
 import unittest
 import warnings
 
-from dygraph_to_static_utils import Dy2StTestBase, test_ast_only
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_ast_only,
+    test_legacy_only,
+)
 
 import paddle
 from paddle.static.nn import cond
@@ -39,6 +43,8 @@ def false_fn():
 
 
 class TestReturnNoneInIfelse(Dy2StTestBase):
+    # Why add test_legacy_only? : PIR not support if true and false branch output with different dtype
+    @test_legacy_only
     @test_ast_only
     def test_dy2static_warning(self):
         paddle.disable_static()
diff --git a/test/legacy_test/test_cond.py b/test/legacy_test/test_cond.py
index 163c1b921f7450..a2a616db1b2c72 100644
--- a/test/legacy_test/test_cond.py
+++ b/test/legacy_test/test_cond.py
@@ -253,7 +253,9 @@ def true_func():
         def false_func():
             return paddle.tensor.fill_constant(
                 shape=[3, 4], dtype='int32', value=3
-            ), paddle.tensor.fill_constant(shape=[4, 5], dtype='bool', value=2)
+            ), paddle.tensor.fill_constant(
+                shape=[4, 5], dtype='bool', value=False
+            )
 
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()

From a7637166f4824a305ee697650beb6b79915c633f Mon Sep 17 00:00:00 2001
From: wentao yu <yuwentao126@126.com>
Date: Mon, 11 Dec 2023 11:16:43 +0800
Subject: [PATCH 07/28] Fix comments for PR #59644 (#59750)

* tinyfix for PR #59644

* tinyfix

* tinyfix

* update
---
 paddle/fluid/memory/allocation/allocator.h            | 11 ++++++-----
 .../fleet/meta_parallel/pipeline_parallel.py          |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 963dd03182f96c..dd86ba9855fbab 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 
 #ifdef PADDLE_WITH_NCCL
@@ -143,22 +144,22 @@ using DecoratedAllocationPtr =
 
 template <typename T>
 static T&& FillValue(T&& allocation) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_CUDA)
   if (allocation != nullptr) {
     if (FLAGS_sync_after_alloc || FLAGS_alloc_fill_value >= 0) {
-      cudaDeviceSynchronize();
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
       if (FLAGS_alloc_fill_value >= 0) {
         VLOG(10) << "Set " << FLAGS_alloc_fill_value << " on "
                  << allocation->ptr() << " " << allocation->place() << " "
                  << allocation->size();
         if (platform::is_gpu_place(allocation->place())) {
-          cudaMemset(
-              allocation->ptr(), FLAGS_alloc_fill_value, allocation->size());
+          PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(
+              allocation->ptr(), FLAGS_alloc_fill_value, allocation->size()));
         } else {
           std::memset(
               allocation->ptr(), FLAGS_alloc_fill_value, allocation->size());
         }
-        cudaDeviceSynchronize();
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
       }
     }
   }
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index fe98ae72775d56..a53f8734e82f24 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -921,7 +921,7 @@ def __init__(self, layers, hcg, strategy):
         self._virtual_pp_rank = 0
         self._reset_counter()
 
-        self._assign_vpp_info(self.model_chunks)
+        self._check_sanity()
 
     def _check_sanity(self):
         assert (

From f7cb3e3b488ee406e08efe060d250b6f995763e9 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Mon, 11 Dec 2023 11:47:29 +0800
Subject: [PATCH 08/28] [Dy2St] Disable `test_resnet` and `test_build_strategy`
 on CPU tests (#59742)

* [Dy2St] Decrease `test_resnet` time

* temp rename to resnetx

* increase timeout of test_resnet

* remove pir test case

* dec timeout

* inc timeout

* disable on CPU

* rename to merge

* rename to resnetx

* rename to resnet

* fix import

* fix v2

* fix test_build_strategy

* rename to test_build_strategyx

* rename back
---
 test/dygraph_to_static/CMakeLists.txt         |   8 +-
 .../dygraph_to_static_utils.py                |  10 +-
 test/dygraph_to_static/test_build_strategy.py |  19 ++--
 test/dygraph_to_static/test_resnet.py         | 102 +++++++++---------
 test/dygraph_to_static/test_seq2seq.py        |   7 +-
 5 files changed, 71 insertions(+), 75 deletions(-)

diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt
index 28618223007090..e2ce58b7cf58c2 100644
--- a/test/dygraph_to_static/CMakeLists.txt
+++ b/test/dygraph_to_static/CMakeLists.txt
@@ -25,6 +25,9 @@ if(NOT WITH_GPU)
   # We should remove this after fix the performance issue.
   list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_adam)
   list(REMOVE_ITEM TEST_OPS test_train_step_resnet18_sgd)
+  # disable some model test on CPU to avoid timeout
+  list(REMOVE_ITEM TEST_OPS test_resnet)
+  list(REMOVE_ITEM TEST_OPS test_build_strategy)
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
@@ -43,19 +46,14 @@ set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transformer PROPERTIES TIMEOUT 200)
 set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
 set_tests_properties(test_bert PROPERTIES TIMEOUT 240)
-#set_tests_properties(test_mnist PROPERTIES TIMEOUT 120)
-set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120)
 
 if(NOT WIN32)
   set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
-  set_tests_properties(test_resnet PROPERTIES TIMEOUT 300)
 endif()
 
 if(APPLE)
   set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_resnet PROPERTIES TIMEOUT 300)
 endif()
 
 if(WITH_GPU)
diff --git a/test/dygraph_to_static/dygraph_to_static_utils.py b/test/dygraph_to_static/dygraph_to_static_utils.py
index cca352f4815b70..f226c978c120ad 100644
--- a/test/dygraph_to_static/dygraph_to_static_utils.py
+++ b/test/dygraph_to_static/dygraph_to_static_utils.py
@@ -381,6 +381,7 @@ def test_legacy_and_pt_and_pir(fn):
     return fn
 
 
+# Some decorators for save CI time
 def test_default_mode_only(fn):
     # Some unittests has high time complexity, we only test them with default mode
     fn = set_to_static_mode(ToStaticMode.SOT)(fn)
@@ -388,16 +389,15 @@ def test_default_mode_only(fn):
     return fn
 
 
-def test_sot_with_pir_only(fn):
+def test_default_and_pir(fn):
+    # Some unittests has high time complexity, we only test them with default mode
     fn = set_to_static_mode(ToStaticMode.SOT)(fn)
-    fn = set_ir_mode(IrMode.PIR)(fn)
+    fn = set_ir_mode(IrMode.PT | IrMode.PIR)(fn)
     return fn
 
 
-def test_default_and_pir(fn):
-    # Some unittests has high time complexity, we only test them with default mode
+def test_sot_mgs0_only(fn):
     fn = set_to_static_mode(ToStaticMode.SOT)(fn)
-    fn = set_ir_mode(IrMode.PT | IrMode.PIR)(fn)
     return fn
 
 
diff --git a/test/dygraph_to_static/test_build_strategy.py b/test/dygraph_to_static/test_build_strategy.py
index 67c0219d65b885..01216eea60d171 100644
--- a/test/dygraph_to_static/test_build_strategy.py
+++ b/test/dygraph_to_static/test_build_strategy.py
@@ -18,9 +18,8 @@
 from dygraph_to_static_utils import (
     Dy2StTestBase,
     enable_to_static_guard,
-    test_ast_only,
-    test_default_and_pir,
-    test_pt_only,
+    test_default_mode_only,
+    test_legacy_and_pt_and_pir,
 )
 from test_resnet import ResNetHelper
 
@@ -36,7 +35,7 @@ def setUp(self):
         self.build_strategy.enable_addto = True
         self.resnet_helper = ResNetHelper()
         # NOTE: for enable_addto
-        paddle.base.set_flags({"FLAGS_max_inplace_grad_add": 8})
+        paddle.set_flags({"FLAGS_max_inplace_grad_add": 8})
 
     def train(self, to_static):
         with enable_to_static_guard(to_static):
@@ -67,8 +66,7 @@ def verify_predict(self):
             err_msg=f'predictor_pre:\n {predictor_pre}\n, st_pre: \n{st_pre}.',
         )
 
-    @test_ast_only
-    @test_pt_only
+    @test_default_mode_only
     def test_resnet(self):
         static_loss = self.train(to_static=True)
         dygraph_loss = self.train(to_static=False)
@@ -80,19 +78,18 @@ def test_resnet(self):
         )
         self.verify_predict()
 
-    @test_ast_only
-    @test_pt_only
+    @test_default_mode_only
     def test_in_static_mode_mkldnn(self):
-        paddle.base.set_flags({'FLAGS_use_mkldnn': True})
+        paddle.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.base.core.is_compiled_with_mkldnn():
                 self.resnet_helper.train(True, self.build_strategy)
         finally:
-            paddle.base.set_flags({'FLAGS_use_mkldnn': False})
+            paddle.set_flags({'FLAGS_use_mkldnn': False})
 
 
 class TestError(Dy2StTestBase):
-    @test_default_and_pir
+    @test_legacy_and_pt_and_pir
     def test_type_error(self):
         def foo(x):
             out = x + 1
diff --git a/test/dygraph_to_static/test_resnet.py b/test/dygraph_to_static/test_resnet.py
index 7f72b900133a9b..665620d1da390b 100644
--- a/test/dygraph_to_static/test_resnet.py
+++ b/test/dygraph_to_static/test_resnet.py
@@ -21,6 +21,8 @@
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
+    enable_to_static_guard,
+    static_guard,
     test_default_and_pir,
 )
 from predictor_utils import PredictorTools
@@ -247,32 +249,31 @@ def __len__(self):
         return len(self.img)
 
 
-class TestResnet(Dy2StTestBase):
-    def setUp(self):
+class ResNetHelper:
+    def __init__(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
         self.model_save_dir = os.path.join(self.temp_dir.name, "./inference")
         self.model_save_prefix = os.path.join(
-            self.temp_dir.name, "./inference/resnet_v2"
+            self.temp_dir.name, "./inference/resnet"
         )
         self.model_filename = (
-            "resnet_v2" + paddle.jit.translated_layer.INFER_MODEL_SUFFIX
+            "resnet" + paddle.jit.translated_layer.INFER_MODEL_SUFFIX
         )
         self.params_filename = (
-            "resnet_v2" + paddle.jit.translated_layer.INFER_PARAMS_SUFFIX
+            "resnet" + paddle.jit.translated_layer.INFER_PARAMS_SUFFIX
         )
         self.dy_state_dict_save_path = os.path.join(
-            self.temp_dir.name, "./resnet_v2.dygraph"
+            self.temp_dir.name, "./resnet.dygraph"
         )
 
-    def tearDown(self):
+    def __del__(self):
         self.temp_dir.cleanup()
 
-    def do_train(self, to_static):
+    def train(self, to_static, build_strategy=None):
         """
         Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
         """
-        paddle.disable_static(place)
         np.random.seed(SEED)
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
@@ -284,7 +285,7 @@ def do_train(self, to_static):
             dataset, batch_size=batch_size, drop_last=True
         )
 
-        resnet = paddle.jit.to_static(ResNet())
+        resnet = paddle.jit.to_static(ResNet(), build_strategy=build_strategy)
         optimizer = optimizer_setting(parameter_list=resnet.parameters())
 
         for epoch in range(epoch_num):
@@ -350,59 +351,55 @@ def do_train(self, to_static):
                             self.dy_state_dict_save_path + '.pdparams',
                         )
                     break
-        paddle.enable_static()
 
         return total_loss.numpy()
 
     def predict_dygraph(self, data):
-        paddle.jit.enable_to_static(False)
-        paddle.disable_static(place)
-        resnet = paddle.jit.to_static(ResNet())
+        with enable_to_static_guard(False):
+            resnet = paddle.jit.to_static(ResNet())
 
-        model_dict = paddle.load(self.dy_state_dict_save_path + '.pdparams')
-        resnet.set_dict(model_dict)
-        resnet.eval()
+            model_dict = paddle.load(self.dy_state_dict_save_path + '.pdparams')
+            resnet.set_dict(model_dict)
+            resnet.eval()
 
-        pred_res = resnet(
-            paddle.to_tensor(
-                data=data, dtype=None, place=None, stop_gradient=True
+            pred_res = resnet(
+                paddle.to_tensor(
+                    data=data, dtype=None, place=None, stop_gradient=True
+                )
             )
-        )
 
         ret = pred_res.numpy()
-        paddle.enable_static()
         return ret
 
     def predict_static(self, data):
-        exe = paddle.static.Executor(place)
-        [
-            inference_program,
-            feed_target_names,
-            fetch_targets,
-        ] = paddle.static.load_inference_model(
-            self.model_save_dir,
-            executor=exe,
-            model_filename=self.model_filename,
-            params_filename=self.params_filename,
-        )
+        with static_guard():
+            exe = paddle.static.Executor(place)
+            [
+                inference_program,
+                feed_target_names,
+                fetch_targets,
+            ] = paddle.static.load_inference_model(
+                self.model_save_dir,
+                executor=exe,
+                model_filename=self.model_filename,
+                params_filename=self.params_filename,
+            )
 
-        pred_res = exe.run(
-            inference_program,
-            feed={feed_target_names[0]: data},
-            fetch_list=fetch_targets,
-        )
+            pred_res = exe.run(
+                inference_program,
+                feed={feed_target_names[0]: data},
+                fetch_list=fetch_targets,
+            )
 
-        return pred_res[0]
+            return pred_res[0]
 
     def predict_dygraph_jit(self, data):
-        paddle.disable_static(place)
         resnet = paddle.jit.load(self.model_save_prefix)
         resnet.eval()
 
         pred_res = resnet(data)
 
         ret = pred_res.numpy()
-        paddle.enable_static()
         return ret
 
     def predict_analysis_inference(self, data):
@@ -415,16 +412,21 @@ def predict_analysis_inference(self, data):
         (out,) = output()
         return out
 
+
+class TestResnet(Dy2StTestBase):
+    def setUp(self):
+        self.resnet_helper = ResNetHelper()
+
     def train(self, to_static):
-        paddle.jit.enable_to_static(to_static)
-        return self.do_train(to_static)
+        with enable_to_static_guard(to_static):
+            return self.resnet_helper.train(to_static)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = self.predict_dygraph(image)
-        st_pre = self.predict_static(image)
-        dy_jit_pre = self.predict_dygraph_jit(image)
-        predictor_pre = self.predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         np.testing.assert_allclose(
             dy_pre,
             st_pre,
@@ -455,7 +457,7 @@ def test_resnet(self):
             err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
         # TODO(@xiongkun): open after save / load supported in pir.
-        if not paddle.base.framework.use_pir_api():
+        if not paddle.framework.use_pir_api():
             self.verify_predict()
 
     @test_default_and_pir
@@ -474,12 +476,12 @@ def test_resnet_composite(self):
 
     @test_default_and_pir
     def test_in_static_mode_mkldnn(self):
-        paddle.base.set_flags({'FLAGS_use_mkldnn': True})
+        paddle.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.base.core.is_compiled_with_mkldnn():
                 self.train(to_static=True)
         finally:
-            paddle.base.set_flags({'FLAGS_use_mkldnn': False})
+            paddle.set_flags({'FLAGS_use_mkldnn': False})
 
 
 if __name__ == '__main__':
diff --git a/test/dygraph_to_static/test_seq2seq.py b/test/dygraph_to_static/test_seq2seq.py
index c5dd312148de6e..a21e4614e71991 100644
--- a/test/dygraph_to_static/test_seq2seq.py
+++ b/test/dygraph_to_static/test_seq2seq.py
@@ -20,9 +20,8 @@
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    ToStaticMode,
-    set_to_static_mode,
     test_legacy_only,
+    test_sot_mgs0_only,
 )
 from seq2seq_dygraph_model import AttentionModel, BaseModel
 from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
@@ -239,13 +238,13 @@ def _test_predict(self, attn_model=False):
             msg=f"\npred_dygraph = {pred_dygraph} \npred_static = {pred_static}",
         )
 
-    @set_to_static_mode(ToStaticMode.SOT)
+    @test_sot_mgs0_only
     @test_legacy_only
     def test_base_model(self):
         self._test_train(attn_model=False)
         self._test_predict(attn_model=False)
 
-    @set_to_static_mode(ToStaticMode.SOT)
+    @test_sot_mgs0_only
     @test_legacy_only
     def test_attn_model(self):
         self._test_train(attn_model=True)

From b12eb1e5b40ddc46d77a25a9f8fab02939bf4030 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Mon, 11 Dec 2023 11:48:33 +0800
Subject: [PATCH 09/28] [Dy2St] Decrease `test_train_step` time (#59867)

* [Dy2St] Decrease `test_train_step` time

* rename back
---
 test/dygraph_to_static/test_train_step.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/dygraph_to_static/test_train_step.py b/test/dygraph_to_static/test_train_step.py
index 5b180b54eda657..bdfd4e732d3504 100644
--- a/test/dygraph_to_static/test_train_step.py
+++ b/test/dygraph_to_static/test_train_step.py
@@ -17,7 +17,7 @@
 from functools import partial
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase
+from dygraph_to_static_utils import Dy2StTestBase, test_ast_only, test_pt_only
 
 import paddle
 
@@ -77,6 +77,8 @@ def get_train_step_losses(self, func, steps):
             losses.append(loss)
         return losses
 
+    @test_ast_only
+    @test_pt_only
     def test_train_step(self):
         reset_seed()
         dygraph_losses = self.get_train_step_losses(

From 47953acd02c6664fb2ff885766ca148cb0959a0a Mon Sep 17 00:00:00 2001
From: coco <69197635+cocoshe@users.noreply.github.com>
Date: Mon, 11 Dec 2023 12:07:28 +0800
Subject: [PATCH 10/28] =?UTF-8?q?=E3=80=90Hackathon=205th=20No.35=E3=80=91?=
 =?UTF-8?q?=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20histogramdd=20API=20(?=
 =?UTF-8?q?#57880)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add histogramdd api

* add some tests for different bins type

* fix

* clean api func

* fix atol

* add some type check && add error test

* fix codestyle

* detail test func name

* codestyle

* modify range to ranges to avoid conflict, modify sample to x

* modify static test to random test

* coverage

* Update python/paddle/tensor/linalg.py

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* Update python/paddle/tensor/linalg.py

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* Update python/paddle/tensor/linalg.py

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* fix doc

* fix doc

---------

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>
---
 python/paddle/__init__.py               |   2 +
 python/paddle/tensor/__init__.py        |   2 +
 python/paddle/tensor/linalg.py          | 225 +++++++++++
 test/legacy_test/test_histogramdd_op.py | 488 ++++++++++++++++++++++++
 4 files changed, 717 insertions(+)
 create mode 100644 test/legacy_test/test_histogramdd_op.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f6b3aca02c2a61..3ff2cdad1b23e4 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -156,6 +156,7 @@
     cholesky,
     bmm,
     histogram,
+    histogramdd,
     bincount,
     mv,
     eigvalsh,
@@ -695,6 +696,7 @@
     'rot90',
     'bincount',
     'histogram',
+    'histogramdd',
     'multiplex',
     'CUDAPlace',
     'empty',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 1682378f59f903..4c453492f193d8 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -79,6 +79,7 @@
     eigvals,
     eigvalsh,
     histogram,
+    histogramdd,
     householder_product,
     lstsq,
     lu,
@@ -435,6 +436,7 @@
     'cholesky',
     'bmm',
     'histogram',
+    'histogramdd',
     'bincount',
     'mv',
     'matrix_power',
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 212125825d9b13..8625c5ae1ecdef 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -29,6 +29,7 @@
 
 __all__ = []
 
+
 # Consistent with kDefaultDim from C++ Backend
 K_DEFAULT_DIM = 9
 
@@ -3869,3 +3870,227 @@ def _householder_product(x, tau):
             )
     out = out.reshape(org_x_shape)
     return out
+
+
+def histogramdd(
+    x, bins=10, ranges=None, density=False, weights=None, name=None
+):
+    r"""
+    Computes a multi-dimensional histogram of the values in a tensor.
+
+    Interprets the elements of an input tensor whose innermost dimension has size `N` as a collection of N-dimensional points. Maps each of the points into a set of N-dimensional bins and returns the number of points (or total weight) in each bin.
+
+    input `x` must be a tensor with at least 2 dimensions. If input has shape `(M, N)`, each of its `M` rows defines a point in N-dimensional space. If input has three or more dimensions, all but the last dimension are flattened.
+
+    Each dimension is independently associated with its own strictly increasing sequence of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D tensors. Alternatively, bin edges may be constructed automatically by passing a sequence of integers specifying the number of equal-width bins in each dimension.
+
+    Args:
+        x (Tensor): The input tensor.
+        bins (Tensor[], int[], or int): If Tensor[], defines the sequences of bin edges. If int[], defines the number of equal-width bins in each dimension. If int, defines the number of equal-width bins for all dimensions.
+        ranges (sequence of float, optional): Defines the leftmost and rightmost bin edges in each dimension. If is None, set the minimum and maximum as leftmost and rightmost edges for each dimension.
+        density (bool, optional): If False (default), the result will contain the count (or total weight) in each bin. If True, each count (weight) is divided by the total count (total weight), then divided by the volume of its associated bin.
+        weights (Tensor, optional): By default, each value in the input has weight 1. If a weight tensor is passed, each N-dimensional coordinate in input contributes its associated weight towards its bin’s result. The weight tensor should have the same shape as the input tensor excluding its innermost dimension N.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
+    Returns:
+        N-dimensional Tensor containing the values of the histogram. ``bin_edges(Tensor[])``,  sequence of N 1D Tensors containing the bin edges.
+
+    Examples:
+        .. code-block:: python
+            :name: exampl
+
+            >>> import paddle
+            >>> x = paddle.to_tensor([[0., 1.], [1., 0.], [2.,0.], [2., 2.]])
+            >>> bins = [3,3]
+            >>> weights = paddle.to_tensor([1., 2., 4., 8.])
+            >>> paddle.histogramdd(x, bins=bins, weights=weights)
+            (Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                   [[0., 1., 0.],
+                    [2., 0., 0.],
+                    [4., 0., 8.]]), [Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                   [0.        , 0.66666669, 1.33333337, 2.        ]), Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                   [0.        , 0.66666669, 1.33333337, 2.        ])])
+
+        .. code-block:: python
+            :name: examp2
+
+            >>> import paddle
+            >>> y = paddle.to_tensor([[0., 0.], [1., 1.], [2., 2.]])
+            >>> bins = [2,2]
+            >>> ranges = [0., 1., 0., 1.]
+            >>> density = True
+            >>> paddle.histogramdd(y, bins=bins, ranges=ranges, density=density)
+            (Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                   [[2., 0.],
+                    [0., 2.]]), [Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                   [0.        , 0.50000000, 1.        ]), Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                   [0.        , 0.50000000, 1.        ])])
+
+
+    """
+
+    def __check_x(x):
+        assert (
+            len(x.shape) >= 2
+        ), "input x must be a tensor with at least 2 dimensions."
+        check_variable_and_dtype(
+            x,
+            'x',
+            [
+                'float32',
+                'float64',
+            ],
+            'histogramdd',
+        )
+
+    def __check_bins(bins, x):  # when Tensor[], check dtype
+        for bins_tensor in bins:
+            bins_tensor = paddle.to_tensor(bins_tensor)
+            check_variable_and_dtype(
+                bins_tensor,
+                'bins',
+                [
+                    'float32',
+                    'float64',
+                ],
+                'histogramdd',
+            )
+            assert (
+                bins_tensor.dtype == x.dtype
+            ), "When bins is Tensor[], the dtype of bins must be the same as x.\n"
+
+    def __check_weights(x, weights):
+        if weights is None:
+            return
+        x_shape, weights_shape = x.shape, weights.shape
+        assert len(x_shape) == len(weights_shape) + 1, (
+            "if weight tensor is provided,"
+            "it should have the same shape as the input tensor excluding its innermost dimension.\n"
+        )
+        for i, _ in enumerate(weights_shape):
+            assert weights_shape[i] == x_shape[i], (
+                "if weight tensor is provided,"
+                "it should have the same shape as the input tensor excluding its innermost dimension.\n"
+            )
+        check_variable_and_dtype(
+            weights,
+            'weights',
+            [
+                'float32',
+                'float64',
+            ],
+            'histogramdd',
+        )
+        assert (
+            weights.dtype == x.dtype
+        ), "The dtype of weights must be the same as x.\n"
+
+    def __check_ranges(D, ranges):
+        if ranges is None:
+            return
+        check_type(ranges, 'ranges', (list, tuple), 'histogramdd')
+        assert D * 2 == len(
+            ranges
+        ), "The length of ranges list must be %d\n" % (D * 2)
+
+    check_type(density, 'density', bool, 'histogramdd')
+
+    __check_x(x)
+    # weights
+    __check_weights(x, weights)
+    D = x.shape[-1]
+    reshaped_input = x.reshape([-1, D])
+    N = reshaped_input.shape[0]
+    reshaped_weights = None
+    if weights is not None:
+        weights = weights.astype(x.dtype)
+        reshaped_weights = weights.reshape([N])
+        assert reshaped_weights.shape[0] == N, (
+            "The size of weight must be %d" % N
+        )
+    # ranges
+    __check_ranges(D, ranges)
+    if ranges is None:
+        ranges = paddle.zeros([D, 2], dtype=x.dtype)
+        maxv = paddle.max(reshaped_input, axis=0).reshape([-1])
+        minv = paddle.min(reshaped_input, axis=0).reshape([-1])
+
+        if paddle.in_dynamic_mode():
+            ranges[:, 0] = minv
+            ranges[:, 1] = maxv
+        else:
+            ranges = paddle.static.setitem(ranges, (slice(None), 0), minv)
+            ranges = paddle.static.setitem(ranges, (slice(None), 1), maxv)
+    else:
+        ranges = paddle.to_tensor(ranges, dtype=x.dtype).reshape([D, 2])
+    # bins to edges
+    edges = []
+    hist_shape = []
+    dedges = []
+    if isinstance(bins, (int, list)):  # int or int[]
+        if isinstance(bins, int):
+            bins = [bins] * D
+        assert len(bins) == D, (
+            "The length of bins must be %d when bins is a list.\n" % D
+        )
+        for idx, r in enumerate(ranges):
+            if not isinstance(bins[idx], int):
+                raise ValueError(
+                    "The type of %d-th element in bins list must be int." % idx
+                )
+            e = paddle.linspace(r[0], r[1], bins[idx] + 1, x.dtype)
+            edges.append(e)
+            dedges.append(e.diff())
+    elif isinstance(
+        bins, tuple
+    ):  # tuple with D tensors for each innermost dimension
+        __check_bins(bins, x)
+        for bin in bins:
+            bin = paddle.to_tensor(bin)
+            edges.append(bin)
+            dedges.append(bin.diff())
+    else:
+        raise ValueError("Input bins must be Tensor[], int[], or int.")
+    hist_shape = [edge.shape[0] + 1 for edge in edges]
+    index_list = []
+    # edges shape: [D, linspaced]
+    # index_list shape: [D, N]
+    for idx, edge in enumerate(edges):
+        edge = paddle.to_tensor(edge)
+        index_list.append(
+            paddle.searchsorted(edge, reshaped_input[:, idx], right=True)
+        )
+    index_list = paddle.to_tensor(index_list)
+    for i in range(D):
+        on_edge = reshaped_input[:, i] == edges[i][-1]
+        if paddle.in_dynamic_mode():
+            index_list[i][on_edge] -= 1
+        else:
+            index_list = paddle.static.setitem(
+                index_list, (i, on_edge), index_list[i][on_edge] - 1
+            )
+    index_list = tuple(index_list)
+    lut = paddle.arange(
+        paddle.to_tensor(hist_shape).prod(),
+    ).reshape(hist_shape)
+    flattened_index = lut[index_list]
+    hist = paddle.bincount(
+        flattened_index,
+        reshaped_weights,
+        minlength=paddle.to_tensor(hist_shape).prod(),
+    )
+    hist = hist.reshape(hist_shape)
+    hist = hist.astype(x.dtype)
+
+    core = D * (slice(1, -1),)
+    hist = hist[core]
+
+    if density:
+        s = hist.sum()
+        for i in range(D):
+            shape = D * [1]
+            shape[i] = hist_shape[i] - 2
+            hist = hist / dedges[i].reshape(shape)
+        hist /= s
+
+    return (hist, edges)
diff --git a/test/legacy_test/test_histogramdd_op.py b/test/legacy_test/test_histogramdd_op.py
new file mode 100644
index 00000000000000..482788f61e8ca1
--- /dev/null
+++ b/test/legacy_test/test_histogramdd_op.py
@@ -0,0 +1,488 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+def ref_histogramdd(x, bins, ranges, weights, density):
+    D = x.shape[-1]
+    x = x.reshape(-1, D)
+    if ranges is not None:
+        ranges = np.array(ranges, dtype=x.dtype).reshape(D, 2).tolist()
+    if weights is not None:
+        weights = weights.reshape(-1)
+    ref_hist, ref_edges = np.histogramdd(x, bins, ranges, density, weights)
+    return ref_hist, ref_edges
+
+
+# inputs, bins, ranges, weights, density
+class TestHistogramddAPI(unittest.TestCase):
+    def setUp(self):
+        self.ranges = None
+        self.weights = None
+        self.density = False
+
+        self.init_input()
+        self.set_expect_output()
+        self.place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+
+    def init_input(self):
+        # self.sample = np.array([[0.0, 1.0], [1.0, 0.0], [2.0, 0.0], [2.0, 2.0]])
+        self.sample = np.random.randn(
+            4,
+            2,
+        ).astype(np.float64)
+        self.bins = [3, 3]
+        self.weights = np.array([1.0, 2.0, 4.0, 8.0], dtype=self.sample.dtype)
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(
+                'x', self.sample.shape, dtype=self.sample.dtype
+            )
+            if self.weights is not None:
+                weights = paddle.static.data(
+                    'weights', self.weights.shape, dtype=self.weights.dtype
+                )
+                out_0, out_1 = paddle.histogramdd(
+                    x,
+                    bins=self.bins,
+                    weights=weights,
+                    ranges=self.ranges,
+                    density=self.density,
+                )
+            else:
+                out_0, out_1 = paddle.histogramdd(
+                    x, bins=self.bins, ranges=self.ranges, density=self.density
+                )
+            exe = paddle.static.Executor(self.place)
+            if self.weights is not None:
+                res = exe.run(
+                    feed={'x': self.sample, 'weights': self.weights},
+                    fetch_list=[out_0, out_1],
+                )
+            else:
+                res = exe.run(
+                    feed={'x': self.sample}, fetch_list=[out_0, out_1]
+                )
+
+            hist_out, edges_out = res[0], res[1:]
+            np.testing.assert_allclose(
+                hist_out,
+                self.expect_hist,
+            )
+            for idx, edge_out in enumerate(edges_out):
+                expect_edge = np.array(self.expect_edges[idx])
+                np.testing.assert_allclose(
+                    edge_out,
+                    expect_edge,
+                )
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        self.sample_dy = paddle.to_tensor(self.sample, dtype=self.sample.dtype)
+        self.weights_dy = None
+        if self.weights is not None:
+            self.weights_dy = paddle.to_tensor(self.weights)
+        if isinstance(self.bins, tuple):
+            self.bins = tuple([paddle.to_tensor(bin) for bin in self.bins])
+        hist, edges = paddle.histogramdd(
+            self.sample_dy,
+            bins=self.bins,
+            weights=self.weights_dy,
+            ranges=self.ranges,
+            density=self.density,
+        )
+
+        np.testing.assert_allclose(
+            hist.numpy(),
+            self.expect_hist,
+        )
+        for idx, edge in enumerate(edges):
+            edge = edge.numpy()
+            expect_edge = np.array(self.expect_edges[idx])
+            np.testing.assert_allclose(
+                edge,
+                expect_edge,
+            )
+
+        paddle.enable_static()
+
+    def test_error(self):
+        pass
+
+
+class TestHistogramddAPICase1ForDensity(TestHistogramddAPI):
+    def init_input(self):
+        # self.sample = np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])
+        self.sample = np.random.randn(4, 2).astype(np.float64)
+        self.bins = [2, 2]
+        self.ranges = [0.0, 1.0, 0.0, 1.0]
+        self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase2ForMultiDimsAndDensity(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.bins = [3, 4]
+        self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase3ForMultiDimsNotDensity(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.bins = [3, 4]
+        # self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase4ForRangesAndDensity(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.bins = [3, 4]
+        # [leftmost_1, rightmost_1, leftmost_2, rightmost_2,..., leftmost_D, rightmost_D]
+        self.ranges = [1.0, 10.0, 1.0, 100.0]
+        self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase5ForRangesNotDensity(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.bins = [3, 4]
+        # [leftmost_1, rightmost_1, leftmost_2, rightmost_2,..., leftmost_D, rightmost_D]
+        self.ranges = [1.0, 10.0, 1.0, 100.0]
+        # self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase6NotRangesAndDensityAndWeights(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.bins = [3, 4]
+        # [leftmost_1, rightmost_1, leftmost_2, rightmost_2,..., leftmost_D, rightmost_D]
+        # self.ranges = [1., 10., 1., 100.]
+        self.density = True
+        self.weights = np.array(
+            [
+                [1.0, 2.0],
+                [3.0, 4.0],
+                [5.0, 6.0],
+                [7.0, 8.0],
+            ],
+            dtype=self.sample.dtype,
+        )
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase7ForRangesAndDensityAndWeights(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.bins = [3, 4]
+        # [leftmost_1, rightmost_1, leftmost_2, rightmost_2,..., leftmost_D, rightmost_D]
+        self.ranges = [1.0, 10.0, 1.0, 100.0]
+        self.density = True
+        self.weights = np.array(
+            [
+                [1.0, 2.0],
+                [3.0, 4.0],
+                [5.0, 6.0],
+                [7.0, 8.0],
+            ],
+            dtype=self.sample.dtype,
+        )
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase8MoreInnermostDim(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,4]
+        self.sample = np.random.randn(4, 2, 4).astype(np.float64)
+        self.bins = [1, 2, 3, 4]
+        # [leftmost_1, rightmost_1, leftmost_2, rightmost_2,..., leftmost_D, rightmost_D]
+        self.density = False
+        self.weights = np.array(
+            [
+                [1.0, 2.0],
+                [3.0, 4.0],
+                [5.0, 6.0],
+                [7.0, 8.0],
+            ],
+            dtype=self.sample.dtype,
+        )
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase8MoreInnermostDimAndDensity(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,4]
+        self.sample = np.random.randn(4, 2, 4).astype(np.float64)
+        self.bins = [1, 2, 3, 4]
+        # [leftmost_1, rightmost_1, leftmost_2, rightmost_2,..., leftmost_D, rightmost_D]
+        self.density = True
+        self.weights = np.array(
+            [
+                [1.0, 2.0],
+                [3.0, 4.0],
+                [5.0, 6.0],
+                [7.0, 8.0],
+            ],
+            dtype=self.sample.dtype,
+        )
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase9ForIntBin(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.weights = np.array(
+            [
+                [1.0, 2.0],
+                [3.0, 4.0],
+                [5.0, 6.0],
+                [7.0, 8.0],
+            ],
+            dtype=self.sample.dtype,
+        )
+        self.bins = 5
+        self.density = True
+        self.ranges = [1.0, 10.0, 1.0, 100.0]
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase10ForTensorBin(TestHistogramddAPI):
+    def init_input(self):
+        # shape: [4,2,2]
+        self.sample = np.random.randn(4, 2, 2).astype(np.float64)
+        self.weights = np.array(
+            [
+                [1.0, 2.0],
+                [3.0, 4.0],
+                [5.0, 6.0],
+                [7.0, 8.0],
+            ],
+            dtype=self.sample.dtype,
+        )
+        self.bins = (
+            np.array([1.0, 2.0, 10.0, 15.0, 20.0]),
+            np.array([0.0, 20.0, 100.0]),
+        )
+        self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+class TestHistogramddAPICase10ForFloat32(TestHistogramddAPI):
+    def init_input(self):
+        # self.sample = np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])
+        self.sample = np.random.randn(4, 2).astype(np.float32)
+        self.bins = [2, 2]
+        self.ranges = [0.0, 1.0, 0.0, 1.0]
+        self.density = True
+
+    def set_expect_output(self):
+        self.expect_hist, self.expect_edges = ref_histogramdd(
+            self.sample, self.bins, self.ranges, self.weights, self.density
+        )
+
+
+# histogramdd(sample, bins=10, ranges=None, density=False, weights=None, name=None):
+class TestHistogramddAPI_check_sample_type_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor([[False, True], [True, False]])
+        with self.assertRaises(TypeError):
+            paddle.histogramdd(sample)
+
+
+class TestHistogramddAPI_check_bins_element_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ]
+        )
+        bins = [3.4, 4.5]
+        with self.assertRaises(ValueError):
+            paddle.histogramdd(sample, bins=bins)
+
+
+class TestHistogramddAPI_check_ranges_type_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ]
+        )
+        ranges = 10
+        with self.assertRaises(TypeError):
+            paddle.histogramdd(sample, ranges=ranges)
+
+
+class TestHistogramddAPI_check_density_type_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ]
+        )
+        density = 10
+        with self.assertRaises(TypeError):
+            paddle.histogramdd(sample, density=density)
+
+
+class TestHistogramddAPI_check_weights_type_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ]
+        )
+        weights = 10
+        with self.assertRaises(AttributeError):
+            paddle.histogramdd(sample, weights=weights)
+
+
+class TestHistogramddAPI_sample_weights_shape_dismatch_error(
+    TestHistogramddAPI
+):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [  # shape: [4,2]
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ]
+        )
+        weights = paddle.to_tensor(
+            [2.0, 3.0, 4.0], dtype=self.sample.dtype
+        )  # shape: [3,]
+        with self.assertRaises(AssertionError):
+            paddle.histogramdd(sample, weights=weights)
+
+
+class TestHistogramddAPI_sample_weights_type_dismatch_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [  # float32
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ],
+            dtype=paddle.float32,
+        )
+        weights = paddle.to_tensor(
+            [2.0, 3.0, 4.0], dtype=paddle.float64
+        )  # float64
+        with self.assertRaises(AssertionError):
+            paddle.histogramdd(sample, weights=weights)
+
+
+class TestHistogramddAPI_check_bins_type_error(TestHistogramddAPI):
+    def test_error(self):
+        sample = paddle.to_tensor(
+            [
+                [[1.0, 2.0], [3.0, 4.0]],
+                [[5.0, 6.0], [7.0, 8.0]],
+                [[9.0, 10.0], [11.0, 12.0]],
+                [[13.0, 14.0], [15.0, 16.0]],
+            ]
+        )
+        bins = 2.0
+        with self.assertRaises(ValueError):
+            paddle.histogramdd(sample, bins=bins)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()

From 7a61a711aafd422d219e6d3f39ddfbf7cd09de7b Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Mon, 11 Dec 2023 12:50:11 +0800
Subject: [PATCH 11/28] [Dy2s] rm using global name generator in static mode
 (#59294)

---
 python/paddle/base/framework.py               | 17 +++++++
 python/paddle/base/layer_helper.py            |  7 ++-
 python/paddle/base/layer_helper_base.py       | 27 ++++++-----
 python/paddle/base/layers/math_op_patch.py    |  6 +--
 python/paddle/base/unique_name.py             | 17 +++++++
 .../auto_parallel/static/parallelizer_v2.py   |  3 +-
 .../static/tuner/optimization_tuner.py        |  3 +-
 .../static/tuner/rule_based_tuner.py          |  4 +-
 .../paddle/jit/dy2static/partial_program.py   | 47 +------------------
 .../jit/dy2static/program_translator.py       | 15 +-----
 .../static/quantization/quantization_pass.py  |  2 +-
 .../test_auto_parallel_dist_tensor.py         |  4 +-
 .../test_auto_parallel_partitioner_gpt.py     | 16 +++----
 .../test_get_inputs_outputs_in_block.py       |  2 +-
 test/prim/pir_prim/test_decompose_op.py       | 14 +++---
 15 files changed, 85 insertions(+), 99 deletions(-)

diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 697eb77d8ae40a..eab4bdf38cca56 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -6390,8 +6390,25 @@ def clone(self, for_test=False):
         p._copy_data_info_from(self, pruned_origin_block_id_map)
         p._copy_dist_param_info_from(self)
         p._copy_operator_info_from(self)
+        p._name_generator = self._name_generator.clone()
         return p
 
+    @signature_safe_contextmanager
+    def switch_name_generator_guard(self, new_generator):
+        if isinstance(new_generator, str):
+            new_generator = unique_name.UniqueNameGenerator(new_generator)
+        elif isinstance(new_generator, bytes):
+            new_generator = unique_name.UniqueNameGenerator(
+                new_generator.decode()
+            )
+
+        old_generator = self._name_generator
+        self._name_generator = new_generator
+        try:
+            yield
+        finally:
+            self._name_generator = old_generator
+
     def _prune(self, targets):
         """
         Prune operators and variables which are not needed to generate
diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py
index 333b176337a953..5511ae9cc49082 100644
--- a/python/paddle/base/layer_helper.py
+++ b/python/paddle/base/layer_helper.py
@@ -36,7 +36,12 @@ def __init__(self, layer_type, **kwargs):
         # can not use both `layer_type` and `name`. Deprecate LayerHelper
         # and write a Helper for dygraph mode.
         if name is None:
-            self.kwargs['name'] = unique_name.generate(layer_type)
+            if in_dygraph_mode():
+                self.kwargs['name'] = unique_name.generate(layer_type)
+            else:
+                self.kwargs[
+                    'name'
+                ] = self.main_program._name_generator.generate(layer_type)
 
         super().__init__(self.kwargs['name'], layer_type=layer_type)
 
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index 74ba6408ef8d81..96d71bc3e98443 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -120,14 +120,14 @@ def __norm_op(
         ):
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(
+                    name=self.main_program._name_generator.generate_with_ignorable_key(
                         ".".join([self.name, 'weight_norm_norm'])
                     ),
                     dtype=dtype,
                     persistable=False,
                 )
             abs_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(
+                name=self.main_program._name_generator.generate_with_ignorable_key(
                     ".".join([self.name, 'weight_norm_abs'])
                 ),
                 dtype=dtype,
@@ -137,7 +137,7 @@ def __norm_op(
                 type='abs', inputs={'X': x}, outputs={'Out': abs_out}
             )
             pow_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(
+                name=self.main_program._name_generator.generate_with_ignorable_key(
                     ".".join([self.name, 'weight_norm_pow'])
                 ),
                 dtype=dtype,
@@ -150,7 +150,7 @@ def __norm_op(
                 attrs={'factor': float(p)},
             )
             sum_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(
+                name=self.main_program._name_generator.generate_with_ignorable_key(
                     ".".join([self.name, 'weight_norm_sum'])
                 ),
                 dtype=dtype,
@@ -179,7 +179,7 @@ def __reshape_op(
         ):
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(
+                    name=self.main_program._name_generator.generate_with_ignorable_key(
                         ".".join([self.name, 'weight_norm_reshape'])
                     ),
                     dtype=dtype,
@@ -199,7 +199,7 @@ def __transpose_op(
         ):
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(
+                    name=self.main_program._name_generator.generate_with_ignorable_key(
                         ".".join([self.name, 'weight_norm_transpose'])
                     ),
                     dtype=dtype,
@@ -219,7 +219,7 @@ def __norm_except_dim(
             """Computes the norm over all dimensions except dim"""
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(
+                    name=self.main_program._name_generator.generate_with_ignorable_key(
                         ".".join([self.name, 'weight_norm_norm'])
                     ),
                     dtype=dtype,
@@ -376,7 +376,12 @@ def create_parameter(
                 else default_initializer
             )
         if attr.name is None:
-            attr.name = unique_name.generate(".".join([self.name, suffix]))
+            if in_dygraph_mode():
+                attr.name = unique_name.generate(".".join([self.name, suffix]))
+            else:
+                attr.name = self.main_program._name_generator.generate(
+                    ".".join([self.name, suffix])
+                )
 
         if default_initializer is None and attr.initializer is None:
             if isinstance(dtype, core.VarDesc.VarType):
@@ -466,7 +471,7 @@ def create_variable_for_type_inference(
         if not dtype:
             dtype = self.__dtype
         return self.main_program.current_block().create_var(
-            name=unique_name.generate_with_ignorable_key(
+            name=self.main_program._name_generator.generate_with_ignorable_key(
                 ".".join([self.name, 'tmp'])
             ),
             dtype=dtype,
@@ -491,7 +496,7 @@ def _create_global_variable_for_type_inference(
         if not dtype:
             dtype = self.__dtype
         output = self.main_program.global_block().create_var(
-            name=unique_name.generate_with_ignorable_key(
+            name=self.main_program._name_generator.generate_with_ignorable_key(
                 ".".join([self.name, 'tmp'])
             ),
             dtype=dtype,
@@ -524,7 +529,7 @@ def create_sparse_variable_for_type_inference(
         if not dtype:
             dtype = self.__dtype
         return self.main_program.current_block().create_var(
-            name=unique_name.generate_with_ignorable_key(
+            name=self.main_program._name_generator.generate_with_ignorable_key(
                 ".".join([self.name, 'tmp'])
             ),
             dtype=dtype,
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index bf1d737970327d..bbf17a58a1a77b 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -17,8 +17,8 @@
 
 from paddle.base.dygraph.base import in_to_static_mode
 
-from .. import core
-from ..framework import Variable, static_only, unique_name
+from .. import core, default_main_program
+from ..framework import Variable, static_only
 from .layer_function_generator import OpProtoHolder
 
 _supported_int_dtype_ = [
@@ -70,7 +70,7 @@
 
 def monkey_patch_variable():
     def unique_tmp_name():
-        return unique_name.generate("tmp")
+        return default_main_program()._name_generator.generate("tmp")
 
     def safe_get_dtype(var):
         try:
diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py
index d3f6d41d3624da..95acd00cc60eab 100644
--- a/python/paddle/base/unique_name.py
+++ b/python/paddle/base/unique_name.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+from copy import deepcopy
 
 from .wrapped_decorator import signature_safe_contextmanager
 
@@ -35,6 +36,9 @@ def __init__(self, prefix=None):
         self.prefix = prefix
 
     def __call__(self, key):
+        return self.generate(key)
+
+    def generate(self, key):
         """
         Generate unique names with prefix
 
@@ -47,6 +51,19 @@ def __call__(self, key):
         self.ids[key] += 1
         return self.prefix + "_".join([key, str(tmp)])
 
+    def generate_with_ignorable_key(self, key):
+        from .framework import _dygraph_tracer, in_dygraph_mode
+
+        if in_dygraph_mode():
+            return _dygraph_tracer()._generate_unique_name()
+
+        return self.generate(key)
+
+    def clone(self):
+        ret = UniqueNameGenerator(self.prefix)
+        ret.ids = deepcopy(self.ids)
+        return ret
+
 
 class DygraphParameterNameChecker:
     """
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index d7d79c3025547c..4ce9ffef799609 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -20,7 +20,6 @@
 from paddle.distributed.passes import PassManager, new_pass
 from paddle.framework import get_flags
 from paddle.static import append_backward, program_guard
-from paddle.utils import unique_name
 
 from ...utils.log_utils import get_logger
 from ..random import init_auto_parallel_rng
@@ -257,7 +256,7 @@ def _generate_optimizer(
         optimizer._sorted = False
 
         with program_guard(main_program, startup_program):
-            with unique_name.guard("opt_"):
+            with main_program.switch_name_generator_guard("opt_"):
                 optimizer_ops = optimizer.apply_gradients(params_grads)
         self._completer.complete_update_annotation(main_program)
         return optimizer_ops
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
index 6f1c26e5f235c4..c67beec9093a16 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
@@ -41,7 +41,6 @@
 from paddle.distributed.auto_parallel.static.utils import debug_program
 from paddle.distributed.passes import PassContext, new_pass
 from paddle.static import append_backward, program_guard
-from paddle.utils import unique_name
 
 from ..utils import get_logger
 from .algorithms import new_algorithm
@@ -344,7 +343,7 @@ def _apply_optimization(self, trial):
         # Generate optimizer
         # FIXME should be remove from apply pass after pass support optimizers
         with program_guard(dist_main_prog, dist_startup_prog):
-            with unique_name.guard("opt_"):
+            with dist_main_prog.switch_name_generator_guard("opt_"):
                 optimizer_ops = dist_context.serial_optimizer.apply_gradients(
                     dist_params_grads
                 )
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
index 95c258a41c6d55..4a88948e90f770 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
@@ -28,7 +28,7 @@
 import paddle
 from paddle.base import program_guard
 from paddle.base.backward import append_backward
-from paddle.base.framework import Parameter, unique_name
+from paddle.base.framework import Parameter
 from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
 from paddle.distributed.auto_parallel.static.cluster_v2 import DeviceMesh
 from paddle.distributed.auto_parallel.static.completion import Completer
@@ -1162,7 +1162,7 @@ def gen_full_program(self):
             with program_guard(
                 self.full_main_program, self.full_startup_program
             ):
-                with unique_name.guard("opt_"):
+                with self.full_main_program.switch_name_generator_guard("opt_"):
                     optimizer_ops = optimizer.apply_gradients(params_grads)
 
             # op original id to grad op id
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index cbdb46df12c648..1633b3919a8db7 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -25,7 +25,6 @@
 from paddle.base.data_feeder import check_type, convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
 from paddle.base.framework import _apply_pass, get_flags
-from paddle.base.unique_name import switch
 from paddle.optimizer.lr import LRScheduler
 
 from . import logging_utils
@@ -172,19 +171,12 @@ class PartialProgramLayer:
     """
 
     def __init__(
-        self,
-        main_program,
-        inputs,
-        outputs,
-        name_generator,
-        parameters=None,
-        **kwargs
+        self, main_program, inputs, outputs, parameters=None, **kwargs
     ):
         super().__init__()
         self._inputs = NestSequence(inputs)
         self._outputs = NestSequence(outputs, need_check=True)
         self._params = parameters if parameters is not None else []
-        self._name_generator = name_generator
 
         self._build_strategy = kwargs.get('build_strategy', BuildStrategy())
         assert isinstance(self._build_strategy, BuildStrategy)
@@ -234,8 +226,6 @@ def __call__(self, inputs):
         """
         Execute static graph by Interpreter and Return dynamic Tensors.
         """
-        old_generator, old_para_name_checker = switch(self._name_generator)
-
         in_vars, in_var_names = self._prepare_inputs(inputs)
         out_vars = self._prepare_outputs()
         self._cast_fp16_if_pure_fp16(in_vars)
@@ -261,16 +251,12 @@ def __call__(self, inputs):
         restored_nest_out = self._restore_out(out_vars)
         restored_nest_out = self._remove_no_value(restored_nest_out)
 
-        switch(old_generator, old_para_name_checker)
         return restored_nest_out
 
     def sot_call(self, inputs):
         """
-        Same as __call__, but set force_not_use_pt to False.
-        Currently _sot_call will cause CUDA 700 error, so we disable it temporarily.
+        In sot, inputs and outputs of partial program only contain tensors, so we can skip some step to speed up
         """
-        old_generator, old_para_name_checker = switch(self._name_generator)
-
         in_vars, in_var_names = self._prepare_inputs(inputs)
         out_vars = self._prepare_outputs()
         self._cast_fp16_if_pure_fp16(in_vars)
@@ -292,36 +278,8 @@ def sot_call(self, inputs):
 
         restored_nest_out = self._restore_out(out_vars)
         restored_nest_out = self._remove_no_value(restored_nest_out)
-
-        switch(old_generator, old_para_name_checker)
         return restored_nest_out
 
-    def _sot_call(self, inputs):
-        """
-        In sot, inputs and outputs of partial program only contain tensors, so we can skip some step to speed up
-        """
-        old_generator, old_para_name_checker = switch(self._name_generator)
-
-        out_vars = self._prepare_outputs()
-        self._cast_fp16_if_pure_fp16(inputs)
-        attrs = self._prepare_attributes()
-        attrs.extend(["x_names", self._in_var_names])
-        self._sync_lr_value_with_scheduler()
-
-        _legacy_C_ops.run_program(
-            self._valid_vars(inputs),
-            self._valid_vars(self._params),
-            self._valid_vars(out_vars),
-            self._create_scope_vec(
-                program_id=self.program_id, use_scope_cache=True
-            ),
-            self._cuda_graph_vec,
-            *attrs
-        )
-
-        switch(old_generator, old_para_name_checker)
-        return out_vars
-
     def _sync_lr_value_with_scheduler(self):
         """Update lr_var value with calculated by lr_scheduler."""
         main_program = self._origin_main_program
@@ -1207,7 +1165,6 @@ def partial_program_from(concrete_program, from_method=False):
         concrete_program.main_program,
         inputs,
         concrete_program.outputs,
-        concrete_program.name_generator,
         concrete_program.parameters,
         **concrete_program.kwargs
     )
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index b3872a415abe44..1e68377d3c3d92 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -30,10 +30,6 @@
     param_guard,
     switch_to_static_graph,
 )
-from paddle.base.unique_name import (
-    UniqueNameGenerator,
-    guard as UniqueNameGuard,
-)
 from paddle.framework import in_dynamic_mode, use_pir_api
 from paddle.nn.layer import layers
 from paddle.pir import OpResult
@@ -1149,7 +1145,6 @@ class ConcreteProgram:
         "startup_program",
         "parameters",
         "function",
-        "name_generator",
         'kwargs',
     ]
 
@@ -1159,7 +1154,6 @@ def __init__(
         outputs,
         parameters,
         function,
-        name_generator,
         main_program,
         startup_program=None,
         **kwargs,
@@ -1170,7 +1164,6 @@ def __init__(
         self.startup_program = startup_program
         self.parameters = parameters
         self.function = function
-        self.name_generator = name_generator
         self.kwargs = kwargs
 
     @staticmethod
@@ -1261,12 +1254,10 @@ def pir_from_func_spec(
         # TODO(@xiongkun): support op call stack in new ir?
         # main_program = update_op_callstack_with_origin_info(main_program)
 
-        new_name_generator = UniqueNameGenerator()
         return ConcreteProgram(
             inputs=static_inputs,
             outputs=outputs,
             parameters=all_parameters_and_buffers,
-            name_generator=new_name_generator,
             function=dygraph_function,
             main_program=main_program,
             startup_program=startup_program,
@@ -1307,13 +1298,10 @@ def from_func_spec(
             framework.default_startup_program().random_seed
         )
 
-        new_name_generator = UniqueNameGenerator()
         ProgramTranslator.get_instance()._amp_records.clear()
 
         with framework.program_guard(main_program, startup_program):
-            with _to_static_mode_guard_(is_to_static=True), UniqueNameGuard(
-                new_name_generator
-            ):
+            with _to_static_mode_guard_(is_to_static=True):
                 # 1. Adds `paddle.static.data` layers for input if needed
                 static_inputs = func_spec.to_static_inputs_with_spec(
                     input_spec, main_program
@@ -1368,7 +1356,6 @@ def from_func_spec(
             outputs=outputs,
             parameters=all_parameters_and_buffers,
             function=dygraph_function,
-            name_generator=new_name_generator,
             main_program=main_program,
             startup_program=startup_program,
             **kwargs,
diff --git a/python/paddle/static/quantization/quantization_pass.py b/python/paddle/static/quantization/quantization_pass.py
index 99304210afa101..453637b36aeb23 100644
--- a/python/paddle/static/quantization/quantization_pass.py
+++ b/python/paddle/static/quantization/quantization_pass.py
@@ -925,7 +925,7 @@ def _insert_func(self, graph, func, var_node, op):
         tmp_program = Program()
         startup_program = Program()
         with program_guard(tmp_program, startup_program):
-            with unique_name.guard(var_node.name() + "_"):
+            with tmp_program.switch_name_generator_guard(var_node.name() + "_"):
                 in_node = data(
                     var_node.name() + '_tmp_input',
                     shape=var_node.shape(),
diff --git a/test/legacy_test/test_auto_parallel_dist_tensor.py b/test/legacy_test/test_auto_parallel_dist_tensor.py
index 259739aaa0a1ec..805d83a3e15254 100644
--- a/test/legacy_test/test_auto_parallel_dist_tensor.py
+++ b/test/legacy_test/test_auto_parallel_dist_tensor.py
@@ -109,7 +109,7 @@ def test_new_local_tensor(self):
         ) = get_dist_prog(train_program, startup_program, dist_context, rank_id)
         dist_context.dist_main_programs[rank_id] = dist_main_prog
         dist_context.dist_startup_programs[rank_id] = dist_startup_prog
-        name = "layer_norm_1.tmp_2"
+        name = "layer_norm_0.tmp_2"
         dist_tensor = dist_context.get_dist_tensor_for_program(
             complete_train_program.global_block().vars[name]
         )
@@ -133,7 +133,7 @@ def test_new_local_tensor(self):
         )
         dist_context.dist_main_programs[rank_id] = dist_main_prog
         dist_context.dist_startup_programs[rank_id] = dist_startup_prog
-        name = "layer_norm_1.tmp_2"
+        name = "layer_norm_0.tmp_2"
         dist_tensor = dist_context.get_dist_tensor_for_program(
             complete_train_program.global_block().vars[name]
         )
diff --git a/test/legacy_test/test_auto_parallel_partitioner_gpt.py b/test/legacy_test/test_auto_parallel_partitioner_gpt.py
index 0828cafa60b817..0f8ad91da4b3b1 100644
--- a/test/legacy_test/test_auto_parallel_partitioner_gpt.py
+++ b/test/legacy_test/test_auto_parallel_partitioner_gpt.py
@@ -934,14 +934,14 @@ def test_gpt_dp_mp(self):
             [param.name for param in startup_program.all_parameters()]
         )
         allreduce_grads = [
-            'layer_norm_5.tmp_2',
-            'layer_norm_5.tmp_2',
-            'layer_norm_5.tmp_2',
-            'layer_norm_6.tmp_2',
-            'layer_norm_7.tmp_2',
-            'layer_norm_7.tmp_2',
-            'layer_norm_7.tmp_2',
-            'layer_norm_8.tmp_2',
+            'layer_norm_0.tmp_2',
+            'layer_norm_0.tmp_2',
+            'layer_norm_0.tmp_2',
+            'layer_norm_1.tmp_2',
+            'layer_norm_2.tmp_2',
+            'layer_norm_2.tmp_2',
+            'layer_norm_2.tmp_2',
+            'layer_norm_3.tmp_2',
         ]
         process_mesh = _global_process_mesh
         mp_parallel_axis = 1
diff --git a/test/legacy_test/test_get_inputs_outputs_in_block.py b/test/legacy_test/test_get_inputs_outputs_in_block.py
index cef73b5268b05f..0d4b743c48ca7f 100644
--- a/test/legacy_test/test_get_inputs_outputs_in_block.py
+++ b/test/legacy_test/test_get_inputs_outputs_in_block.py
@@ -72,7 +72,7 @@ def _test_cond(self):
             inner_outputs,
         ) = paddle.utils.get_inputs_outputs_in_block(sub_block)
         # 'fill_constant_1.tmp_0', 'tmp_3' are names of a, c
-        self.assertTrue(inner_inputs == {'fill_constant_1.tmp_0', 'tmp_3'})
+        self.assertTrue(inner_inputs == {'fill_constant_1.tmp_0', 'tmp_0'})
         # '_generated_var_1', is name of a + c
         self.assertTrue(inner_outputs == {'_generated_var_0'})
 
diff --git a/test/prim/pir_prim/test_decompose_op.py b/test/prim/pir_prim/test_decompose_op.py
index 791f3fdeed945b..f58a0a8819f2c3 100644
--- a/test/prim/pir_prim/test_decompose_op.py
+++ b/test/prim/pir_prim/test_decompose_op.py
@@ -121,13 +121,13 @@ def net(self, flag=None):
 
                 # get the old_ir_grad_var_to_var map
                 old_ir_grad_var_to_var_map = {
-                    'dropout_1.tmp_0@GRAD': 'dropout_1.tmp_0',
-                    'elementwise_add_2@GRAD': 'elementwise_add_2',
-                    'elementwise_add_3@GRAD': 'elementwise_add_3',
-                    'elementwise_mul_1@GRAD': 'elementwise_mul_1',
-                    'layer_norm_1.tmp_2@GRAD': 'layer_norm_1.tmp_2',
-                    'rsqrt_1.tmp_0@GRAD': 'rsqrt_1.tmp_0',
-                    'mean_1.tmp_0@GRAD': 'mean_1.tmp_0',
+                    'dropout_0.tmp_0@GRAD': 'dropout_0.tmp_0',
+                    'elementwise_add_0@GRAD': 'elementwise_add_0',
+                    'elementwise_add_1@GRAD': 'elementwise_add_1',
+                    'elementwise_mul_0@GRAD': 'elementwise_mul_0',
+                    'layer_norm_0.tmp_2@GRAD': 'layer_norm_0.tmp_2',
+                    'rsqrt_0.tmp_0@GRAD': 'rsqrt_0.tmp_0',
+                    'mean_0.tmp_0@GRAD': 'mean_0.tmp_0',
                     'x@GRAD': 'x',
                     'x@GRAD@RENAME@block0@0': 'x',
                     'x@GRAD@RENAME@block0@1': 'x',

From c140f91f9aaee629110ff9839f0a42cceae68740 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Mon, 11 Dec 2023 14:10:01 +0800
Subject: [PATCH 12/28] [PIR] move opresult interface to value in python api.
 (#59783)

---
 .../pir_adaptor/pir_adaptor_util.cc           |   9 +-
 .../framework/new_executor/pir_interpreter.cc |  18 +--
 paddle/fluid/pir/transforms/inplace_pass.cc   |  12 +-
 paddle/fluid/pybind/eager_utils.cc            |   2 +-
 paddle/fluid/pybind/pir.cc                    | 150 +++++++-----------
 paddle/fluid/pybind/pir.h                     |   6 +-
 paddle/pir/core/value.h                       |   2 +-
 python/paddle/base/executor.py                |  11 +-
 python/paddle/base/variable_index.py          |  14 +-
 python/paddle/decomposition/decomp.py         |  14 +-
 .../paddle/geometric/message_passing/utils.py |   2 +-
 .../jit/dy2static/pir_partial_program.py      |  20 +--
 .../jit/dy2static/program_translator.py       |   6 +-
 python/paddle/jit/sot/infer_meta.py           |   6 +-
 python/paddle/nn/functional/common.py         |  18 +--
 python/paddle/nn/functional/conv.py           |   2 +-
 python/paddle/nn/initializer/constant.py      |   2 +-
 python/paddle/nn/layer/layers.py              |   4 +-
 python/paddle/optimizer/adam.py               |  16 +-
 19 files changed, 123 insertions(+), 191 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index e8f84d19854ee6..139da2dd57d482 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -243,12 +243,9 @@ Variable* CreateVar(pir::Value value,
   bool is_persisable = false;
   if (def_op->isa<::pir::ParameterOp>()) {
     is_persisable = true;
-  } else if (def_op->HasAttribute(kAttrIsPersisable)) {
-    is_persisable = def_op->attribute(kAttrIsPersisable)
-                        .dyn_cast<pir::ArrayAttribute>()
-                        .AsVector()[value.dyn_cast<pir::OpResult>().index()]
-                        .dyn_cast<pir::BoolAttribute>()
-                        .data();
+  } else if (auto attr =
+                 value.attribute<pir::BoolAttribute>(kAttrIsPersisable)) {
+    is_persisable = attr.data();
   }
 
   Variable* var = nullptr;
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 45674498b179fb..4f2842987e2cc8 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -1717,21 +1717,9 @@ void PirInterpreter::SolvePersisableVarNames() {
   for (auto kv : value_exe_info_->GetValue2VarName()) {
     ::pir::Value value = kv.first;
     const std::string& var_name = kv.second;
-    ::pir::OpResult result = value.dyn_cast<::pir::OpResult>();
-    if (!result) {
-      continue;
-    }
-    auto* defining_op = result.owner();
-    if (defining_op->HasAttribute(kAttrIsPersisable)) {
-      auto is_persisables =
-          defining_op->attribute<::pir::ArrayAttribute>(kAttrIsPersisable)
-              .AsVector();
-      if (is_persisables[result.index()]
-              .dyn_cast<::pir::BoolAttribute>()
-              .data()) {
-        VLOG(6) << "parameter_var_names_ include: " << var_name;
-        parameter_var_names_.insert(var_name);
-      }
+    auto bool_attr = value.attribute<::pir::BoolAttribute>(kAttrIsPersisable);
+    if (bool_attr && bool_attr.data()) {
+      parameter_var_names_.insert(var_name);
     }
   }
 }
diff --git a/paddle/fluid/pir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
index baab42b2b37a3e..57813d54a53f7a 100644
--- a/paddle/fluid/pir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -57,16 +57,8 @@ static bool CanBeDeleted(pir::Value value) {
       !value.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
     return false;
   }
-  if (auto op_result = value.dyn_cast<pir::OpResult>()) {
-    auto def_op = op_result.owner();
-    if (def_op->HasAttribute(kAttrIsPersisable)) {
-      return !(def_op->attribute<pir::ArrayAttribute>(kAttrIsPersisable)
-                   .AsVector()[op_result.index()]
-                   .dyn_cast<pir::BoolAttribute>()
-                   .data());
-    }
-  }
-  return true;
+  auto persist_attr = value.attribute<pir::BoolAttribute>(kAttrIsPersisable);
+  return !(persist_attr && persist_attr.data());
 }
 
 static bool CanDoInplace(const std::unordered_set<pir::Value>& eager_dels,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 2b8f36f8988cfa..5b9bf6bc35c3c2 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1967,7 +1967,7 @@ paddle::Tensor CreateTensorFromOpResult(const pir::OpResult& op_result) {
   auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
   autograd_meta->SetPersistable(false);
   autograd_meta->SetStopGradient(
-      GetOpResultBoolAttr(op_result, kAttrStopGradients));
+      GetValueBoolAttr(op_result, kAttrStopGradients));
 
   if (op_result.type().isa<paddle::dialect::DenseTensorType>()) {
     // TODO(jiabin): Maybe support LOD later
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index e9ff0e4d4d5430..ed22ded0a090ca 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -82,6 +82,7 @@ using paddle::dialect::SelectedRowsType;
 using pir::Attribute;
 using pir::Block;
 using pir::BlockArgument;
+using pir::BoolAttribute;
 using pir::IrParser;
 using pir::Operation;
 using pir::OpOperand;
@@ -600,7 +601,7 @@ void BindValue(py::module *m) {
 
   )DOC");
   g_ir_value_pytype = reinterpret_cast<PyTypeObject *>(value.ptr());
-  value
+  value.def(py::init<>())
       .def_property_readonly(
           "block",
           [](Value self) {
@@ -651,6 +652,37 @@ void BindValue(py::module *m) {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "can't set dtype when building static graph"));
           })
+      .def("initialized",
+           [](Value self) {
+             if (self.impl() == nullptr || self.type().storage() == nullptr) {
+               return false;
+             } else {
+               return true;
+             }
+           })
+      .def_property(
+          "stop_gradient",
+          [](Value self) {
+            auto stop_gradient =
+                self.attribute<BoolAttribute>(kAttrStopGradients);
+            return !stop_gradient || stop_gradient.data();
+          },
+          [](Value self, bool stop_gradient) {
+            self.set_attribute(
+                kAttrStopGradients,
+                BoolAttribute::get(pir::IrContext::Instance(), stop_gradient));
+          })
+      .def_property(
+          "persistable",
+          [](Value self) {
+            auto persistable = self.attribute<BoolAttribute>(kAttrIsPersisable);
+            return !persistable || persistable.data();
+          },
+          [](Value self, bool persistable) {
+            self.set_attribute(
+                kAttrIsPersisable,
+                BoolAttribute::get(pir::IrContext::Instance(), persistable));
+          })
       .def(
           "get_defining_op",
           [](Value self) -> pir::Operation * {
@@ -692,13 +724,27 @@ void BindValue(py::module *m) {
       .def("first_use", &Value::first_use, return_value_policy::reference)
       .def("has_one_use", &Value::HasOneUse)
       .def("use_empty", &Value::use_empty)
+      .def("__str__",
+           [](Value self) -> py::str {
+             std::ostringstream print_stream;
+             print_stream << "Value(";
+             print_stream << GetValueInfo(self);
+             auto stop_gradient =
+                 self.attribute<BoolAttribute>(kAttrStopGradients);
+             if (stop_gradient && !stop_gradient.data()) {
+               print_stream << ", stop_gradient=False";
+             } else {
+               print_stream << ", stop_gradient=True";
+             }
+             print_stream << ")";
+             return print_stream.str();
+           })
       .def("__neg__",
            [](Value self) {
              return paddle::dialect::scale(self, -1.0, 0.0, true);
            })
       .def("__eq__", &Value::operator==)
       .def("__hash__", [](Value self) { return std::hash<pir::Value>{}(self); })
-      .def("__str__", &Value2String)
       .def("__repr__", &Value2String);
   // For basaic operators
   OVERRIDE_OPERATOR_FOR_EACH(__add__, add, 1.0, other, true);
@@ -734,47 +780,9 @@ void BindOpOperand(py::module *m) {
       .def("index", &OpOperand::index);
 }
 
-bool GetOpResultBoolAttr(const OpResult &self, const std::string &attr_name) {
-  auto *defining_op = self.owner();
-  if (defining_op->HasAttribute(attr_name)) {
-    PADDLE_ENFORCE(
-        defining_op->attribute(attr_name).isa<pir::ArrayAttribute>(),
-        paddle::platform::errors::InvalidArgument(
-            "%s: Callstack attributes of %s is not ArrayAttribute type",
-            attr_name));
-    auto attrs = defining_op->attribute(attr_name)
-                     .dyn_cast<pir::ArrayAttribute>()
-                     .AsVector();
-    PADDLE_ENFORCE(attrs[self.index()].isa<pir::BoolAttribute>(),
-                   paddle::platform::errors::InvalidArgument(
-                       "The index %d in %s is not BoolAttribute type",
-                       self.index(),
-                       attr_name));
-    return attrs[self.index()].dyn_cast<pir::BoolAttribute>().data();
-  } else {
-    return true;
-  }
-}
-
-void SetOpResultBoolAttr(const OpResult &self,
-                         const std::string &attr_name,
-                         bool value,
-                         bool default_value) {
-  auto *defining_op = self.owner();
-  std::vector<pir::Attribute> attrs;
-  if (defining_op->HasAttribute(attr_name)) {
-    attrs = defining_op->attribute(attr_name)
-                .dyn_cast<pir::ArrayAttribute>()
-                .AsVector();
-  } else {
-    attrs = std::vector<pir::Attribute>(
-        defining_op->num_results(),
-        pir::BoolAttribute::get(pir::IrContext::Instance(), default_value));
-  }
-  attrs[self.index()] =
-      pir::BoolAttribute::get(pir::IrContext::Instance(), value);
-  defining_op->set_attribute(
-      attr_name, pir::ArrayAttribute::get(pir::IrContext::Instance(), attrs));
+bool GetValueBoolAttr(Value value, const std::string &attr_name) {
+  auto bool_attr = value.attribute<BoolAttribute>(attr_name);
+  return !bool_attr || bool_attr.data();
 }
 
 void BindOpResult(py::module *m) {
@@ -786,58 +794,10 @@ void BindOpResult(py::module *m) {
         when build network.
   )DOC");
   g_ir_opresult_pytype = reinterpret_cast<PyTypeObject *>(op_result.ptr());
-  op_result
-      .def(
-          "__init__",
-          [](OpResult &self) { new (&self) OpResult(); },
-          pybind11::return_value_policy::reference)
-      .def("__str__",
-           [](OpResult &self) -> py::str {
-             std::ostringstream print_stream;
-             print_stream << "OpResult(";
-             print_stream << GetValueInfo(self);
-             if (GetOpResultBoolAttr(self, kAttrStopGradients)) {
-               print_stream << ", stop_gradient=True";
-             } else {
-               print_stream << ", stop_gradient=False";
-             }
-             print_stream << ")";
-             return print_stream.str();
-           })
-      .def("initialized",
-           [](OpResult &self) {
-             if (self.impl() == nullptr || self.type().storage() == nullptr) {
-               return false;
-             } else {
-               return true;
-             }
-           })
-      .def_property(
-          "stop_gradient",
-          [](OpResult &self) {
-            return GetOpResultBoolAttr(self, kAttrStopGradients);
-          },
-          [](OpResult &self, bool stop_gradient) {
-            // NOTE(Aurelius84): For other OpResult, set theirs
-            // stop_gradient default value as true.
-            SetOpResultBoolAttr(self,
-                                kAttrStopGradients,
-                                stop_gradient,
-                                /*default_value=*/true);
-          })
-      .def_property(
-          "persistable",
-          [](OpResult &self) {
-            return GetOpResultBoolAttr(self, kAttrIsPersisable);
-          },
-          [](OpResult &self, bool persistable) {
-            // NOTE(Aurelius84): For other OpResult, set theirs
-            // persistable default value as false.
-            SetOpResultBoolAttr(self,
-                                kAttrIsPersisable,
-                                persistable,
-                                /*default_value=*/false);
-          });
+  op_result.def(
+      "__init__",
+      [](OpResult &self) { new (&self) OpResult(); },
+      pybind11::return_value_policy::reference);
 }
 
 void BindType(py::module *m) {
diff --git a/paddle/fluid/pybind/pir.h b/paddle/fluid/pybind/pir.h
index 81ae155bbd28ef..e687b9cb699f00 100644
--- a/paddle/fluid/pybind/pir.h
+++ b/paddle/fluid/pybind/pir.h
@@ -21,9 +21,9 @@
 
 namespace paddle {
 namespace pybind {
-using pir::OpResult;
+using pir::Value;
 void BindPir(pybind11::module *m);
-const phi::DDim &GetValueDims(pir::Value value);
-bool GetOpResultBoolAttr(const OpResult &self, const std::string &attr_name);
+const phi::DDim &GetValueDims(Value value);
+bool GetValueBoolAttr(Value value, const std::string &attr_name);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/pir/core/value.h b/paddle/pir/core/value.h
index 3a62cab47ee892..4a94ca87ecf696 100644
--- a/paddle/pir/core/value.h
+++ b/paddle/pir/core/value.h
@@ -108,7 +108,7 @@ class IR_API Value {
   Attribute attribute(const std::string &key) const;
 
   template <typename T>
-  T attribute(const std::string &name) {
+  T attribute(const std::string &name) const {
     return attribute(name).dyn_cast<T>();
   }
 
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index fff2771da14c2f..01605a2741d10d 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -24,7 +24,6 @@
 from paddle import pir
 
 from ..pir import (
-    OpResult,
     Program as PirProgram,
     Value,
     translate_to_pir,
@@ -526,7 +525,7 @@ def _add_pir_fetch_ops(program, fetch_list, fetch_var_name):
         with paddle.static.program_guard(program):
             for i, fetch_input in enumerate(need_fetch_info):
                 assert isinstance(
-                    fetch_input, (OpResult, Value)
+                    fetch_input, Value
                 ), f"Wrong type for fetch_list[{i}]: {type(fetch_input)}"
                 out = paddle._pir_ops.fetch(
                     fetch_input, fetch_var_name + str(i), i
@@ -610,7 +609,7 @@ def _to_str(var):
             return str(var)
         elif isinstance(var, Operator):
             return str(id(var))
-        elif isinstance(var, OpResult):
+        elif isinstance(var, Value):
             return str(var)
         elif isinstance(var, Value):
             return str(var)
@@ -2082,9 +2081,7 @@ def _run_inference(self, exe, feed):
         return exe.run(feed)
 
     def _check_fetch_list(self, fetch_list):
-        is_fetch_var = lambda var: isinstance(
-            var, (Variable, str, OpResult, Value)
-        )
+        is_fetch_var = lambda var: isinstance(var, (Variable, str, Value))
         is_tuple_list = lambda var: isinstance(var, (tuple, list))
 
         if fetch_list is None:
@@ -2110,7 +2107,7 @@ def _check_fetch_list(self, fetch_list):
                     res.append(var)
             else:
                 raise TypeError(
-                    "Require fetch_list[{}] 's type shall be one of (OpResult, str), but received {}.".format(
+                    "Require fetch_list[{}] 's type shall be one of (Value, str), but received {}.".format(
                         i, type(var).__name__
                     )
                 )
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index f3a04076ef3fbd..b5ed230fd11dc4 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -39,7 +39,7 @@ def replace_ellipsis(var, item):
     item_remove_var = [
         ele
         for ele in item
-        if not isinstance(ele, (Variable, paddle.pir.OpResult, np.ndarray))
+        if not isinstance(ele, (Variable, paddle.pir.Value, np.ndarray))
         and ele is not None
     ]
     ell_count = item_remove_var.count(Ellipsis)
@@ -100,7 +100,7 @@ def is_integer_or_scalar_tensor(ele):
                 return True
         if len(ele.shape) == 0 and ele.dtype != paddle.bool:
             return True
-    elif isinstance(ele, paddle.pir.OpResult):
+    elif isinstance(ele, paddle.pir.Value):
         if len(ele.shape) == 0 and ele.dtype != paddle.base.libpaddle.BOOL:
             return True
     return False
@@ -112,7 +112,7 @@ def is_bool_tensor(ele):
     if isinstance(ele, Variable) and ele.dtype == paddle.bool:
         return True
     elif (
-        isinstance(ele, paddle.pir.OpResult)
+        isinstance(ele, paddle.pir.Value)
         and ele.dtype == paddle.base.libpaddle.BOOL
     ):
         return True
@@ -127,7 +127,7 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
             attr, dtype="int64"
         )
         for i, dim in enumerate(attr):
-            if isinstance(dim, (Variable, paddle.pir.OpResult)):
+            if isinstance(dim, (Variable, paddle.pir.Value)):
                 attrs[attr_name].append(-1)
                 infer_flags[i] = -1
             else:
@@ -354,7 +354,7 @@ def parse_index(x, indices):
             has_advanced_index = True
             estimated_dim += 1
             dim += 1
-        elif isinstance(slice_item, paddle.pir.OpResult):
+        elif isinstance(slice_item, paddle.pir.Value):
             # In this case, the Variable is not 0-dim Tensor and will be treated as advanced-indexing.
             if slice_item.dtype == paddle.pir.core.DataType.BOOL:
                 if slice_item.ndim == 0:
@@ -385,9 +385,7 @@ def parse_index(x, indices):
             use_strided_slice = (
                 True
                 if (
-                    isinstance(
-                        step, (paddle.base.Variable, paddle.pir.OpResult)
-                    )
+                    isinstance(step, (paddle.base.Variable, paddle.pir.Value))
                     or step != 1
                 )
                 else use_strided_slice
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 938f3d710516f2..62880cfcdb2c3c 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -30,7 +30,7 @@
 
 
 def _build_tensor_tuple(xs):
-    if isinstance(xs, pir.OpResult):
+    if isinstance(xs, pir.Value):
         return (xs,)
     elif isinstance(xs, typing.Sequence):
         return tuple(xs)
@@ -41,14 +41,14 @@ def _analyse_decomp_results(orig_outs, decomp_outs, op):
     assert len(orig_outs) == len(decomp_outs)
     res = []
     for idx, value in enumerate(decomp_outs):
-        if isinstance(orig_outs[idx], pir.OpResult):
+        if isinstance(orig_outs[idx], pir.Value):
             if (
                 op.name() in decomp_ops_contain_unused_output.keys()
                 and idx in decomp_ops_contain_unused_output[op.name()]
             ):
                 assert value[0] is None
             else:
-                assert len(value) == 1 and isinstance(value[0], pir.OpResult)
+                assert len(value) == 1 and isinstance(value[0], pir.Value)
             res.append(value[0])
         else:
             res.append(value)
@@ -76,7 +76,7 @@ def _prepare_python_api_arguments(op):
             inputs.append(input)
         else:
             # for optional input, such as scale for layer_norm op,
-            # if it is not set, there will be an empty OpResult which is not initialized in ops.operands
+            # if it is not set, there will be an empty Value which is not initialized in ops.operands
             # therefore append None for it.
             inputs.append(None)
 
@@ -191,7 +191,7 @@ def decompose(
 
     Args:
         program (Program): The program to be processed.
-        src_vars (list[OpResult]): In program, once some operator is decomposed, its vars will be replaced by new ones. This argument means some vars will be used later and corresponding vars will be returned for later usage.
+        src_vars (list[Value]): In program, once some operator is decomposed, its vars will be replaced by new ones. This argument means some vars will be used later and corresponding vars will be returned for later usage.
         blacklist (frozenset): The Operators that will be exclude when decomposed into primitives.
         whitelist (frozenset): Only the operators in whitelist will be decomposed into primitives.
 
@@ -230,7 +230,7 @@ def decompose(
     dst_vars = [None] * len(src_vars)
     dst_vars_dct = {}
     for idx, item in enumerate(src_vars):
-        if not isinstance(item, pir.OpResult):
+        if not isinstance(item, pir.Value):
             raise TypeError(
                 f"Each var in dst_vars should map corresponding var in src_vars, but got type {type(item)} in {src_vars}."
             )
@@ -243,7 +243,7 @@ def decompose(
             op_filter,
         )
     for idx, item in enumerate(dst_vars):
-        if not isinstance(item, pir.OpResult):
+        if not isinstance(item, pir.Value):
             if item is None:
                 dst_vars[idx] = src_vars[idx]
             else:
diff --git a/python/paddle/geometric/message_passing/utils.py b/python/paddle/geometric/message_passing/utils.py
index 2566bc0f7074f8..a07c9a99714d27 100644
--- a/python/paddle/geometric/message_passing/utils.py
+++ b/python/paddle/geometric/message_passing/utils.py
@@ -28,7 +28,7 @@ def convert_out_size_to_list(out_size, op_type):
         out_size = [0]
     elif isinstance(out_size, (int, np.int32, np.int64)):
         out_size = [out_size]
-    elif isinstance(out_size, (Variable, paddle.pir.OpResult)):
+    elif isinstance(out_size, (Variable, paddle.pir.Value)):
         out_size.stop_gradient = True
         check_dtype(
             out_size.dtype,
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index 99e72770dda55a..d6e2181d8c3594 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -26,7 +26,7 @@
 from paddle.base.data_feeder import check_type, convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
 from paddle.optimizer.lr import LRScheduler
-from paddle.pir import OpResult, fake_op_result, is_fake_op_result
+from paddle.pir import Value, fake_op_result, is_fake_op_result
 
 from .utils import RETURN_NO_VALUE_MAGIC_NUM, backend_guard
 
@@ -71,10 +71,10 @@ def _tolist(self):
         """
         Flattens the nested sequences into single list and remove duplicate variables + non-variable elements.
         """
-        variable_map = {}  # opresult -> list idx
+        variable_map = {}  # Value -> list idx
         variable_list = []
         for value in paddle.utils.flatten(self._raw_input):
-            if not isinstance(value, OpResult):
+            if not isinstance(value, Value):
                 continue
             if value in variable_map:
                 # remove duplicate opresults.
@@ -90,7 +90,7 @@ def restore(self, value_list):
         assert len(self._var_list) == len(value_list)
 
         def to_value(x):
-            if isinstance(x, OpResult):
+            if isinstance(x, Value):
                 return value_list[self._var_map[x]]
             return x
 
@@ -107,9 +107,9 @@ class RunableProgram:
     """a pir program ready for run_program_op to run. constructed by 3 parts:
     - pir program (pir::Program)
     - in_out_values
-        - input_x values ([string | pir::OpResult])
-        - input_param values ([string | pir::OpResult])
-        - output values ([string | pir::OpResult])
+        - input_x values ([string | pir::Value])
+        - input_param values ([string | pir::Value])
+        - output values ([string | pir::Value])
     - forward_backward_ranges
         - forward_range (tuple(Int, Int)) | None
         - backward_range (tuple(Int, Int)) | None
@@ -725,7 +725,7 @@ def _append_backward_desc(self, train_runnable_program: RunableProgram):
             check_type(
                 targets,
                 'targets',
-                (OpResult, list, tuple),
+                (Value, list, tuple),
                 'paddle.static.gradients',
             )
             with ir_static.program_guard(program, None):
@@ -787,7 +787,7 @@ def _append_backward_desc(self, train_runnable_program: RunableProgram):
             # )
 
         mapping_op_result = (
-            lambda x: x if isinstance(x, OpResult) else fake_op_result()
+            lambda x: x if isinstance(x, Value) else fake_op_result()
         )
         inputs_size = len(inputs)
         x_grad_value = list(
@@ -932,7 +932,7 @@ def _create_cuda_graph_vec(self):
     def _update_stop_gradient(self, out_vars):
         # Update stop_gradient for all outputs
         def set_stop_gradient(var, eager_tensor):
-            assert isinstance(var, OpResult)
+            assert isinstance(var, Value)
             eager_tensor.stop_gradient = var.stop_gradient
 
         for idx, var in zip(self._outputs.var_list, out_vars):
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 1e68377d3c3d92..4c498d719b6f7e 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -32,7 +32,7 @@
 )
 from paddle.framework import in_dynamic_mode, use_pir_api
 from paddle.nn.layer import layers
-from paddle.pir import OpResult
+from paddle.pir import Value
 from paddle.utils import flatten, gast
 
 from . import error, logging_utils
@@ -1030,7 +1030,7 @@ def inputs(self):
         inputs = [
             var
             for var in flatten(concrete_program.inputs)
-            if isinstance(var, (framework.Variable, OpResult))
+            if isinstance(var, (framework.Variable, Value))
         ]
         return inputs
 
@@ -1044,7 +1044,7 @@ def outputs(self):
         outputs = [
             var
             for var in flatten(concrete_program.outputs)
-            if isinstance(var, (framework.Variable, OpResult))
+            if isinstance(var, (framework.Variable, Value))
         ]
 
         return outputs
diff --git a/python/paddle/jit/sot/infer_meta.py b/python/paddle/jit/sot/infer_meta.py
index 81713bb259595b..ea2cbff8b1cc1b 100644
--- a/python/paddle/jit/sot/infer_meta.py
+++ b/python/paddle/jit/sot/infer_meta.py
@@ -42,8 +42,8 @@ def __init__(
     @staticmethod
     def from_tensor(tensor):
         # We always use float32 in simulation if AMP is enabled.
-        if isinstance(tensor, paddle.pir.OpResult):
-            name = "OpResult@NoName"
+        if isinstance(tensor, paddle.pir.Value):
+            name = "Value@NoName"
             persistable = tensor.persistable
             dtype = framework.paddle_type_to_proto_type[tensor.dtype]
         else:
@@ -230,7 +230,7 @@ def convert_variable_to_meta_info(args):
     static_variable_type = (
         paddle.static.Variable
         if not paddle.base.framework.use_pir_api()
-        else paddle.pir.OpResult
+        else paddle.pir.Value
     )
     return map_if_extend(
         args,
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index ab62bcc689e6ff..b678c80344d301 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -412,7 +412,7 @@ def interpolate(
             'The x and size should satisfy rank(x) - 2 == len(size).'
         )
 
-    if isinstance(size, (Variable, paddle.pir.OpResult)):
+    if isinstance(size, (Variable, paddle.pir.Value)):
         size = size.cast("int32")  # static mode only support int32
         if size.ndim != 1:
             raise ValueError(
@@ -434,7 +434,7 @@ def interpolate(
         )
 
     if resample == 'AREA':
-        if isinstance(size, (list, tuple, Variable, paddle.pir.OpResult)):
+        if isinstance(size, (list, tuple, Variable, paddle.pir.Value)):
             if len(size) == 0:
                 raise ValueError("output size can not be empty")
         if size is None:
@@ -494,7 +494,7 @@ def _is_list_or_turple_(data):
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
         if (
-            isinstance(out_shape, (Variable, paddle.pir.OpResult))
+            isinstance(out_shape, (Variable, paddle.pir.Value))
             and not in_dynamic_mode()
         ):
             out_shape.stop_gradient = True
@@ -514,7 +514,7 @@ def _is_list_or_turple_(data):
             # Validate the shape
             contain_var = False
             for dim_idx, dim_size in enumerate(out_shape):
-                if isinstance(dim_size, (Variable, paddle.pir.OpResult)):
+                if isinstance(dim_size, (Variable, paddle.pir.Value)):
                     contain_var = True
                     continue
                 assert (
@@ -525,7 +525,7 @@ def _is_list_or_turple_(data):
                 new_size_tensor = []
                 size_list = []
                 for dim in out_shape:
-                    if isinstance(dim, (Variable, paddle.pir.OpResult)):
+                    if isinstance(dim, (Variable, paddle.pir.Value)):
                         dim.stop_gradient = True
                         new_size_tensor.append(dim)
                         size_list.append(-1)
@@ -591,7 +591,7 @@ def _is_list_or_turple_(data):
                 scale = float(scale)
             else:
                 scale = list(scale.numpy())
-        if isinstance(scale, (Variable, paddle.pir.OpResult)):
+        if isinstance(scale, (Variable, paddle.pir.Value)):
             scale.stop_gradient = True
             inputs["Scale"] = scale
         elif isinstance(scale, (float, int, numpy.ndarray)):
@@ -1111,7 +1111,7 @@ def dropout(
             [[0., 0., 6.],
              [0., 0., 0.]])
     """
-    if not isinstance(p, (float, int, Variable, pir.OpResult)):
+    if not isinstance(p, (float, int, Variable, pir.Value)):
         raise TypeError("p argument should be a number or Variable")
 
     if isinstance(p, (int, float)):
@@ -1673,7 +1673,7 @@ def pad(x, pad, mode='constant', value=0.0, data_format="NCHW", name=None):
             return out
 
         if in_pir_mode():
-            if isinstance(pad_value, paddle.pir.OpResult):
+            if isinstance(pad_value, paddle.pir.Value):
                 return _C_ops.pad(x, paddings, pad_value)
             else:
                 return _C_ops.pad(x, paddings, float(pad_value))
@@ -1728,7 +1728,7 @@ def pad(x, pad, mode='constant', value=0.0, data_format="NCHW", name=None):
 
     unsqueezed_dim = []
 
-    if isinstance(pad, (Variable, pir.OpResult)):
+    if isinstance(pad, (Variable, pir.Value)):
         if data_format in ["NCL", "NCHW", "NCDHW"]:
             data_format = "NCDHW"
             if x_dim == 3:
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index df837215993541..762f684ee9ec30 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -1233,7 +1233,7 @@ def conv2d_transpose(
                 output_size = convert_to_list(output_size, 2, 'output_size')
         elif isinstance(output_size, int):
             output_size = convert_to_list(output_size, 2, 'output_size')
-        elif isinstance(output_size, (Variable, pir.OpResult)):
+        elif isinstance(output_size, (Variable, pir.Value)):
             check_dtype(
                 output_size.dtype,
                 'output_size',
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 5391142d503c3b..d45c784c20b8cd 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -61,7 +61,7 @@ def forward(self, var, block=None):
             (
                 framework.Variable,
                 framework.EagerParamBase,
-                paddle.pir.OpResult,
+                paddle.pir.Value,
                 paddle.pir.core.ParameterMeta,
             ),
         )
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 4f9a1159c4eb4d..bf744180c02842 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -1678,7 +1678,7 @@ def _remove_if_exist(*dicts):
             _remove_if_exist(self.__dict__, self._buffers, self._sub_layers)
             params[name] = value
         elif (
-            isinstance(value, paddle.pir.OpResult)
+            isinstance(value, paddle.pir.Value)
             and value.get_defining_op().name() == 'builtin.parameter'
         ):
             if params is None:
@@ -1733,7 +1733,7 @@ def _remove_if_exist(*dicts):
                     # decorated function, such as `self.buffer = new_tensor`. So we update its
                     # value via `assign`.
                     if type(value) == framework.Variable or isinstance(
-                        value, paddle.pir.OpResult
+                        value, paddle.pir.Value
                     ):
                         from paddle import assign
 
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index a36492fe5e1721..485c59441853bc 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -18,7 +18,7 @@
 import paddle
 from paddle import _C_ops, pir
 from paddle.base.libpaddle import DataType
-from paddle.pir import OpResult
+from paddle.pir import Value
 
 from ..base import core, framework
 from ..base.dygraph import base as imperative_base
@@ -185,17 +185,17 @@ def __init__(
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        if not isinstance(beta1, (Variable, OpResult)):
+        if not isinstance(beta1, (Variable, Value)):
             if not 0 <= beta1 < 1:
                 raise ValueError(
                     "Invaild value of beta1, expect beta1 in [0,1)."
                 )
-        if not isinstance(beta2, (Variable, OpResult)):
+        if not isinstance(beta2, (Variable, Value)):
             if not 0 <= beta2 < 1:
                 raise ValueError(
                     "Invaild value of beta2, expect beta2 in [0,1)."
                 )
-        if not isinstance(epsilon, (Variable, OpResult)):
+        if not isinstance(epsilon, (Variable, Value)):
             if not 0 <= epsilon:
                 raise ValueError(
                     "Invaild value of epsilon, expect epsilon >= 0."
@@ -245,7 +245,7 @@ def _add_moments_pows(self, p):
             param=p,
             dtype=acc_dtype,
             fill_value=0.9
-            if isinstance(self._beta1, (Variable, OpResult))
+            if isinstance(self._beta1, (Variable, Value))
             else self._beta1,
             shape=[1],
             type=core.VarDesc.VarType.LOD_TENSOR,
@@ -256,7 +256,7 @@ def _add_moments_pows(self, p):
             param=p,
             dtype=acc_dtype,
             fill_value=0.999
-            if isinstance(self._beta2, (Variable, OpResult))
+            if isinstance(self._beta2, (Variable, Value))
             else self._beta2,
             shape=[1],
             type=core.VarDesc.VarType.LOD_TENSOR,
@@ -709,12 +709,12 @@ def _append_optimize_multi_tensor_op(
                     found_inf = self._get_auxiliary_var('found_inf')
                     if found_inf:
                         if isinstance(
-                            found_inf, (core.eager.Tensor, pir.OpResult)
+                            found_inf, (core.eager.Tensor, pir.Value)
                         ):
                             self._set_auxiliary_var('found_inf', True)
                     else:
                         if isinstance(
-                            found_inf, (core.eager.Tensor, pir.OpResult)
+                            found_inf, (core.eager.Tensor, pir.Value)
                         ):
                             self._set_auxiliary_var('found_inf', False)
                         _, _, _, _, _, _ = _C_ops.merged_adam_(

From b3d4ce374f13340b30f6428f8f67a1b3e9789066 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Mon, 11 Dec 2023 16:38:12 +0800
Subject: [PATCH 13/28] Incremental compilation optimization of phi (#59811)

* mv phi/ops/.cc file to fluid

* fix phi/kernels/compile bug

* optimization cmake code

* optimiztion compiliation of phi

* tmp for gen fpaintb.py

* refine gen mxigemm py

* refine gen mxigemm py

* optimiztion compiliation of phi

* optimiztion compiliation of phi

* optimization compilation of phi

* optimization compilation of phi

---------

Co-authored-by: wwbitejotunn <wang_bojun@outlook.com>
---
 .gitignore                                    |  4 +-
 cmake/operators.cmake                         | 50 +++++++++++++++++
 cmake/phi.cmake                               | 49 -----------------
 paddle/fluid/framework/operator.cc            |  2 +-
 paddle/fluid/framework/phi_utils.cc           |  2 +-
 .../fluid/operators/generator/CMakeLists.txt  | 17 +++---
 .../ops_signature}/activation_sig.cc          |  0
 .../operators/ops_signature}/adadelta_sig.cc  |  0
 .../operators/ops_signature}/adam_sig.cc      |  0
 .../ops_signature}/assign_pos_sig.cc          |  0
 .../operators/ops_signature}/assign_sig.cc    |  0
 .../ops_signature}/assign_value_sig.cc        |  0
 .../ops_signature}/batch_norm_sig.cc          |  0
 .../ops_signature}/c_embedding_sig.cc         |  0
 .../operators/ops_signature}/c_split_sig.cc   |  0
 .../operators/ops_signature}/cast_sig.cc      |  0
 .../ops_signature}/channel_shuffle_sig.cc     |  0
 .../ops_signature}/cudnn_lstm_sig.cc          |  0
 .../ops_signature}/decayed_adagrad_sig.cc     |  0
 .../ops_signature}/deformable_conv_sig.cc     |  0
 .../ops_signature}/dequantize_sig.cc          |  0
 .../ops_signature}/dgc_momentum_sig.cc        |  0
 .../distribute_fpn_proposals_sig.cc           |  0
 .../distributed_fused_lamb_init_sig.cc        |  0
 .../distributed_fused_lamb_sig.cc             |  0
 .../operators/ops_signature}/dropout_sig.cc   |  0
 .../ops_signature}/elementwise_sig.cc         |  0
 .../operators/ops_signature}/embedding_sig.cc |  0
 .../operators/ops_signature}/expand_sig.cc    |  0
 .../operators/ops_signature}/feed_sig.cc      |  0
 .../fill_constant_batch_size_like_sig.cc      |  0
 .../ops_signature}/fill_constant_sig.cc       |  0
 .../operators/ops_signature}/flatten_sig.cc   |  0
 .../ops_signature}/fused_adam_sig.cc          |  0
 .../ops_signature}/fused_attention_sig.cc     |  0
 .../ops_signature}/fused_bn_activation_sig.cc |  0
 .../fused_bn_add_activation_sig.cc            |  0
 .../ops_signature}/fused_conv_sig.cc          |  0
 .../ops_signature}/fused_elementwise_sig.cc   |  0
 .../ops_signature}/fused_feedforward_sig.cc   |  0
 .../ops_signature}/fused_matmul_sig.cc        |  0
 .../ops_signature}/fused_softmax_mask_sig.cc  |  0
 .../fused_softmax_mask_upper_triangle_sig.cc  |  0
 .../ops_signature}/fused_softplus_sig.cc      |  0
 .../ops_signature}/fused_transpose_sig.cc     |  0
 .../ops_signature}/fusion_group_sig.cc        |  0
 .../ops_signature}/gaussian_random_sig.cc     |  0
 .../graph_sample_neighbors_sig.cc             |  0
 .../hierarchical_sigmoid_sig.cc               |  0
 .../ops_signature}/identity_loss_sig.cc       |  0
 .../ops_signature}/lars_momentum_sig.cc       |  0
 .../ops_signature}/limit_by_capacity_sig.cc   |  0
 .../operators/ops_signature}/load_sig.cc      |  0
 .../operators/ops_signature}/logsumexp_sig.cc |  0
 .../ops_signature}/matrix_rank_sig.cc         |  0
 .../operators/ops_signature}/memcpy_sig.cc    |  0
 .../operators/ops_signature}/mul_sig.cc       |  0
 .../ops_signature}/number_count_sig.cc        |  0
 .../operators/ops_signature}/p_send_sig.cc    |  0
 .../operators/ops_signature}/pad_sig.cc       |  0
 .../pow2_decay_with_linear_warmup_sig.cc      |  0
 .../prune_gate_by_capacity_sig.cc             |  0
 .../ops_signature}/quantize_linear_sig.cc     |  0
 .../operators/ops_signature}/randint_sig.cc   |  0
 .../ops_signature}/random_routing_sig.cc      |  0
 .../operators/ops_signature}/read_file_sig.cc |  0
 .../operators/ops_signature}/reduce_sig.cc    |  0
 .../ops_signature}/repeat_interleave_sig.cc   |  0
 .../operators/ops_signature}/reshape_sig.cc   |  0
 .../operators/ops_signature}/reverse_sig.cc   |  0
 .../operators/ops_signature}/rrelu_sig.cc     |  0
 .../ops_signature}/save_combine_sig.cc        |  0
 .../operators/ops_signature}/save_sig.cc      |  0
 .../ops_signature}/sequence_mask_sig.cc       |  0
 .../ops_signature}/sequence_pool_sig.cc       |  0
 .../operators/ops_signature}/set_value_sig.cc |  0
 .../ops_signature}/shuffle_batch_sig.cc       |  0
 .../operators/ops_signature}/slice_sig.cc     |  0
 .../ops_signature}/sparse_manual_op_sig.cc    |  0
 .../operators/ops_signature}/split_sig.cc     |  0
 .../operators/ops_signature}/stft_sig.cc      |  0
 .../ops_signature}/strided_slice_sig.cc       |  0
 .../operators/ops_signature}/sum_sig.cc       |  0
 .../ops_signature}/sync_batch_norm_sig.cc     |  0
 .../operators/ops_signature}/tile_sig.cc      |  0
 .../operators/ops_signature}/transpose_sig.cc |  0
 .../ops_signature}/uniform_random_sig.cc      |  0
 .../operators/ops_signature}/unique_sig.cc    |  0
 paddle/phi/kernels/CMakeLists.txt             | 55 +++++++++++++++++--
 .../fusion/cutlass/conv2d/conv2d_bias_act.py  |  2 +-
 .../cutlass/conv2d/conv2d_bias_residual.py    |  2 +-
 .../conv2d/conv2d_depthwise_bias_act.py       |  2 +-
 .../generic_mixed_gemm_kernelLauncher.py      |  4 +-
 test/cpp/phi/core/test_op_utils.cc            |  2 +-
 test/cpp/phi/ops/test_op_signature.cc         |  2 +-
 95 files changed, 122 insertions(+), 71 deletions(-)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/activation_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/adadelta_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/adam_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/assign_pos_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/assign_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/assign_value_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/batch_norm_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/c_embedding_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/c_split_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/cast_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/channel_shuffle_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/cudnn_lstm_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/decayed_adagrad_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/deformable_conv_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/dequantize_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/dgc_momentum_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/distribute_fpn_proposals_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/distributed_fused_lamb_init_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/distributed_fused_lamb_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/dropout_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/elementwise_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/embedding_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/expand_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/feed_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fill_constant_batch_size_like_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fill_constant_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/flatten_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_adam_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_attention_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_bn_activation_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_bn_add_activation_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_conv_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_elementwise_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_feedforward_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_matmul_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_softmax_mask_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_softmax_mask_upper_triangle_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_softplus_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fused_transpose_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/fusion_group_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/gaussian_random_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/graph_sample_neighbors_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/hierarchical_sigmoid_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/identity_loss_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/lars_momentum_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/limit_by_capacity_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/load_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/logsumexp_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/matrix_rank_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/memcpy_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/mul_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/number_count_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/p_send_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/pad_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/pow2_decay_with_linear_warmup_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/prune_gate_by_capacity_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/quantize_linear_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/randint_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/random_routing_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/read_file_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/reduce_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/repeat_interleave_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/reshape_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/reverse_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/rrelu_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/save_combine_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/save_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/sequence_mask_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/sequence_pool_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/set_value_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/shuffle_batch_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/slice_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/sparse_manual_op_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/split_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/stft_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/strided_slice_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/sum_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/sync_batch_norm_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/tile_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/transpose_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/uniform_random_sig.cc (100%)
 rename paddle/{phi/ops/compat => fluid/operators/ops_signature}/unique_sig.cc (100%)

diff --git a/.gitignore b/.gitignore
index 96274a125aad78..9118b08be9bad2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,7 +82,7 @@ paddle/fluid/operators/generated_op*.cc
 paddle/fluid/operators/generated_sparse_op.cc
 paddle/fluid/operators/generated_static_op.cc
 paddle/fluid/operators/generated_fused_op.cc
-paddle/phi/ops/compat/generated_*.cc
+paddle/fluid/operators/ops_signature/generated_*.cc
 paddle/phi/api/yaml/parsed_apis/
 paddle/fluid/operators/generator/parsed_ops/
 paddle/fluid/pybind/tmp_eager_op_function_impl.h
@@ -95,9 +95,11 @@ paddle/fluid/framework/__init__.py
 paddle/phi/api/profiler/__init__.py
 python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
 paddle/phi/kernels/fusion/cutlass/conv2d/generated/*
+paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/*
 python/paddle/base/incubate/fleet/parameter_server/pslib/ps_pb2.py
 paddle/fluid/ir_adaptor/translator/op_compat_info.cc
 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*
+paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/*
 paddle/fluid/pybind/static_op_function.*
 paddle/fluid/pybind/ops_api.cc
 paddle/fluid/pir/dialect/operator/ir/pd_api.*
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 95273118c25057..d529e16bddc03a 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -734,3 +734,53 @@ function(prune_pybind_h)
     endif()
   endforeach()
 endfunction()
+
+function(append_op_util_declare TARGET)
+  file(READ ${TARGET} target_content)
+  string(REGEX MATCH "(PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*"
+               util_registrar "${target_content}")
+  if(NOT ${util_registrar} EQUAL "")
+    string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN"
+                   util_declare "${util_registrar}")
+    string(APPEND util_declare ");\n")
+    file(APPEND ${op_utils_header} "${util_declare}")
+  endif()
+endfunction()
+
+function(append_op_kernel_map_declare TARGET)
+  file(READ ${TARGET} target_content)
+  string(
+    REGEX
+      MATCH
+      "(PD_REGISTER_BASE_KERNEL_NAME)\\([ \t\r\n]*[a-z0-9_]*,[ \\\t\r\n]*[a-z0-9_]*"
+      kernel_mapping_registrar
+      "${target_content}")
+  if(NOT ${kernel_mapping_registrar} EQUAL "")
+    string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME"
+                   kernel_mapping_declare "${kernel_mapping_registrar}")
+    string(APPEND kernel_mapping_declare ");\n")
+    file(APPEND ${op_utils_header} "${kernel_mapping_declare}")
+  endif()
+endfunction()
+
+function(register_op_utils TARGET_NAME)
+  set(utils_srcs)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs EXCLUDES DEPS)
+  cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  file(GLOB SIGNATURES
+       "${PADDLE_SOURCE_DIR}/paddle/fluid/operators/ops_signature/*_sig.cc")
+  foreach(target ${SIGNATURES})
+    append_op_util_declare(${target})
+    append_op_kernel_map_declare(${target})
+    list(APPEND utils_srcs ${target})
+  endforeach()
+
+  cc_library(
+    ${TARGET_NAME}
+    SRCS ${utils_srcs}
+    DEPS ${register_op_utils_DEPS})
+endfunction()
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index ead66697ef68cb..bfb6a88eb62a73 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -172,55 +172,6 @@ function(kernel_declare TARGET_LIST)
   endforeach()
 endfunction()
 
-function(append_op_util_declare TARGET)
-  file(READ ${TARGET} target_content)
-  string(REGEX MATCH "(PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*"
-               util_registrar "${target_content}")
-  if(NOT ${util_registrar} EQUAL "")
-    string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN"
-                   util_declare "${util_registrar}")
-    string(APPEND util_declare ");\n")
-    file(APPEND ${op_utils_header} "${util_declare}")
-  endif()
-endfunction()
-
-function(append_op_kernel_map_declare TARGET)
-  file(READ ${TARGET} target_content)
-  string(
-    REGEX
-      MATCH
-      "(PD_REGISTER_BASE_KERNEL_NAME)\\([ \t\r\n]*[a-z0-9_]*,[ \\\t\r\n]*[a-z0-9_]*"
-      kernel_mapping_registrar
-      "${target_content}")
-  if(NOT ${kernel_mapping_registrar} EQUAL "")
-    string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME"
-                   kernel_mapping_declare "${kernel_mapping_registrar}")
-    string(APPEND kernel_mapping_declare ");\n")
-    file(APPEND ${op_utils_header} "${kernel_mapping_declare}")
-  endif()
-endfunction()
-
-function(register_op_utils TARGET_NAME)
-  set(utils_srcs)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs EXCLUDES DEPS)
-  cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  file(GLOB SIGNATURES "${PADDLE_SOURCE_DIR}/paddle/phi/ops/compat/*_sig.cc")
-  foreach(target ${SIGNATURES})
-    append_op_util_declare(${target})
-    append_op_kernel_map_declare(${target})
-    list(APPEND utils_srcs ${target})
-  endforeach()
-
-  cc_library(
-    ${TARGET_NAME}
-    SRCS ${utils_srcs}
-    DEPS ${register_op_utils_DEPS})
-endfunction()
-
 function(prune_declaration_h)
   set(kernel_list ${KERNEL_LIST})
   file(STRINGS ${kernel_declare_file} kernel_registry_list)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4ae5e0ebdf8720..d2440b9f64d4e5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/operators/ops_extra_info.h"
+#include "paddle/fluid/operators/ops_signature/signatures.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -41,7 +42,6 @@ limitations under the License. */
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/ops/compat/signatures.h"
 #include "paddle/utils/flags.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index cc5cf54724dabe..15727db9d0f5d2 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -157,7 +157,7 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
       continue;
     }
     // If contains dispensable input, we should override the
-    // OpArgumentMapping method self in phi/ops/compat dir
+    // OpArgumentMapping method self in fluid/operators/ops_signature dir
     if (in.has_dispensable() && in.dispensable()) {
       continue;
     }
diff --git a/paddle/fluid/operators/generator/CMakeLists.txt b/paddle/fluid/operators/generator/CMakeLists.txt
index a47a0a295be8f4..f13cac6eec80c6 100644
--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -1,5 +1,5 @@
 # phi auto cmake utils
-include(phi)
+include(operators)
 
 # set yaml file path
 set(op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml)
@@ -115,13 +115,16 @@ set(generated_fused_op_path
 set(generated_sparse_ops_path
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_sparse_op.cc)
 set(generated_argument_mapping_path
-    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_signature/generated_sig.cc)
 set(generated_fused_argument_mapping_path
-    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_fused_sig.cc)
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_signature/generated_fused_sig.cc
+)
 set(generated_static_argument_mapping_path
-    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_static_sig.cc)
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_signature/generated_static_sig.cc
+)
 set(generated_sparse_argument_mapping_path
-    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sparse_sig.cc)
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc
+)
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
   COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_op_dir} RESULTS_VARIABLE
@@ -342,10 +345,10 @@ execute_process(
 message("generate ${ops_extra_info_file}")
 
 set(op_utils_header
-    ${PADDLE_BINARY_DIR}/paddle/phi/ops/compat/signatures.h.tmp
+    ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ops_signature/signatures.h.tmp
     CACHE INTERNAL "op_args_fns.cc file")
 set(op_utils_header_final
-    ${PADDLE_BINARY_DIR}/paddle/phi/ops/compat/signatures.h)
+    ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ops_signature/signatures.h)
 file(
   WRITE ${op_utils_header}
   "// Generated by the paddle/fluid/operators/generator/CMakeLists.txt.  DO NOT EDIT!\n\n"
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/fluid/operators/ops_signature/activation_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/activation_sig.cc
rename to paddle/fluid/operators/ops_signature/activation_sig.cc
diff --git a/paddle/phi/ops/compat/adadelta_sig.cc b/paddle/fluid/operators/ops_signature/adadelta_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/adadelta_sig.cc
rename to paddle/fluid/operators/ops_signature/adadelta_sig.cc
diff --git a/paddle/phi/ops/compat/adam_sig.cc b/paddle/fluid/operators/ops_signature/adam_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/adam_sig.cc
rename to paddle/fluid/operators/ops_signature/adam_sig.cc
diff --git a/paddle/phi/ops/compat/assign_pos_sig.cc b/paddle/fluid/operators/ops_signature/assign_pos_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/assign_pos_sig.cc
rename to paddle/fluid/operators/ops_signature/assign_pos_sig.cc
diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/fluid/operators/ops_signature/assign_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/assign_sig.cc
rename to paddle/fluid/operators/ops_signature/assign_sig.cc
diff --git a/paddle/phi/ops/compat/assign_value_sig.cc b/paddle/fluid/operators/ops_signature/assign_value_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/assign_value_sig.cc
rename to paddle/fluid/operators/ops_signature/assign_value_sig.cc
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/fluid/operators/ops_signature/batch_norm_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/batch_norm_sig.cc
rename to paddle/fluid/operators/ops_signature/batch_norm_sig.cc
diff --git a/paddle/phi/ops/compat/c_embedding_sig.cc b/paddle/fluid/operators/ops_signature/c_embedding_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/c_embedding_sig.cc
rename to paddle/fluid/operators/ops_signature/c_embedding_sig.cc
diff --git a/paddle/phi/ops/compat/c_split_sig.cc b/paddle/fluid/operators/ops_signature/c_split_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/c_split_sig.cc
rename to paddle/fluid/operators/ops_signature/c_split_sig.cc
diff --git a/paddle/phi/ops/compat/cast_sig.cc b/paddle/fluid/operators/ops_signature/cast_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/cast_sig.cc
rename to paddle/fluid/operators/ops_signature/cast_sig.cc
diff --git a/paddle/phi/ops/compat/channel_shuffle_sig.cc b/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/channel_shuffle_sig.cc
rename to paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc
diff --git a/paddle/phi/ops/compat/cudnn_lstm_sig.cc b/paddle/fluid/operators/ops_signature/cudnn_lstm_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/cudnn_lstm_sig.cc
rename to paddle/fluid/operators/ops_signature/cudnn_lstm_sig.cc
diff --git a/paddle/phi/ops/compat/decayed_adagrad_sig.cc b/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/decayed_adagrad_sig.cc
rename to paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc
diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/fluid/operators/ops_signature/deformable_conv_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/deformable_conv_sig.cc
rename to paddle/fluid/operators/ops_signature/deformable_conv_sig.cc
diff --git a/paddle/phi/ops/compat/dequantize_sig.cc b/paddle/fluid/operators/ops_signature/dequantize_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/dequantize_sig.cc
rename to paddle/fluid/operators/ops_signature/dequantize_sig.cc
diff --git a/paddle/phi/ops/compat/dgc_momentum_sig.cc b/paddle/fluid/operators/ops_signature/dgc_momentum_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/dgc_momentum_sig.cc
rename to paddle/fluid/operators/ops_signature/dgc_momentum_sig.cc
diff --git a/paddle/phi/ops/compat/distribute_fpn_proposals_sig.cc b/paddle/fluid/operators/ops_signature/distribute_fpn_proposals_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/distribute_fpn_proposals_sig.cc
rename to paddle/fluid/operators/ops_signature/distribute_fpn_proposals_sig.cc
diff --git a/paddle/phi/ops/compat/distributed_fused_lamb_init_sig.cc b/paddle/fluid/operators/ops_signature/distributed_fused_lamb_init_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/distributed_fused_lamb_init_sig.cc
rename to paddle/fluid/operators/ops_signature/distributed_fused_lamb_init_sig.cc
diff --git a/paddle/phi/ops/compat/distributed_fused_lamb_sig.cc b/paddle/fluid/operators/ops_signature/distributed_fused_lamb_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/distributed_fused_lamb_sig.cc
rename to paddle/fluid/operators/ops_signature/distributed_fused_lamb_sig.cc
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/fluid/operators/ops_signature/dropout_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/dropout_sig.cc
rename to paddle/fluid/operators/ops_signature/dropout_sig.cc
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/fluid/operators/ops_signature/elementwise_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/elementwise_sig.cc
rename to paddle/fluid/operators/ops_signature/elementwise_sig.cc
diff --git a/paddle/phi/ops/compat/embedding_sig.cc b/paddle/fluid/operators/ops_signature/embedding_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/embedding_sig.cc
rename to paddle/fluid/operators/ops_signature/embedding_sig.cc
diff --git a/paddle/phi/ops/compat/expand_sig.cc b/paddle/fluid/operators/ops_signature/expand_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/expand_sig.cc
rename to paddle/fluid/operators/ops_signature/expand_sig.cc
diff --git a/paddle/phi/ops/compat/feed_sig.cc b/paddle/fluid/operators/ops_signature/feed_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/feed_sig.cc
rename to paddle/fluid/operators/ops_signature/feed_sig.cc
diff --git a/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc b/paddle/fluid/operators/ops_signature/fill_constant_batch_size_like_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc
rename to paddle/fluid/operators/ops_signature/fill_constant_batch_size_like_sig.cc
diff --git a/paddle/phi/ops/compat/fill_constant_sig.cc b/paddle/fluid/operators/ops_signature/fill_constant_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fill_constant_sig.cc
rename to paddle/fluid/operators/ops_signature/fill_constant_sig.cc
diff --git a/paddle/phi/ops/compat/flatten_sig.cc b/paddle/fluid/operators/ops_signature/flatten_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/flatten_sig.cc
rename to paddle/fluid/operators/ops_signature/flatten_sig.cc
diff --git a/paddle/phi/ops/compat/fused_adam_sig.cc b/paddle/fluid/operators/ops_signature/fused_adam_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_adam_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_adam_sig.cc
diff --git a/paddle/phi/ops/compat/fused_attention_sig.cc b/paddle/fluid/operators/ops_signature/fused_attention_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_attention_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_attention_sig.cc
diff --git a/paddle/phi/ops/compat/fused_bn_activation_sig.cc b/paddle/fluid/operators/ops_signature/fused_bn_activation_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_bn_activation_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_bn_activation_sig.cc
diff --git a/paddle/phi/ops/compat/fused_bn_add_activation_sig.cc b/paddle/fluid/operators/ops_signature/fused_bn_add_activation_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_bn_add_activation_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_bn_add_activation_sig.cc
diff --git a/paddle/phi/ops/compat/fused_conv_sig.cc b/paddle/fluid/operators/ops_signature/fused_conv_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_conv_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_conv_sig.cc
diff --git a/paddle/phi/ops/compat/fused_elementwise_sig.cc b/paddle/fluid/operators/ops_signature/fused_elementwise_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_elementwise_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_elementwise_sig.cc
diff --git a/paddle/phi/ops/compat/fused_feedforward_sig.cc b/paddle/fluid/operators/ops_signature/fused_feedforward_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_feedforward_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_feedforward_sig.cc
diff --git a/paddle/phi/ops/compat/fused_matmul_sig.cc b/paddle/fluid/operators/ops_signature/fused_matmul_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_matmul_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_matmul_sig.cc
diff --git a/paddle/phi/ops/compat/fused_softmax_mask_sig.cc b/paddle/fluid/operators/ops_signature/fused_softmax_mask_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_softmax_mask_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_softmax_mask_sig.cc
diff --git a/paddle/phi/ops/compat/fused_softmax_mask_upper_triangle_sig.cc b/paddle/fluid/operators/ops_signature/fused_softmax_mask_upper_triangle_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_softmax_mask_upper_triangle_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_softmax_mask_upper_triangle_sig.cc
diff --git a/paddle/phi/ops/compat/fused_softplus_sig.cc b/paddle/fluid/operators/ops_signature/fused_softplus_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_softplus_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_softplus_sig.cc
diff --git a/paddle/phi/ops/compat/fused_transpose_sig.cc b/paddle/fluid/operators/ops_signature/fused_transpose_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fused_transpose_sig.cc
rename to paddle/fluid/operators/ops_signature/fused_transpose_sig.cc
diff --git a/paddle/phi/ops/compat/fusion_group_sig.cc b/paddle/fluid/operators/ops_signature/fusion_group_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/fusion_group_sig.cc
rename to paddle/fluid/operators/ops_signature/fusion_group_sig.cc
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/fluid/operators/ops_signature/gaussian_random_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/gaussian_random_sig.cc
rename to paddle/fluid/operators/ops_signature/gaussian_random_sig.cc
diff --git a/paddle/phi/ops/compat/graph_sample_neighbors_sig.cc b/paddle/fluid/operators/ops_signature/graph_sample_neighbors_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/graph_sample_neighbors_sig.cc
rename to paddle/fluid/operators/ops_signature/graph_sample_neighbors_sig.cc
diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/fluid/operators/ops_signature/hierarchical_sigmoid_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
rename to paddle/fluid/operators/ops_signature/hierarchical_sigmoid_sig.cc
diff --git a/paddle/phi/ops/compat/identity_loss_sig.cc b/paddle/fluid/operators/ops_signature/identity_loss_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/identity_loss_sig.cc
rename to paddle/fluid/operators/ops_signature/identity_loss_sig.cc
diff --git a/paddle/phi/ops/compat/lars_momentum_sig.cc b/paddle/fluid/operators/ops_signature/lars_momentum_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/lars_momentum_sig.cc
rename to paddle/fluid/operators/ops_signature/lars_momentum_sig.cc
diff --git a/paddle/phi/ops/compat/limit_by_capacity_sig.cc b/paddle/fluid/operators/ops_signature/limit_by_capacity_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/limit_by_capacity_sig.cc
rename to paddle/fluid/operators/ops_signature/limit_by_capacity_sig.cc
diff --git a/paddle/phi/ops/compat/load_sig.cc b/paddle/fluid/operators/ops_signature/load_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/load_sig.cc
rename to paddle/fluid/operators/ops_signature/load_sig.cc
diff --git a/paddle/phi/ops/compat/logsumexp_sig.cc b/paddle/fluid/operators/ops_signature/logsumexp_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/logsumexp_sig.cc
rename to paddle/fluid/operators/ops_signature/logsumexp_sig.cc
diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/fluid/operators/ops_signature/matrix_rank_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/matrix_rank_sig.cc
rename to paddle/fluid/operators/ops_signature/matrix_rank_sig.cc
diff --git a/paddle/phi/ops/compat/memcpy_sig.cc b/paddle/fluid/operators/ops_signature/memcpy_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/memcpy_sig.cc
rename to paddle/fluid/operators/ops_signature/memcpy_sig.cc
diff --git a/paddle/phi/ops/compat/mul_sig.cc b/paddle/fluid/operators/ops_signature/mul_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/mul_sig.cc
rename to paddle/fluid/operators/ops_signature/mul_sig.cc
diff --git a/paddle/phi/ops/compat/number_count_sig.cc b/paddle/fluid/operators/ops_signature/number_count_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/number_count_sig.cc
rename to paddle/fluid/operators/ops_signature/number_count_sig.cc
diff --git a/paddle/phi/ops/compat/p_send_sig.cc b/paddle/fluid/operators/ops_signature/p_send_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/p_send_sig.cc
rename to paddle/fluid/operators/ops_signature/p_send_sig.cc
diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/fluid/operators/ops_signature/pad_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/pad_sig.cc
rename to paddle/fluid/operators/ops_signature/pad_sig.cc
diff --git a/paddle/phi/ops/compat/pow2_decay_with_linear_warmup_sig.cc b/paddle/fluid/operators/ops_signature/pow2_decay_with_linear_warmup_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/pow2_decay_with_linear_warmup_sig.cc
rename to paddle/fluid/operators/ops_signature/pow2_decay_with_linear_warmup_sig.cc
diff --git a/paddle/phi/ops/compat/prune_gate_by_capacity_sig.cc b/paddle/fluid/operators/ops_signature/prune_gate_by_capacity_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/prune_gate_by_capacity_sig.cc
rename to paddle/fluid/operators/ops_signature/prune_gate_by_capacity_sig.cc
diff --git a/paddle/phi/ops/compat/quantize_linear_sig.cc b/paddle/fluid/operators/ops_signature/quantize_linear_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/quantize_linear_sig.cc
rename to paddle/fluid/operators/ops_signature/quantize_linear_sig.cc
diff --git a/paddle/phi/ops/compat/randint_sig.cc b/paddle/fluid/operators/ops_signature/randint_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/randint_sig.cc
rename to paddle/fluid/operators/ops_signature/randint_sig.cc
diff --git a/paddle/phi/ops/compat/random_routing_sig.cc b/paddle/fluid/operators/ops_signature/random_routing_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/random_routing_sig.cc
rename to paddle/fluid/operators/ops_signature/random_routing_sig.cc
diff --git a/paddle/phi/ops/compat/read_file_sig.cc b/paddle/fluid/operators/ops_signature/read_file_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/read_file_sig.cc
rename to paddle/fluid/operators/ops_signature/read_file_sig.cc
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/fluid/operators/ops_signature/reduce_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/reduce_sig.cc
rename to paddle/fluid/operators/ops_signature/reduce_sig.cc
diff --git a/paddle/phi/ops/compat/repeat_interleave_sig.cc b/paddle/fluid/operators/ops_signature/repeat_interleave_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/repeat_interleave_sig.cc
rename to paddle/fluid/operators/ops_signature/repeat_interleave_sig.cc
diff --git a/paddle/phi/ops/compat/reshape_sig.cc b/paddle/fluid/operators/ops_signature/reshape_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/reshape_sig.cc
rename to paddle/fluid/operators/ops_signature/reshape_sig.cc
diff --git a/paddle/phi/ops/compat/reverse_sig.cc b/paddle/fluid/operators/ops_signature/reverse_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/reverse_sig.cc
rename to paddle/fluid/operators/ops_signature/reverse_sig.cc
diff --git a/paddle/phi/ops/compat/rrelu_sig.cc b/paddle/fluid/operators/ops_signature/rrelu_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/rrelu_sig.cc
rename to paddle/fluid/operators/ops_signature/rrelu_sig.cc
diff --git a/paddle/phi/ops/compat/save_combine_sig.cc b/paddle/fluid/operators/ops_signature/save_combine_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/save_combine_sig.cc
rename to paddle/fluid/operators/ops_signature/save_combine_sig.cc
diff --git a/paddle/phi/ops/compat/save_sig.cc b/paddle/fluid/operators/ops_signature/save_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/save_sig.cc
rename to paddle/fluid/operators/ops_signature/save_sig.cc
diff --git a/paddle/phi/ops/compat/sequence_mask_sig.cc b/paddle/fluid/operators/ops_signature/sequence_mask_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/sequence_mask_sig.cc
rename to paddle/fluid/operators/ops_signature/sequence_mask_sig.cc
diff --git a/paddle/phi/ops/compat/sequence_pool_sig.cc b/paddle/fluid/operators/ops_signature/sequence_pool_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/sequence_pool_sig.cc
rename to paddle/fluid/operators/ops_signature/sequence_pool_sig.cc
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/fluid/operators/ops_signature/set_value_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/set_value_sig.cc
rename to paddle/fluid/operators/ops_signature/set_value_sig.cc
diff --git a/paddle/phi/ops/compat/shuffle_batch_sig.cc b/paddle/fluid/operators/ops_signature/shuffle_batch_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/shuffle_batch_sig.cc
rename to paddle/fluid/operators/ops_signature/shuffle_batch_sig.cc
diff --git a/paddle/phi/ops/compat/slice_sig.cc b/paddle/fluid/operators/ops_signature/slice_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/slice_sig.cc
rename to paddle/fluid/operators/ops_signature/slice_sig.cc
diff --git a/paddle/phi/ops/compat/sparse_manual_op_sig.cc b/paddle/fluid/operators/ops_signature/sparse_manual_op_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/sparse_manual_op_sig.cc
rename to paddle/fluid/operators/ops_signature/sparse_manual_op_sig.cc
diff --git a/paddle/phi/ops/compat/split_sig.cc b/paddle/fluid/operators/ops_signature/split_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/split_sig.cc
rename to paddle/fluid/operators/ops_signature/split_sig.cc
diff --git a/paddle/phi/ops/compat/stft_sig.cc b/paddle/fluid/operators/ops_signature/stft_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/stft_sig.cc
rename to paddle/fluid/operators/ops_signature/stft_sig.cc
diff --git a/paddle/phi/ops/compat/strided_slice_sig.cc b/paddle/fluid/operators/ops_signature/strided_slice_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/strided_slice_sig.cc
rename to paddle/fluid/operators/ops_signature/strided_slice_sig.cc
diff --git a/paddle/phi/ops/compat/sum_sig.cc b/paddle/fluid/operators/ops_signature/sum_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/sum_sig.cc
rename to paddle/fluid/operators/ops_signature/sum_sig.cc
diff --git a/paddle/phi/ops/compat/sync_batch_norm_sig.cc b/paddle/fluid/operators/ops_signature/sync_batch_norm_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/sync_batch_norm_sig.cc
rename to paddle/fluid/operators/ops_signature/sync_batch_norm_sig.cc
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/fluid/operators/ops_signature/tile_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/tile_sig.cc
rename to paddle/fluid/operators/ops_signature/tile_sig.cc
diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/fluid/operators/ops_signature/transpose_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/transpose_sig.cc
rename to paddle/fluid/operators/ops_signature/transpose_sig.cc
diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/fluid/operators/ops_signature/uniform_random_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/uniform_random_sig.cc
rename to paddle/fluid/operators/ops_signature/uniform_random_sig.cc
diff --git a/paddle/phi/ops/compat/unique_sig.cc b/paddle/fluid/operators/ops_signature/unique_sig.cc
similarity index 100%
rename from paddle/phi/ops/compat/unique_sig.cc
rename to paddle/fluid/operators/ops_signature/unique_sig.cc
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 82bc55a19fdf95..78c14352dc53a5 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -56,11 +56,33 @@ endif()
 if(WITH_CUTLASS)
   execute_process(
     COMMAND ${CMAKE_COMMAND} -E make_directory
-            "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d/generated"
+            "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d/generated_tmp"
     COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_act.py"
     COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_residual.py"
     COMMAND ${PYTHON_EXECUTABLE} "conv2d_depthwise_bias_act.py"
     WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d")
+  set(generated_tmp_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp
+  )
+  set(generated_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d/generated)
+  file(GLOB con2d_generated_files ${generated_tmp_dir}/*.cu)
+
+  if(EXISTS ${generated_dir})
+    foreach(gen_file ${con2d_generated_files})
+      string(REPLACE "generated_tmp" "generated" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${generated_dir}")
+  else()
+    foreach(gen_file ${con2d_generated_files})
+      string(REPLACE "generated_tmp" "generated" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${generated_dir}")
+  endif()
 
   execute_process(
     COMMAND
@@ -145,17 +167,39 @@ if(WITH_CUTLASS)
   )
 
   execute_process(
-    COMMAND
-      ${CMAKE_COMMAND} -E remove_directory
-      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen"
     COMMAND
       ${CMAKE_COMMAND} -E make_directory
-      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen"
+      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp"
     COMMAND ${PYTHON_EXECUTABLE} generic_mixed_gemm_kernelLauncher.py
             --cuda_arch "${NVCC_ARCH_BIN}"
     WORKING_DIRECTORY
       "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm"
   )
+  set(fpA_intB_gemm_autogen_tmp_dir
+      ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp
+  )
+  set(fpA_intB_gemm_autogen_dir
+      ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen
+  )
+
+  file(GLOB fpA_intB_gemm_autogen_files ${fpA_intB_gemm_autogen_tmp_dir}/*.h
+       ${fpA_intB_gemm_autogen_tmp_dir}/*.cu)
+
+  if(EXISTS ${fpA_intB_gemm_autogen_dir})
+    foreach(gen_file ${fpA_intB_gemm_autogen_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${fpA_intB_gemm_autogen_dir}")
+  else()
+    foreach(gen_file ${fpA_intB_gemm_autogen_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${fpA_intB_gemm_autogen_dir}")
+  endif()
 
   file(
     GLOB cutlass_cu
@@ -167,6 +211,7 @@ if(WITH_CUTLASS)
     "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
     "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu"
     "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu")
+
   list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
index ea6550fd430b1f..6870d191a8026a 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
@@ -225,6 +225,6 @@ def generate_sm75_1688():
         sm_versions, SupportedAct, UnderScoreName, CamelName
     )
     all_code += CommonTail
-    with open("generated/conv2d_bias_act.cu", "w") as f:
+    with open("generated_tmp/conv2d_bias_act.cu", "w") as f:
         f.write(all_code)
         f.close()
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
index 018a456bf64b28..109dac2ad65e82 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
@@ -210,6 +210,6 @@ def generate_sm75_1688():
         sm_versions, SupportedEpilogue, UnderScoreName, CamelName
     )
     all_code += CommonTail
-    with open("generated/conv2d_bias_residual.cu", "w") as f:
+    with open("generated_tmp/conv2d_bias_residual.cu", "w") as f:
         f.write(all_code)
         f.close()
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
index 0dabcd366bb452..cfeb60dbc154d2 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
@@ -225,6 +225,6 @@ def generate_conv2d_depthwise():
     all_code = cdba_header
     all_code += generate_conv2d_depthwise()
     all_code += CommonTail
-    with open("generated/conv2d_depthwise_bias_act.cu", "w") as f:
+    with open("generated_tmp/conv2d_depthwise_bias_act.cu", "w") as f:
         f.write(all_code)
         f.close()
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
index 3b523fd60e1291..ad7f1e65591ce9 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
@@ -204,7 +204,7 @@ def generate_source_cu(
     args = parse_args()
     archs = args.cuda_arch
     header_all = DefineHeader
-    header_name = "autogen/arch_define.h"
+    header_name = "autogen_tmp/arch_define.h"
     if archs:
         for arch in archs:
             define_line = "#define USE_FPAINTB_GEMM_WITH_SM%s\n" % str(arch)
@@ -217,7 +217,7 @@ def generate_source_cu(
             for arch in archs:
                 for epilogue_tag in EpilogueTags.keys():
                     for stages in StagesList[arch]:
-                        file_name = "autogen/generic_mixed_gemm_kernelLauncher_{}_sm{}_stages{}_{}.cu".format(
+                        file_name = "autogen_tmp/generic_mixed_gemm_kernelLauncher_{}_sm{}_stages{}_{}.cu".format(
                             element_type, arch, stages, epilogue_tag
                         )
                         all_code = generate_source_cu(
diff --git a/test/cpp/phi/core/test_op_utils.cc b/test/cpp/phi/core/test_op_utils.cc
index 4b69800971fb35..85e9ad48691ebe 100644
--- a/test/cpp/phi/core/test_op_utils.cc
+++ b/test/cpp/phi/core/test_op_utils.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <iostream>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/ops_signature/signatures.h"
 #include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/ops/compat/signatures.h"
 
 namespace phi {
 namespace tests {
diff --git a/test/cpp/phi/ops/test_op_signature.cc b/test/cpp/phi/ops/test_op_signature.cc
index 191705d6ae7c60..0bd01967cb4cc4 100644
--- a/test/cpp/phi/ops/test_op_signature.cc
+++ b/test/cpp/phi/ops/test_op_signature.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <memory>
 #include <unordered_set>
 
+#include "paddle/fluid/operators/ops_signature/signatures.h"
 #include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/ops/compat/signatures.h"
 
 namespace phi {
 namespace tests {

From b2d107a39ee8e2bbe9c37a04c6745e45bdff2ab1 Mon Sep 17 00:00:00 2001
From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Date: Mon, 11 Dec 2023 16:54:19 +0800
Subject: [PATCH 14/28] disable_unittest_cuda12_win (#59866)

* disable_unittest_cuda12_win

* disable_unittest_cuda12_win
---
 tools/windows/run_unittests.sh | 51 ++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 527a46ad017134..03e802750b8df5 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -242,6 +242,57 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_add_reader_dependency$|\
 ^test_conv2d_fusion_op$|\
 ^test_fused_conv2d_add_act_op$|\
+^test_analyzer_detect_functional_mkldnn$|\
+^test_audio_datasets$|\
+^test_signal$|\
+^test_stft_op$|\
+^test_trt_convert_flatten_contiguous_range$|\
+^test_trt_convert_gather$|\
+^test_trt_convert_index_select$|\
+^test_trt_convert_lookup_table$|\
+^test_trt_convert_prelu$|\
+^test_trt_convert_bilinear_interp_v2$|\
+^test_trt_convert_leaky_relu$|\
+^test_reverse_roll_fuse_pass$|\
+^test_trt_convert_einsum$|\
+^test_trt_convert_roi_align$|\
+^test_trt_convert_temporal_shift$|\
+^test_trt_convert_mish$|\
+^test_trt_convert_pad3d$|\
+^test_trt_convert_yolo_box$|\
+^test_merge_layernorm_fuse_pass$|\
+^test_trt_convert_instance_norm$|\
+^test_skip_merge_layernorm_fuse_pass$|\
+^test_trt_float64$|\
+^test_trt_convert_arg_max$|\
+^test_trt_convert_arg_min$|\
+^test_trt_convert_assign$|\
+^test_trt_convert_cast$|\
+^test_trt_convert_compare_and_logical$|\
+^test_trt_convert_concat$|\
+^test_preln_layernorm_x_fuse_pass$|\
+^test_trt_convert_argsort$|\
+^test_trt_remove_amp_strategy_op_pass$|\
+^test_trt_convert_bitwise_and$|\
+^test_trt_convert_bitwise_or$|\
+^test_trt_convert_scatter$|\
+^test_trt_convert_solve$|\
+^test_quant_linear_fuse_pass$|\
+^test_trt_explicit_quantization$|\
+^test_trt_nearest_interp_v2_op$|\
+^test_trt_pool3d_op$|\
+^test_trt_convert_anchor_generator$|\
+^test_trt_convert_softmax$|\
+^test_trt_convert_strided_slice$|\
+^test_layernorm_shift_partition_pass$|\
+^test_trt_convert_multihead_matmul$|\
+^test_trt_convert_reshape$|\
+^test_trt_convert_split$|\
+^test_trt_convert_squeeze2$|\
+^test_trt_convert_sum$|\
+^test_trt_convert_transpose$|\
+^test_trt_convert_unsqueeze2$|\
+^test_simplify_with_basic_ops_pass_autoscan$|\
 ^disable_wingpu_cuda12_test$"
 
 # /*=================Fixed Disabled Windows TRT MKL unittests=======================*/

From d57954a0041f50055157705022d9e780ab95a181 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Mon, 11 Dec 2023 18:56:25 +0800
Subject: [PATCH 15/28] =?UTF-8?q?[Paddle=20Inference]=20support=20MMHA?=
 =?UTF-8?q?=E2=80=98s=20sequence=5Flengths=20is=200=20(#59882)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Paddle Inference] support MMHA‘s sequence_lengths is 0 (#59882)
---
 .../fusion/gpu/masked_multihead_attention_kernel.cu       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
index e9b2b8eb0cbe6f..449ce9a9e69318 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
@@ -92,7 +92,9 @@ __global__ void masked_multihead_attention_kernel(
     StoreFunc store_func) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   const int bi = blockIdx.y;
-  if (params.sequence_lengths && params.sequence_lengths[bi] == 0) {
+  // params.sequence_lengths[bi] means how many k and v we have cached in
+  // cache_kv.
+  if (params.sequence_lengths && params.sequence_lengths[bi] < 0) {
     return;
   }
 
@@ -711,6 +713,10 @@ void fmha_impl(const phi::GPUContext &dev_ctx,
       fmha_launch_kernel<T, 10, 32>(
           params, dev_ctx.stream(), load_func, store_func);
       break;
+    case 16:
+      fmha_launch_kernel<T, 16, 32>(
+          params, dev_ctx.stream(), load_func, store_func);
+      break;
     case 26:
       fmha_launch_kernel<T, 26, 32>(
           params, dev_ctx.stream(), load_func, store_func);

From 257f5d37b7a4798f880b4559163ed837b8ca1fe2 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Mon, 11 Dec 2023 19:20:45 +0800
Subject: [PATCH 16/28] =?UTF-8?q?=E3=80=90PIR=20API=20adaptor=20No.257?=
 =?UTF-8?q?=E3=80=81265=E3=80=91Migrate=20gcd=EF=BC=8Clcm=20into=20pir=20?=
 =?UTF-8?q?=20(#59600)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/legacy_test/test_gcd.py | 34 ++++++++++++++++++----------------
 test/legacy_test/test_lcm.py | 35 ++++++++++++++++++-----------------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/test/legacy_test/test_gcd.py b/test/legacy_test/test_gcd.py
index a7ec34eca42c7c..175714649952ee 100644
--- a/test/legacy_test/test_gcd.py
+++ b/test/legacy_test/test_gcd.py
@@ -17,10 +17,8 @@
 import numpy as np
 
 import paddle
-from paddle import base
 from paddle.base import core
-
-paddle.enable_static()
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestGcdAPI(unittest.TestCase):
@@ -30,10 +28,15 @@ def setUp(self):
         self.x_shape = [1]
         self.y_shape = [1]
 
+    @test_with_pir_api
     def test_static_graph(self):
-        startup_program = base.Program()
-        train_program = base.Program()
-        with base.program_guard(startup_program, train_program):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             x = paddle.static.data(
                 name='input1', dtype='int32', shape=self.x_shape
             )
@@ -41,21 +44,15 @@ def test_static_graph(self):
                 name='input2', dtype='int32', shape=self.y_shape
             )
             out = paddle.gcd(x, y)
+            out_ref = np.gcd(self.x_np, self.y_np)
 
-            place = (
-                base.CUDAPlace(0)
-                if core.is_compiled_with_cuda()
-                else base.CPUPlace()
-            )
-            exe = base.Executor(place)
+            exe = paddle.static.Executor(place)
             res = exe.run(
-                base.default_main_program(),
+                paddle.static.default_main_program(),
                 feed={'input1': self.x_np, 'input2': self.y_np},
                 fetch_list=[out],
             )
-            self.assertTrue(
-                (np.array(res[0]) == np.gcd(self.x_np, self.y_np)).all()
-            )
+            self.assertTrue((res[0] == out_ref).all())
 
     def test_dygraph(self):
         paddle.disable_static()
@@ -99,3 +96,8 @@ def setUp(self):
         self.y_np = -20
         self.x_shape = []
         self.y_shape = []
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/legacy_test/test_lcm.py b/test/legacy_test/test_lcm.py
index 706c2dc23c32e9..647ae2b749e384 100644
--- a/test/legacy_test/test_lcm.py
+++ b/test/legacy_test/test_lcm.py
@@ -17,10 +17,8 @@
 import numpy as np
 
 import paddle
-from paddle import base
 from paddle.base import core
-
-paddle.enable_static()
+from paddle.pir_utils import test_with_pir_api
 
 
 class TestLcmAPI(unittest.TestCase):
@@ -30,10 +28,15 @@ def setUp(self):
         self.x_shape = []
         self.y_shape = []
 
+    @test_with_pir_api
     def test_static_graph(self):
-        startup_program = base.Program()
-        train_program = base.Program()
-        with base.program_guard(startup_program, train_program):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
             x1 = paddle.static.data(
                 name='input1', dtype='int32', shape=self.x_shape
             )
@@ -41,21 +44,14 @@ def test_static_graph(self):
                 name='input2', dtype='int32', shape=self.y_shape
             )
             out = paddle.lcm(x1, x2)
-
-            place = (
-                base.CUDAPlace(0)
-                if core.is_compiled_with_cuda()
-                else base.CPUPlace()
-            )
-            exe = base.Executor(place)
+            out_ref = np.lcm(self.x_np, self.y_np)
+            exe = paddle.static.Executor(place)
             res = exe.run(
-                base.default_main_program(),
+                paddle.static.default_main_program(),
                 feed={'input1': self.x_np, 'input2': self.y_np},
                 fetch_list=[out],
             )
-            self.assertTrue(
-                (np.array(res[0]) == np.lcm(self.x_np, self.y_np)).all()
-            )
+            self.assertTrue((res[0] == out_ref).all())
 
     def test_dygraph(self):
         paddle.disable_static()
@@ -99,3 +95,8 @@ def setUp(self):
         self.y_np = -20
         self.x_shape = []
         self.y_shape = []
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()

From 93fda0acb16e8c5c570831aba7c58e8a4f460112 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Mon, 11 Dec 2023 19:38:07 +0800
Subject: [PATCH 17/28] [PIR] delete python use_mkldnn part2 (#59554)

* delete python use_mkldnn part2
---
 python/paddle/base/backward.py                  |  4 ++--
 python/paddle/base/dygraph_utils.py             |  7 +------
 python/paddle/base/layer_helper.py              | 10 +---------
 .../base/layers/layer_function_generator.py     |  2 --
 .../fleet/meta_optimizers/sharding_optimizer.py |  2 --
 .../passes/auto_parallel_gradient_merge.py      |  2 --
 .../transpiler/distribute_transpiler.py         |  4 ++--
 python/paddle/incubate/asp/asp.py               |  1 -
 .../fleet/parameter_server/ir/pserver_pass.py   |  2 +-
 .../paddle/incubate/optimizer/gradient_merge.py |  4 ++--
 python/paddle/nn/functional/conv.py             | 17 ++++-------------
 python/paddle/nn/functional/norm.py             |  1 -
 python/paddle/nn/functional/pooling.py          |  7 -------
 python/paddle/nn/initializer/normal.py          |  1 -
 python/paddle/nn/layer/layers.py                |  8 +-------
 python/paddle/nn/layer/norm.py                  |  6 +-----
 python/paddle/nn/utils/dygraph_utils.py         |  7 ++-----
 python/paddle/static/nn/common.py               | 11 ++---------
 .../paddle/tensor/layer_function_generator.py   |  2 --
 python/paddle/tensor/linalg.py                  |  8 ++++----
 python/paddle/tensor/math.py                    |  5 ++---
 python/paddle/tensor/random.py                  |  1 -
 22 files changed, 25 insertions(+), 87 deletions(-)

diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 890c22c4d52341..143e6713737d98 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -478,7 +478,7 @@ def _accumulate_gradients_by_sum_op_(
             "sum",
             {"X": renamed_vars[var_name]},
             {"Out": [var_name]},
-            {"use_mkldnn": False, "op_device": op_device},
+            {"op_device": op_device},
         )
     )
     renamed_vars[var_name] = [var_name]
@@ -505,7 +505,7 @@ def _accumulate_gradients_by_add_ops_(
                 "grad_add",
                 {"X": [x_name], "Y": [y_name]},
                 {"Out": [out_name]},
-                {"use_mkldnn": False, "op_device": op_device},
+                {"op_device": op_device},
             )
         )
     renamed_vars[var_name] = [var_name]
diff --git a/python/paddle/base/dygraph_utils.py b/python/paddle/base/dygraph_utils.py
index 926c4680017ce9..e3d83336889b71 100644
--- a/python/paddle/base/dygraph_utils.py
+++ b/python/paddle/base/dygraph_utils.py
@@ -18,15 +18,12 @@
 
 
 @dygraph_only
-def _append_activation_in_dygraph(
-    input, act=None, use_cudnn=None, use_mkldnn=None
-):
+def _append_activation_in_dygraph(input, act=None, use_cudnn=None):
     """Append activation in dygraph mode.
 
         Args:
             input: the input variable.
             act: activation type
-            use_mkldnn: if use mkldnn
             use_cudnn: if use cudnn
 
     Return the Variable after append activation
@@ -37,8 +34,6 @@ def _append_activation_in_dygraph(
     attrs = ()
     if use_cudnn:
         attrs = ('use_cudnn', use_cudnn)
-    if use_mkldnn:
-        attrs += ('use_mkldnn', use_mkldnn)
 
     act_op = getattr(_legacy_C_ops, act)
     return act_op(input, *attrs)
diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py
index 5511ae9cc49082..8f4b068d4e8978 100644
--- a/python/paddle/base/layer_helper.py
+++ b/python/paddle/base/layer_helper.py
@@ -20,7 +20,6 @@
 from .dygraph_utils import _append_activation_in_dygraph
 from .framework import (
     Parameter,
-    _global_flags,
     dtype_is_floating,
     in_dygraph_mode,
 )
@@ -156,16 +155,9 @@ def append_activation(self, input_var):
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             use_cudnn = self.kwargs.get('use_cudnn')
             act['use_cudnn'] = use_cudnn
-        use_mkldnn = self.kwargs.get(
-            'use_mkldnn', _global_flags().get("FLAGS_use_mkldnn", False)
-        )
-        if use_mkldnn:
-            act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
         if in_dygraph_mode():
-            res = _append_activation_in_dygraph(
-                input_var, act_type, use_cudnn, use_mkldnn
-            )
+            res = _append_activation_in_dygraph(input_var, act_type, use_cudnn)
             return res
         else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index 2404ef286b1f2e..009cb2ae49a6b7 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -92,8 +92,6 @@ def _generate_doc_string_(
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
-    # attr use_mkldnn and is_test also should not be visible to users.
-    skip_attrs.add("use_mkldnn")
     skip_attrs.add("is_test")
     skip_attrs.add("use_cudnn")
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 6934fb6957eadb..61b0a214eab154 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -1815,7 +1815,6 @@ def create_persistable_gradients_and_insert_merge_ops(
                 outputs={'Out': gradient_merge_var},
                 attrs={
                     'axis': -1,
-                    'use_mkldnn': False,
                     OP_ROLE_KEY: OpRole.Backward,
                 },
             )
@@ -1884,7 +1883,6 @@ def _create_gm_cond(self, main_block):
                 attrs={
                     'axis': -1,
                     OP_ROLE_KEY: OpRole.Optimize,
-                    'use_mkldnn': False,
                 },
             )
 
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index 33f87ea1587837..90a381bcbc4a0f 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -128,7 +128,6 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
             outputs={'Out': step_var},
             attrs={
                 'axis': -1,
-                'use_mkldnn': False,
                 OP_ROLE_KEY: OpRole.Backward,
             },
         )
@@ -235,7 +234,6 @@ def _append_gradient_merge_backward_op(
                     outputs={'Out': gradient_merge_var},
                     attrs={
                         'axis': -1,
-                        'use_mkldnn': False,
                         OP_ROLE_KEY: OpRole.Backward,
                     },
                 )
diff --git a/python/paddle/distributed/transpiler/distribute_transpiler.py b/python/paddle/distributed/transpiler/distribute_transpiler.py
index 3d86d6dd9afcef..0ad1df56d2b51c 100644
--- a/python/paddle/distributed/transpiler/distribute_transpiler.py
+++ b/python/paddle/distributed/transpiler/distribute_transpiler.py
@@ -2198,7 +2198,7 @@ def _create_table_optimize_block(
                 type="sum",
                 inputs={"X": pserver_side_table_grad_list},
                 outputs={"Out": [grad_var]},
-                attrs={"use_mkldnn": False},
+                attrs={},
             )
         else:
             # in async_mode, for table gradient, it also need to be split to each parameter server
@@ -2490,7 +2490,7 @@ def _append_pserver_grad_merge_ops(
                 type="sum",
                 inputs={"X": vars2merge},
                 outputs={"Out": merged_var},
-                attrs={"use_mkldnn": False},
+                attrs={},
             )
             optimize_block.append_op(
                 type="scale",
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index 97f9376b303827..a1de1937c70cd5 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -910,7 +910,6 @@ def _insert_sparse_mask_ops(cls, main_program, params):
                     outputs={'Out': param},
                     attrs={
                         'axis': -1,
-                        'use_mkldnn': False,
                         OP_ROLE_KEY: int(OpRole.Optimize),
                     },
                 )
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/incubate/distributed/fleet/parameter_server/ir/pserver_pass.py
index 7b94a848e671f0..32c4b3398b4b25 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/pserver_pass.py
@@ -445,7 +445,7 @@ def _append_pserver_grad_merge_ops(
                 type="sum",
                 inputs={"X": vars2merge},
                 outputs={"Out": merged_var},
-                attrs={"use_mkldnn": False},
+                attrs={},
             )
             optimize_block.append_op(
                 type="scale",
diff --git a/python/paddle/incubate/optimizer/gradient_merge.py b/python/paddle/incubate/optimizer/gradient_merge.py
index 3cd17992ef5e8d..6d617a9d08007e 100644
--- a/python/paddle/incubate/optimizer/gradient_merge.py
+++ b/python/paddle/incubate/optimizer/gradient_merge.py
@@ -233,7 +233,7 @@ def _get_gm_cond_var(self, main_block):
                 type='elementwise_mod',
                 inputs={'X': step_var, 'Y': k_step_var},
                 outputs={'Out': step_var},
-                attrs={'axis': -1, 'use_mkldnn': False},
+                attrs={'axis': -1},
             )
 
             # cond_var = (step_var == 0)
@@ -302,7 +302,7 @@ def apply_gradients(self, params_grads):
                 type="elementwise_add",
                 inputs={'X': grad, 'Y': gradient_merge_var},
                 outputs={'Out': gradient_merge_var},
-                attrs={'axis': -1, 'use_mkldnn': False},
+                attrs={'axis': -1},
             )
             self._add_gm_op_role_var(
                 new_grad_op, param, gradient_merge_var, cond
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 762f684ee9ec30..220caf6e31063c 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -120,7 +120,6 @@ def _conv_nd(
     channel_dim=1,
     op_type="conv2d",
     use_cudnn=True,
-    use_mkldnn=False,
     name=None,
 ):
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
@@ -203,8 +202,6 @@ def _conv_nd(
             groups,
             'use_cudnn',
             use_cudnn,
-            'use_mkldnn',
-            use_mkldnn,
             'fuse_relu_before_depthwise_conv',
             False,
             "padding_algorithm",
@@ -225,7 +222,6 @@ def _conv_nd(
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'use_mkldnn': use_mkldnn,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
             "data_format": data_format,
@@ -249,7 +245,7 @@ def _conv_nd(
                     type='elementwise_add',
                     inputs={'X': [pre_bias], 'Y': [bias]},
                     outputs={'Out': [out]},
-                    attrs={'axis': -1, 'use_mkldnn': use_mkldnn},
+                    attrs={'axis': -1},
                 )
             else:
                 assert len(x_shape) > len(
@@ -264,7 +260,7 @@ def _conv_nd(
                     type='elementwise_add',
                     inputs={'X': [pre_bias], 'Y': [bias]},
                     outputs={'Out': [out]},
-                    attrs={'axis': -1, 'use_mkldnn': use_mkldnn},
+                    attrs={'axis': -1},
                 )
         else:
             out = pre_bias
@@ -496,7 +492,6 @@ def conv1d(
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
             "data_format": conv2d_data_format,
@@ -733,8 +728,6 @@ def conv2d(
             else:
                 return pre_bias
 
-    use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-
     if (
         is_compiled_with_cuda()
         and get_flags("FLAGS_conv2d_disable_cudnn")[
@@ -756,7 +749,6 @@ def conv2d(
         channel_dim,
         l_type,
         use_cudnn,
-        use_mkldnn,
         name,
     )
 
@@ -1322,7 +1314,7 @@ def conv2d_transpose(
                     type='elementwise_add',
                     inputs={'X': [pre_bias], 'Y': [bias]},
                     outputs={'Out': [out]},
-                    attrs={'axis': -1, 'use_mkldnn': False},
+                    attrs={'axis': -1},
                 )
             else:
                 assert len(x_shape) > len(
@@ -1336,7 +1328,7 @@ def conv2d_transpose(
                     type='elementwise_add',
                     inputs={'X': [pre_bias], 'Y': [bias]},
                     outputs={'Out': [out]},
-                    attrs={'axis': -1, 'use_mkldnn': False},
+                    attrs={'axis': -1},
                 )
         else:
             out = pre_bias
@@ -1517,7 +1509,6 @@ def conv3d(
         channel_dim,
         op_type,
         use_cudnn,
-        False,
         name,
     )
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 8c9754d11dfa57..a8f2ff96c2841a 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -239,7 +239,6 @@ def batch_norm(
             "epsilon": epsilon,
             "is_test": not training,
             "data_layout": data_format,
-            "use_mkldnn": False,
             "fuse_with_relu": False,
             "use_global_stats": use_global_stats,
             "trainable_statistics": trainable_statistics,
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index e43daa332aea3a..398c99949623f0 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -288,7 +288,6 @@ def avg_pool1d(
                 "padding_algorithm": padding_algorithm,
                 "use_cudnn": True,
                 "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
                 "exclusive": exclusive,
                 "data_format": data_format,
             },
@@ -415,7 +414,6 @@ def avg_pool2d(
                 "padding_algorithm": padding_algorithm,
                 "use_cudnn": True,
                 "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
                 "exclusive": exclusive,
                 "data_format": data_format,
             },
@@ -542,7 +540,6 @@ def avg_pool3d(
                 "padding_algorithm": padding_algorithm,
                 "use_cudnn": True,
                 "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
                 "exclusive": exclusive,
                 "data_format": data_format,
             },
@@ -677,7 +674,6 @@ def max_pool1d(
                 "padding_algorithm": padding_algorithm,
                 "use_cudnn": True,
                 "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
                 "exclusive": True,
                 "data_format": data_format,
             },
@@ -1303,7 +1299,6 @@ def max_pool2d(
                     "padding_algorithm": padding_algorithm,
                     "use_cudnn": True,
                     "ceil_mode": ceil_mode,
-                    "use_mkldnn": False,
                     "exclusive": True,
                     "data_format": data_format,
                 },
@@ -1326,7 +1321,6 @@ def max_pool2d(
                     "padding_algorithm": padding_algorithm,
                     "use_cudnn": True,
                     "ceil_mode": ceil_mode,
-                    "use_mkldnn": False,
                     "exclusive": True,
                     "data_format": data_format,
                 },
@@ -1468,7 +1462,6 @@ def max_pool3d(
                 "padding_algorithm": padding_algorithm,
                 "use_cudnn": True,
                 "ceil_mode": ceil_mode,
-                "use_mkldnn": False,
                 "exclusive": False,
                 "data_format": data_format,
             },
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 3a05bbed121f36..3983f270e60a69 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -103,7 +103,6 @@ def forward(self, var, block=None):
                     "mean": self._mean,
                     "std": self._std_dev,
                     "seed": self._seed,
-                    "use_mkldnn": False,
                 },
                 stop_gradient=True,
             )
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index bf744180c02842..a1d932dcc2055b 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -38,7 +38,6 @@
     Parameter,
     Program,
     _current_expected_place as _get_device,
-    _global_flags,
     convert_np_dtype_to_dtype_,
     default_main_program,
     in_dygraph_mode,
@@ -266,14 +265,9 @@ def append_activation(self, input_var, act=None, use_cudnn=None):
 
         if (use_cudnn is not None) and use_cudnn:
             act['use_cudnn'] = use_cudnn
-        use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-        if (use_mkldnn is not None) and use_mkldnn:
-            act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
         if in_dygraph_mode():
-            res = _append_activation_in_dygraph(
-                input_var, act_type, use_cudnn, use_mkldnn
-            )
+            res = _append_activation_in_dygraph(input_var, act_type, use_cudnn)
             return res
         else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index f7d32bb61908b4..f1cdedb7a8ddf8 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -970,7 +970,6 @@ def __init__(
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._act = act
-        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
         if dtype == "float16":
             self._dtype = "float32"
@@ -1073,9 +1072,8 @@ def forward(self, input):
             )
             if self._act is None:
                 return batch_norm_out
-
             return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
+                batch_norm_out, act=self._act
             )
         elif in_pir_mode():
             batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm_(
@@ -1111,7 +1109,6 @@ def forward(self, input):
                 "epsilon": self._epsilon,
                 "is_test": self._is_test,
                 "data_layout": self._data_layout,
-                "use_mkldnn": False,
                 "fuse_with_relu": self._fuse_with_relu,
                 "use_global_stats": self._use_global_stats,
                 "trainable_statistics": self._trainable_statistics,
@@ -1668,7 +1665,6 @@ def forward(self, x):
             "epsilon": self._epsilon,
             "is_test": not self.training,
             "data_layout": self._data_format,
-            "use_mkldnn": False,
             "fuse_with_relu": False,
             "use_global_stats": False,
             "trainable_statistics": False,
diff --git a/python/paddle/nn/utils/dygraph_utils.py b/python/paddle/nn/utils/dygraph_utils.py
index f572859497eb0b..f199cdc4126dfe 100644
--- a/python/paddle/nn/utils/dygraph_utils.py
+++ b/python/paddle/nn/utils/dygraph_utils.py
@@ -17,20 +17,17 @@
 
 
 @dygraph_only
-def _append_bias_in_dygraph(input, bias=None, axis=1, use_mkldnn=False):
+def _append_bias_in_dygraph(input, bias=None, axis=1):
     """Append bias operation in dygraph mode.
 
         Args:
             input: the input variable.
             bias:  the bias to be appended
             axis:  the axis to perform operation
-            use_mkldnn: whether to use mkldnn
 
     Return the Variable after bias operation
     """
     if bias is None:
         return input
 
-    return _legacy_C_ops.elementwise_add(
-        input, bias, 'axis', axis, 'use_mkldnn', use_mkldnn
-    )
+    return _legacy_C_ops.elementwise_add(input, bias, 'axis', axis)
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 41f1b82d7e419f..576d56e86cdb33 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -227,7 +227,7 @@ def fc_base(
                 type="sum",
                 inputs={"X": mul_results},
                 outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False},
+                attrs={},
             )
         # add bias
         pre_activation = helper.append_bias_op(
@@ -1055,7 +1055,6 @@ def _get_default_param_initializer():
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
             "data_format": data_format,
@@ -1351,7 +1350,6 @@ def _get_default_param_initializer():
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
             "padding_algorithm": padding_algorithm,
             "data_format": data_format,
         },
@@ -2832,8 +2830,6 @@ def batch_norm(
                 is_test,
                 'data_layout',
                 data_layout,
-                'use_mkldnn',
-                False,
                 'fuse_with_relu',
                 False,
                 'use_global_stats',
@@ -2847,8 +2843,6 @@ def batch_norm(
                 is_test,
                 'data_layout',
                 data_layout,
-                'use_mkldnn',
-                False,
                 'fuse_with_relu',
                 False,
                 'use_global_stats',
@@ -2880,7 +2874,7 @@ def batch_norm(
             )
 
         return paddle.base.dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=act, use_mkldnn=False
+            batch_norm_out, act=act
         )
 
     saved_mean = helper.create_variable_for_type_inference(
@@ -2912,7 +2906,6 @@ def batch_norm(
         "epsilon": epsilon,
         "is_test": is_test,
         "data_layout": data_layout,
-        "use_mkldnn": False,
         "fuse_with_relu": False,
         "use_global_stats": use_global_stats,
     }
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 30574b93baf48a..d5a875794fe7d3 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -97,8 +97,6 @@ def _generate_doc_string_(
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
-    # attr use_mkldnn and is_test also should not be visible to users.
-    skip_attrs.add("use_mkldnn")
     skip_attrs.add("is_test")
     skip_attrs.add("use_cudnn")
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 8625c5ae1ecdef..7c39955b84c6c7 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1043,7 +1043,7 @@ def svd_norm(input, porder, axis=[-1]):
                     type='elementwise_div',
                     inputs={'X': max_out, 'Y': min_out},
                     outputs={'Out': out},
-                    attrs={'aixs': axis, 'use_mkldnn': False},
+                    attrs={'aixs': axis},
                 )
                 return out
             if porder == -2:
@@ -1051,7 +1051,7 @@ def svd_norm(input, porder, axis=[-1]):
                     type='elementwise_div',
                     inputs={'X': min_out, 'Y': max_out},
                     outputs={'Out': out},
-                    attrs={'aixs': axis, 'use_mkldnn': False},
+                    attrs={'aixs': axis},
                 )
                 return out
 
@@ -3019,7 +3019,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
                 type='elementwise_mul',
                 inputs={'X': v, 'Y': st},
                 outputs={'Out': out_1},
-                attrs={'axis': -1, 'use_mkldnn': False},
+                attrs={'axis': -1},
             )
             out_1 = helper.append_activation(out_1)
 
@@ -3089,7 +3089,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
                 type='elementwise_mul',
                 inputs={'X': u, 'Y': st},
                 outputs={'Out': out_1},
-                attrs={'axis': -1, 'use_mkldnn': False},
+                attrs={'axis': -1},
             )
             out_1 = helper.append_activation(out_1)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 63ed9f1248d54c..688b08c597ce01 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -621,7 +621,6 @@ def _elementwise_op(helper):
     )
 
     axis = helper.kwargs.get('axis', -1)
-    use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
 
     if out is None:
@@ -636,7 +635,7 @@ def _elementwise_op(helper):
         type=op_type,
         inputs={'X': x, 'Y': y},
         outputs={'Out': out},
-        attrs={'axis': axis, 'use_mkldnn': use_mkldnn},
+        attrs={'axis': axis},
     )
     return helper.append_activation(out)
 
@@ -2029,7 +2028,7 @@ def add_n(inputs, name=None):
             type='sum',
             inputs={'X': inputs},
             outputs={'Out': out},
-            attrs={'use_mkldnn': False},
+            attrs={},
         )
 
         return out
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index c3ae5079ea67d0..b08723d5ceb547 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -381,7 +381,6 @@ def gaussian(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None):
             'std': std,
             'seed': seed,
             'dtype': dtype,
-            'use_mkldnn': False,
         }
         paddle.utils.get_shape_tensor_inputs(
             inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check

From 50afa0bbc62447a9683ac4f56bc5c013d041d998 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Mon, 11 Dec 2023 21:55:21 +0800
Subject: [PATCH 18/28] [Dy2St] Cleanup duplicate dy2st `test_simnet_v2` and
 `test_ptb_lm_v2` (#59865)

---
 .../dygraph_to_static/simnet_dygraph_model.py |  12 +-
 .../simnet_dygraph_model_v2.py                | 509 ------------------
 test/dygraph_to_static/test_ptb_lm_v2.py      | 349 ------------
 test/dygraph_to_static/test_simnet.py         |   5 +-
 test/dygraph_to_static/test_simnet_v2.py      | 194 -------
 tools/parallel_UT_rule.py                     |   4 -
 6 files changed, 7 insertions(+), 1066 deletions(-)
 delete mode 100644 test/dygraph_to_static/simnet_dygraph_model_v2.py
 delete mode 100644 test/dygraph_to_static/test_ptb_lm_v2.py
 delete mode 100644 test/dygraph_to_static/test_simnet_v2.py

diff --git a/test/dygraph_to_static/simnet_dygraph_model.py b/test/dygraph_to_static/simnet_dygraph_model.py
index 39565ddf87989b..86d3071f616e50 100644
--- a/test/dygraph_to_static/simnet_dygraph_model.py
+++ b/test/dygraph_to_static/simnet_dygraph_model.py
@@ -15,8 +15,6 @@
 from functools import reduce
 
 import paddle
-import paddle.base.param_attr as attr
-from paddle.nn import Layer
 
 
 class EmbeddingLayer:
@@ -44,7 +42,7 @@ def ops(self):
             self.emb_dim,
             sparse=True,
             padding_idx=self.padding_idx,
-            weight_attr=attr.ParamAttr(
+            weight_attr=paddle.ParamAttr(
                 name=self.name,
                 initializer=paddle.nn.initializer.XavierUniform(),
             ),
@@ -72,8 +70,8 @@ def ops(self):
         """
         fc = FC(
             size=self.fc_dim,
-            param_attr=attr.ParamAttr(name="%s.w" % self.name),
-            bias_attr=attr.ParamAttr(name="%s.b" % self.name),
+            param_attr=paddle.ParamAttr(name="%s.w" % self.name),
+            bias_attr=paddle.ParamAttr(name="%s.b" % self.name),
             act=self.act,
         )
         return fc
@@ -234,7 +232,7 @@ def ops(self, input):
         return softsign
 
 
-class FC(Layer):
+class FC(paddle.nn.Layer):
     r"""
     This interface is used to construct a callable object of the ``FC`` class.
     For more details, refer to code examples.
@@ -461,7 +459,7 @@ def compute(self, pos, neg):
         return loss
 
 
-class BOW(Layer):
+class BOW(paddle.nn.Layer):
     """
     BOW
     """
diff --git a/test/dygraph_to_static/simnet_dygraph_model_v2.py b/test/dygraph_to_static/simnet_dygraph_model_v2.py
deleted file mode 100644
index e843f6b82e820f..00000000000000
--- a/test/dygraph_to_static/simnet_dygraph_model_v2.py
+++ /dev/null
@@ -1,509 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import reduce
-
-import paddle
-
-
-class EmbeddingLayer:
-    """
-    Embedding Layer class
-    """
-
-    def __init__(self, dict_size, emb_dim, name="emb", padding_idx=None):
-        """
-        initialize
-        """
-        self.dict_size = dict_size
-        self.emb_dim = emb_dim
-        self.name = name
-        self.padding_idx = padding_idx
-
-    def ops(self):
-        """
-        operation
-        """
-        # TODO(huihuangzheng): The original code set the is_sparse=True, but it
-        # causes crush in dy2stat. Set it to True after fixing it.
-        emb = paddle.nn.Embedding(
-            self.dict_size,
-            self.emb_dim,
-            sparse=True,
-            padding_idx=self.padding_idx,
-            weight_attr=paddle.ParamAttr(
-                name=self.name,
-                initializer=paddle.nn.initializer.XavierUniform(),
-            ),
-        )
-
-        return emb
-
-
-class FCLayer:
-    """
-    Fully Connect Layer class
-    """
-
-    def __init__(self, fc_dim, act, name="fc"):
-        """
-        initialize
-        """
-        self.fc_dim = fc_dim
-        self.act = act
-        self.name = name
-
-    def ops(self):
-        """
-        operation
-        """
-        fc = FC(
-            size=self.fc_dim,
-            param_attr=paddle.ParamAttr(name="%s.w" % self.name),
-            bias_attr=paddle.ParamAttr(name="%s.b" % self.name),
-            act=self.act,
-        )
-        return fc
-
-
-class ConcatLayer:
-    """
-    Connection Layer class
-    """
-
-    def __init__(self, axis):
-        """
-        initialize
-        """
-        self.axis = axis
-
-    def ops(self, inputs):
-        """
-        operation
-        """
-        concat = paddle.concat(x=inputs, axis=self.axis)
-        return concat
-
-
-class ReduceMeanLayer:
-    """
-    Reduce Mean Layer class
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, input):
-        """
-        operation
-        """
-        mean = paddle.mean(input)
-        return mean
-
-
-class CosSimLayer:
-    """
-    Cos Similarly Calculate Layer
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, x, y):
-        """
-        operation
-        """
-        sim = paddle.nn.functional.cosine_similarity(x, y)
-        return sim
-
-
-class ElementwiseMaxLayer:
-    """
-    Elementwise Max Layer class
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, x, y):
-        """
-        operation
-        """
-        max = paddle.maximum(x=x, y=y)
-        return max
-
-
-class ElementwiseAddLayer:
-    """
-    Elementwise Add Layer class
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, x, y):
-        """
-        operation
-        """
-        add = paddle.add(x=x, y=y)
-        return add
-
-
-class ElementwiseSubLayer:
-    """
-    Elementwise Add Layer class
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, x, y):
-        """
-        operation
-        """
-        sub = paddle.subtract(x, y)
-        return sub
-
-
-class ConstantLayer:
-    """
-    Generate A Constant Layer class
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, input, shape, dtype, value):
-        """
-        operation
-        """
-        shape = list(shape)
-        input_shape = paddle.shape(input)
-        shape[0] = input_shape[0]
-        constant = paddle.tensor.fill_constant(shape, dtype, value)
-        return constant
-
-
-class SoftsignLayer:
-    """
-    Softsign Layer class
-    """
-
-    def __init__(self):
-        """
-        initialize
-        """
-        pass
-
-    def ops(self, input):
-        """
-        operation
-        """
-        softsign = paddle.nn.functional.softsign(input)
-        return softsign
-
-
-class FC(paddle.nn.Layer):
-    r"""
-    This interface is used to construct a callable object of the ``FC`` class.
-    For more details, refer to code examples.
-    It creates a fully connected layer in the network. It can take
-    one or multiple ``Tensor`` as its inputs. It creates a Variable called weights for each input tensor,
-    which represents a fully connected weight matrix from each input unit to
-    each output unit. The fully connected layer multiplies each input tensor
-    with its corresponding weight to produce an output Tensor with shape [N, `size`],
-    where N is batch size. If multiple input tensors are given, the results of
-    multiple output tensors with shape [N, `size`] will be summed up. If ``bias_attr``
-    is not None, a bias variable will be created and added to the output.
-    Finally, if ``act`` is not None, it will be applied to the output as well.
-    When the input is single ``Tensor`` :
-    .. math::
-        Out = Act({XW + b})
-    When the input are multiple ``Tensor`` :
-    .. math::
-        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
-    In the above equation:
-    * :math:`N`: Number of the input. N equals to len(input) if input is list of ``Tensor`` .
-    * :math:`X_i`: The i-th input ``Tensor`` .
-    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
-    * :math:`b`: The bias parameter created by this layer (if needed).
-    * :math:`Act`: The activation function.
-    * :math:`Out`: The output ``Tensor`` .
-    See below for an example.
-    .. code-block:: text
-        Given:
-            data_1.data = [[[0.1, 0.2]]]
-            data_1.shape = (1, 1, 2) # 1 is batch_size
-            data_2.data = [[[0.1, 0.2, 0.3]]]
-            data_2.shape = (1, 1, 3) # 1 is batch_size
-            fc = FC("fc", 2, num_flatten_dims=2)
-            out = fc(input=[data_1, data_2])
-        Then:
-            out.data = [[[0.182996 -0.474117]]]
-            out.shape = (1, 1, 2)
-    Parameters:
-
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multi-dimension tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
-        param_attr (ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
-            weights(Parameter) of this layer. Default: None.
-        bias_attr (ParamAttr or list of ParamAttr, optional): The attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act (str, optional): Activation to be applied to the output of this layer. Default: None.
-        is_test(bool, optional): A flag indicating whether execution is in test phase. Default: False.
-        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
-    Attribute:
-        **weight** (list of Parameter): the learnable weights of this layer.
-        **bias** (Parameter or None): the learnable bias of this layer.
-    Returns:
-        None
-
-    """
-
-    def __init__(
-        self,
-        size,
-        num_flatten_dims=1,
-        param_attr=None,
-        bias_attr=None,
-        act=None,
-        is_test=False,
-        dtype="float32",
-    ):
-        super().__init__(dtype)
-
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._dtype = dtype
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self.__w = []
-
-    def _build_once(self, input):
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(
-            input, self._param_attr
-        ):
-            input_shape = inp.shape
-
-            param_shape = [
-                reduce(
-                    lambda a, b: a * b, input_shape[self._num_flatten_dims :], 1
-                )
-            ] + [self._size]
-            self.__w.append(
-                self.add_parameter(
-                    '_w%d' % i,
-                    self.create_parameter(
-                        attr=param,
-                        shape=param_shape,
-                        dtype=self._dtype,
-                        is_bias=False,
-                    ),
-                )
-            )
-            i += 1
-
-        size = [self._size]
-        self._b = self.create_parameter(
-            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True
-        )
-
-    @property
-    def weight(self):
-        if len(self.__w) > 1:
-            return self.__w
-        else:
-            return self.__w[0]
-
-    @weight.setter
-    def weight(self, value):
-        if len(self.__w) == 1:
-            self.__w[0] = value
-
-    @property
-    def bias(self):
-        return self._b
-
-    @bias.setter
-    def bias(self, value):
-        self._b = value
-
-    def forward(self, input):
-        mul_results = []
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(
-            input, self._param_attr
-        ):
-            tmp = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="mul",
-                inputs={"X": inp, "Y": self.__w[i]},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": self._num_flatten_dims,
-                    "y_num_col_dims": 1,
-                },
-            )
-            i += 1
-            mul_results.append(tmp)
-
-        if len(mul_results) == 1:
-            pre_bias = mul_results[0]
-        else:
-            pre_bias = self._helper.create_variable_for_type_inference(
-                self._dtype
-            )
-            self._helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False},
-            )
-
-        if self._b is not None:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias], 'Y': [self._b]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': self._num_flatten_dims},
-            )
-        else:
-            pre_activation = pre_bias
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_activation, act=self._act)
-
-
-class HingeLoss:
-    """
-    Hing Loss Calculate class
-    """
-
-    def __init__(self, conf_dict):
-        """
-        initialize
-        """
-        self.margin = conf_dict["loss"]["margin"]
-
-    def compute(self, pos, neg):
-        """
-        compute loss
-        """
-        elementwise_max = ElementwiseMaxLayer()
-        elementwise_add = ElementwiseAddLayer()
-        elementwise_sub = ElementwiseSubLayer()
-        constant = ConstantLayer()
-        reduce_mean = ReduceMeanLayer()
-        loss = reduce_mean.ops(
-            elementwise_max.ops(
-                constant.ops(neg, neg.shape, "float32", 0.0),
-                elementwise_add.ops(
-                    elementwise_sub.ops(neg, pos),
-                    constant.ops(neg, neg.shape, "float32", self.margin),
-                ),
-            )
-        )
-        return loss
-
-
-class BOW(paddle.nn.Layer):
-    """
-    BOW
-    """
-
-    def __init__(self, conf_dict):
-        """
-        initialize
-        """
-        super().__init__()
-        self.dict_size = conf_dict["dict_size"]
-        self.task_mode = conf_dict["task_mode"]
-        self.emb_dim = conf_dict["net"]["emb_dim"]
-        self.bow_dim = conf_dict["net"]["bow_dim"]
-        self.seq_len = conf_dict["seq_len"]
-        self.emb_layer = EmbeddingLayer(
-            self.dict_size, self.emb_dim, "emb"
-        ).ops()
-        self.bow_layer = paddle.nn.Linear(
-            in_features=self.bow_dim, out_features=self.bow_dim
-        )
-        self.bow_layer_po = FCLayer(self.bow_dim, None, "fc").ops()
-        self.softmax_layer = FCLayer(2, "softmax", "cos_sim").ops()
-
-    @paddle.jit.to_static
-    def forward(self, left, right):
-        """
-        Forward network
-        """
-
-        # embedding layer
-        left_emb = self.emb_layer(left)
-        right_emb = self.emb_layer(right)
-        left_emb = paddle.reshape(
-            left_emb, shape=[-1, self.seq_len, self.bow_dim]
-        )
-        right_emb = paddle.reshape(
-            right_emb, shape=[-1, self.seq_len, self.bow_dim]
-        )
-
-        bow_left = paddle.sum(left_emb, axis=1)
-        bow_right = paddle.sum(right_emb, axis=1)
-        softsign_layer = SoftsignLayer()
-        left_soft = softsign_layer.ops(bow_left)
-        right_soft = softsign_layer.ops(bow_right)
-
-        # matching layer
-        if self.task_mode == "pairwise":
-            left_bow = self.bow_layer(left_soft)
-            right_bow = self.bow_layer(right_soft)
-            cos_sim_layer = CosSimLayer()
-            pred = cos_sim_layer.ops(left_bow, right_bow)
-            return left_bow, pred
-        else:
-            concat_layer = ConcatLayer(1)
-            concat = concat_layer.ops([left_soft, right_soft])
-            concat_fc = self.bow_layer_po(concat)
-            pred = self.softmax_layer(concat_fc)
-            return left_soft, pred
diff --git a/test/dygraph_to_static/test_ptb_lm_v2.py b/test/dygraph_to_static/test_ptb_lm_v2.py
deleted file mode 100644
index ab89c20afc25fe..00000000000000
--- a/test/dygraph_to_static/test_ptb_lm_v2.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import time
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import (
-    Dy2StTestBase,
-    test_legacy_and_pt_and_pir,
-)
-
-import paddle
-
-PRINT_STEP = 20
-SEED = 2020
-
-
-class SimpleLSTMRNN(paddle.nn.Layer):
-    def __init__(
-        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
-    ):
-        super().__init__()
-        self._hidden_size = hidden_size
-        self._num_layers = num_layers
-        self._init_scale = init_scale
-        self._dropout = dropout
-        self._num_steps = num_steps
-        self.cell_array = []
-        self.hidden_array = []
-
-        self.weight_1_arr = []
-        self.weight_2_arr = []
-        self.bias_arr = []
-        self.mask_array = []
-
-        for i in range(self._num_layers):
-            weight_1 = self.create_parameter(
-                attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.Uniform(
-                        low=-self._init_scale, high=self._init_scale
-                    )
-                ),
-                shape=[self._hidden_size * 2, self._hidden_size * 4],
-                dtype="float32",
-                default_initializer=paddle.nn.initializer.Uniform(
-                    low=-self._init_scale, high=self._init_scale
-                ),
-            )
-            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
-            bias_1 = self.create_parameter(
-                attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.Uniform(
-                        low=-self._init_scale, high=self._init_scale
-                    )
-                ),
-                shape=[self._hidden_size * 4],
-                dtype="float32",
-                default_initializer=paddle.nn.initializer.Constant(0.0),
-            )
-            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
-
-    def forward(self, input_embedding, init_hidden=None, init_cell=None):
-        cell_array = []
-        hidden_array = []
-
-        for i in range(self._num_layers):
-            hidden_array.append(init_hidden[i])
-            cell_array.append(init_cell[i])
-
-        res = []
-        for index in range(self._num_steps):
-            step_input = input_embedding[:, index, :]
-            for k in range(self._num_layers):
-                pre_hidden = hidden_array[k]
-                pre_cell = cell_array[k]
-                weight_1 = self.weight_1_arr[k]
-                bias = self.bias_arr[k]
-
-                nn = paddle.concat(x=[step_input, pre_hidden], axis=1)
-                gate_input = paddle.matmul(x=nn, y=weight_1)
-
-                gate_input = paddle.add(x=gate_input, y=bias)
-                i, j, f, o = paddle.split(
-                    x=gate_input, num_or_sections=4, axis=-1
-                )
-                c = pre_cell * paddle.nn.functional.sigmoid(
-                    f
-                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
-                hidden_array[k] = m
-                cell_array[k] = c
-                step_input = m
-
-                if self._dropout is not None and self._dropout > 0.0:
-                    step_input = paddle.nn.functional.dropout(
-                        step_input,
-                        dropout_prob=self._dropout,
-                        dropout_implementation='upscale_in_train',
-                    )
-            res.append(step_input)
-        real_res = paddle.concat(x=res, axis=1)
-        real_res = paddle.reshape(
-            real_res, [-1, self._num_steps, self._hidden_size]
-        )
-        last_hidden = paddle.concat(x=hidden_array, axis=1)
-        last_hidden = paddle.reshape(
-            last_hidden, shape=[-1, self._num_layers, self._hidden_size]
-        )
-        last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
-        last_cell = paddle.concat(x=cell_array, axis=1)
-        last_cell = paddle.reshape(
-            last_cell, shape=[-1, self._num_layers, self._hidden_size]
-        )
-        last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2])
-        return real_res, last_hidden, last_cell
-
-
-class PtbModel(paddle.nn.Layer):
-    def __init__(
-        self,
-        hidden_size,
-        vocab_size,
-        num_layers=2,
-        num_steps=20,
-        init_scale=0.1,
-        dropout=None,
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.vocab_size = vocab_size
-        self.init_scale = init_scale
-        self.num_layers = num_layers
-        self.num_steps = num_steps
-        self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout,
-        )
-        self.embedding = paddle.nn.Embedding(
-            vocab_size,
-            hidden_size,
-            sparse=False,
-            weight_attr=paddle.ParamAttr(
-                name='embedding_para',
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_scale, high=init_scale
-                ),
-            ),
-        )
-        self.softmax_weight = self.create_parameter(
-            attr=paddle.ParamAttr(),
-            shape=[self.hidden_size, self.vocab_size],
-            dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(
-                low=-self.init_scale, high=self.init_scale
-            ),
-        )
-        self.softmax_bias = self.create_parameter(
-            attr=paddle.ParamAttr(),
-            shape=[self.vocab_size],
-            dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(
-                low=-self.init_scale, high=self.init_scale
-            ),
-        )
-
-    def build_once(self, input, label, init_hidden, init_cell):
-        pass
-
-    def forward(self, input, label, init_hidden, init_cell):
-        init_h = paddle.reshape(
-            init_hidden, shape=[self.num_layers, -1, self.hidden_size]
-        )
-
-        init_c = paddle.reshape(
-            init_cell, shape=[self.num_layers, -1, self.hidden_size]
-        )
-
-        x_emb = self.embedding(input)
-
-        x_emb = paddle.reshape(
-            x_emb, shape=[-1, self.num_steps, self.hidden_size]
-        )
-        if self.dropout is not None and self.dropout > 0.0:
-            x_emb = paddle.nn.functional.dropout(
-                x_emb,
-                dropout_prob=self.dropout,
-                dropout_implementation='upscale_in_train',
-            )
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
-            x_emb, init_h, init_c
-        )
-
-        projection = paddle.matmul(x=rnn_out, y=self.softmax_weight)
-        projection = paddle.add(x=projection, y=self.softmax_bias)
-
-        loss = paddle.nn.functional.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False
-        )
-        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = paddle.mean(loss, axis=[0])
-        loss = paddle.sum(loss)
-
-        return loss, last_hidden, last_cell
-
-    def debug_emb(self):
-        np.save("emb_grad", self.x_emb.gradient())
-
-
-def train(place):
-    num_layers = 1
-    batch_size = 4
-    hidden_size = 10
-    num_steps = 3
-    init_scale = 0.1
-    max_epoch = 1
-    dropout = 0.0
-    vocab_size = 1000
-    batch_num = 200
-
-    paddle.disable_static(place)
-    paddle.seed(SEED)
-    paddle.framework.random._manual_program_seed(SEED)
-    ptb_model = paddle.jit.to_static(
-        PtbModel(
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_layers=num_layers,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            dropout=dropout,
-        )
-    )
-
-    sgd = paddle.optimizer.SGD(
-        learning_rate=1e-3, parameters=ptb_model.parameters()
-    )
-
-    for epoch_id in range(max_epoch):
-        total_loss = 0.0
-        iters = 0.0
-        total_sample = 0
-
-        init_hidden_data = np.zeros(
-            (num_layers, batch_size, hidden_size), dtype='float32'
-        )
-        init_cell_data = np.zeros(
-            (num_layers, batch_size, hidden_size), dtype='float32'
-        )
-
-        init_hidden = paddle.to_tensor(
-            data=init_hidden_data, dtype=None, place=None, stop_gradient=True
-        )
-        init_cell = paddle.to_tensor(
-            data=init_cell_data, dtype=None, place=None, stop_gradient=True
-        )
-        for step_id in range(batch_num):
-            x_data = np.arange(12).reshape(4, 3).astype('int64')
-            y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-            y_data = y_data.reshape((-1, 1))
-
-            x_data = x_data.reshape((-1, num_steps, 1))
-            y_data = y_data.reshape((-1, num_steps, 1))
-
-            x = paddle.to_tensor(
-                data=x_data, dtype=None, place=None, stop_gradient=True
-            )
-            y = paddle.to_tensor(
-                data=y_data, dtype=None, place=None, stop_gradient=True
-            )
-
-            dy_loss, last_hidden, last_cell = ptb_model(
-                x, y, init_hidden, init_cell
-            )
-            out_loss = dy_loss.numpy()
-
-            dy_loss.backward()
-            sgd.minimize(dy_loss)
-            ptb_model.clear_gradients()
-
-            total_loss += out_loss
-            iters += num_steps
-            total_sample += 1
-            if step_id % PRINT_STEP == 0:
-                if step_id == 0:
-                    logging.info(
-                        "epoch %d | step %d, loss %0.3f"
-                        % (epoch_id, step_id, total_loss / total_sample)
-                    )
-                    avg_batch_time = time.time()
-                else:
-                    speed = PRINT_STEP / (time.time() - avg_batch_time)
-                    logging.info(
-                        "epoch %d | step %d, loss %0.3f, speed %.3f steps/s"
-                        % (epoch_id, step_id, total_loss / total_sample, speed)
-                    )
-                    avg_batch_time = time.time()
-
-    ret = out_loss, last_hidden.numpy(), last_cell.numpy()
-    paddle.enable_static()
-    return ret
-
-
-def train_dygraph(place):
-    paddle.jit.enable_to_static(False)
-    return train(place)
-
-
-def train_static(place):
-    paddle.jit.enable_to_static(True)
-    return train(place)
-
-
-class TestPtb(Dy2StTestBase):
-    def setUp(self):
-        self.place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-
-    @test_legacy_and_pt_and_pir
-    def test_check_result(self):
-        loss_1, hidden_1, cell_1 = train_static(self.place)
-        loss_2, hidden_2, cell_2 = train_dygraph(self.place)
-
-        np.testing.assert_allclose(loss_1, loss_2, rtol=1e-05)
-        np.testing.assert_allclose(hidden_1, hidden_2, rtol=1e-05)
-        np.testing.assert_allclose(cell_1, cell_2, rtol=1e-05)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/dygraph_to_static/test_simnet.py b/test/dygraph_to_static/test_simnet.py
index b1dc687abad3b1..91e4c3ee4cfbec 100644
--- a/test/dygraph_to_static/test_simnet.py
+++ b/test/dygraph_to_static/test_simnet.py
@@ -21,7 +21,6 @@
 from simnet_dygraph_model import BOW, HingeLoss
 
 import paddle
-from paddle import base
 from paddle.base.framework import unique_name
 
 SEED = 102
@@ -180,8 +179,8 @@ def train(conf_dict, to_static):
 class TestSimnet(Dy2StTestBase):
     @test_legacy_and_pt_and_pir
     def test_dygraph_static_same_loss(self):
-        if base.is_compiled_with_cuda():
-            base.set_flags({"FLAGS_cudnn_deterministic": True})
+        if paddle.is_compiled_with_cuda():
+            paddle.set_flags({"FLAGS_cudnn_deterministic": True})
         conf_dict = create_conf_dict()
         dygraph_loss = train(conf_dict, to_static=False)
         static_loss = train(conf_dict, to_static=True)
diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py
deleted file mode 100644
index 1c4476a99457d7..00000000000000
--- a/test/dygraph_to_static/test_simnet_v2.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import random
-import unittest
-
-import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_legacy_and_pt_and_pir
-from simnet_dygraph_model_v2 import BOW, HingeLoss
-
-import paddle
-
-SEED = 102
-random.seed(SEED)
-
-
-def create_conf_dict():
-    conf_dict = {}
-    conf_dict["task_mode"] = "pairwise"
-    conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
-    conf_dict["loss"] = {"margin": 0.1}
-    return conf_dict
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="Total examples' number in batch for training.",
-    )
-    parser.add_argument(
-        "--seq_len", type=int, default=32, help="The length of each sentence."
-    )
-    parser.add_argument(
-        "--epoch", type=int, default=1, help="The number of training epoch."
-    )
-    parser.add_argument(
-        "--fake_sample_size",
-        type=int,
-        default=128,
-        help="The number of samples of fake data.",
-    )
-    args = parser.parse_args([])
-    return args
-
-
-args = parse_args()
-
-
-def fake_vocabulary():
-    vocab = {}
-    vocab["<unk>"] = 0
-    for i in range(26):
-        c = chr(ord('a') + i)
-        vocab[c] = i + 1
-    return vocab
-
-
-vocab = fake_vocabulary()
-
-
-class FakeReaderProcessor(paddle.io.Dataset):
-    def __init__(self, args, vocab, length):
-        self.vocab = vocab
-        self.seq_len = args.seq_len
-        self.sample_size = args.fake_sample_size
-        self.data_samples = []
-        for i in range(self.sample_size):
-            query = [random.randint(0, 26) for i in range(self.seq_len)]
-            pos_title = query[:]
-            neg_title = [26 - q for q in query]
-            self.data_samples.append(
-                np.array([query, pos_title, neg_title]).astype(np.int64)
-            )
-        self.query = []
-        self.pos_title = []
-        self.neg_title = []
-        self._init_data(length)
-
-    def get_reader(self, mode, epoch=0):
-        def reader_with_pairwise():
-            if mode == "train":
-                for i in range(self.sample_size):
-                    yield self.data_samples[i]
-
-        return reader_with_pairwise
-
-    def _init_data(self, length):
-        reader = self.get_reader("train", epoch=args.epoch)()
-        for i, yield_data in enumerate(reader):
-            if i >= length:
-                break
-            self.query.append(yield_data[0])
-            self.pos_title.append(yield_data[1])
-            self.neg_title.append(yield_data[2])
-
-    def __getitem__(self, idx):
-        return self.query[idx], self.pos_title[idx], self.neg_title[idx]
-
-    def __len__(self):
-        return len(self.query)
-
-
-simnet_process = FakeReaderProcessor(
-    args, vocab, args.batch_size * (args.epoch + 1)
-)
-
-
-def train(conf_dict, to_static):
-    """
-    train process
-    """
-    paddle.jit.enable_to_static(to_static)
-
-    # Get device
-    if paddle.is_compiled_with_cuda():
-        place = paddle.CUDAPlace(0)
-    else:
-        place = paddle.CPUPlace()
-
-    paddle.disable_static(place)
-    paddle.seed(SEED)
-    paddle.framework.random._manual_program_seed(SEED)
-
-    conf_dict['dict_size'] = len(vocab)
-    conf_dict['seq_len'] = args.seq_len
-
-    net = BOW(conf_dict)
-    loss = HingeLoss(conf_dict)
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        parameters=net.parameters(),
-    )
-
-    metric = paddle.metric.Auc(name="auc")
-
-    global_step = 0
-    losses = []
-
-    train_loader = paddle.io.DataLoader(
-        simnet_process, batch_size=args.batch_size
-    )
-
-    for left, pos_right, neg_right in train_loader():
-        left = paddle.reshape(left, shape=[-1, 1])
-        pos_right = paddle.reshape(pos_right, shape=[-1, 1])
-        neg_right = paddle.reshape(neg_right, shape=[-1, 1])
-        net.train()
-        global_step += 1
-        left_feat, pos_score = net(left, pos_right)
-        pred = pos_score
-        _, neg_score = net(left, neg_right)
-        avg_cost = loss.compute(pos_score, neg_score)
-        losses.append(np.mean(avg_cost.numpy()))
-        avg_cost.backward()
-        optimizer.minimize(avg_cost)
-        net.clear_gradients()
-    paddle.enable_static()
-    return losses
-
-
-class TestSimnet(Dy2StTestBase):
-    @test_legacy_and_pt_and_pir
-    def test_dygraph_static_same_loss(self):
-        if paddle.is_compiled_with_cuda():
-            paddle.base.set_flags({"FLAGS_cudnn_deterministic": True})
-        conf_dict = create_conf_dict()
-        dygraph_loss = train(conf_dict, to_static=False)
-        static_loss = train(conf_dict, to_static=True)
-
-        self.assertEqual(len(dygraph_loss), len(static_loss))
-        for i in range(len(dygraph_loss)):
-            self.assertAlmostEqual(dygraph_loss[i], static_loss[i])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 464fb9cc1cfe46..5b315049c5025c 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -1100,7 +1100,6 @@
     'test_rmsprop_op',
     'test_fuse_bn_act_pass',
     'test_inplace_addto_strategy',
-    'test_ptb_lm_v2',
     'test_paddle_save_load',
     'test_prelu_mkldnn_op',
     'test_box_coder_op',
@@ -1149,7 +1148,6 @@
     'test_complex_sum_layer',
     'test_isfinite_v2_op',
     'test_is_empty_op',
-    'test_simnet_v2',
     'beam_search_test',
     'test_randperm_op',
     'test_elementwise_add_op_inplace',
@@ -2331,7 +2329,6 @@
     'test_density_prior_box_op',
     'test_dataloader_keep_order',
     'test_bce_loss',
-    'test_simnet_v2',
     'test_fetch_lod_tensor_array',
     'test_smooth_l1_loss',
     'test_matrix_rank_op',
@@ -2749,7 +2746,6 @@
     'test_expand_v2_op',
     'test_psroi_pool_op',
     'test_expand_as_v2_op',
-    'test_ptb_lm_v2',
     'test_rand_op',
     'test_empty_like_op',
     'test_rank_loss_op',

From 5970a573b42b6002d3bcd99c06981d5ddbe5a289 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Tue, 12 Dec 2023 10:44:11 +0800
Subject: [PATCH 19/28] optimize backward (#59844)

---
 python/paddle/autograd/backward_utils.py | 12 +++++++-
 python/paddle/autograd/ir_backward.py    | 38 ++++++++----------------
 2 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py
index 0b50872f62c3c3..c499a0856b4d09 100644
--- a/python/paddle/autograd/backward_utils.py
+++ b/python/paddle/autograd/backward_utils.py
@@ -22,7 +22,8 @@
 class State:
     """
     record relationship of forward op/value and backward op/value
-    one state must be bining with a program
+    one state must be bining with a block, if block has parent block,
+    state will include parent block info.
 
     """
 
@@ -39,6 +40,10 @@ def __init__(self, block):
         self.sumvaluegrad_to_value = collections.defaultdict(list)
         # operation -> list(operation)
         self.opgrad_to_op = collections.defaultdict(list)
+        # only for controlflow
+        # inside_value is sub block value, which will yield to parent block,
+        # parant block value is outside_value
+        self.inside_value_to_outside_value_map = {}
 
     def turn_map(self) -> None:
         self.valuegrad_to_value = collections.defaultdict(list)
@@ -71,6 +76,11 @@ def copy(self, new_block):
         # operation -> list(operation)
         state.opgrad_to_op = self.opgrad_to_op.copy()
 
+        # only for controlflow
+        state.inside_value_to_outside_value_map = (
+            self.inside_value_to_outside_value_map.copy()
+        )
+
         return state
 
 
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index 860b0581a637d7..678d57e5ecb02c 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -239,9 +239,7 @@ def prune_ops(total_ops, inputs_set, outputs_set, no_grad_set):
         total_ops[i] for i in range(len(total_ops)) if intersection_op_flags[i]
     ]
     uneffective_ops = [
-        total_ops[i]
-        for i in reversed(range(len(total_ops)))
-        if not union_op_flags[i]
+        total_ops[i] for i in range(len(total_ops)) if not union_op_flags[i]
     ]
 
     return effective_ops, uneffective_ops
@@ -337,7 +335,6 @@ def append_backward_ops(
     no_grad_set,
     backward_ops,
     state,
-    inside_value_to_outside_value_map,
 ):
     '''
     add grad_op in order of topological inverse sort
@@ -351,7 +348,7 @@ def append_backward_ops(
         v2_g = call_vjp(op3, [[v2]], [[v3]],[[v3_g]], [[v2_stopgradient]])
 
 
-    special pattern 1:
+    special pattern:
         v11 -> combine_op -> v1 -> op -> v3
         v12 ->
                              v2 ->
@@ -359,7 +356,7 @@ def append_backward_ops(
 
         v1 is inside python api, we don't describe it in backward process(state)
         so v1_grad is inside vjp, we don't describe it in backward process(state)
-        [[v11_g, v12_g], v2_g] = call_vjp(combine_op, [[v11, v12]], [[v3]],[[v3_g]], [[v11_stopgradient, v12_stopgradient], v2_stop_gradient])
+        [[v11_g, v12_g], v2_g] = call_vjp(op, [[v11, v12]], [[v3]],[[v3_g]], [[v11_stopgradient, v12_stopgradient], v2_stop_gradient])
 
 
         op_vjp is:
@@ -380,7 +377,7 @@ def append_backward_ops(
     '''
 
     def append_add_n(value):
-        # one value is input of more than one fwd_op,
+        # value is input of more than one fwd_op,
         # so more than one bwd_op create input_grad,
         # need add sum op to accumulate gradient
         add_n_value = paddle.add_n(
@@ -406,8 +403,8 @@ def make_output_with_output_grad(op):
                 if value in control_flow_value_to_copyvalue_map
                 else [value]
             )
-            while value in inside_value_to_outside_value_map:
-                value = inside_value_to_outside_value_map[value]
+            while value in state.inside_value_to_outside_value_map:
+                value = state.inside_value_to_outside_value_map[value]
 
             if (
                 value in state.value_to_valuegrad
@@ -425,7 +422,7 @@ def make_output_with_output_grad(op):
                 ):
                     # pattern case:
                     # this fwd_op's output is vectorType, it will split to
-                    # Type by builtin.split op, so need get from split op's ouput
+                    # Type by builtin_split op, so need get from split op's outputs.
                     (
                         split_zero_flag,
                         split_outputs,
@@ -556,8 +553,8 @@ def append_yield(block, base_inputs, base_inputs_grad):
                 if value_grad is None:
                     continue
 
-                while value in inside_value_to_outside_value_map:
-                    value = inside_value_to_outside_value_map[value]
+                while value in state.inside_value_to_outside_value_map:
+                    value = state.inside_value_to_outside_value_map[value]
 
                 if value in state.value_to_valuegrad:
                     if len(state.value_to_valuegrad[value]) > 1:
@@ -579,8 +576,6 @@ def append_yield(block, base_inputs, base_inputs_grad):
     # -----------------only for control flow-----------------#
     # tuple_push value to pop value
     control_flow_value_to_copyvalue_map = {}
-    # tuple_push value to pop value
-    control_flow_copyvalue_to_value_map = {}
 
     if (
         len(effective_forward_ops) > 1
@@ -590,7 +585,9 @@ def append_yield(block, base_inputs, base_inputs_grad):
         for outside_output, inside_output in zip(
             base_op.results(), yield_op.operands_source()
         ):
-            inside_value_to_outside_value_map[inside_output] = outside_output
+            state.inside_value_to_outside_value_map[
+                inside_output
+            ] = outside_output
         forward_ops = effective_forward_ops[:-1]
     else:
         forward_ops = effective_forward_ops
@@ -629,9 +626,6 @@ def append_yield(block, base_inputs, base_inputs_grad):
                         control_flow_value_to_copyvalue_map[
                             output[0]
                         ] = copy_output[0]
-                        control_flow_copyvalue_to_value_map[
-                            copy_output[0]
-                        ] = output[0]
 
                 else:
                     # all(zero_flag) support this op has no contribution for grad
@@ -658,9 +652,6 @@ def append_yield(block, base_inputs, base_inputs_grad):
                             op.blocks(), grad_op.blocks()
                         ):
                             sub_state = state.copy(sub_fwd_block)
-                            sub_inside_value_to_outside_value_map = (
-                                inside_value_to_outside_value_map.copy()
-                            )
                             sub_backward_ops = []
                             append_backward_ops(
                                 op,
@@ -672,7 +663,6 @@ def append_yield(block, base_inputs, base_inputs_grad):
                                 no_grad_set,
                                 sub_backward_ops,
                                 sub_state,
-                                sub_inside_value_to_outside_value_map,
                             )
                         # update input_grad map
                         update_input_grad_map(op, input_grads, origin_inputs)
@@ -812,9 +802,6 @@ def calc_gradient_helper(outputs, inputs, grad_outputs, no_grad_set):
         inputs, complete_outputs
     )
 
-    # sub_block op output to parent_block op output
-    inside_value_to_outside_value_map = {}
-
     append_backward_ops(
         None,
         None,
@@ -825,7 +812,6 @@ def calc_gradient_helper(outputs, inputs, grad_outputs, no_grad_set):
         no_grad_set,
         backward_ops,
         state,
-        inside_value_to_outside_value_map,
     )
     # now value_to_valuegrad should be value <-> value (add sum op for the same values's gradvalue)
     outputs_set, inputs_set, no_gradvar_set = create_backward_prune_set(

From fa81f64907722de7f8aa65e2186672c68132481c Mon Sep 17 00:00:00 2001
From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Date: Tue, 12 Dec 2023 11:09:39 +0800
Subject: [PATCH 20/28] fix_test_inference_windows_compile (#59914)

---
 paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 0c9e3d34483364..0cca4532a0ce6b 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -241,7 +241,7 @@ if(WITH_GPU)
                ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
       set(DEPS ${DEPS}
                ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
-      if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
+      if(${TENSORRT_MAJOR_VERSION} EQUAL 7)
         set(DEPS ${DEPS}
                  ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX})
       endif()
@@ -272,7 +272,7 @@ if(WIN32)
         ${CMAKE_COMMAND} -E copy
         ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
         ${LIB_PATH})
-    if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
+    if(${TENSORRT_MAJOR_VERSION} EQUAL 7)
       add_custom_command(
         TARGET ${DEMO_NAME}
         POST_BUILD

From f86431e2700d2392d63d57639eefb0994fbf7bc8 Mon Sep 17 00:00:00 2001
From: Ataf Fazledin Ahamed <ataf@openrefactory.com>
Date: Tue, 12 Dec 2023 09:25:34 +0600
Subject: [PATCH 21/28] Fixed Inappropriate Logical Expression (#59886)

Signed-off-by: fazledyn-or <ataf@openrefactory.com>
---
 python/paddle/static/nn/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 576d56e86cdb33..558660fcc2a3a0 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -1659,7 +1659,7 @@ def _update_padding(padding, data_format):
         )
 
     if filter_size is None:
-        if output_size is []:
+        if output_size == []:
             raise ValueError("output_size must be set when filter_size is None")
         if not in_dygraph_mode():
             if isinstance(output_size, Variable) or paddle.utils._contain_var(

From 510b2f86bc9fc22ba40d30d9e7ad896283f991db Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Tue, 12 Dec 2023 11:34:40 +0800
Subject: [PATCH 22/28]  Add Tensor.to to check core.Place (#59857)

---
 python/paddle/base/dygraph/tensor_patch_methods.py | 1 +
 test/legacy_test/test_Tensor_to.py                 | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 674d9c740f1580..1b779ed87cb5db 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -455,6 +455,7 @@ def _to(self, device=None, dtype=None, blocking=None):
             elif isinstance(
                 device,
                 (
+                    core.Place,
                     core.CPUPlace,
                     core.CUDAPlace,
                     core.CUDAPinnedPlace,
diff --git a/test/legacy_test/test_Tensor_to.py b/test/legacy_test/test_Tensor_to.py
index c9901cb68d780e..9821fac8616218 100644
--- a/test/legacy_test/test_Tensor_to.py
+++ b/test/legacy_test/test_Tensor_to.py
@@ -55,6 +55,13 @@ def test_Tensor_to_device(self):
             else:
                 self.assertTrue(placex_str, "Place(" + place + ")")
 
+    def test_Tensor_to_device2(self):
+        x = paddle.to_tensor([1, 2, 3])
+        y = paddle.to_tensor([1, 2, 3], place="cpu")
+
+        y.to(x.place)
+        self.assertTrue(x.place, y.place)
+
     def test_Tensor_to_device_dtype(self):
         tensorx = paddle.to_tensor([1, 2, 3])
         places = ["cpu"]

From ec5e6b000d741b517a8424dbf2600b89c97dd0c8 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Tue, 12 Dec 2023 11:36:16 +0800
Subject: [PATCH 23/28] [CodeStyle][ruff] clean `python/paddle/__init__.py` and
 `python/paddle/base/__init__.py` step: 7 (#59628)

* clean F401

* clean no qa

* fix

* fix
---
 python/paddle/__init__.py                  | 612 ++++++++++-----------
 python/paddle/base/__init__.py             | 155 +++---
 python/paddle/base/layers/math_op_patch.py |   7 +-
 3 files changed, 378 insertions(+), 396 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 3ff2cdad1b23e4..6f1929c64275b0 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle.version import full_version as __version__  # noqa: F401
-    from paddle.version import commit as __git_commit__  # noqa: F401
     from paddle.cuda_env import *  # noqa: F403
+    from paddle.version import (  # noqa: F401
+        commit as __git_commit__,
+        full_version as __version__,
+    )
 except ImportError:
     import sys
 
@@ -31,145 +33,212 @@
 # Do the *DUPLICATED* monkey-patch for the tensor object.
 # We need remove the duplicated code here once we fix
 # the illogical implement in the monkey-patch methods later.
-from .framework import monkey_patch_variable
-from .framework import monkey_patch_math_tensor
-from .pir import monkey_patch_value, monkey_patch_program
+from .framework import monkey_patch_math_tensor, monkey_patch_variable
+from .pir import monkey_patch_program, monkey_patch_value
 
 monkey_patch_variable()
 monkey_patch_math_tensor()
 monkey_patch_value()
 monkey_patch_program()
 
+from .base.dataset import *  # noqa: F403
 from .framework import (
     disable_signal_handler,
-    get_flags,
-    set_flags,
     disable_static,
     enable_static,
+    get_flags,
     in_dynamic_mode,
+    set_flags,
 )
-from .base.dataset import *  # noqa: F403
-
 from .framework.dtype import (
-    iinfo,
-    finfo,
+    bfloat16,
+    bool,
+    complex64,
+    complex128,
     dtype,
-    uint8,
+    finfo,
+    float16,
+    float32,
+    float64,
+    iinfo,
     int8,
     int16,
     int32,
     int64,
-    float16,
-    float32,
-    float64,
-    bfloat16,
-    bool,
-    complex64,
-    complex128,
+    uint8,
 )
 
 Tensor = framework.core.eager.Tensor
 Tensor.__qualname__ = 'Tensor'
 
 import paddle.distributed.fleet  # noqa: F401
+import paddle.text  # noqa: F401
+import paddle.vision  # noqa: F401
 from paddle import (  # noqa: F401
+    amp,
+    audio,
+    autograd,
+    dataset,
+    decomposition,
+    device,
     distributed,
-    sysconfig,
     distribution,
-    nn,
-    optimizer,
-    metric,
-    regularizer,
+    geometric,
     incubate,
-    autograd,
-    device,
-    decomposition,
-    jit,
-    amp,
-    dataset,
     inference,
     io,
+    jit,
+    metric,
+    nn,
     onnx,
+    optimizer,
+    quantization,
     reader,
+    regularizer,
+    sparse,
     static,
+    sysconfig,
     vision,
-    audio,
-    geometric,
-    sparse,
-    quantization,
 )
 
+# high-level api
+from . import (  # noqa: F401
+    _pir_ops,
+    callbacks,
+    fft,
+    hub,
+    linalg,
+    signal,
+)
+from .autograd import (
+    enable_grad,
+    grad,
+    is_grad_enabled,
+    no_grad,
+    set_grad_enabled,
+)
+from .device import (  # noqa: F401
+    get_cudnn_version,
+    get_device,
+    is_compiled_with_cinn,
+    is_compiled_with_cuda,
+    is_compiled_with_custom_device,
+    is_compiled_with_distribute,
+    is_compiled_with_ipu,
+    is_compiled_with_rocm,
+    is_compiled_with_xpu,
+    set_device,
+)
+from .distributed import DataParallel
+from .framework import (  # noqa: F401  # noqa: F401
+    CPUPlace,
+    CUDAPinnedPlace,
+    CUDAPlace,
+    CustomPlace,
+    IPUPlace,
+    ParamAttr,
+    XPUPlace,
+    async_save,
+    clear_async_save_task_queue,
+    get_default_dtype,
+    load,
+    save,
+    set_default_dtype,
+)
+from .framework.random import (
+    get_cuda_rng_state,
+    get_rng_state,
+    seed,
+    set_cuda_rng_state,
+    set_rng_state,
+)
+from .hapi import (
+    Model,
+    flops,
+    summary,
+)
+from .nn.initializer.lazy_init import LazyGuard
 from .tensor.attribute import (
+    imag,
     is_complex,
+    is_floating_point,
     is_integer,
     rank,
-    shape,
     real,
-    imag,
-    is_floating_point,
+    shape,
 )
-
 from .tensor.creation import (
+    arange,
+    assign,
+    cauchy_,
+    clone,
+    complex,
     create_parameter,
-    to_tensor,
     diag,
     diag_embed,
     diagflat,
+    empty,
+    empty_like,
     eye,
+    full,
+    full_like,
+    geometric_,
     linspace,
     logspace,
+    meshgrid,
     ones,
     ones_like,
-    zeros,
-    zeros_like,
-    arange,
-    full,
-    full_like,
-    triu,
-    triu_,
+    polar,
+    to_tensor,
     tril,
     tril_,
-    meshgrid,
-    empty,
-    empty_like,
-    assign,
-    complex,
-    clone,
     tril_indices,
+    triu,
+    triu_,
     triu_indices,
-    polar,
-    geometric_,
-    cauchy_,
+    zeros,
+    zeros_like,
 )
-
+from .tensor.einsum import einsum
 from .tensor.linalg import (  # noqa: F401
-    matmul,
-    dot,
-    norm,
-    transpose,
-    transpose_,
-    dist,
-    t,
-    t_,
+    bincount,
+    bmm,
     cdist,
-    cross,
     cholesky,
-    bmm,
+    cross,
+    dist,
+    dot,
+    eigvalsh,
     histogram,
     histogramdd,
-    bincount,
+    matmul,
     mv,
-    eigvalsh,
+    norm,
+    t,
+    t_,
+    transpose,
+    transpose_,
 )
-
 from .tensor.logic import (  # noqa: F401
+    allclose,
+    bitwise_and,
+    bitwise_and_,
+    bitwise_not,
+    bitwise_not_,
+    bitwise_or,
+    bitwise_or_,
+    bitwise_xor,
+    bitwise_xor_,
     equal,
     equal_,
+    equal_all,
     greater_equal,
     greater_equal_,
     greater_than,
     greater_than_,
     is_empty,
+    is_tensor,
+    isclose,
     less_equal,
     less_equal_,
     less_than,
@@ -182,135 +251,211 @@
     logical_or_,
     logical_xor,
     logical_xor_,
-    bitwise_and,
-    bitwise_and_,
-    bitwise_not,
-    bitwise_not_,
-    bitwise_or,
-    bitwise_or_,
-    bitwise_xor,
-    bitwise_xor_,
     not_equal,
     not_equal_,
-    allclose,
-    isclose,
-    equal_all,
-    is_tensor,
 )
-
-
 from .tensor.manipulation import (  # noqa: F401
+    as_complex,
+    as_real,
+    as_strided,
     atleast_1d,
     atleast_2d,
     atleast_3d,
+    broadcast_tensors,
+    broadcast_to,
     cast,
     cast_,
+    chunk,
     concat,
-    broadcast_tensors,
+    crop,
+    diagonal_scatter,
     expand,
-    broadcast_to,
     expand_as,
-    tile,
     flatten,
+    flip,
+    flip as reverse,
     gather,
     gather_nd,
+    index_add,
+    index_add_,
+    index_fill,
+    index_fill_,
+    index_put,
+    index_put_,
+    masked_fill,
+    masked_fill_,
+    moveaxis,
+    put_along_axis,
+    select_scatter,
+    repeat_interleave,
     reshape,
     reshape_,
-    flip as reverse,
+    roll,
+    rot90,
     scatter,
     scatter_,
-    scatter_nd_add,
     scatter_nd,
+    scatter_nd_add,
     shard_index,
     slice,
-    crop,
     split,
-    vsplit,
     squeeze,
     squeeze_,
     stack,
     strided_slice,
+    take_along_axis,
+    tensordot,
+    tile,
+    tolist,
+    unbind,
+    unflatten,
+    unfold,
     unique,
     unique_consecutive,
     unsqueeze,
     unsqueeze_,
     unstack,
-    flip,
-    rot90,
-    unbind,
-    roll,
-    chunk,
-    tolist,
-    take_along_axis,
-    put_along_axis,
-    select_scatter,
-    tensordot,
-    as_complex,
-    as_real,
-    moveaxis,
-    repeat_interleave,
-    index_add,
-    index_add_,
-    index_put,
-    index_put_,
-    unflatten,
-    as_strided,
     view,
     view_as,
-    unfold,
-    masked_fill,
-    masked_fill_,
-    index_fill,
-    index_fill_,
-    diagonal_scatter,
+    vsplit,
 )
-
 from .tensor.math import (  # noqa: F401
     abs,
     abs_,
     acos,
     acos_,
+    acosh,
+    acosh_,
+    add,
+    add_n,
+    addmm,
+    addmm_,
+    all,
+    amax,
+    amin,
+    angle,
+    any,
     asin,
     asin_,
+    asinh,
+    asinh_,
     atan,
-    atan_,
     atan2,
+    atan_,
+    atanh,
+    atanh_,
+    broadcast_shape,
     ceil,
+    clip,
+    combinations,
+    conj,
     cos,
     cos_,
-    tan,
-    tan_,
     cosh,
     cosh_,
-    cumsum,
-    cumsum_,
+    count_nonzero,
     cummax,
     cummin,
     cumprod,
     cumprod_,
-    logcumsumexp,
-    logit,
-    logit_,
+    cumsum,
+    cumsum_,
+    cumulative_trapezoid,
+    deg2rad,
+    diagonal,
+    diff,
+    digamma,
+    digamma_,
+    divide,
+    divide_,
+    erf,
+    erf_,
+    erfinv,
     exp,
     expm1,
     expm1_,
     floor,
+    floor_divide,
+    floor_divide_,
+    floor_mod,
+    floor_mod_,
+    fmax,
+    fmin,
+    frac,
+    frac_,
+    frexp,
+    gcd,
+    gcd_,
+    heaviside,
+    hypot,
+    hypot_,
+    i0,
+    i0_,
+    i0e,
+    i1,
+    i1e,
     increment,
+    inner,
+    inverse,
+    isfinite,
+    isinf,
+    isnan,
+    kron,
+    lcm,
+    lcm_,
+    ldexp,
+    ldexp_,
+    lerp,
+    lgamma,
+    lgamma_,
     log,
-    log_,
-    log2_,
+    log1p,
+    log1p_,
     log2,
+    log2_,
     log10,
     log10_,
+    log_,
+    logaddexp,
+    logcumsumexp,
+    logit,
+    logit_,
+    logsumexp,
+    max,
+    maximum,
+    min,
+    minimum,
+    mm,
+    mod,
+    mod_,
+    multigammaln,
+    multigammaln_,
     multiplex,
+    multiply,
+    multiply_,
+    nan_to_num,
+    nan_to_num_,
+    nanmean,
+    nansum,
+    neg,
+    neg_,
+    nextafter,
+    outer,
+    polygamma,
+    polygamma_,
     pow,
     pow_,
+    prod,
+    rad2deg,
     reciprocal,
-    all,
-    any,
+    remainder,
+    remainder_,
+    renorm,
+    renorm_,
     round,
     rsqrt,
     scale,
+    sgn,
     sign,
     sin,
     sin_,
@@ -320,227 +465,62 @@
     square,
     square_,
     stanh,
+    subtract,
     sum,
-    multigammaln,
-    multigammaln_,
-    nan_to_num,
-    nan_to_num_,
-    nansum,
-    nanmean,
-    count_nonzero,
+    take,
+    tan,
+    tan_,
     tanh,
     tanh_,
-    add_n,
-    max,
-    maximum,
-    amax,
-    min,
-    minimum,
-    amin,
-    mm,
-    divide,
-    divide_,
-    floor_divide,
-    floor_divide_,
-    remainder,
-    remainder_,
-    mod,
-    mod_,
-    floor_mod,
-    floor_mod_,
-    multiply,
-    multiply_,
-    renorm,
-    renorm_,
-    add,
-    subtract,
-    logsumexp,
-    logaddexp,
-    inverse,
-    log1p,
-    log1p_,
-    erf,
-    erf_,
-    addmm,
-    addmm_,
-    clip,
     trace,
-    diagonal,
-    kron,
-    isfinite,
-    isinf,
-    isnan,
-    prod,
-    broadcast_shape,
-    conj,
+    trapezoid,
     trunc,
     trunc_,
-    digamma,
-    digamma_,
-    neg,
-    neg_,
-    lgamma,
-    lgamma_,
-    acosh,
-    acosh_,
-    asinh,
-    asinh_,
-    atanh,
-    atanh_,
-    lerp,
-    erfinv,
-    rad2deg,
-    deg2rad,
-    gcd,
-    gcd_,
-    lcm,
-    lcm_,
-    diff,
-    angle,
-    fmax,
-    fmin,
-    inner,
-    outer,
-    heaviside,
-    frac,
-    frac_,
-    sgn,
-    take,
-    frexp,
-    ldexp,
-    ldexp_,
-    trapezoid,
-    cumulative_trapezoid,
     vander,
-    nextafter,
-    i0,
-    i0_,
-    i0e,
-    i1,
-    i1e,
-    polygamma,
-    polygamma_,
-    hypot,
-    hypot_,
-    combinations,
 )
-
 from .tensor.random import (
     bernoulli,
-    poisson,
+    check_shape,
     multinomial,
-    standard_normal,
     normal,
     normal_,
-    uniform,
-    randn,
+    poisson,
     rand,
     randint,
     randint_like,
+    randn,
     randperm,
+    standard_normal,
+    uniform,
 )
 from .tensor.search import (
     argmax,
     argmin,
     argsort,
-    searchsorted,
     bucketize,
+    index_sample,
+    index_select,
+    kthvalue,
     masked_select,
+    mode,
+    nonzero,
+    searchsorted,
+    sort,
     topk,
     where,
     where_,
-    index_select,
-    nonzero,
-    sort,
-    kthvalue,
-    mode,
-)
-
-from .tensor.to_string import set_printoptions
-
-from .tensor.einsum import einsum
-
-from .framework import async_save, clear_async_save_task_queue  # noqa: F401
-
-from .framework.random import (
-    seed,
-    get_cuda_rng_state,
-    set_cuda_rng_state,
-    get_rng_state,
-    set_rng_state,
-)
-from .framework import (  # noqa: F401
-    ParamAttr,
-    CPUPlace,
-    IPUPlace,
-    CUDAPlace,
-    CUDAPinnedPlace,
-    CustomPlace,
-    XPUPlace,
-)
-
-from .autograd import (
-    grad,
-    no_grad,
-    enable_grad,
-    set_grad_enabled,
-    is_grad_enabled,
-)
-from .framework import (
-    save,
-    load,
-)
-from .distributed import DataParallel
-
-from .framework import (
-    set_default_dtype,
-    get_default_dtype,
 )
-
-from .tensor.search import index_sample
 from .tensor.stat import (
     mean,
-    std,
-    var,
-    numel,
     median,
     nanmedian,
-    quantile,
     nanquantile,
+    numel,
+    quantile,
+    std,
+    var,
 )
-from .device import (  # noqa: F401
-    get_cudnn_version,
-    set_device,
-    get_device,
-    is_compiled_with_xpu,
-    is_compiled_with_ipu,
-    is_compiled_with_cinn,
-    is_compiled_with_distribute,
-    is_compiled_with_cuda,
-    is_compiled_with_rocm,
-    is_compiled_with_custom_device,
-)
-
-# high-level api
-from . import (  # noqa: F401
-    callbacks,
-    hub,
-    linalg,
-    fft,
-    signal,
-    _pir_ops,
-)
-from .hapi import (
-    Model,
-    summary,
-    flops,
-)
-
-import paddle.text  # noqa: F401
-import paddle.vision  # noqa: F401
-
-from .tensor.random import check_shape
-from .nn.initializer.lazy_init import LazyGuard
+from .tensor.to_string import set_printoptions
 
 # CINN has to set a flag to include a lib
 if is_compiled_with_cinn():
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index 7e5ac9c1d92c44..bb7415fcd63f2c 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import sys
 import atexit
+import os
 import platform
+import sys
 
 # The legacy core need to be removed before "import core",
 # in case of users installing paddlepaddle without -U option
@@ -33,99 +33,102 @@
     except Exception as e:
         raise e
 
-from . import core
+from .layers.math_op_patch import monkey_patch_variable
 
 # import all class inside framework into base module
-from . import framework
-from .framework import (
+# import all class inside executor into base module
+from . import (  # noqa: F401
+    backward,
+    compiler,
+    core,
+    data_feed_desc,
+    dataset,
+    dygraph,
+    executor,
+    framework,
+    incubate,
+    initializer,
+    io,
+    layers,
+    trainer_desc,
+    unique_name,
+)
+from .backward import (  # noqa: F401
+    append_backward,
+    gradients,
+)
+from .compiler import (  # noqa: F401
+    BuildStrategy,
+    CompiledProgram,
+    ExecutionStrategy,
+    IpuCompiledProgram,
+    IpuStrategy,
+)
+from .core import (  # noqa: F401
+    CPUPlace,
+    CUDAPinnedPlace,
+    CUDAPlace,
+    CustomPlace,
+    IPUPlace,
+    LoDTensor,
+    LoDTensorArray,
+    Scope,
+    XPUPlace,
+    _cuda_synchronize,
+    _Scope,
+)
+from .data_feed_desc import DataFeedDesc  # noqa: F401
+from .data_feeder import DataFeeder  # noqa: F401
+from .dataset import (  # noqa: F401
+    DatasetFactory,
+    InMemoryDataset,
+)
+from .dygraph.base import disable_dygraph, enable_dygraph
+from .dygraph.tensor_patch_methods import monkey_patch_tensor
+from .executor import (  # noqa: F401
+    Executor,
+    global_scope,
+    scope_guard,
+)
+from .framework import (  # noqa: F401
     Program,
-    default_startup_program,
-    default_main_program,
-    program_guard,
-    name_scope,
-    ipu_shard_guard,
-    set_ipu_shard,
-    cuda_places,
+    Variable,
     cpu_places,
-    xpu_places,
     cuda_pinned_places,
+    cuda_places,
+    default_main_program,
+    default_startup_program,
+    device_guard,
+    get_flags,
     in_dygraph_mode,
-    in_pir_mode,
     in_dynamic_or_pir_mode,
+    in_pir_mode,
+    ipu_shard_guard,
     is_compiled_with_cinn,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
     is_compiled_with_xpu,
-    Variable,
+    name_scope,
+    program_guard,
     require_version,
-    device_guard,
     set_flags,
-    get_flags,
-)
-
-# import all class inside executor into base module
-from . import executor
-from .executor import (
-    Executor,
-    global_scope,
-    scope_guard,
-)
-
-from . import data_feed_desc
-from .data_feed_desc import DataFeedDesc
-
-from . import dataset
-from .dataset import (
-    DatasetFactory,
-    InMemoryDataset,
-)
-
-from . import trainer_desc
-
-from . import io
-from . import initializer
-from .initializer import set_global_initializer
-from . import layers
-from . import dygraph
-from . import backward
-from .backward import gradients
-from . import incubate
-from .param_attr import ParamAttr, WeightNormParamAttr
-from .data_feeder import DataFeeder
-
-from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import (
-    CPUPlace,
-    XPUPlace,
-    CUDAPlace,
-    CUDAPinnedPlace,
-    IPUPlace,
-    CustomPlace,
+    set_ipu_shard,
+    xpu_places,
 )
-from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-
-from . import unique_name
-from . import compiler
-from .compiler import (
-    CompiledProgram,
-    ExecutionStrategy,
-    BuildStrategy,
-    IpuCompiledProgram,
-    IpuStrategy,
+from .initializer import set_global_initializer  # noqa: F401
+from .lod_tensor import (  # noqa: F401
+    create_lod_tensor,
+    create_random_int_lodtensor,
 )
-from paddle.base.layers.math_op_patch import monkey_patch_variable
-from .dygraph.base import enable_dygraph, disable_dygraph
-from .dygraph.tensor_patch_methods import monkey_patch_tensor
-from .core import _cuda_synchronize
-from .trainer_desc import (
-    TrainerDesc,
+from .param_attr import ParamAttr, WeightNormParamAttr  # noqa: F401
+from .trainer_desc import (  # noqa: F401
     DistMultiTrainer,
-    PipelineTrainer,
     HeterPipelineTrainer,
-    MultiTrainer,
     HeterXpuTrainer,
+    MultiTrainer,
+    PipelineTrainer,
+    TrainerDesc,
 )
-from .backward import append_backward
 
 Tensor = LoDTensor
 enable_imperative = enable_dygraph
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index bbf17a58a1a77b..c0fd87eaf42587 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -15,10 +15,9 @@
 import inspect
 import warnings
 
-from paddle.base.dygraph.base import in_to_static_mode
-
-from .. import core, default_main_program
-from ..framework import Variable, static_only
+from .. import core
+from ..dygraph.base import in_to_static_mode
+from ..framework import Variable, default_main_program, static_only
 from .layer_function_generator import OpProtoHolder
 
 _supported_int_dtype_ = [

From 7a8aacf26c269621c8d451ed2fa1b7e43ed61053 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 12 Dec 2023 13:42:58 +0800
Subject: [PATCH 24/28] update README (#59883)

---
 README.md    | 18 ++++++++----------
 README_cn.md | 14 ++++++--------
 README_ja.md | 16 +++++++---------
 3 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 8f708334ed28f1..4d732846c64b00 100644
--- a/README.md
+++ b/README.md
@@ -17,21 +17,22 @@ Welcome to the PaddlePaddle GitHub.
 PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
 PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 8 million developers, 220,000 companies and generating 800,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
-
 ## Installation
 
 ### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
-### Install Latest Stable Release:
-```
+
+### Install Latest Stable Release
+
+``` sh
 # CPU
 pip install paddlepaddle
 # GPU
 pip install paddlepaddle-gpu
-
 ```
+
 For more information about installation, please view [Quick Install](https://www.paddlepaddle.org.cn/install/quick)
 
 Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 8 hours to train models online per day. [Click here to start](https://aistudio.baidu.com/aistudio/index).
@@ -42,24 +43,20 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
 
     The PaddlePaddle deep learning framework facilitates the development while lowering the technical burden, through leveraging a programmable scheme to architect the neural networks. It supports both declarative programming and imperative programming with both development flexibility and high runtime performance preserved.  The neural architectures could be automatically designed by algorithms with better performance than the ones designed by human experts.
 
-
--  **Support Ultra-Large-Scale Training of Deep Neural Networks**
+- **Support Ultra-Large-Scale Training of Deep Neural Networks**
 
     PaddlePaddle has made breakthroughs in ultra-large-scale deep neural networks training. It launched the world's first large-scale open-source training platform that supports the training of deep networks with 100 billion features and trillions of parameters using data sources distributed over hundreds of nodes. PaddlePaddle overcomes the online deep learning challenges for ultra-large-scale deep learning models, and further achieved real-time model updating with more than 1 trillion parameters.
      [Click here to learn more](https://github.com/PaddlePaddle/Fleet)
 
-
 - **High-Performance Inference Engines for Comprehensive Deployment Environments**
 
    PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/master/guides/introduction/index_intro.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
 
-
 - **Industry-Oriented Models and Libraries with Open Source Repositories**
 
      PaddlePaddle includes and maintains more than 100 mainstream models that have been practiced and polished for a long time in the industry. Some of these models have won major prizes from key international competitions. In the meanwhile, PaddlePaddle has further more than 200 pre-training models (some of them with source codes) to facilitate the rapid development of industrial applications.
      [Click here to learn more](https://github.com/PaddlePaddle/models)
 
-
 ## Documentation
 
 We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html) and
@@ -94,7 +91,7 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
   - Technical Organization: [Paddle Framework Contributor Club, PFCC](https://github.com/PaddlePaddle/community/tree/master/pfcc)
   - Community Governance Organization: [PaddlePaddle OpenSource Development Working Group, PPOSDWG](https://github.com/PaddlePaddle/community/tree/master/pposdwg)
 
-- Community Blog: https://pfcc.blog/
+- Community Blog: <https://pfcc.blog/>
 
 ## Courses
 
@@ -102,4 +99,5 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 - [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses introducing edge deployments from mobile, IoT to web and applets.
 
 ## Copyright and License
+
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/README_cn.md b/README_cn.md
index a13fa5ba214503..702aadbcbb32e7 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -22,13 +22,15 @@
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
-### 安装最新稳定版本:
-```
+### 安装最新稳定版本
+
+``` sh
 # CPU
 pip install paddlepaddle
 # GPU
 pip install paddlepaddle-gpu
 ```
+
 更多安装信息详见官网 [安装说明](https://www.paddlepaddle.org.cn/install/quick)
 
 PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型更高效。**每日登陆即送8小时**，[前往使用免费算力](https://aistudio.baidu.com/aistudio/index)。
@@ -39,24 +41,20 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 
     飞桨深度学习框架采用基于编程逻辑的组网范式，对于普通开发者而言更容易上手，符合他们的开发习惯。同时支持声明式和命令式编程，兼具开发的灵活性和高性能。网络结构自动设计，模型效果超越人类专家。
 
-
 - **支持超大规模深度学习模型的训练**
 
     飞桨突破了超大规模深度学习模型训练技术，实现了支持千亿特征、万亿参数、数百节点的开源大规模训练平台，攻克了超大规模深度学习模型的在线学习难题，实现了万亿规模参数模型的实时更新。
     [查看详情](https://github.com/PaddlePaddle/Fleet)
 
-
 - **支持多端多平台的高性能推理部署工具**
 
     飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://www.paddlepaddle.org.cn/inference/product_introduction/inference_intro.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
 
-
 - **面向产业应用，开源开放覆盖多领域的工业级模型库。**
 
     飞桨官方支持100多个经过产业实践长期打磨的主流模型，其中包括在国际竞赛中夺得冠军的模型；同时开源开放200多个预训练模型，助力快速的产业应用。
     [查看详情](https://github.com/PaddlePaddle/models)
 
-
 ## 文档
 
 我们提供 [英文](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html) 和 [中文](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html) 文档
@@ -67,7 +65,6 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 
 - [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/index_cn.html)：新的 API 支持代码更少更简洁的程序
 
-
 - [贡献方式](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_contribution/index_cn.html)：参与飞桨社区开源贡献的指南
 
 ## 开源社区
@@ -82,7 +79,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
   - 技术交流组织：[飞桨核心框架贡献者俱乐部 PFCC](https://github.com/PaddlePaddle/community/tree/master/pfcc)
   - 社区治理组织：[飞桨社区开源发展工作组 PPOSDWG](https://github.com/PaddlePaddle/community/tree/master/pposdwg)
 
-- 社区博客：https://pfcc.blog/
+- 社区博客：<https://pfcc.blog/>
 
 ## 课程
 
@@ -90,4 +87,5 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 - [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690)：详细介绍端侧多场景部署实操，从移动端设备、IoT、网页到小程序部署
 
 ## 版权和许可证
+
 PaddlePaddle由[Apache-2.0 license](LICENSE)提供
diff --git a/README_ja.md b/README_ja.md
index 22c78a1a79bbd9..34cf96185e905a 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -17,21 +17,22 @@ PaddlePaddle GitHub へようこそ。
 PaddlePaddle は中国初の独立系 R&D ディープラーニングプラットフォームとして、2016年からプロのコミュニティに正式にオープンソース化されました。コアとなる深層学習フレームワーク、基本モデルライブラリ、エンドツーエンドの開発キット、ツール＆コンポーネント、さらにサービスプラットフォームを網羅する、高度な技術と豊富な機能を備えた産業プラットフォームです。
 PaddlePaddle は、工業化に対するコミットメントを持つ工業的実践から生まれたものです。製造業、農業、企業サービスなど幅広い分野で採用され、800万人以上の開発者、22万以上の企業、80万以上のモデルを生み出しています。それにより PaddlePaddle は、ますます多くのパートナーの AI 商用化を支援しています。
 
-
 ## インストール
 
 ### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
 
 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。
 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。
-### 最新の安定版リリースのインストール:
-```
+
+### 最新の安定版リリースのインストール
+
+``` sh
 # CPU
 pip install paddlepaddle
 # GPU
 pip install paddlepaddle-gpu
-
 ```
+
 インストール方法については、[クイックインストール](https://www.paddlepaddle.org.cn/install/quick)をご覧ください
 
 この度、開発者の皆様が Tesla V100 のオンライン計算資源を無償で取得できるようになりました。AI Studio でプログラムを作成した場合、1日あたり8時間のオンライン学習が可能です。[スタートはこちら](https://aistudio.baidu.com/aistudio/index)。
@@ -42,24 +43,20 @@ pip install paddlepaddle-gpu
 
     PaddlePaddle ディープラーニングフレームワークは、ニューラルネットワークをアーキテクトするプログラマブルスキームを活用することで、技術的負担を軽減しながら開発を容易にする。宣言型プログラミングと命令型プログラミングの両方をサポートし、開発の柔軟性と高い実行性能を両立しています。 ニューラル・アーキテクチャは、アルゴリズムによって自動的に設計され、人間の専門家が設計したものよりも優れた性能を発揮する可能性があります。
 
-
--  **ディープニューラルネットワークの超大規模学習をサポート**
+- **ディープニューラルネットワークの超大規模学習をサポート**
 
     PaddlePaddle は、超大規模なディープニューラルネットワークのトレーニングでブレークスルーを起こしました。数百のノードに分散したデータソースを用いて、1000億の特徴量と数兆のパラメータを持つディープネットワークのトレーニングをサポートする、世界初の大規模オープンソース・トレーニング・プラットフォームを立ち上げたのです。PaddlePaddle は、超大規模ディープラーニングモデルのオンラインディープラーニングの課題を克服し、さらに1兆以上のパラメータでリアルタイムにモデル更新を実現しました。
      [詳しくはこちら](https://github.com/PaddlePaddle/Fleet)
 
-
 - **総合的な展開環境に対応した高性能推論エンジン**
 
    PaddlePaddle は、サードパーティのオープンソースフレームワークで学習されたモデルとの互換性があるだけでなく、様々な生産シナリオに対応した完全な推論エンジン、システム、スイートを提供しています。当社の推論エンジン、システム、スイートには、[Paddle Inference](https://paddle-inference.readthedocs.io/en/master/guides/introduction/index_intro.html) があります： [Paddle Serving](https://github.com/PaddlePaddle/Serving): 高性能なサーバーおよびクラウド推論用のネイティブ推論ライブラリ： [Paddle Serving](https://github.com/PaddlePaddle/Paddle-Lite): 分散型やパイプライン型プロダクションに適したサービス指向フレームワーク; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)： モバイルや IoT 環境向けの超軽量推論エンジン; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)： ブラウザやミニアプリのためのフロントエンド推論エンジンです。さらに、各シナリオの主要なハードウェアに最適化することで、Paddle の推論エンジンは他の主流フレームワークのほとんどを凌駕しています。
 
-
 - **オープンソースリポジトリによる業界指向のモデルやライブラリ**
 
      PaddlePaddle は、業界で長い間実践され、磨かれてきた100以上の主流モデルを含み、維持しています。これらのモデルの中には、主要な国際コンペティションで主要な賞を受賞したものもあります。一方、PaddlePaddle は、産業用アプリケーションの迅速な開発を促進するために、200以上のプレトレーニングモデル（そのうちのいくつかはソースコード付き）をさらに整備しています。
      [詳しくはこちら](https://github.com/PaddlePaddle/models)
 
-
 ## ドキュメント
 
 [英語](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html)と
@@ -93,4 +90,5 @@ pip install paddlepaddle-gpu
 - [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): モバイル、IoT から Web、アプレットまで、エッジの展開を紹介するコース。
 
 ## Copyright とライセンス
+
 PaddlePaddle は [Apache-2.0 license](LICENSE) の下で提供されています。

From 8a086c97df758adc98183b8b13643e592176ab66 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Tue, 12 Dec 2023 13:53:17 +0800
Subject: [PATCH 25/28] [Prim] add eager prim backward blacklist (#59891)

* add eager prim backward blacklist

* add test case
---
 .../generator/eager_gen.py                    |  4 +-
 .../prim/flags/test_eager_blacklist_flag.py   | 64 +++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 test/prim/prim/flags/test_eager_blacklist_flag.py

diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 2a96fddccbce70..60014305469805 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -2591,7 +2591,9 @@ def GenerateNodeDefinition(
   """
             else:
                 grad_function_call_str = f"""
-  if (paddle::prim::PrimCommonUtils::IsEagerPrimEnabled()) {{
+  std::string grad_op_name = "{composite_grad_api_name}";
+  auto need_skip = paddle::prim::StaticCompositeContext::Instance().CheckSkipCompOps(grad_op_name);
+  if (paddle::prim::PrimCommonUtils::IsEagerPrimEnabled() && !need_skip) {{
 {indent}bool original_global_grad = egr::Controller::Instance().HasGrad();
 {indent}if(!create_graph){{
 {indent}{indent}egr::Controller::Instance().SetHasGrad(create_graph);
diff --git a/test/prim/prim/flags/test_eager_blacklist_flag.py b/test/prim/prim/flags/test_eager_blacklist_flag.py
new file mode 100644
index 00000000000000..ad21426b79ce07
--- /dev/null
+++ b/test/prim/prim/flags/test_eager_blacklist_flag.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.base import core
+
+# core.set_prim_eager_enabled(True)
+
+
+def fn(primal, cotangent):
+    primal = paddle.to_tensor(primal)
+    primal.stop_gradient = False
+    return paddle.grad(
+        paddle.nn.functional.silu(primal), primal, paddle.to_tensor(cotangent)
+    )[0]
+
+
+class TestPrimFlags(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        self.primal = paddle.to_tensor(
+            np.random.rand(100, 100).astype(np.float32)
+        )
+        self.primal.stop_gradient = False
+        self.cotangent = paddle.to_tensor(
+            np.random.rand(100, 100).astype(np.float32)
+        )
+
+    def test_prim_flags(self):
+        origin = fn(self.primal, self.cotangent)
+        core.set_prim_eager_enabled(True)
+        actual1 = fn(self.primal, self.cotangent)
+        np.testing.assert_allclose(origin, actual1, atol=1e-6)
+        with self.assertRaises(AssertionError):
+            np.testing.assert_array_equal(
+                origin,
+                actual1,
+            )
+        core._set_prim_backward_blacklist("silu_grad")
+        actual2 = fn(self.primal, self.cotangent)
+
+        np.testing.assert_array_equal(
+            origin,
+            actual2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From fd3e9ef40077cae34988b84c05f5366fcd9ada4d Mon Sep 17 00:00:00 2001
From: Shijie <jaywan@nvidia.com>
Date: Tue, 12 Dec 2023 14:07:33 +0800
Subject: [PATCH 26/28]  Integrate cudnn flash attention and add ir pass
 (#58680)

---
 paddle/fluid/framework/details/CMakeLists.txt |    4 +
 .../fluid/framework/details/build_strategy.cc |    7 +
 .../fluid/framework/details/build_strategy.h  |    3 +-
 .../framework/distributed_strategy.proto      |    1 +
 paddle/fluid/framework/ir/CMakeLists.txt      |    7 +
 .../ir/fuse_dot_product_attention_pass.cc     |  493 ++++
 .../ir/fuse_dot_product_attention_pass.h      |  120 +
 .../framework/ir/fuse_gemm_epilogue_pass.h    |   11 +-
 .../framework/ir/graph_pattern_detector.cc    |  314 +++
 .../framework/ir/graph_pattern_detector.h     |   86 +
 .../pir/dialect/op_generator/ops_api_gen.py   |    2 +
 .../op_generator/vjp_interface_black_list.py  |    1 +
 .../fused_dot_product_attention_pass.cc       |  639 +++++
 .../fusion/fused_dot_product_attention_pass.h |   26 +
 paddle/fluid/pybind/parallel_executor.cc      |   28 +
 paddle/fluid/pybind/pir.cc                    |    2 +
 paddle/phi/api/yaml/fused_backward.yaml       |   12 +
 paddle/phi/api/yaml/fused_ops.yaml            |   12 +
 paddle/phi/infermeta/backward.h               |    1 +
 paddle/phi/infermeta/fusion.cc                |  250 +-
 paddle/phi/infermeta/fusion.h                 |   14 +
 paddle/phi/kernels/CMakeLists.txt             |    7 +-
 .../gpu/fused_dot_product_attention_op.cu     |  274 +++
 .../phi/kernels/gpudnn/mha_cudnn_frontend.cu  | 2172 +++++++++++++++++
 .../phi/kernels/gpudnn/mha_cudnn_frontend.h   |  105 +
 python/paddle/distributed/passes/cpp_pass.py  |   13 +
 python/paddle/framework/ir.py                 |    3 +
 .../paddle/incubate/nn/functional/__init__.py |    1 +
 .../functional/fused_dot_product_attention.py |  110 +
 .../test_fused_dot_product_attention_pass.py  |  142 ++
 test/legacy_test/CMakeLists.txt               |    9 +
 .../test_fuse_dot_product_attention_pass.py   |  300 +++
 .../test_fused_dot_product_attention_op.py    |  388 +++
 tools/gpups_test.sh                           |    3 +
 34 files changed, 5457 insertions(+), 103 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/fuse_dot_product_attention_pass.cc
 create mode 100644 paddle/fluid/framework/ir/fuse_dot_product_attention_pass.h
 create mode 100644 paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.cc
 create mode 100644 paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.h
 create mode 100644 paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu
 create mode 100644 paddle/phi/kernels/gpudnn/mha_cudnn_frontend.cu
 create mode 100644 paddle/phi/kernels/gpudnn/mha_cudnn_frontend.h
 create mode 100644 python/paddle/incubate/nn/functional/fused_dot_product_attention.py
 create mode 100644 test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py
 create mode 100644 test/legacy_test/test_fuse_dot_product_attention_pass.py
 create mode 100644 test/legacy_test/test_fused_dot_product_attention_op.py

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index f0c2b60f41b69d..cdd4b073d22ac2 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -414,6 +414,10 @@ set(IR_PASS_DEPS
     fused_feedforward_pass
     delete_dropout_op_pass)
 
+if(WITH_CUDNN_FRONTEND)
+  set(IR_PASS_DEPS ${IR_PASS_DEPS} fuse_dot_product_attention_pass)
+endif()
+
 if(WITH_CINN)
   set(IR_PASS_DEPS ${IR_PASS_DEPS} paddle2cinn)
 endif()
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 5a6f4e6e70d4c1..70ef03cbcc0b39 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -182,6 +182,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
         "mkldnn_placement_pass");
 
     // 2. trainning pass
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+    AppendPassWithCheck(strategy_.fuse_dot_product_attention_,
+                        "fuse_dot_product_attention_pass");
+#endif
     AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_,
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
@@ -552,3 +556,6 @@ USE_PASS(fusion_group_pass);
 #if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
 USE_PASS(fuse_gemm_epilogue_pass);
 #endif
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+USE_PASS(fuse_dot_product_attention_pass);
+#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 203525d5a74821..ed79ab6617528a 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -136,7 +136,8 @@ struct BuildStrategy {
   // Fused feed forward
   bool fused_feedforward_{false};
   bool sequential_run_{false};
-
+  // Fuse dot product attention
+  bool fuse_dot_product_attention_{false};
   // mkldnn_enabled_op_types specify the operator type list to
   // use MKLDNN acceleration. It is null in default, means
   // that all the operators supported by MKLDNN will be
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 9b57b88bd11f59..3adee2824036d7 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -172,6 +172,7 @@ message BuildStrategy {
   optional string debug_graphviz_path = 17;
   optional bool fused_attention = 18 [ default = false];
   optional bool fused_feedforward = 19 [ default = false];
+  optional bool fuse_dot_product_attention = 20 [ default = false ];
 }
 
 message ExecutionStrategy {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 46183fd93e97fd..e1d146c367ae47 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -351,6 +351,13 @@ cc_library(
   SRCS fused_feedforward_pass.cc
   DEPS pass graph_pattern_detector)
 
+if(WITH_CUDNN_FRONTEND)
+  cc_library(
+    fuse_dot_product_attention_pass
+    SRCS fuse_dot_product_attention_pass.cc
+    DEPS pass graph_pattern_detector)
+endif()
+
 set(GLOB_PASS_LIB
     ${INFER_IR_PASSES}
     CACHE INTERNAL "Global PASS library")
diff --git a/paddle/fluid/framework/ir/fuse_dot_product_attention_pass.cc b/paddle/fluid/framework/ir/fuse_dot_product_attention_pass.cc
new file mode 100644
index 00000000000000..273adfdcda096a
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_dot_product_attention_pass.cc
@@ -0,0 +1,493 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_dot_product_attention_pass.h"
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseDotProductAttentionPass::ApplyImpl(ir::Graph *graph) const {
+  std::unordered_set<const Node *> nodes_to_remove;
+  QKVCache qkv_cache;
+  MaskCache mask_cache;
+  OutputCache output_cache;
+
+  graph = FuseDotProductAttentionFwd(
+      graph, true, &qkv_cache, &mask_cache, &output_cache, &nodes_to_remove);
+
+  graph = FuseDotProductAttentionFwd(
+      graph, false, &qkv_cache, &mask_cache, &output_cache, &nodes_to_remove);
+
+  graph = FuseDotProductAttentionBwd(
+      graph, true, &qkv_cache, &mask_cache, &output_cache, &nodes_to_remove);
+
+  graph = FuseDotProductAttentionBwd(
+      graph, false, &qkv_cache, &mask_cache, &output_cache, &nodes_to_remove);
+
+  GraphSafeRemoveNodes(graph, nodes_to_remove);
+}
+
+ir::Graph *FuseDotProductAttentionPass::FuseDotProductAttentionFwd(
+    ir::Graph *graph,
+    bool with_dropout,
+    QKVCache *qkv_cache,
+    MaskCache *mask_cache,
+    OutputCache *output_cache,
+    std::unordered_set<const Node *> *nodes_to_remove) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("dot_product_attention");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  patterns::DotProductAttention dot_product_attention_fwd_pattern(
+      gpd.mutable_pattern(), "dot_product_attention_fwd");
+
+  dot_product_attention_fwd_pattern(with_dropout);
+
+  int found_pattern_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle  dot_product_attention fuse"
+            << " - with_dropout:" << with_dropout;
+
+    QKVMetaData qkv_meta_data;
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_q, attn_q, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_k, attn_k, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_v, attn_v, dot_product_attention_fwd_pattern);
+    qkv_meta_data.q_node = attn_q;
+    qkv_meta_data.k_node = attn_k;
+    qkv_meta_data.v_node = attn_v;
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_q_transpose, attn_q_transpose, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_k_transpose, attn_k_transpose, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_v_transpose, attn_v_transpose, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_q_transpose_out,
+                              attn_q_transpose_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_q_transpose_xshape,
+                              attn_q_transpose_xshape,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_k_transpose_out,
+                              attn_k_transpose_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_k_transpose_xshape,
+                              attn_k_transpose_xshape,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_v_transpose_out,
+                              attn_v_transpose_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_v_transpose_xshape,
+                              attn_v_transpose_xshape,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_q_scale, attn_q_scale, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_q_scale_out, attn_q_scale_out, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_qk_matmul, attn_qk_matmul, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_qk_matmul_out,
+                              attn_qk_matmul_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_mask, attn_mask, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_mask_cast1, attn_mask_cast1, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_cast1_out,
+                              attn_mask_cast1_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_mask_cast2, attn_mask_cast2, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_cast2_out,
+                              attn_mask_cast2_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_mask_scale1, attn_mask_scale1, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_scale1_out,
+                              attn_mask_scale1_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_mask_scale2, attn_mask_scale2, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_scale2_out,
+                              attn_mask_scale2_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_mask_eleadd, attn_mask_eleadd, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_eleadd_out,
+                              attn_mask_eleadd_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_softmax, attn_softmax, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_softmax_out, attn_softmax_out, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul,
+                              attn_context_matmul,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul_out,
+                              attn_context_matmul_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_transpose, attn_transpose, dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_transpose_out,
+                              attn_transpose_out,
+                              dot_product_attention_fwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_transpose_xshape,
+                              attn_transpose_xshape,
+                              dot_product_attention_fwd_pattern);
+
+    std::string mha_meta_key = GenerateMetaKey_(attn_q_scale_out->Name(),
+                                                attn_k_transpose_out->Name(),
+                                                attn_v_transpose_out->Name());
+
+    // To avoid duplicate conversion.
+    if (qkv_cache->Exist(mha_meta_key)) {
+      return;
+    }
+
+    BlockDesc *block = attn_qk_matmul->Op()->Block();
+    Attribute op_role = attn_qk_matmul->Op()->GetAttr("op_role");
+
+    // `attn_mask_scale2_out` can be easily detected in both fwd/bwd pass
+    // so we use `attn_mask_scale2_out` instead of `attn_mask` as the key to
+    // cache the mask_meta_data.
+    // In bwd pass, it's name is `attn_mask_eleadd_grad_mask`
+    MaskMetaData mask_meta_data;
+    auto mask_meta_key = attn_mask_scale2_out->Var()->Name();
+    if (!mask_cache->Exist(mask_meta_key)) {
+      mask_meta_data.mask_node = attn_mask;
+      mask_cache->Insert(mask_meta_key, mask_meta_data);
+    } else {
+      mask_meta_data = mask_cache->Get(mask_meta_key);
+    }
+
+    // create fused_dot_product_attention op
+    VarDesc softmax_out_desc(patterns::PDNodeName(scope_name, "softmax_out"));
+    softmax_out_desc.SetDataType(proto::VarType::FP16);
+    softmax_out_desc.SetLoDLevel(attn_softmax_out->Var()->GetLoDLevel());
+    auto *softmax_out_node = g->CreateVarNode(&softmax_out_desc);
+    VarDesc rng_state_desc(patterns::PDNodeName(scope_name, "rng_state"));
+    rng_state_desc.SetDataType(proto::VarType::INT64);
+    auto *rng_state_node = g->CreateVarNode(&rng_state_desc);
+    OpDesc dot_product_attention_fwd_op_desc(block);
+    dot_product_attention_fwd_op_desc.SetType("fused_dot_product_attention");
+    dot_product_attention_fwd_op_desc.SetInput("q",
+                                               {qkv_meta_data.q_node->Name()});
+    dot_product_attention_fwd_op_desc.SetInput("k",
+                                               {qkv_meta_data.k_node->Name()});
+    dot_product_attention_fwd_op_desc.SetInput("v",
+                                               {qkv_meta_data.v_node->Name()});
+    dot_product_attention_fwd_op_desc.SetInput(
+        "mask", {mask_meta_data.mask_node->Name()});
+    dot_product_attention_fwd_op_desc.SetOutput("out",
+                                                {attn_transpose_out->Name()});
+    dot_product_attention_fwd_op_desc.SetOutput("softmax_out",
+                                                {softmax_out_node->Name()});
+    dot_product_attention_fwd_op_desc.SetOutput("rng_state",
+                                                {rng_state_desc.Name()});
+    dot_product_attention_fwd_op_desc.SetAttr(
+        "scaling_factor",
+        PADDLE_GET_CONST(float, attn_q_scale->Op()->GetAttr("scale")));
+    dot_product_attention_fwd_op_desc.SetAttr("is_causal_masking", false);
+    dot_product_attention_fwd_op_desc.SetAttr("is_training", true);
+    dot_product_attention_fwd_op_desc.SetAttr("op_role", op_role);
+
+    if (with_dropout) {
+      GET_IR_NODE_FROM_SUBGRAPH(
+          attn_dropout, attn_dropout, dot_product_attention_fwd_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(attn_dropout_out,
+                                attn_dropout_out,
+                                dot_product_attention_fwd_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(attn_dropout_mask,
+                                attn_dropout_mask,
+                                dot_product_attention_fwd_pattern);
+      dot_product_attention_fwd_op_desc.SetAttr(
+          "dropout_probability",
+          PADDLE_GET_CONST(float, attn_dropout->Op()->GetAttr("dropout_prob")));
+      nodes_to_remove->insert(
+          {attn_dropout, attn_dropout_out, attn_dropout_mask});
+    } else {
+      dot_product_attention_fwd_op_desc.SetAttr("dropout_probability", 0.0f);
+    }
+    auto dot_product_attention_fwd_op_node =
+        g->CreateOpNode(&dot_product_attention_fwd_op_desc);
+
+    IR_NODE_LINK_TO(qkv_meta_data.q_node, dot_product_attention_fwd_op_node);
+    IR_NODE_LINK_TO(qkv_meta_data.k_node, dot_product_attention_fwd_op_node);
+    IR_NODE_LINK_TO(qkv_meta_data.v_node, dot_product_attention_fwd_op_node);
+    IR_NODE_LINK_TO(mask_meta_data.mask_node,
+                    dot_product_attention_fwd_op_node);
+    IR_NODE_LINK_TO(dot_product_attention_fwd_op_node, attn_transpose_out);
+    IR_NODE_LINK_TO(dot_product_attention_fwd_op_node, softmax_out_node);
+    IR_NODE_LINK_TO(dot_product_attention_fwd_op_node, rng_state_node);
+
+    qkv_cache->Insert(mha_meta_key, qkv_meta_data);
+    OutputMetaData output_meta_data;
+    output_meta_data.output_node = attn_transpose_out;
+    output_meta_data.softmax_output_node = softmax_out_node;
+    output_meta_data.rng_state_node = rng_state_node;
+    output_cache->Insert(mha_meta_key, output_meta_data);
+
+    nodes_to_remove->insert({attn_q_transpose,        attn_k_transpose,
+                             attn_v_transpose,        attn_q_transpose_out,
+                             attn_k_transpose_out,    attn_v_transpose_out,
+                             attn_q_transpose_xshape, attn_k_transpose_xshape,
+                             attn_v_transpose_xshape, attn_q_scale,
+                             attn_q_scale_out,        attn_qk_matmul,
+                             attn_qk_matmul_out,      attn_mask_cast1,
+                             attn_mask_cast1_out,     attn_mask_scale1,
+                             attn_mask_scale1_out,    attn_mask_scale2,
+                             attn_mask_scale2_out,    attn_mask_cast2,
+                             attn_mask_cast2_out,     attn_mask_eleadd,
+                             attn_mask_eleadd_out,    attn_softmax,
+                             attn_softmax_out,        attn_context_matmul,
+                             attn_context_matmul_out, attn_transpose,
+                             attn_transpose_xshape});
+
+    found_pattern_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_pattern_count);
+  return graph;
+}
+
+ir::Graph *FuseDotProductAttentionPass::FuseDotProductAttentionBwd(
+    ir::Graph *graph,
+    bool with_dropout,
+    QKVCache *qkv_cache,
+    MaskCache *mask_cache,
+    OutputCache *output_cache,
+    std::unordered_set<const Node *> *nodes_to_remove) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("dot_product_attention");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  patterns::DotProductAttentionGrad dot_product_attention_bwd_pattern(
+      gpd.mutable_pattern(), "dot_product_attention_bwd");
+
+  dot_product_attention_bwd_pattern(with_dropout);
+
+  int found_pattern_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle MultiHeadAttnBwd fuse"
+            << " - with_dropout:" << with_dropout;
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dout, attn_dout, dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_transpose_grad,
+                              attn_transpose_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_transpose_grad_out,
+                              attn_transpose_grad_out,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul_grad_x,
+                              attn_context_matmul_grad_x,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul_grad_y,
+                              attn_context_matmul_grad_y,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul_grad,
+                              attn_context_matmul_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul_grad_dx,
+                              attn_context_matmul_grad_dx,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_context_matmul_grad_dy,
+                              attn_context_matmul_grad_dy,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_softmax_grad,
+                              attn_softmax_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_softmax_grad_out,
+                              attn_softmax_grad_out,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_eleadd_grad,
+                              attn_mask_eleadd_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_eleadd_grad_mask,
+                              attn_mask_eleadd_grad_mask,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_mask_eleadd_grad_dx,
+                              attn_mask_eleadd_grad_dx,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_qk_matmul_grad_x,
+                              attn_qk_matmul_grad_x,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_qk_matmul_grad_y,
+                              attn_qk_matmul_grad_y,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_qk_matmul_grad,
+                              attn_qk_matmul_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_qk_matmul_grad_dx,
+                              attn_qk_matmul_grad_dx,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_qk_matmul_grad_dy,
+                              attn_qk_matmul_grad_dy,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_scale_grad, attn_scale_grad, dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_scale_grad_out,
+                              attn_scale_grad_out,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_q_transpose_grad,
+                              attn_q_transpose_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_k_transpose_grad,
+                              attn_k_transpose_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(attn_v_transpose_grad,
+                              attn_v_transpose_grad,
+                              dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dq, attn_dq, dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dk, attn_dk, dot_product_attention_bwd_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dv, attn_dv, dot_product_attention_bwd_pattern);
+
+    std::string mha_meta_key =
+        GenerateMetaKey_(attn_qk_matmul_grad_x->Name(),
+                         attn_qk_matmul_grad_y->Name(),
+                         attn_context_matmul_grad_y->Name());
+    if (!qkv_cache->Exist(mha_meta_key)) {
+      return;
+    }
+    auto mask_meta_data =
+        mask_cache->Get(attn_mask_eleadd_grad_mask->Var()->Name());
+    auto qkv_meta_data = qkv_cache->Get(mha_meta_key);
+    auto output_meta_data = output_cache->Get(mha_meta_key);
+
+    BlockDesc *block = attn_qk_matmul_grad->Op()->Block();
+    Attribute op_role = attn_qk_matmul_grad->Op()->GetAttr("op_role");
+
+    // create fused_dot_product_attention_grad op
+    OpDesc dot_product_attention_bwd_op_desc(block);
+    dot_product_attention_bwd_op_desc.SetType(
+        "fused_dot_product_attention_grad");
+    dot_product_attention_bwd_op_desc.SetInput("q",
+                                               {qkv_meta_data.q_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput("k",
+                                               {qkv_meta_data.k_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput("v",
+                                               {qkv_meta_data.v_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput(
+        "out", {output_meta_data.output_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput(
+        "softmax_out", {output_meta_data.softmax_output_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput(
+        "rng_state", {output_meta_data.rng_state_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput(
+        "mask", {mask_meta_data.mask_node->Name()});
+    dot_product_attention_bwd_op_desc.SetInput(GradVarName("out"),
+                                               {attn_dout->Name()});
+    dot_product_attention_bwd_op_desc.SetOutput(GradVarName("q"),
+                                                {attn_dq->Name()});
+    dot_product_attention_bwd_op_desc.SetOutput(GradVarName("k"),
+                                                {attn_dk->Name()});
+    dot_product_attention_bwd_op_desc.SetOutput(GradVarName("v"),
+                                                {attn_dv->Name()});
+    dot_product_attention_bwd_op_desc.SetAttr(
+        "scaling_factor",
+        PADDLE_GET_CONST(float, attn_scale_grad->Op()->GetAttr("scale")));
+    dot_product_attention_bwd_op_desc.SetAttr("is_training", true);
+    dot_product_attention_bwd_op_desc.SetAttr("is_causal_masking", false);
+    dot_product_attention_bwd_op_desc.SetAttr("op_role", op_role);
+    if (with_dropout) {
+      GET_IR_NODE_FROM_SUBGRAPH(attn_dropout_grad,
+                                attn_dropout_grad,
+                                dot_product_attention_bwd_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(attn_dropout_grad_out,
+                                attn_dropout_grad_out,
+                                dot_product_attention_bwd_pattern);
+      dot_product_attention_bwd_op_desc.SetAttr(
+          "dropout_probability",
+          PADDLE_GET_CONST(float,
+                           attn_dropout_grad->Op()->GetAttr("dropout_prob")));
+      nodes_to_remove->insert({attn_dropout_grad, attn_dropout_grad_out});
+    } else {
+      dot_product_attention_bwd_op_desc.SetAttr("dropout_probability", 0.0f);
+    }
+    auto dot_product_attention_bwd_op_node =
+        g->CreateOpNode(&dot_product_attention_bwd_op_desc);
+
+    IR_NODE_LINK_TO(attn_dout, dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(qkv_meta_data.q_node, dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(qkv_meta_data.k_node, dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(qkv_meta_data.v_node, dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(output_meta_data.output_node,
+                    dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(output_meta_data.softmax_output_node,
+                    dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(output_meta_data.rng_state_node,
+                    dot_product_attention_bwd_op_node);
+    IR_NODE_LINK_TO(mask_meta_data.mask_node,
+                    dot_product_attention_bwd_op_node);
+
+    IR_NODE_LINK_TO(dot_product_attention_bwd_op_node, attn_dq);
+    IR_NODE_LINK_TO(dot_product_attention_bwd_op_node, attn_dk);
+    IR_NODE_LINK_TO(dot_product_attention_bwd_op_node, attn_dv);
+
+    nodes_to_remove->insert(
+        {attn_transpose_grad,         attn_transpose_grad_out,
+         attn_context_matmul_grad,    attn_context_matmul_grad_x,
+         attn_context_matmul_grad_y,  attn_context_matmul_grad_dx,
+         attn_context_matmul_grad_dy, attn_softmax_grad,
+         attn_softmax_grad_out,       attn_mask_eleadd_grad,
+         attn_mask_eleadd_grad_mask,  attn_mask_eleadd_grad_dx,
+         attn_qk_matmul_grad,         attn_qk_matmul_grad_x,
+         attn_qk_matmul_grad_y,       attn_qk_matmul_grad_dx,
+         attn_qk_matmul_grad_dy,      attn_scale_grad,
+         attn_scale_grad_out,         attn_q_transpose_grad,
+         attn_k_transpose_grad,       attn_v_transpose_grad});
+
+    qkv_cache->Erase(mha_meta_key);
+    output_cache->Erase(mha_meta_key);
+
+    found_pattern_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_pattern_count);
+  return graph;
+}
+
+std::string FuseDotProductAttentionPass::GenerateMetaKey_(
+    const std::string &q_name,
+    const std::string &k_name,
+    const std::string &v_name) const {
+  std::string concat_symbol = "|";
+  return q_name + concat_symbol + k_name + concat_symbol + v_name;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_dot_product_attention_pass,
+              paddle::framework::ir::FuseDotProductAttentionPass);
diff --git a/paddle/fluid/framework/ir/fuse_dot_product_attention_pass.h b/paddle/fluid/framework/ir/fuse_dot_product_attention_pass.h
new file mode 100644
index 00000000000000..6ad5b41553877b
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_dot_product_attention_pass.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the ElewiseAdd and activation
+ */
+class Graph;
+class Node;
+
+enum class AttentionType { kSelfAttention, kCrossAttention };
+
+class MaskMetaData {
+ public:
+  ir::Node *mask_node = nullptr;
+};
+class QKVMetaData {
+ public:
+  ir::Node *q_node = nullptr;
+  ir::Node *k_node = nullptr;
+  ir::Node *v_node = nullptr;
+};
+class OutputMetaData {
+ public:
+  ir::Node *output_node = nullptr;
+  ir::Node *softmax_output_node = nullptr;
+  ir::Node *rng_state_node = nullptr;
+};
+
+template <typename T, char const *NAME>
+class OpCache {
+ public:
+  OpCache() {}
+  OpCache(const OpCache &) = delete;
+  void operator=(const OpCache &) = delete;
+
+  bool Exist(const std::string &key) const {
+    std::lock_guard<std::mutex> lock(mtx_);
+    return map_.count(key);
+  }
+
+  T Get(const std::string &key) const {
+    std::lock_guard<std::mutex> lock(mtx_);
+    return map_.find(key)->second;
+  }
+
+  void Insert(const std::string &key, const T &value) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    map_[key] = value;
+  }
+
+  void Erase(const std::string &key) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    map_.erase(key);
+  }
+
+ private:
+  std::unordered_map<std::string, T> map_;
+  mutable std::mutex mtx_;
+};
+
+const char NAME1[] = "MaskMetaData";
+const char NAME2[] = "QKVMetaData";
+const char NAME3[] = "OutputMetaData";
+using MaskCache = OpCache<MaskMetaData, NAME1>;
+using QKVCache = OpCache<QKVMetaData, NAME2>;
+using OutputCache = OpCache<OutputMetaData, NAME3>;
+
+class FuseDotProductAttentionPass : public FusePassBase {
+ public:
+  virtual ~FuseDotProductAttentionPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseDotProductAttentionFwd(
+      ir::Graph *graph,
+      bool with_dropout,
+      QKVCache *qkv_cache,
+      MaskCache *mask_cache,
+      OutputCache *output_cache,
+      std::unordered_set<const Node *> *nodes_to_remove) const;
+  ir::Graph *FuseDotProductAttentionBwd(
+      ir::Graph *graph,
+      bool with_dropout,
+      QKVCache *qkv_cache,
+      MaskCache *mask_cache,
+      OutputCache *output_cache,
+      std::unordered_set<const Node *> *nodes_to_remove) const;
+
+ private:
+  std::string GenerateMetaKey_(const std::string &q_name,
+                               const std::string &k_name,
+                               const std::string &v_name) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
index 6ed57621061d0c..4b1b72f2c5fb48 100644
--- a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
@@ -41,7 +41,10 @@ class EpiloguePassActivationCache {
   void operator=(const EpiloguePassActivationCache &) = delete;
 
   bool HasFusedActivation(const std::string &key) const {
-    return fused_activation_space_map_.count(key);
+    mtx_.lock();
+    bool has = fused_activation_space_map_.count(key);
+    mtx_.unlock();
+    return has;
   }
 
   ir::Node *GetFusedActivationSpace(const std::string &key) {
@@ -54,9 +57,9 @@ class EpiloguePassActivationCache {
 
   void InsertFusedActivation(const std::string &key, ir::Node *const value) {
     if (!HasFusedActivation(key)) {
-      mtx.lock();
+      mtx_.lock();
       fused_activation_space_map_.insert({key, value});
-      mtx.unlock();
+      mtx_.unlock();
     } else {
       PADDLE_THROW(platform::errors::AlreadyExists(
           "The key (%d) of EpiloguePassActivationCache already exist.", key));
@@ -65,7 +68,7 @@ class EpiloguePassActivationCache {
 
  private:
   std::unordered_map<std::string, ir::Node *> fused_activation_space_map_;
-  std::mutex mtx;
+  mutable std::mutex mtx_;
 };
 
 class FuseGemmEpiloguePass : public FusePassBase {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 038fc0d686caa5..ad6b43db840976 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2443,6 +2443,320 @@ PDNode *patterns::ConvElementwiseaddAct::operator()(
   return act_out;
 }
 
+PDNode *patterns::DotProductAttention::operator()(bool with_dropout) {
+  // Attention Computing
+  auto *attn_q = pattern->NewNode(attn_q_repr())
+                     ->AsInput()
+                     ->assert_is_op_output("reshape2", "Out")
+                     ->assert_is_op_input("transpose2", "X");
+  auto *attn_k = pattern->NewNode(attn_k_repr())
+                     ->AsInput()
+                     ->assert_is_op_output("reshape2", "Out")
+                     ->assert_is_op_input("transpose2", "X");
+  auto *attn_v = pattern->NewNode(attn_v_repr())
+                     ->AsInput()
+                     ->assert_is_op_output("reshape2", "Out")
+                     ->assert_is_op_input("transpose2", "X");
+
+  auto *attn_q_transpose =
+      pattern->NewNode(attn_q_transpose_repr())->assert_is_op("transpose2");
+  auto *attn_k_transpose =
+      pattern->NewNode(attn_k_transpose_repr())->assert_is_op("transpose2");
+  auto *attn_v_transpose =
+      pattern->NewNode(attn_v_transpose_repr())->assert_is_op("transpose2");
+
+  auto *attn_q_transpose_out_var =
+      pattern->NewNode(attn_q_transpose_out_repr())
+          ->assert_is_op_output("transpose2", "Out")
+          ->assert_is_op_input("scale", "X");
+  auto *attn_k_transpose_out_var =
+      pattern->NewNode(attn_k_transpose_out_repr())
+          ->assert_is_op_output("transpose2", "Out")
+          ->assert_is_op_input("matmul_v2", "Y");
+  auto *attn_v_transpose_out_var =
+      pattern->NewNode(attn_v_transpose_out_repr())
+          ->assert_is_op_output("transpose2", "Out")
+          ->assert_is_op_input("matmul_v2", "Y");
+  auto *attn_q_transpose_xshape_var =
+      pattern->NewNode(attn_q_transpose_xshape_repr())
+          ->assert_is_op_output("transpose2", "XShape");
+  auto *attn_k_transpose_xshape_var =
+      pattern->NewNode(attn_k_transpose_xshape_repr())
+          ->assert_is_op_output("transpose2", "XShape");
+  auto *attn_v_transpose_xshape_var =
+      pattern->NewNode(attn_v_transpose_xshape_repr())
+          ->assert_is_op_output("transpose2", "XShape");
+  attn_q_transpose->LinksFrom({attn_q}).LinksTo(
+      {attn_q_transpose_out_var, attn_q_transpose_xshape_var});
+  attn_k_transpose->LinksFrom({attn_k}).LinksTo(
+      {attn_k_transpose_out_var, attn_k_transpose_xshape_var});
+  attn_v_transpose->LinksFrom({attn_v}).LinksTo(
+      {attn_v_transpose_out_var, attn_v_transpose_xshape_var});
+
+  auto *attn_q_scale =
+      pattern->NewNode(attn_q_scale_repr())->assert_is_op("scale");
+  auto *attn_q_scale_out_var = pattern->NewNode(attn_q_scale_out_repr())
+                                   ->assert_is_op_output("scale", "Out")
+                                   ->assert_is_op_input("matmul_v2", "X");
+  attn_q_scale->LinksFrom({attn_q_transpose_out_var})
+      .LinksTo({attn_q_scale_out_var});
+
+  auto *attn_qk_matmul = pattern->NewNode(attn_qk_matmul_repr())
+                             ->assert_is_op("matmul_v2")
+                             ->assert_op_attr<bool>("trans_x", false)
+                             ->assert_op_attr<bool>("trans_y", true);
+  auto *attn_qk_matmul_out_var =
+      pattern->NewNode(attn_qk_matmul_out_repr())
+          ->assert_is_op_output("matmul_v2", "Out")
+          ->assert_is_op_input("elementwise_add", "X");
+  attn_qk_matmul->LinksFrom({attn_q_scale_out_var, attn_k_transpose_out_var})
+      .LinksTo({attn_qk_matmul_out_var});
+
+  auto *attn_mask_var =
+      pattern->NewNode(attn_mask_repr())->assert_is_op_input("cast", "X");
+  auto *attn_mask_cast1 =
+      pattern->NewNode(attn_mask_cast1_repr())->assert_is_op("cast");
+  auto *attn_mask_cast1_out_var = pattern->NewNode(attn_mask_cast1_out_repr())
+                                      ->assert_is_op_output("cast", "Out")
+                                      ->assert_is_op_input("cast", "X");
+  attn_mask_cast1->LinksFrom({attn_mask_var})
+      .LinksTo({attn_mask_cast1_out_var});
+
+  auto *attn_mask_cast2 =
+      pattern->NewNode(attn_mask_cast2_repr())->assert_is_op("cast");
+  auto *attn_mask_cast2_out_var = pattern->NewNode(attn_mask_cast2_out_repr())
+                                      ->assert_is_op_output("cast", "Out")
+                                      ->assert_is_op_input("scale", "X");
+  attn_mask_cast2->LinksFrom({attn_mask_cast1_out_var})
+      .LinksTo({attn_mask_cast2_out_var});
+
+  auto *attn_mask_scale1 =
+      pattern->NewNode(attn_mask_scale1_repr())->assert_is_op("scale");
+  auto *attn_mask_scale1_out_var = pattern->NewNode(attn_mask_scale1_out_repr())
+                                       ->assert_is_op_output("scale", "Out")
+                                       ->assert_is_op_input("scale", "X");
+  attn_mask_scale1->LinksFrom({attn_mask_cast2_out_var})
+      .LinksTo({attn_mask_scale1_out_var});
+
+  auto *attn_mask_scale2 =
+      pattern->NewNode(attn_mask_scale2_repr())->assert_is_op("scale");
+  auto *attn_mask_scale2_out_var =
+      pattern->NewNode(attn_mask_scale2_out_repr())
+          ->assert_is_op_output("scale", "Out")
+          ->assert_is_op_input("elementwise_add", "Y");
+  attn_mask_scale2->LinksFrom({attn_mask_scale1_out_var})
+      .LinksTo({attn_mask_scale2_out_var});
+
+  auto *attn_mask_eleadd = pattern->NewNode(attn_mask_eleadd_repr())
+                               ->assert_is_op("elementwise_add");
+  auto *attn_mask_eleadd_out_var =
+      pattern->NewNode(attn_mask_eleadd_out_repr())
+          ->assert_is_op_output("elementwise_add", "Out")
+          ->assert_is_op_input("softmax", "X");
+  attn_mask_eleadd
+      ->LinksFrom({attn_mask_scale2_out_var, attn_qk_matmul_out_var})
+      .LinksTo({attn_mask_eleadd_out_var});
+
+  auto *attn_softmax =
+      pattern->NewNode(attn_softmax_repr())->assert_is_op("softmax");
+  auto *attn_softmax_out_var = pattern->NewNode(attn_softmax_out_repr())
+                                   ->assert_is_op_output("softmax", "Out");
+  attn_softmax->LinksFrom({attn_mask_eleadd_out_var})
+      .LinksTo({attn_softmax_out_var});
+
+  auto *attn_context_matmul_input = attn_softmax_out_var;
+  if (with_dropout) {
+    attn_softmax_out_var->assert_is_op_input("dropout", "X");
+    auto *attn_dropout =
+        pattern->NewNode(attn_dropout_repr())->assert_is_op("dropout");
+    auto *attn_dropout_out_var = pattern->NewNode(attn_dropout_out_repr())
+                                     ->assert_is_op_output("dropout", "Out")
+                                     ->assert_is_op_input("matmul_v2", "X");
+    auto *attn_dropout_mask_var = pattern->NewNode(attn_dropout_mask_repr())
+                                      ->assert_is_op_output("dropout", "Mask");
+    attn_dropout->LinksFrom({attn_softmax_out_var})
+        .LinksTo({attn_dropout_out_var, attn_dropout_mask_var});
+    attn_context_matmul_input = attn_dropout_out_var;
+  } else {
+    attn_softmax_out_var->assert_is_op_input("matmul_v2", "X");
+  }
+
+  auto *attn_context_matmul =
+      pattern->NewNode(attn_context_matmul_repr())->assert_is_op("matmul_v2");
+  auto *attn_context_matmul_out_var =
+      pattern->NewNode(attn_context_matmul_out_repr())
+          ->assert_is_op_output("matmul_v2", "Out")
+          ->assert_is_op_input("transpose2", "X");
+  attn_context_matmul
+      ->LinksFrom({attn_context_matmul_input, attn_v_transpose_out_var})
+      .LinksTo({attn_context_matmul_out_var});
+
+  auto *attn_transpose =
+      pattern->NewNode(attn_transpose_repr())->assert_is_op("transpose2");
+  auto *attn_transpose_out_var = pattern->NewNode(attn_transpose_out_repr())
+                                     ->assert_is_op_output("transpose2", "Out")
+                                     ->assert_is_op_input("reshape2", "X");
+  attn_transpose->LinksFrom({attn_context_matmul_out_var})
+      .LinksTo({attn_transpose_out_var});
+  auto *attn_transpose_xshape_var =
+      pattern->NewNode(attn_transpose_xshape_repr())
+          ->assert_is_op_output("transpose2", "XShape");
+  attn_transpose->LinksFrom({attn_context_matmul_out_var})
+      .LinksTo({attn_transpose_out_var, attn_transpose_xshape_var});
+
+  return attn_transpose_out_var;
+}
+
+PDNode *patterns::DotProductAttentionGrad::operator()(bool with_dropout) {
+  auto *attn_dout_var =
+      pattern->NewNode(attn_dout_repr())
+          ->AsInput()
+          ->assert_is_op_input("transpose2_grad", GradVarName("Out"))
+          ->assert_is_op_output("reshape2_grad", GradVarName("X"));
+  auto *attn_transpose_grad = pattern->NewNode(attn_transpose_grad_repr())
+                                  ->assert_is_op("transpose2_grad");
+  auto *attn_transpose_grad_out_var =
+      pattern->NewNode(attn_transpose_grad_out_repr())
+          ->assert_is_op_output("transpose2_grad", GradVarName("X"));
+  attn_transpose_grad->LinksFrom({attn_dout_var})
+      .LinksTo({attn_transpose_grad_out_var});
+
+  attn_transpose_grad_out_var->assert_is_op_input("matmul_v2_grad",
+                                                  GradVarName("Out"));
+  auto *attn_context_matmul_grad_x_var =
+      pattern->NewNode(attn_context_matmul_grad_x_repr())
+          ->assert_is_op_input("matmul_v2_grad", "X");
+  auto *attn_context_matmul_grad_y_var =
+      pattern->NewNode(attn_context_matmul_grad_y_repr())
+          ->assert_is_op_input("matmul_v2_grad", "Y");
+  auto *attn_context_matmul_grad =
+      pattern->NewNode(attn_context_matmul_grad_repr())
+          ->assert_is_op("matmul_v2_grad");
+  auto *attn_context_matmul_grad_dx_var =
+      pattern->NewNode(attn_context_matmul_grad_dx_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("X"));
+  auto *attn_context_matmul_grad_dy_var =
+      pattern->NewNode(attn_context_matmul_grad_dy_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("Y"));
+  attn_context_matmul_grad
+      ->LinksFrom({attn_transpose_grad_out_var,
+                   attn_context_matmul_grad_x_var,
+                   attn_context_matmul_grad_y_var})
+      .LinksTo(
+          {attn_context_matmul_grad_dx_var, attn_context_matmul_grad_dy_var});
+
+  PDNode *attn_softmax_grad_input = nullptr;
+  PDNode *attn_softmax_out_var = nullptr;
+  if (with_dropout) {
+    auto *attn_dropout_grad = pattern->NewNode(attn_dropout_grad_repr())
+                                  ->assert_is_op("dropout_grad");
+    auto *attn_dropout_grad_out_var =
+        pattern->NewNode(attn_dropout_grad_out_repr())
+            ->assert_is_op_output("dropout_grad", GradVarName("X"));
+    attn_context_matmul_grad_dx_var->assert_is_op_input("dropout_grad",
+                                                        GradVarName("Out"));
+    attn_dropout_grad->LinksFrom({attn_context_matmul_grad_dx_var})
+        .LinksTo({attn_dropout_grad_out_var});
+    attn_softmax_grad_input = attn_dropout_grad_out_var;
+    attn_softmax_out_var = pattern->NewNode(attn_softmax_out_repr());
+
+  } else {
+    attn_context_matmul_grad_dx_var->assert_is_op_input("softmax_grad",
+                                                        GradVarName("Out"));
+    attn_softmax_grad_input = attn_context_matmul_grad_dx_var;
+    attn_softmax_out_var = attn_context_matmul_grad_x_var;
+  }
+  attn_softmax_out_var->assert_is_op_input("softmax_grad", "Out");
+
+  auto *attn_softmax_grad =
+      pattern->NewNode(attn_softmax_grad_repr())->assert_is_op("softmax_grad");
+  auto *attn_softmax_grad_out_var =
+      pattern->NewNode(attn_softmax_grad_out_repr())
+          ->assert_is_op_output("softmax_grad", GradVarName("X"));
+  attn_softmax_grad->LinksFrom({attn_softmax_out_var, attn_softmax_grad_input})
+      .LinksTo({attn_softmax_grad_out_var});
+
+  attn_softmax_grad_out_var->assert_is_op_input("elementwise_add_grad",
+                                                GradVarName("Out"));
+  auto *attn_mask_eleadd_grad_mask_var =
+      pattern->NewNode(attn_mask_eleadd_grad_mask_repr())
+          ->assert_is_op_input("elementwise_add_grad", "Y");
+  auto *attn_mask_eleadd_grad = pattern->NewNode(attn_mask_eleadd_grad_repr())
+                                    ->assert_is_op("elementwise_add_grad");
+  auto *attn_mask_eleadd_grad_dx_var =
+      pattern->NewNode(attn_mask_eleadd_grad_dx_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
+  attn_mask_eleadd_grad
+      ->LinksFrom({attn_softmax_grad_out_var, attn_mask_eleadd_grad_mask_var})
+      .LinksTo({attn_mask_eleadd_grad_dx_var});
+
+  attn_mask_eleadd_grad_dx_var->assert_is_op_input("matmul_v2_grad",
+                                                   GradVarName("Out"));
+  auto *attn_qk_matmul_grad_x_var =
+      pattern->NewNode(attn_qk_matmul_grad_x_repr())
+          ->assert_is_op_input("matmul_v2_grad", "X");
+  auto *attn_qk_matmul_grad_y_var =
+      pattern->NewNode(attn_qk_matmul_grad_y_repr())
+          ->assert_is_op_input("matmul_v2_grad", "Y");
+  auto *attn_qk_matmul_grad = pattern->NewNode(attn_qk_matmul_grad_repr())
+                                  ->assert_is_op("matmul_v2_grad");
+  auto *attn_qk_matmul_grad_dx_var =
+      pattern->NewNode(attn_qk_matmul_grad_dx_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("X"));
+  auto *attn_qk_matmul_grad_dy_var =
+      pattern->NewNode(attn_qk_matmul_grad_dy_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("Y"));
+  attn_qk_matmul_grad
+      ->LinksFrom({attn_mask_eleadd_grad_dx_var,
+                   attn_qk_matmul_grad_x_var,
+                   attn_qk_matmul_grad_y_var})
+      .LinksTo({attn_qk_matmul_grad_dx_var, attn_qk_matmul_grad_dy_var});
+
+  attn_qk_matmul_grad_dx_var->assert_is_op_input("scale", "X");
+  auto *attn_scale_grad =
+      pattern->NewNode(attn_scale_grad_repr())->assert_is_op("scale");
+  auto *attn_scale_grad_out_var = pattern->NewNode(attn_scale_grad_out_repr())
+                                      ->assert_is_op_output("scale", "Out");
+  attn_scale_grad->LinksFrom({attn_qk_matmul_grad_dx_var})
+      .LinksTo({attn_scale_grad_out_var});
+
+  attn_scale_grad_out_var->assert_is_op_input("transpose2_grad",
+                                              GradVarName("Out"));
+
+  // q -> transpose2_grad -> reshape2_grad
+  auto *attn_q_transpose_grad = pattern->NewNode(attn_q_transpose_grad_repr())
+                                    ->assert_is_op("transpose2_grad");
+  auto *attn_dq = pattern->NewNode(attn_dq_repr())
+                      ->assert_is_op_output("transpose2_grad", GradVarName("X"))
+                      ->assert_is_op_input("reshape2_grad", GradVarName("Out"));
+  attn_q_transpose_grad->LinksFrom({attn_scale_grad_out_var})
+      .LinksTo({attn_dq});
+
+  // k -> transpose2_grad -> reshape2_grad
+  attn_qk_matmul_grad_dy_var->assert_is_op_input("transpose2_grad",
+                                                 GradVarName("Out"));
+  auto *attn_k_transpose_grad = pattern->NewNode(attn_k_transpose_grad_repr())
+                                    ->assert_is_op("transpose2_grad");
+  auto *attn_dk = pattern->NewNode(attn_dk_repr())
+                      ->assert_is_op_output("transpose2_grad", GradVarName("X"))
+                      ->assert_is_op_input("reshape2_grad", GradVarName("Out"));
+  attn_k_transpose_grad->LinksFrom({attn_qk_matmul_grad_dy_var})
+      .LinksTo({attn_dk});
+
+  // v -> transpose2_grad -> slice_grad
+  attn_context_matmul_grad_dy_var->assert_is_op_input("transpose2_grad",
+                                                      GradVarName("Out"));
+  auto *attn_v_transpose_grad = pattern->NewNode(attn_v_transpose_grad_repr())
+                                    ->assert_is_op("transpose2_grad");
+  auto *attn_dv = pattern->NewNode(attn_dv_repr())
+                      ->assert_is_op_output("transpose2_grad", GradVarName("X"))
+                      ->assert_is_op_input("reshape2_grad", GradVarName("Out"));
+  attn_v_transpose_grad->LinksFrom({attn_context_matmul_grad_dy_var})
+      .LinksTo({attn_dv});
+
+  return attn_dq;
+}
+
 PDNode *patterns::VitAttention::operator()(PDNode *in) {
   in->AsInput();
   std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 37b5fbc2ac78e3..6aa9cb54ccbaff 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -987,6 +987,92 @@ struct LinearAct : public PatternBase {
   PATTERN_DECL_NODE(act_out);
 };
 
+struct DotProductAttention : public PatternBase {
+  DotProductAttention(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dot_product_attention_fwd") {}
+
+  PDNode* operator()(bool with_dropout);
+  // declare operator node's name for Attention Computing
+  PATTERN_DECL_NODE(attn_q_transpose);
+  PATTERN_DECL_NODE(attn_k_transpose);
+  PATTERN_DECL_NODE(attn_v_transpose);
+  PATTERN_DECL_NODE(attn_q_scale);
+  PATTERN_DECL_NODE(attn_qk_matmul);
+  PATTERN_DECL_NODE(attn_mask_cast1);
+  PATTERN_DECL_NODE(attn_mask_scale1);
+  PATTERN_DECL_NODE(attn_mask_scale2);
+  PATTERN_DECL_NODE(attn_mask_cast2);
+  PATTERN_DECL_NODE(attn_mask_eleadd);
+  PATTERN_DECL_NODE(attn_softmax);
+  PATTERN_DECL_NODE(attn_dropout);
+  PATTERN_DECL_NODE(attn_context_matmul);
+  PATTERN_DECL_NODE(attn_transpose);
+  // declare variable node's name for Attention Computing
+
+  PATTERN_DECL_NODE(attn_q);
+  PATTERN_DECL_NODE(attn_k);
+  PATTERN_DECL_NODE(attn_v);
+  PATTERN_DECL_NODE(attn_q_transpose_out);
+  PATTERN_DECL_NODE(attn_q_transpose_xshape);
+  PATTERN_DECL_NODE(attn_k_transpose_out);
+  PATTERN_DECL_NODE(attn_k_transpose_xshape);
+  PATTERN_DECL_NODE(attn_v_transpose_out);
+  PATTERN_DECL_NODE(attn_v_transpose_xshape);
+  PATTERN_DECL_NODE(attn_q_scale_out);
+  PATTERN_DECL_NODE(attn_qk_matmul_out);
+  PATTERN_DECL_NODE(attn_mask);
+  PATTERN_DECL_NODE(attn_mask_cast1_out);
+  PATTERN_DECL_NODE(attn_mask_scale1_out);
+  PATTERN_DECL_NODE(attn_mask_scale2_out);
+  PATTERN_DECL_NODE(attn_mask_cast2_out);
+  PATTERN_DECL_NODE(attn_mask_eleadd_out);
+  PATTERN_DECL_NODE(attn_softmax_out);
+  PATTERN_DECL_NODE(attn_dropout_out);
+  PATTERN_DECL_NODE(attn_dropout_mask);
+  PATTERN_DECL_NODE(attn_context_matmul_out);
+  PATTERN_DECL_NODE(attn_transpose_out);
+  PATTERN_DECL_NODE(attn_transpose_xshape);
+};
+
+struct DotProductAttentionGrad : public PatternBase {
+  DotProductAttentionGrad(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dot_product_attention_bwd") {}
+
+  PDNode* operator()(bool with_dropout);
+
+  // declare operator node's name for grad of Attention Computing
+  PATTERN_DECL_NODE(attn_transpose_grad);
+  PATTERN_DECL_NODE(attn_context_matmul_grad);
+  PATTERN_DECL_NODE(attn_dropout_grad);
+  PATTERN_DECL_NODE(attn_softmax_grad);
+  PATTERN_DECL_NODE(attn_mask_eleadd_grad);
+  PATTERN_DECL_NODE(attn_qk_matmul_grad);
+  PATTERN_DECL_NODE(attn_scale_grad);
+  PATTERN_DECL_NODE(attn_q_transpose_grad);
+  PATTERN_DECL_NODE(attn_k_transpose_grad);
+  PATTERN_DECL_NODE(attn_v_transpose_grad);
+  // declare variable node's name for grad of Attention Computing
+  PATTERN_DECL_NODE(attn_dout);
+  PATTERN_DECL_NODE(attn_transpose_grad_out);
+  PATTERN_DECL_NODE(attn_context_matmul_grad_x);
+  PATTERN_DECL_NODE(attn_context_matmul_grad_y);
+  PATTERN_DECL_NODE(attn_context_matmul_grad_dx);
+  PATTERN_DECL_NODE(attn_context_matmul_grad_dy);
+  PATTERN_DECL_NODE(attn_dropout_grad_out);
+  PATTERN_DECL_NODE(attn_softmax_out);
+  PATTERN_DECL_NODE(attn_softmax_grad_out);
+  PATTERN_DECL_NODE(attn_mask_eleadd_grad_mask);
+  PATTERN_DECL_NODE(attn_mask_eleadd_grad_dx);
+  PATTERN_DECL_NODE(attn_qk_matmul_grad_x);
+  PATTERN_DECL_NODE(attn_qk_matmul_grad_y);
+  PATTERN_DECL_NODE(attn_qk_matmul_grad_dx);
+  PATTERN_DECL_NODE(attn_qk_matmul_grad_dy);
+  PATTERN_DECL_NODE(attn_scale_grad_out);
+  PATTERN_DECL_NODE(attn_dq);
+  PATTERN_DECL_NODE(attn_dk);
+  PATTERN_DECL_NODE(attn_dv);
+};
+
 // The following patterns are used to fuse linear_grad and act_grad (ReLu or
 // GeLU)
 // formula: the backward of F.linear( act(x) )
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 0b19175c9306db..e669a0bbdf5524 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -77,6 +77,7 @@
     'fused_scale_bias_relu_conv_bn',
     'fused_scale_bias_add_relu',
     'fused_dconv_drelu_dbn',
+    'fused_dot_product_attention',
     'fusion_transpose_flatten_concat',
     'skip_layernorm',
     'generate_sequence_xpu',
@@ -122,6 +123,7 @@
     'fused_scale_bias_relu_conv_bn',
     'fused_scale_bias_add_relu',
     'fused_dconv_drelu_dbn',
+    'fused_dot_product_attention',
     'recv_v2',
     'rnn_',
     'seed',
diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
index faf4df78cbdd71..9551bfc425ebc6 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_black_list.py
@@ -28,4 +28,5 @@
     'fused_dropout_add',
     'fused_rotary_position_embedding',
     'fused_bias_dropout_residual_layer_norm',
+    'fused_dot_product_attention',
 ]
diff --git a/paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.cc
new file mode 100644
index 00000000000000..639c0e0e4b4140
--- /dev/null
+++ b/paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.cc
@@ -0,0 +1,639 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/drr/api/drr_pattern_base.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_registry.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+
+namespace {
+
+class FusedDotProductAttentionPattern
+    : public pir::drr::DrrPatternBase<FusedDotProductAttentionPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern src = ctx->SourcePattern();
+
+    // q[b, s, head, head_dim] -> transpose -> q[b, head, s, head_dim] -> scale
+    const auto &q_transpose = src.Op("pd_op.transpose");
+    src.Tensor("q_transpose_out") = q_transpose(src.Tensor("q"));
+    // k[b, s, head, head_dim] -> transpose -> k[b, head, s, head_dim]
+    const auto &k_transpose = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = k_transpose(src.Tensor("k"));
+    // v[b, s, head, head_dim] -> transpose -> v[b, head, s, head_dim]
+    const auto &v_transpose = src.Op("pd_op.transpose");
+    src.Tensor("v_transpose_out") = v_transpose(src.Tensor("v"));
+
+    const auto &q_scale_full =
+        src.Op("pd_op.full", {{"value", src.Attr("q_scale_value")}});
+    src.Tensor("q_scale_full_out") = q_scale_full();
+    const auto &q_scale = src.Op("pd_op.scale");
+    src.Tensor("q_scale_out") =
+        q_scale(src.Tensor("q_transpose_out"), src.Tensor("q_scale_full_out"));
+
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("qk_matmul_transpose_x")},
+                {"transpose_y", src.Attr("qk_matmul_transpose_y")}});
+    src.Tensor("qk_matmul_out") =
+        qk_matmul(src.Tensor("q_scale_out"), src.Tensor("k_transpose_out"));
+
+    // mask(int) -> cast -> cast -> scale -> scale -> mask(fp16)
+    const auto &mask_cast1 = src.Op("pd_op.cast");
+    src.Tensor("mask_cast1_out") = mask_cast1(src.Tensor("mask"));
+    const auto &mask_full1 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale1_value")}});
+    const auto &mask_scale1 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale1_out") =
+        mask_scale1(src.Tensor("mask_cast1_out"), mask_full1());
+    const auto &mask_full2 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale2_value")}});
+    const auto &mask_scale2 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale2_out") =
+        mask_scale2(src.Tensor("mask_scale1_out"), mask_full2());
+
+    // softmax(qk)v
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_matmul_out"), src.Tensor("mask_scale2_out"));
+    const auto &softmax =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+    src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") = context_matmul(
+        src.Tensor("softmax_out"), src.Tensor("v_transpose_out"));
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // Constraints
+    src.RequireNativeCall([](const pir::drr::MatchContext &match_ctx) -> bool {
+      const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+      if (softmax_axis != -1 && softmax_axis != 3) return false;
+
+      bool qk_matmul_transpose_x =
+          match_ctx.Attr<bool>("qk_matmul_transpose_x");
+      bool qk_matmul_transpose_y =
+          match_ctx.Attr<bool>("qk_matmul_transpose_y");
+      if (qk_matmul_transpose_x || !qk_matmul_transpose_y) return false;
+
+      bool context_matmul_transpose_x =
+          match_ctx.Attr<bool>("context_matmul_transpose_x");
+      bool context_matmul_transpose_y =
+          match_ctx.Attr<bool>("context_matmul_transpose_y");
+      if (context_matmul_transpose_x || context_matmul_transpose_y)
+        return false;
+
+      return true;
+    });
+
+    // Result pattern
+    pir::drr::ResultPattern res = src.ResultPattern();
+    const auto &scaling_factor =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return match_ctx.Attr<float>("q_scale_value");
+        });
+    const auto &dropout_prob =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return static_cast<float>(0.0);
+        });
+    const auto &is_training = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return true; });
+    const auto &is_causal_masking = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return false; });
+
+    const auto &dot_product_attention =
+        res.Op(paddle::dialect::FusedDotProductAttentionOp::name(),
+               {{{"scaling_factor", scaling_factor},
+                 {"dropout_probability", dropout_prob},
+                 {"is_training", is_training},
+                 {"is_causal_masking", is_causal_masking}}});
+
+    dot_product_attention({&res.Tensor("q"),
+                           &res.Tensor("k"),
+                           &res.Tensor("v"),
+                           &res.Tensor("mask")},
+                          {&res.Tensor("out"),
+                           &res.Tensor("softmax_aux"),
+                           &res.Tensor("rng_state")});
+  }
+};
+
+class FusedDotProductAttentionGradPattern
+    : public pir::drr::DrrPatternBase<FusedDotProductAttentionGradPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern src = ctx->SourcePattern();
+
+    // q[b, s, head, head_dim] -> transpose -> q[b, head, s, head_dim] -> scale
+    const auto &q_transpose = src.Op("pd_op.transpose");
+    src.Tensor("q_transpose_out") = q_transpose(src.Tensor("q"));
+    // k[b, s, head, head_dim] -> transpose -> k[b, head, s, head_dim]
+    const auto &k_transpose = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = k_transpose(src.Tensor("k"));
+    // v[b, s, head, head_dim] -> transpose -> v[b, head, s, head_dim]
+    const auto &v_transpose = src.Op("pd_op.transpose");
+    src.Tensor("v_transpose_out") = v_transpose(src.Tensor("v"));
+
+    const auto &q_scale_full =
+        src.Op("pd_op.full", {{"value", src.Attr("q_scale_value")}});
+    src.Tensor("q_scale_full_out") = q_scale_full();
+    const auto &q_scale = src.Op("pd_op.scale");
+    src.Tensor("q_scale_out") =
+        q_scale(src.Tensor("q_transpose_out"), src.Tensor("q_scale_full_out"));
+
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("qk_matmul_transpose_x")},
+                {"transpose_y", src.Attr("qk_matmul_transpose_y")}});
+    src.Tensor("qk_matmul_out") =
+        qk_matmul(src.Tensor("q_scale_out"), src.Tensor("k_transpose_out"));
+
+    // mask(int) -> cast -> cast -> scale -> scale -> mask(fp16)
+    const auto &mask_cast1 = src.Op("pd_op.cast");
+    src.Tensor("mask_cast1_out") = mask_cast1(src.Tensor("mask"));
+    const auto &mask_full1 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale1_value")}});
+    const auto &mask_scale1 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale1_out") =
+        mask_scale1(src.Tensor("mask_cast1_out"), mask_full1());
+    const auto &mask_full2 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale2_value")}});
+    const auto &mask_scale2 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale2_out") =
+        mask_scale2(src.Tensor("mask_scale1_out"), mask_full2());
+
+    // softmax(qk)v
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_matmul_out"), src.Tensor("mask_scale2_out"));
+    const auto &softmax =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+    src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") = context_matmul(
+        src.Tensor("softmax_out"), src.Tensor("v_transpose_out"));
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // backward
+    const auto &o_transpose_grad = src.Op("pd_op.transpose_grad");
+    src.Tensor("o_transpose_grad_out") =
+        o_transpose_grad(src.Tensor("out_grad"));
+    const auto &context_matmul_grad =
+        src.Op("pd_op.matmul_grad",
+               {{"transpose_x", src.Attr("context_matmul_grad_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_grad_transpose_y")}});
+    context_matmul_grad(
+        {&src.Tensor("softmax_out"),
+         &src.Tensor("v_transpose_out"),
+         &src.Tensor("o_transpose_grad_out")},
+        {&src.Tensor("softmax_out_grad"), &src.Tensor("v_transpose_out_grad")});
+    const auto &softmax_grad = src.Op("pd_op.softmax_grad");
+    softmax_grad({&src.Tensor("softmax_out"), &src.Tensor("softmax_out_grad")},
+                 {&src.Tensor("mask_add_out_grad")});
+    const auto &v_transpose_grad = src.Op("pd_op.transpose_grad");
+    v_transpose_grad({&src.Tensor("v_transpose_out_grad")},
+                     {&src.Tensor("v_grad")});
+    const auto &mask_add_grad = src.Op("pd_op.add_grad");
+    mask_add_grad({&src.Tensor("qk_matmul_out"),
+                   &src.Tensor("mask_scale2_out"),
+                   &src.Tensor("mask_add_out_grad")},
+                  {&src.Tensor("qk_matmul_out_grad"),
+                   &src.Tensor("mask_scale2_out_grad")});
+    const auto &qk_matmul_grad =
+        src.Op("pd_op.matmul_grad",
+               {{"transpose_x", src.Attr("qk_matmul_grad_transpose_x")},
+                {"transpose_y", src.Attr("qk_matmul_grad_transpose_y")}});
+    qk_matmul_grad(
+        {&src.Tensor("q_scale_out"),
+         &src.Tensor("k_transpose_out"),
+         &src.Tensor("qk_matmul_out_grad")},
+        {&src.Tensor("q_scale_out_grad"), &src.Tensor("k_transpose_out_grad")});
+    const auto &q_scale_grad = src.Op("pd_op.scale");
+    src.Tensor("q_transpose_out_grad") = q_scale_grad(
+        src.Tensor("q_scale_out_grad"), src.Tensor("q_scale_full_out"));
+    const auto &q_transpose_grad = src.Op("pd_op.transpose_grad");
+    q_transpose_grad({&src.Tensor("q_transpose_out_grad")},
+                     {&src.Tensor("q_grad")});
+    const auto &k_transpose_grad = src.Op("pd_op.transpose_grad");
+    k_transpose_grad({&src.Tensor("k_transpose_out_grad")},
+                     {&src.Tensor("k_grad")});
+
+    // Constraints
+    src.RequireNativeCall([](const pir::drr::MatchContext &match_ctx) -> bool {
+      const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+      if (softmax_axis != -1 && softmax_axis != 3) return false;
+
+      bool qk_matmul_transpose_x =
+          match_ctx.Attr<bool>("qk_matmul_transpose_x");
+      bool qk_matmul_transpose_y =
+          match_ctx.Attr<bool>("qk_matmul_transpose_y");
+      if (qk_matmul_transpose_x || !qk_matmul_transpose_y) return false;
+
+      bool context_matmul_transpose_x =
+          match_ctx.Attr<bool>("context_matmul_transpose_x");
+      bool context_matmul_transpose_y =
+          match_ctx.Attr<bool>("context_matmul_transpose_y");
+      if (context_matmul_transpose_x || context_matmul_transpose_y)
+        return false;
+
+      return true;
+    });
+
+    // Result pattern
+    pir::drr::ResultPattern res = src.ResultPattern();
+    const auto &scaling_factor =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return match_ctx.Attr<float>("q_scale_value");
+        });
+    const auto &dropout_prob =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return static_cast<float>(0.0);
+        });
+    const auto &is_training = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return true; });
+    const auto &is_causal_masking = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return false; });
+
+    const auto &dot_product_attention =
+        res.Op(paddle::dialect::FusedDotProductAttentionOp::name(),
+               {{{"scaling_factor", scaling_factor},
+                 {"dropout_probability", dropout_prob},
+                 {"is_training", is_training},
+                 {"is_causal_masking", is_causal_masking}}});
+
+    dot_product_attention({&res.Tensor("q"),
+                           &res.Tensor("k"),
+                           &res.Tensor("v"),
+                           &res.Tensor("mask")},
+                          {&res.Tensor("out"),
+                           &res.Tensor("softmax_aux"),
+                           &res.Tensor("rng_state")});
+    const auto &dot_product_attention_grad =
+        res.Op(paddle::dialect::FusedDotProductAttentionGradOp::name(),
+               {{{"scaling_factor", scaling_factor},
+                 {"dropout_probability", dropout_prob},
+                 {"is_causal_masking", is_causal_masking}}});
+    dot_product_attention_grad(
+        {&res.Tensor("q"),
+         &res.Tensor("k"),
+         &res.Tensor("v"),
+         &res.Tensor("out"),
+         &res.Tensor("softmax_aux"),
+         &res.Tensor("rng_state"),
+         &res.Tensor("mask"),
+         &res.Tensor("out_grad")},
+        {&res.Tensor("q_grad"), &res.Tensor("k_grad"), &res.Tensor("v_grad")});
+  }
+};
+
+class FusedDotProductAttentionWithDropoutPattern
+    : public pir::drr::DrrPatternBase<
+          FusedDotProductAttentionWithDropoutPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern src = ctx->SourcePattern();
+
+    // q[b, s, head, head_dim] -> transpose -> q[b, head, s, head_dim] -> scale
+    const auto &q_transpose = src.Op("pd_op.transpose");
+    src.Tensor("q_transpose_out") = q_transpose(src.Tensor("q"));
+    // k[b, s, head, head_dim] -> transpose -> k[b, head, s, head_dim]
+    const auto &k_transpose = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = k_transpose(src.Tensor("k"));
+    // v[b, s, head, head_dim] -> transpose -> v[b, head, s, head_dim]
+    const auto &v_transpose = src.Op("pd_op.transpose");
+    src.Tensor("v_transpose_out") = v_transpose(src.Tensor("v"));
+
+    const auto &q_scale_full =
+        src.Op("pd_op.full", {{"value", src.Attr("q_scale_value")}});
+    src.Tensor("q_scale_full_out") = q_scale_full();
+    const auto &q_scale = src.Op("pd_op.scale");
+    src.Tensor("q_scale_out") =
+        q_scale(src.Tensor("q_transpose_out"), src.Tensor("q_scale_full_out"));
+
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("qk_matmul_transpose_x")},
+                {"transpose_y", src.Attr("qk_matmul_transpose_y")}});
+    src.Tensor("qk_matmul_out") =
+        qk_matmul(src.Tensor("q_scale_out"), src.Tensor("k_transpose_out"));
+
+    // mask(int) -> cast -> cast -> scale -> scale -> mask(fp16)
+    const auto &mask_cast1 = src.Op("pd_op.cast");
+    src.Tensor("mask_cast1_out") = mask_cast1(src.Tensor("mask"));
+    const auto &mask_full1 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale1_value")}});
+    const auto &mask_scale1 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale1_out") =
+        mask_scale1(src.Tensor("mask_cast1_out"), mask_full1());
+    const auto &mask_full2 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale2_value")}});
+    const auto &mask_scale2 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale2_out") =
+        mask_scale2(src.Tensor("mask_scale1_out"), mask_full2());
+
+    // softmax(qk)v
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_matmul_out"), src.Tensor("mask_scale2_out"));
+    const auto &softmax =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+    src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    const auto &dropout = src.Op("pd_op.dropout",
+                                 {{"p", src.Attr("dropout_prob")},
+                                  {"is_test", src.Attr("is_test")},
+                                  {"mode", src.Attr("mode")},
+                                  {"seed", src.Attr("seed")},
+                                  {"fix_seed", src.Attr("fix_seed")}});
+    dropout({&src.Tensor("softmax_out"), &src.Tensor("seed_tensor")},
+            {&src.Tensor("dropout_out"), &src.Tensor("dropout_mask")});
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") = context_matmul(
+        src.Tensor("dropout_out"), src.Tensor("v_transpose_out"));
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // Constraints
+    src.RequireNativeCall([](const pir::drr::MatchContext &match_ctx) -> bool {
+      const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+      if (softmax_axis != -1 && softmax_axis != 3) return false;
+
+      bool qk_matmul_transpose_x =
+          match_ctx.Attr<bool>("qk_matmul_transpose_x");
+      bool qk_matmul_transpose_y =
+          match_ctx.Attr<bool>("qk_matmul_transpose_y");
+      if (qk_matmul_transpose_x || !qk_matmul_transpose_y) return false;
+
+      bool context_matmul_transpose_x =
+          match_ctx.Attr<bool>("context_matmul_transpose_x");
+      bool context_matmul_transpose_y =
+          match_ctx.Attr<bool>("context_matmul_transpose_y");
+      if (context_matmul_transpose_x || context_matmul_transpose_y)
+        return false;
+
+      return true;
+    });
+
+    // Result pattern
+    pir::drr::ResultPattern res = src.ResultPattern();
+    const auto &scaling_factor =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return match_ctx.Attr<float>("q_scale_value");
+        });
+    const auto &dropout_prob =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return static_cast<float>(0.0);
+        });
+    const auto &is_training = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return true; });
+    const auto &is_causal_masking = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return false; });
+
+    const auto &dot_product_attention =
+        res.Op(paddle::dialect::FusedDotProductAttentionOp::name(),
+               {{{"scaling_factor", scaling_factor},
+                 {"dropout_probability", src.Attr("dropout_prob")},
+                 {"is_training", is_training},
+                 {"is_causal_masking", is_causal_masking}}});
+
+    dot_product_attention({&res.Tensor("q"),
+                           &res.Tensor("k"),
+                           &res.Tensor("v"),
+                           &res.Tensor("mask")},
+                          {&res.Tensor("out"),
+                           &res.Tensor("softmax_aux"),
+                           &res.Tensor("rng_state")});
+  }
+};
+
+class FusedDotProductAttentionGradWithDropoutPattern
+    : public pir::drr::DrrPatternBase<
+          FusedDotProductAttentionGradWithDropoutPattern> {
+ public:
+  void operator()(pir::drr::DrrPatternContext *ctx) const override {
+    pir::drr::SourcePattern src = ctx->SourcePattern();
+
+    // q[b, s, head, head_dim] -> transpose -> q[b, head, s, head_dim] -> scale
+    const auto &q_transpose = src.Op("pd_op.transpose");
+    src.Tensor("q_transpose_out") = q_transpose(src.Tensor("q"));
+    // k[b, s, head, head_dim] -> transpose -> k[b, head, s, head_dim]
+    const auto &k_transpose = src.Op("pd_op.transpose");
+    src.Tensor("k_transpose_out") = k_transpose(src.Tensor("k"));
+    // v[b, s, head, head_dim] -> transpose -> v[b, head, s, head_dim]
+    const auto &v_transpose = src.Op("pd_op.transpose");
+    src.Tensor("v_transpose_out") = v_transpose(src.Tensor("v"));
+
+    const auto &q_scale_full =
+        src.Op("pd_op.full", {{"value", src.Attr("q_scale_value")}});
+    src.Tensor("q_scale_full_out") = q_scale_full();
+    const auto &q_scale = src.Op("pd_op.scale");
+    src.Tensor("q_scale_out") =
+        q_scale(src.Tensor("q_transpose_out"), src.Tensor("q_scale_full_out"));
+
+    const auto &qk_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("qk_matmul_transpose_x")},
+                {"transpose_y", src.Attr("qk_matmul_transpose_y")}});
+    src.Tensor("qk_matmul_out") =
+        qk_matmul(src.Tensor("q_scale_out"), src.Tensor("k_transpose_out"));
+
+    // mask(int) -> cast -> cast -> scale -> scale -> mask(fp16)
+    const auto &mask_cast1 = src.Op("pd_op.cast");
+    src.Tensor("mask_cast1_out") = mask_cast1(src.Tensor("mask"));
+    const auto &mask_full1 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale1_value")}});
+    const auto &mask_scale1 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale1_out") =
+        mask_scale1(src.Tensor("mask_cast1_out"), mask_full1());
+    const auto &mask_full2 =
+        src.Op("pd_op.full", {{"value", src.Attr("mask_scale2_value")}});
+    const auto &mask_scale2 = src.Op("pd_op.scale");
+    src.Tensor("mask_scale2_out") =
+        mask_scale2(src.Tensor("mask_scale1_out"), mask_full2());
+
+    // softmax(qk)v
+    const auto &mask_add = src.Op("pd_op.add");
+    src.Tensor("mask_add_out") =
+        mask_add(src.Tensor("qk_matmul_out"), src.Tensor("mask_scale2_out"));
+    const auto &softmax =
+        src.Op("pd_op.softmax", {{"axis", src.Attr("softmax_axis")}});
+    src.Tensor("softmax_out") = softmax(src.Tensor("mask_add_out"));
+    const auto &dropout = src.Op("pd_op.dropout",
+                                 {{"p", src.Attr("dropout_prob")},
+                                  {"is_test", src.Attr("is_test")},
+                                  {"mode", src.Attr("mode")},
+                                  {"seed", src.Attr("seed")},
+                                  {"fix_seed", src.Attr("fix_seed")}});
+    dropout({&src.Tensor("softmax_out"), &src.Tensor("seed_tensor")},
+            {&src.Tensor("dropout_out"), &src.Tensor("dropout_mask")});
+    const auto &context_matmul =
+        src.Op("pd_op.matmul",
+               {{"transpose_x", src.Attr("context_matmul_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_transpose_y")}});
+    src.Tensor("context_matmul_out") = context_matmul(
+        src.Tensor("dropout_out"), src.Tensor("v_transpose_out"));
+    const auto &o_transpose = src.Op("pd_op.transpose");
+    src.Tensor("out") = o_transpose(src.Tensor("context_matmul_out"));
+
+    // backward
+    const auto &o_transpose_grad = src.Op("pd_op.transpose_grad");
+    src.Tensor("o_transpose_grad_out") =
+        o_transpose_grad(src.Tensor("out_grad"));
+    const auto &context_matmul_grad =
+        src.Op("pd_op.matmul_grad",
+               {{"transpose_x", src.Attr("context_matmul_grad_transpose_x")},
+                {"transpose_y", src.Attr("context_matmul_grad_transpose_y")}});
+    context_matmul_grad(
+        {&src.Tensor("dropout_out"),
+         &src.Tensor("v_transpose_out"),
+         &src.Tensor("o_transpose_grad_out")},
+        {&src.Tensor("dropout_out_grad"), &src.Tensor("v_transpose_out_grad")});
+    const auto &dropout_grad = src.Op("pd_op.dropout_grad",
+                                      {{"p", src.Attr("dropout_prob")},
+                                       {"is_test", src.Attr("is_test")},
+                                       {"mode", src.Attr("mode")}});
+    dropout_grad({&src.Tensor("dropout_mask"), &src.Tensor("dropout_out_grad")},
+                 {&src.Tensor("softmax_out_grad")});
+    const auto &softmax_grad = src.Op("pd_op.softmax_grad");
+    softmax_grad({&src.Tensor("softmax_out"), &src.Tensor("softmax_out_grad")},
+                 {&src.Tensor("mask_add_out_grad")});
+    const auto &v_transpose_grad = src.Op("pd_op.transpose_grad");
+    v_transpose_grad({&src.Tensor("v_transpose_out_grad")},
+                     {&src.Tensor("v_grad")});
+    const auto &mask_add_grad = src.Op("pd_op.add_grad");
+    mask_add_grad({&src.Tensor("qk_matmul_out"),
+                   &src.Tensor("mask_scale2_out"),
+                   &src.Tensor("mask_add_out_grad")},
+                  {&src.Tensor("qk_matmul_out_grad"),
+                   &src.Tensor("mask_scale2_out_grad")});
+    const auto &qk_matmul_grad =
+        src.Op("pd_op.matmul_grad",
+               {{"transpose_x", src.Attr("qk_matmul_grad_transpose_x")},
+                {"transpose_y", src.Attr("qk_matmul_grad_transpose_y")}});
+    qk_matmul_grad(
+        {&src.Tensor("q_scale_out"),
+         &src.Tensor("k_transpose_out"),
+         &src.Tensor("qk_matmul_out_grad")},
+        {&src.Tensor("q_scale_out_grad"), &src.Tensor("k_transpose_out_grad")});
+    const auto &q_scale_grad = src.Op("pd_op.scale");
+    src.Tensor("q_transpose_out_grad") = q_scale_grad(
+        src.Tensor("q_scale_out_grad"), src.Tensor("q_scale_full_out"));
+    const auto &q_transpose_grad = src.Op("pd_op.transpose_grad");
+    q_transpose_grad({&src.Tensor("q_transpose_out_grad")},
+                     {&src.Tensor("q_grad")});
+    const auto &k_transpose_grad = src.Op("pd_op.transpose_grad");
+    k_transpose_grad({&src.Tensor("k_transpose_out_grad")},
+                     {&src.Tensor("k_grad")});
+
+    // Constraints
+    src.RequireNativeCall([](const pir::drr::MatchContext &match_ctx) -> bool {
+      const auto &softmax_axis = match_ctx.Attr<int>("softmax_axis");
+      if (softmax_axis != -1 && softmax_axis != 3) return false;
+
+      bool qk_matmul_transpose_x =
+          match_ctx.Attr<bool>("qk_matmul_transpose_x");
+      bool qk_matmul_transpose_y =
+          match_ctx.Attr<bool>("qk_matmul_transpose_y");
+      if (qk_matmul_transpose_x || !qk_matmul_transpose_y) return false;
+
+      bool context_matmul_transpose_x =
+          match_ctx.Attr<bool>("context_matmul_transpose_x");
+      bool context_matmul_transpose_y =
+          match_ctx.Attr<bool>("context_matmul_transpose_y");
+      if (context_matmul_transpose_x || context_matmul_transpose_y)
+        return false;
+
+      return true;
+    });
+
+    // Result pattern
+    pir::drr::ResultPattern res = src.ResultPattern();
+    const auto &scaling_factor =
+        res.Attr([](const pir::drr::MatchContext &match_ctx) -> float {
+          return match_ctx.Attr<float>("q_scale_value");
+        });
+    const auto &is_training = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return true; });
+    const auto &is_causal_masking = res.Attr(
+        [](const pir::drr::MatchContext &match_ctx) -> bool { return false; });
+
+    const auto &dot_product_attention =
+        res.Op(paddle::dialect::FusedDotProductAttentionOp::name(),
+               {{{"scaling_factor", scaling_factor},
+                 {"dropout_probability", src.Attr("dropout_prob")},
+                 {"is_training", is_training},
+                 {"is_causal_masking", is_causal_masking}}});
+
+    dot_product_attention({&res.Tensor("q"),
+                           &res.Tensor("k"),
+                           &res.Tensor("v"),
+                           &res.Tensor("mask")},
+                          {&res.Tensor("out"),
+                           &res.Tensor("softmax_aux"),
+                           &res.Tensor("rng_state")});
+    const auto &dot_product_attention_grad =
+        res.Op(paddle::dialect::FusedDotProductAttentionGradOp::name(),
+               {{{"scaling_factor", scaling_factor},
+                 {"dropout_probability", src.Attr("dropout_prob")},
+                 {"is_causal_masking", is_causal_masking}}});
+    dot_product_attention_grad(
+        {&res.Tensor("q"),
+         &res.Tensor("k"),
+         &res.Tensor("v"),
+         &res.Tensor("out"),
+         &res.Tensor("softmax_aux"),
+         &res.Tensor("rng_state"),
+         &res.Tensor("mask"),
+         &res.Tensor("out_grad")},
+        {&res.Tensor("q_grad"), &res.Tensor("k_grad"), &res.Tensor("v_grad")});
+  }
+};
+
+class FusedDotProductAttentionPass : public pir::PatternRewritePass {
+ public:
+  FusedDotProductAttentionPass()
+      : pir::PatternRewritePass("fused_dot_product_attention_pass", 1) {}
+
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(FusedDotProductAttentionPattern().Build(context));
+    ps.Add(FusedDotProductAttentionGradPattern().Build(context));
+    ps.Add(FusedDotProductAttentionWithDropoutPattern().Build(context));
+    ps.Add(FusedDotProductAttentionGradWithDropoutPattern().Build(context));
+    return ps;
+  }
+};
+
+}  // namespace
+
+namespace pir {
+
+std::unique_ptr<Pass> CreateFusedDotProductAttentionPass() {
+  return std::make_unique<FusedDotProductAttentionPass>();
+}
+}  // namespace pir
+
+REGISTER_IR_PASS(fused_dot_product_attention_pass,
+                 FusedDotProductAttentionPass);
diff --git a/paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.h b/paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.h
new file mode 100644
index 00000000000000..55f7914e4d5d98
--- /dev/null
+++ b/paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/pir/core/dll_decl.h"
+
+namespace pir {
+
+class Pass;
+
+IR_API std::unique_ptr<Pass> CreateFusedDotProductAttentionPass();
+
+}  // namespace pir
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 5b8d169d91f746..e79be5f134a224 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -587,6 +587,34 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                         >>> build_strategy = static.BuildStrategy()
                         >>> build_strategy.fuse_gemm_epilogue = True
           )DOC")
+      .def_property(
+          "fuse_dot_product_attention",
+          [](const BuildStrategy &self) {
+            return self.fuse_dot_product_attention_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_dot_product_attention_ = b;
+          },
+          R"DOC((bool, optional): fuse_dot_product_attention indicate whether
+                to fuse dot product attention,
+                it would make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_dot_product_attention = True
+                     )DOC")
       .def_property(
           "fuse_adamw",
           [](const BuildStrategy &self) { return self.fuse_adamw_; },
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index ed22ded0a090ca..73b4051981a04e 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -39,6 +39,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/fluid/pir/transforms/dead_code_elimination_pass.h"
+#include "paddle/fluid/pir/transforms/fusion/fused_dot_product_attention_pass.h"
 #include "paddle/fluid/pir/transforms/fusion/fused_dropout_add_pass.h"
 #include "paddle/fluid/pir/transforms/fusion/fused_linear_param_grad_add_pass.h"
 #include "paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.h"
@@ -105,6 +106,7 @@ USE_PIR_PASS(replace_fetch_with_shadow_output_pass);
 USE_PIR_PASS(conv2d_bn_fuse_pass);
 USE_PIR_PASS(conv2d_add_fuse_pass);
 USE_PIR_PASS(conv2d_add_act_fuse_pass);
+USE_PIR_PASS(fused_dot_product_attention_pass);
 
 PHI_DECLARE_bool(print_ir);
 
diff --git a/paddle/phi/api/yaml/fused_backward.yaml b/paddle/phi/api/yaml/fused_backward.yaml
index 2552d218db4629..649e427b25a34d 100644
--- a/paddle/phi/api/yaml/fused_backward.yaml
+++ b/paddle/phi/api/yaml/fused_backward.yaml
@@ -15,6 +15,18 @@
     func : fused_bias_dropout_residual_layer_norm_grad
     data_type : y_grad
 
+- backward_op : fused_dot_product_attention_grad
+  forward : fused_dot_product_attention (Tensor q, Tensor k, Tensor v, Tensor mask, float scaling_factor, float dropout_probability, bool is_training, bool is_causal_masking) -> Tensor(out), Tensor(softmax_out), Tensor(rng_state)
+  args : (Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_out, Tensor rng_state, Tensor mask, Tensor out_grad, float scaling_factor, float dropout_probability, bool is_causal_masking = false)
+  output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
+  infer_meta :
+    func : FusedDotProductAttentionGradInferMeta
+    param: [q, k, v]
+  kernel :
+    func : fused_dot_product_attention_grad
+    data_type : q
+  support_dygraph_mode : true
+
 - backward_op : fused_dropout_add_grad
   forward : fused_dropout_add (Tensor x, Tensor y, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
   args : (Tensor seed_offset, Tensor out_grad, Scalar p, bool is_test, str mode, bool fix_seed)
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 7e5975b42894e5..12310f6731ac60 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -196,6 +196,18 @@
     func : fused_dconv_drelu_dbn
     data_type : grad_output
 
+- op : fused_dot_product_attention
+  args : (Tensor q, Tensor k, Tensor v, Tensor mask, float scaling_factor, float dropout_probability, bool is_training = false, bool is_causal_masking = false)
+  output : Tensor(out), Tensor(softmax_out), Tensor(rng_state)
+  infer_meta :
+    func : FusedDotProductAttentionInferMeta
+    param : [q, k, v]
+  kernel :
+    func : fused_dot_product_attention
+    data_type : q
+  backward : fused_dot_product_attention_grad
+  support_dygraph_mode : true
+
 - op : fused_dropout_add
   args : (Tensor x, Tensor y, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed = 0, bool fix_seed = false)
   optional : seed_tensor
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c1d79f2378926e..6cf538f255b02c 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -494,4 +494,5 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
                            const MetaTensor& values,
                            MetaTensor* x_grad,
                            MetaTensor* value_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 7cce4584ac78dc..0e23b8b53ef019 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -851,6 +851,160 @@ void FusedAttentionGradInferMeta(const MetaTensor& out_grad,
   }
 }
 
+void FusedBiasDropoutResidualLnInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& residual,
+    const MetaTensor& bias,
+    const MetaTensor& ln_scale,
+    const MetaTensor& ln_bias,
+    const float dropout_rate,
+    const bool is_test,
+    const bool dropout_fix_seed,
+    const int dropout_seed,
+    const std::string& dropout_implementation,
+    const float ln_epsilon,
+    MetaTensor* y,
+    MetaTensor* bias_dropout_residual_out,
+    MetaTensor* dropout_mask_out,
+    MetaTensor* ln_mean,
+    MetaTensor* ln_variance) {
+  PADDLE_ENFORCE_EQ(dropout_rate >= 0.0f && dropout_rate <= 1.0f,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "'dropout_rate' must be between 0.0 and 1.0."));
+  PADDLE_ENFORCE_EQ(
+      dropout_implementation == "downgrade_in_infer" ||
+          dropout_implementation == "upscale_in_train",
+      true,
+      phi::errors::InvalidArgument(
+          "dropout_implementation can only be downgrade_in_infer or "
+          "upscale_in_train"));
+  PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "'epsilon' of the LayerNorm should be between "
+                        "0.0 and 0.001, But received [%s].",
+                        ln_epsilon));
+  auto x_dim = x.dims();
+  int left = 1;
+  for (int i = 0; i < x_dim.size() - 1; i++) {
+    left *= x_dim[i];
+  }
+  bias_dropout_residual_out->set_dims(x.dims());
+  if (is_test == false) {
+    dropout_mask_out->set_dims(x.dims());
+  }
+  ln_mean->set_dims({left});
+  ln_variance->set_dims({left});
+  y->set_dims(x.dims());
+}
+
+void FusedBiasDropoutResidualLnGradInferMeta(
+    const MetaTensor& y_grad,
+    const MetaTensor& x,
+    const MetaTensor& residual,
+    const MetaTensor& bias,
+    const MetaTensor& ln_scale,
+    const MetaTensor& ln_bias,
+    const MetaTensor& ln_mean,
+    const MetaTensor& ln_variance,
+    const MetaTensor& bias_dropout_residual_out,
+    const MetaTensor& dropout_mask_out,
+    const float dropout_rate,
+    const bool is_test,
+    const bool dropout_fix_seed,
+    const int dropout_seed,
+    const std::string& dropout_implementation,
+    const float ln_epsilon,
+    MetaTensor* x_grad,
+    MetaTensor* residual_grad,
+    MetaTensor* bias_grad,
+    MetaTensor* ln_scale_grad,
+    MetaTensor* ln_bias_grad) {
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "GradOp is only callable when is_test is false"));
+  if (ln_scale_grad) {
+    ln_scale_grad->set_dims(ln_scale.dims());
+    ln_scale_grad->set_dtype(y_grad.dtype());
+  }
+  if (ln_bias_grad) {
+    ln_bias_grad->set_dims(ln_bias.dims());
+    ln_bias_grad->set_dtype(y_grad.dtype());
+  }
+  if (residual_grad) {
+    residual_grad->set_dims(residual.dims());
+    residual_grad->set_dtype(y_grad.dtype());
+  }
+  if (bias_grad) {
+    bias_grad->set_dims(bias.dims());
+    bias_grad->set_dtype(y_grad.dtype());
+  }
+  if (x_grad) {
+    x_grad->set_dims(x.dims());
+    x_grad->set_dtype(y_grad.dtype());
+  }
+}
+
+void FusedDotProductAttentionInferMeta(const MetaTensor& q,
+                                       const MetaTensor& k,
+                                       const MetaTensor& v,
+                                       MetaTensor* out,
+                                       MetaTensor* softmax_out,
+                                       MetaTensor* rng_state) {
+  // q input shape: [batch_size, q_seq_len, num_heads, head_size]
+  // k, v input shape: [batch_size, kv_seq_len, num_heads, head_size]
+  auto q_dim = q.dims();
+  auto k_dim = k.dims();
+  auto v_dim = v.dims();
+
+  // check shape
+  PADDLE_ENFORCE(q_dim.size() == 4 && k_dim.size() == 4 && v_dim.size() == 4,
+                 phi::errors::InvalidArgument(
+                     "The dimensions of q, k, v must be 4"
+                     "(batch_size, seq_len, num_heads, head_size),"
+                     "but received dimensions of"
+                     "Input is [%d], [%d], [%d]",
+                     q_dim.size(),
+                     k_dim.size(),
+                     v_dim.size()));
+
+  PADDLE_ENFORCE(q_dim[0] == k_dim[0] && k_dim[0] == v_dim[0],
+                 phi::errors::InvalidArgument(
+                     "The first dimension of q, k, v must be equal"
+                     "but received dimensions of"
+                     "Input is [%d], [%d], [%d]",
+                     q_dim[0],
+                     k_dim[0],
+                     v_dim[0]));
+
+  // [batch_size, num_heads, q_seqlen, 1]
+  std::vector<int64_t> softmax_out_shape({q_dim[0], q_dim[2], q_dim[1], 1});
+
+  out->set_dims(q_dim);
+  softmax_out->set_dims(
+      DDim(softmax_out_shape.data(), softmax_out_shape.size()));
+
+  // rng_state: {seed, offset}
+  std::vector<int64_t> rng_state_shape({2});
+  rng_state->set_dims(DDim(rng_state_shape.data(), rng_state_shape.size()));
+}
+
+void FusedDotProductAttentionGradInferMeta(const MetaTensor& q,
+                                           const MetaTensor& k,
+                                           const MetaTensor& v,
+                                           MetaTensor* q_grad,
+                                           MetaTensor* k_grad,
+                                           MetaTensor* v_grad) {
+  auto q_dim = q.dims();
+  auto k_dim = k.dims();
+  auto v_dim = v.dims();
+  q_grad->set_dims(q_dim);
+  k_grad->set_dims(k_dim);
+  v_grad->set_dims(v_dim);
+}
+
 void FusedFeedForwardInferMeta(const MetaTensor& x,
                                const MetaTensor& dropout1_seed,
                                const MetaTensor& dropout2_seed,
@@ -3332,102 +3486,6 @@ void SelfDPAttenInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
-void FusedBiasDropoutResidualLnInferMeta(
-    const MetaTensor& x,
-    const MetaTensor& residual,
-    const MetaTensor& bias,
-    const MetaTensor& ln_scale,
-    const MetaTensor& ln_bias,
-    const float dropout_rate,
-    const bool is_test,
-    const bool dropout_fix_seed,
-    const int dropout_seed,
-    const std::string& dropout_implementation,
-    const float ln_epsilon,
-    MetaTensor* y,
-    MetaTensor* bias_dropout_residual_out,
-    MetaTensor* dropout_mask_out,
-    MetaTensor* ln_mean,
-    MetaTensor* ln_variance) {
-  PADDLE_ENFORCE_EQ(dropout_rate >= 0.0f && dropout_rate <= 1.0f,
-                    true,
-                    phi::errors::InvalidArgument(
-                        "'dropout_rate' must be between 0.0 and 1.0."));
-  PADDLE_ENFORCE_EQ(
-      dropout_implementation == "downgrade_in_infer" ||
-          dropout_implementation == "upscale_in_train",
-      true,
-      phi::errors::InvalidArgument(
-          "dropout_implementation can only be downgrade_in_infer or "
-          "upscale_in_train"));
-  PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f,
-                    true,
-                    phi::errors::InvalidArgument(
-                        "'epsilon' of the LayerNorm should be between "
-                        "0.0 and 0.001, But received [%s].",
-                        ln_epsilon));
-  auto x_dim = x.dims();
-  int left = 1;
-  for (int i = 0; i < x_dim.size() - 1; i++) {
-    left *= x_dim[i];
-  }
-  bias_dropout_residual_out->set_dims(x.dims());
-  if (is_test == false) {
-    dropout_mask_out->set_dims(x.dims());
-  }
-  ln_mean->set_dims({left});
-  ln_variance->set_dims({left});
-  y->set_dims(x.dims());
-}
-
-void FusedBiasDropoutResidualLnGradInferMeta(
-    const MetaTensor& y_grad,
-    const MetaTensor& x,
-    const MetaTensor& residual,
-    const MetaTensor& bias,
-    const MetaTensor& ln_scale,
-    const MetaTensor& ln_bias,
-    const MetaTensor& ln_mean,
-    const MetaTensor& ln_variance,
-    const MetaTensor& bias_dropout_residual_out,
-    const MetaTensor& dropout_mask_out,
-    const float dropout_rate,
-    const bool is_test,
-    const bool dropout_fix_seed,
-    const int dropout_seed,
-    const std::string& dropout_implementation,
-    const float ln_epsilon,
-    MetaTensor* x_grad,
-    MetaTensor* residual_grad,
-    MetaTensor* bias_grad,
-    MetaTensor* ln_scale_grad,
-    MetaTensor* ln_bias_grad) {
-  PADDLE_ENFORCE_EQ(is_test,
-                    false,
-                    phi::errors::InvalidArgument(
-                        "GradOp is only callable when is_test is false"));
-  if (ln_scale_grad) {
-    ln_scale_grad->set_dims(ln_scale.dims());
-    ln_scale_grad->set_dtype(y_grad.dtype());
-  }
-  if (ln_bias_grad) {
-    ln_bias_grad->set_dims(ln_bias.dims());
-    ln_bias_grad->set_dtype(y_grad.dtype());
-  }
-  if (residual_grad) {
-    residual_grad->set_dims(residual.dims());
-    residual_grad->set_dtype(y_grad.dtype());
-  }
-  if (bias_grad) {
-    bias_grad->set_dims(bias.dims());
-    bias_grad->set_dtype(y_grad.dtype());
-  }
-  if (x_grad) {
-    x_grad->set_dims(x.dims());
-    x_grad->set_dtype(y_grad.dtype());
-  }
-}
-
 void SkipLayerNormInferMeta(const MetaTensor& x,
                             const MetaTensor& y,
                             const MetaTensor& scale,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 6fa9c5baab384d..80eec20281da4f 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -761,6 +761,20 @@ void FusedBiasDropoutResidualLnGradInferMeta(
     MetaTensor* ln_scale_grad,
     MetaTensor* ln_bias_grad);
 
+void FusedDotProductAttentionInferMeta(const MetaTensor& q,
+                                       const MetaTensor& k,
+                                       const MetaTensor& v,
+                                       MetaTensor* out,
+                                       MetaTensor* softmax_out,
+                                       MetaTensor* rng_state);
+
+void FusedDotProductAttentionGradInferMeta(const MetaTensor& q,
+                                           const MetaTensor& k,
+                                           const MetaTensor& v,
+                                           MetaTensor* q_grad,
+                                           MetaTensor* k_grad,
+                                           MetaTensor* v_grad);
+
 void SkipLayerNormInferMeta(const MetaTensor& x,
                             const MetaTensor& y,
                             const MetaTensor& scale,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 78c14352dc53a5..f6ed266577bac5 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -217,9 +217,12 @@ endif()
 
 if(NOT WITH_CUDNN_FRONTEND)
   list(
-    REMOVE_ITEM kernel_cu "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
+    REMOVE_ITEM
+    kernel_cu
+    "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
     "fusion/gpu/fused_scale_bias_add_relu_kernel.cu"
-    "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu")
+    "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu"
+    "fusion/gpu/fused_dot_product_attention_op.cu")
 endif()
 
 set(cc_search_pattern
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu b/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu
new file mode 100644
index 00000000000000..48cda3e3a88b90
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu
@@ -0,0 +1,274 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/mha_cudnn_frontend.h"
+
+namespace phi {
+namespace fusion {
+
+__global__ void set_rng_state(std::pair<uint64_t, uint64_t> seed_offset,
+                              int64_t *rng_state_ptr) {
+  rng_state_ptr[0] = static_cast<int64_t>(seed_offset.first);
+  rng_state_ptr[1] = static_cast<int64_t>(seed_offset.second);
+}
+
+template <typename T, typename Context>
+void FusedDotProductAttentionKernel(const Context &dev_ctx,
+                                    const DenseTensor &q,
+                                    const DenseTensor &k,
+                                    const DenseTensor &v,
+                                    const DenseTensor &mask,
+                                    float scaling_factor,
+                                    float dropout_probability,
+                                    bool is_training,
+                                    bool is_causal_masking,
+                                    DenseTensor *out,
+                                    DenseTensor *softmax_out,
+                                    DenseTensor *rng_state) {
+  PADDLE_ENFORCE_GE(dev_ctx.GetComputeCapability(),
+                    80,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports Ampere and later devices, "
+                        "but got compute capability: %d.",
+                        dev_ctx.GetComputeCapability()));
+  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  PADDLE_ENFORCE_GE(cudnn_version,
+                    8906,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports CUDNN version >= 8906, "
+                        "but got %d.",
+                        cudnn_version));
+
+  // allocate output variables
+  dev_ctx.template Alloc<T>(out);
+  dev_ctx.template Alloc<float>(softmax_out);
+  dev_ctx.template Alloc<int64_t>(rng_state);
+
+  // get handles
+  auto handle = dev_ctx.cudnn_handle();
+
+  auto tensor_dtype = phi::backends::gpu::ToCudnnDataType(q.dtype());
+  bool is_type_supported =
+      (tensor_dtype == CUDNN_DATA_HALF || tensor_dtype == CUDNN_DATA_BFLOAT16);
+  PADDLE_ENFORCE_EQ(
+      is_type_supported,
+      true,
+      phi::errors::InvalidArgument(
+          "cuDNN fused attention Only supports FP16/BF16 currently"));
+  auto mha_layout = MHA_Layout::NOT_INTERLEAVED;
+  auto bias_type = MHA_Bias_Type::NO_BIAS;
+  auto mask_type = is_causal_masking ? MHA_Mask_Type::CAUSAL_MASK
+                                     : MHA_Mask_Type::PADDING_MASK;
+  std::vector<cudnn_frontend::Operation const *> all_ops;
+  std::vector<cudnn_frontend::Operation> ops;
+  std::set<std::pair<uint64_t, void *>> data_ptrs;
+
+  // q dim: {b, s_q, h, d};
+  // k,v dim: {b, s_kv, h, d};
+  auto batch_size = q.dims()[0];
+  auto q_seq_len = q.dims()[1];
+  auto num_heads = q.dims()[2];
+  auto head_size = q.dims()[3];
+  auto kv_seq_len = k.dims()[1];
+
+  // only support seqlen >= 64 and seqlen <= 512 and seqlen % 64 == 0
+  // currently
+  bool can_divide_by_64 = (q_seq_len % 64 == 0 && kv_seq_len % 64 == 0);
+  PADDLE_ENFORCE_EQ(can_divide_by_64,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "cuDNN FMHA only supports sequence length >= 64,"
+                        "and sequence length % 64 == 0, "
+                        "but got sequence length: %d and %d.",
+                        q_seq_len,
+                        kv_seq_len));
+
+  auto gen_cuda = dev_ctx.GetGenerator();
+  // threads per CTA = 128
+  auto rng_elts_per_thread = (q_seq_len * kv_seq_len + 128 - 1) / 128;
+  auto seed_offset = gen_cuda->IncrementOffset(rng_elts_per_thread);
+  set_rng_state<<<1, 1, 0, dev_ctx.stream()>>>(
+      seed_offset, static_cast<int64_t *>(rng_state->data<int64_t>()));
+
+  void *q_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(q.data<T>()));
+  void *k_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(k.data<T>()));
+  void *v_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(v.data<T>()));
+  void *out_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(out->data<T>()));
+  void *softmax_out_dev_ptr =
+      reinterpret_cast<void *>(const_cast<float *>(softmax_out->data<float>()));
+  void *bias_dev_ptr = nullptr;
+  void *mask_dev_ptr =
+      reinterpret_cast<void *>(const_cast<int32_t *>(mask.data<int32_t>()));
+  // rng_state: {seed, offset}
+  void *seed_dev_ptr = reinterpret_cast<void *>(
+      const_cast<int64_t *>(rng_state->data<int64_t>()));
+  void *offset_dev_ptr = reinterpret_cast<void *>(
+      const_cast<int64_t *>(rng_state->data<int64_t>()) + 1);
+
+  fused_attn_arbitrary_seqlen_fwd(batch_size,
+                                  num_heads,
+                                  q_seq_len,
+                                  kv_seq_len,
+                                  head_size,
+                                  is_training,
+                                  scaling_factor,
+                                  dropout_probability,
+                                  mha_layout,
+                                  mask_type,
+                                  q_dev_ptr,
+                                  k_dev_ptr,
+                                  v_dev_ptr,
+                                  softmax_out_dev_ptr,
+                                  out_dev_ptr,
+                                  mask_dev_ptr,
+                                  seed_dev_ptr,
+                                  offset_dev_ptr,
+                                  tensor_dtype,
+                                  dev_ctx.stream(),
+                                  handle);
+}
+
+template <typename T, typename Context>
+void FusedDotProductAttentionGradKernel(const Context &dev_ctx,
+                                        const DenseTensor &q,
+                                        const DenseTensor &k,
+                                        const DenseTensor &v,
+                                        const DenseTensor &O,
+                                        const DenseTensor &softmax_out,
+                                        const DenseTensor &rng_state,
+                                        const DenseTensor &mask,
+                                        const DenseTensor &dO,
+                                        float scaling_factor,
+                                        float dropout_probability,
+                                        bool is_causal_masking,
+                                        DenseTensor *q_grad,
+                                        DenseTensor *k_grad,
+                                        DenseTensor *v_grad) {
+  PADDLE_ENFORCE_GE(dev_ctx.GetComputeCapability(),
+                    80,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports Ampere and later devices, "
+                        "but got compute capability: %d.",
+                        dev_ctx.GetComputeCapability()));
+  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  PADDLE_ENFORCE_GE(cudnn_version,
+                    8906,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports CUDNN version >= 8906, "
+                        "but got %d.",
+                        cudnn_version));
+
+  // allocate output variables
+  dev_ctx.template Alloc<T>(q_grad);
+  dev_ctx.template Alloc<T>(k_grad);
+  dev_ctx.template Alloc<T>(v_grad);
+
+  // get handles
+  auto handle = dev_ctx.cudnn_handle();
+
+  auto tensor_dtype = phi::backends::gpu::ToCudnnDataType(q.dtype());
+  bool support_type =
+      (tensor_dtype == CUDNN_DATA_HALF || tensor_dtype == CUDNN_DATA_BFLOAT16);
+  PADDLE_ENFORCE_EQ(support_type,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "cuDNN FMHA Only supports FP16/BF16 currently"));
+  auto mha_layout = MHA_Layout::NOT_INTERLEAVED;
+  auto mask_type = is_causal_masking ? MHA_Mask_Type::CAUSAL_MASK
+                                     : MHA_Mask_Type::PADDING_MASK;
+  std::vector<cudnn_frontend::Operation const *> all_ops;
+  std::vector<cudnn_frontend::Operation> ops;
+  std::set<std::pair<uint64_t, void *>> data_ptrs;
+
+  // q dim: {b, s_q, h, d};
+  // k, v dim: {b, s_kv, h, d};
+  auto batch_size = q.dims()[0];
+  auto q_seq_len = q.dims()[1];
+  auto num_heads = q.dims()[2];
+  auto head_size = q.dims()[3];
+  auto kv_seq_len = k.dims()[1];
+
+  void *q_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(q.data<T>()));
+  void *k_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(k.data<T>()));
+  void *v_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(v.data<T>()));
+  void *dq_dev_ptr =
+      reinterpret_cast<void *>(const_cast<T *>(q_grad->data<T>()));
+  void *dk_dev_ptr =
+      reinterpret_cast<void *>(const_cast<T *>(k_grad->data<T>()));
+  void *dv_dev_ptr =
+      reinterpret_cast<void *>(const_cast<T *>(v_grad->data<T>()));
+  void *o_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(O.data<T>()));
+  void *do_dev_ptr = reinterpret_cast<void *>(const_cast<T *>(dO.data<T>()));
+  void *softmax_out_dev_ptr =
+      reinterpret_cast<void *>(const_cast<float *>(softmax_out.data<float>()));
+  void *mask_dev_ptr =
+      reinterpret_cast<void *>(const_cast<int32_t *>(mask.data<int32_t>()));
+  void *seed_dev_ptr = reinterpret_cast<void *>(
+      const_cast<int64_t *>(rng_state.data<int64_t>()));
+  void *offset_dev_ptr = reinterpret_cast<void *>(
+      const_cast<int64_t *>(rng_state.data<int64_t>()) + 1);
+
+  fused_attn_arbitrary_seqlen_bwd(batch_size,
+                                  num_heads,
+                                  q_seq_len,
+                                  kv_seq_len,
+                                  head_size,
+                                  scaling_factor,
+                                  dropout_probability,
+                                  mha_layout,
+                                  mask_type,
+                                  q_dev_ptr,
+                                  k_dev_ptr,
+                                  v_dev_ptr,
+                                  o_dev_ptr,
+                                  softmax_out_dev_ptr,
+                                  dq_dev_ptr,
+                                  dk_dev_ptr,
+                                  dv_dev_ptr,
+                                  do_dev_ptr,
+                                  mask_dev_ptr,
+                                  seed_dev_ptr,
+                                  offset_dev_ptr,
+                                  tensor_dtype,
+                                  dev_ctx.stream(),
+                                  handle,
+                                  false);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_dot_product_attention,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedDotProductAttentionKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT32);  // mask
+}
+
+PD_REGISTER_KERNEL(fused_dot_product_attention_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedDotProductAttentionGradKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(6).SetDataType(phi::DataType::INT32);  // mask
+}
diff --git a/paddle/phi/kernels/gpudnn/mha_cudnn_frontend.cu b/paddle/phi/kernels/gpudnn/mha_cudnn_frontend.cu
new file mode 100644
index 00000000000000..c3276fd2670d71
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/mha_cudnn_frontend.cu
@@ -0,0 +1,2172 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpudnn/mha_cudnn_frontend.h"
+
+#include <cub/cub.cuh>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+
+#define CUDNN_FRONTEND_UNUSED(X) ((void)X)
+
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+
+namespace phi {
+namespace cudnn_fused_attn {
+
+#define Q_ID 1
+#define K_ID 2
+#define V_ID 3
+#define O_ID 4
+#define S_ID 5
+#define B_ID 6
+#define D_CONST_ID 7
+#define S_CONST_ID 8
+#define Q_SEQLEN_ID 9
+#define K_SEQLEN_ID 10
+#define dQ_ID 11
+#define dK_ID 12
+#define dV_ID 13
+#define dO_ID 14
+#define MASK_VAL_ID 15
+#define dS_ID 16
+#define D_SEED_ID 17
+#define D_OFFSET_ID 18
+#define S_STATS_ID 19
+#define S_SUM_ID 20
+#define SCALE_PROB 21
+#define K_TRANSPOSE_ID 22
+#define dQ_ACCUM_ID 23
+
+#define VIRTUAL_ID 30
+
+void generateMatrixStrides(int64_t b,
+                           int64_t h,
+                           int64_t s_q,
+                           int64_t s_kv,
+                           int64_t d,
+                           int64_t *strideA,
+                           MHA_Layout layout,
+                           MHA_Matrix matrix) {
+  constexpr int batch_dim_idx = 0;
+  constexpr int head_dim_idx = 1;
+  constexpr int seqlen_dim_idx = 2;
+  constexpr int hidden_dim_idx = 3;
+
+  constexpr int seqlen_transpose_dim_idx = 3;
+  constexpr int hidden_transpose_dim_idx = 2;
+
+  constexpr int seqlen_q_dim_idx = 2;
+  constexpr int seqlen_kv_dim_idx = 3;
+
+  // to be deprecated in the future
+  switch (matrix) {
+    case MHA_Matrix::Q_Matrix:
+      if (layout == MHA_Layout::QKV_INTERLEAVED) {
+        strideA[hidden_dim_idx] = 1;
+        strideA[seqlen_dim_idx] = 3 * h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_q * 3 * h * d;
+      } else if ((layout == MHA_Layout::KV_INTERLEAVED) ||
+                 (layout == MHA_Layout::NOT_INTERLEAVED)) {
+        strideA[hidden_dim_idx] = 1;
+        strideA[seqlen_dim_idx] = h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_q * h * d;
+      }
+      break;
+    case MHA_Matrix::K_Matrix:
+      if (layout == MHA_Layout::QKV_INTERLEAVED) {
+        strideA[seqlen_dim_idx] = 3 * h * d;
+        strideA[hidden_dim_idx] = 1;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 3 * h * d;
+      } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+        strideA[seqlen_dim_idx] = 2 * h * d;
+        strideA[hidden_dim_idx] = 1;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 2 * h * d;
+      } else if (layout == MHA_Layout::NOT_INTERLEAVED) {
+        strideA[seqlen_dim_idx] = h * d;
+        strideA[hidden_dim_idx] = 1;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * h * d;
+      }
+      break;
+    case MHA_Matrix::K_Matrix_Transpose:
+      if (layout == MHA_Layout::QKV_INTERLEAVED) {
+        strideA[seqlen_transpose_dim_idx] = 3 * h * d;
+        strideA[hidden_transpose_dim_idx] = 1;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 3 * h * d;
+      } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+        strideA[seqlen_transpose_dim_idx] = 2 * h * d;
+        strideA[hidden_transpose_dim_idx] = 1;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 2 * h * d;
+      } else if (layout == MHA_Layout::NOT_INTERLEAVED) {
+        strideA[seqlen_transpose_dim_idx] = h * d;
+        strideA[hidden_transpose_dim_idx] = 1;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * h * d;
+      }
+      break;
+    case MHA_Matrix::V_Matrix:
+      if (layout == MHA_Layout::QKV_INTERLEAVED) {
+        strideA[hidden_dim_idx] = 1;
+        strideA[seqlen_dim_idx] = 3 * h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 3 * h * d;
+      } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+        strideA[hidden_dim_idx] = 1;
+        strideA[seqlen_dim_idx] = 2 * h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 2 * h * d;
+      } else if (layout == MHA_Layout::NOT_INTERLEAVED) {
+        strideA[hidden_dim_idx] = 1;
+        strideA[seqlen_dim_idx] = h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * h * d;
+      }
+      break;
+    case MHA_Matrix::V_Matrix_Transpose:
+      if (layout == MHA_Layout::QKV_INTERLEAVED) {
+        strideA[hidden_transpose_dim_idx] = 1;
+        strideA[seqlen_transpose_dim_idx] = 3 * h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 3 * h * d;
+      } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+        strideA[hidden_transpose_dim_idx] = 1;
+        strideA[seqlen_transpose_dim_idx] = 2 * h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * 2 * h * d;
+      } else if (layout == MHA_Layout::NOT_INTERLEAVED) {
+        strideA[hidden_transpose_dim_idx] = 1;
+        strideA[seqlen_transpose_dim_idx] = h * d;
+        strideA[head_dim_idx] = d;
+        strideA[batch_dim_idx] = s_kv * h * d;
+      }
+      break;
+    case MHA_Matrix::S_Matrix:
+      strideA[seqlen_kv_dim_idx] = 1;
+      strideA[seqlen_q_dim_idx] = s_kv;
+      strideA[head_dim_idx] = s_q * s_kv;
+      strideA[batch_dim_idx] = h * s_q * s_kv;
+      break;
+    case MHA_Matrix::O_Matrix:
+      strideA[seqlen_kv_dim_idx] = 1;
+      strideA[seqlen_q_dim_idx] = h * d;
+      strideA[head_dim_idx] = d;
+      strideA[batch_dim_idx] = s_q * h * d;
+      break;
+  }
+}
+
+static bool allowAllConfig(cudnnBackendDescriptor_t engine_config) {
+  (void)engine_config;
+  return false;
+}
+
+static cudnn_frontend::Tensor tensor_create(cudnnDataType_t type,
+                                            int64_t id,
+                                            int64_t const *dim,
+                                            int64_t const *stride,
+                                            bool is_virtual,
+                                            bool is_value) {
+  int nbDims = 4;
+  auto tensor_created =
+      cudnn_frontend::TensorBuilder()
+          .setDim(nbDims, dim)
+          .setStride(nbDims, stride)
+          .setId(id)
+          .setAlignment(
+              16)  // 16B alignment is needed to run a tensor core engine
+          .setDataType(type)
+          .setVirtual(is_virtual)
+          .setByValue(is_value)
+          .build();
+  VLOG(10) << tensor_created.describe();
+  return tensor_created;
+}
+
+static cudnn_frontend::PointWiseDesc pw_desc_create(cudnnDataType_t type,
+                                                    cudnnPointwiseMode_t mode) {
+  auto pw_desc_created = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(mode)
+                             .setComputeType(type)
+                             .build();
+
+  VLOG(10) << pw_desc_created.describe();
+  return pw_desc_created;
+}
+
+static cudnn_frontend::Operation unary_pw_op_create(
+    cudnn_frontend::Tensor const &xDesc,
+    cudnn_frontend::Tensor const &yDesc,
+    cudnn_frontend::PointWiseDesc const &pwDesc) {
+  auto pw_op_created = cudnn_frontend::OperationBuilder(
+                           CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(xDesc)
+                           .setyDesc(yDesc)
+                           .setpwDesc(pwDesc)
+                           .build();
+  VLOG(10) << pw_op_created.describe();
+  return pw_op_created;
+}
+
+static cudnn_frontend::Operation binary_pw_op_create(
+    cudnn_frontend::Tensor const &xDesc,
+    cudnn_frontend::Tensor const &bDesc,
+    cudnn_frontend::Tensor const &yDesc,
+    cudnn_frontend::PointWiseDesc const &pwDesc) {
+  auto pw_op_created = cudnn_frontend::OperationBuilder(
+                           CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(xDesc)
+                           .setbDesc(bDesc)
+                           .setyDesc(yDesc)
+                           .setpwDesc(pwDesc)
+                           .build();
+  VLOG(10) << pw_op_created.describe();
+  return pw_op_created;
+}
+
+static cudnn_frontend::Operation ternary_pw_op_create(
+    cudnn_frontend::Tensor const &xDesc,
+    cudnn_frontend::Tensor const &bDesc,
+    cudnn_frontend::Tensor const &tDesc,
+    cudnn_frontend::Tensor const &yDesc,
+    cudnn_frontend::PointWiseDesc const &pwDesc) {
+  auto pw_op_created = cudnn_frontend::OperationBuilder(
+                           CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(xDesc)
+                           .setbDesc(bDesc)
+                           .settDesc(tDesc)
+                           .setyDesc(yDesc)
+                           .setpwDesc(pwDesc)
+                           .build();
+  VLOG(10) << pw_op_created.describe();
+  return pw_op_created;
+}
+
+static cudnn_frontend::Tensor createScale(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    MHA_Layout layout,
+    cudnnDataType_t tensorType,
+    const cudnn_frontend::Tensor &sTensor,
+    std::vector<cudnn_frontend::Operation> *ops) {
+  // scale
+  int64_t scale_dim[4] = {1, 1, 1, 1};
+  int64_t scale_stride[4] = {1, 1, 1, 1};
+
+  int64_t s_dim[4] = {b, h, s_q, s_kv};
+  int64_t s_stride[4];
+  generateMatrixStrides(
+      b, h, s_q, s_kv, d, s_stride, layout, MHA_Matrix::S_Matrix);
+
+  auto scaleTensor = tensor_create(tensorType,
+                                   S_CONST_ID,
+                                   scale_dim,
+                                   scale_stride,
+                                   false,
+                                   true);  // is by value
+  auto sScaleTensor = tensor_create(tensorType,
+                                    VIRTUAL_ID + 2000,
+                                    s_dim,
+                                    s_stride,
+                                    true,
+                                    false);  // is virtual
+
+  // Define the scale descriptor
+  auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+  // Create a scale node
+  auto scale_op =
+      binary_pw_op_create(sTensor, scaleTensor, sScaleTensor, scaleDesc);
+
+  ops->push_back(std::move(scale_op));
+  return sScaleTensor;
+}
+
+static cudnn_frontend::Tensor createQKBMM(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    bool variable_sequence_length,
+    MHA_Layout layout,
+    cudnnDataType_t tensorType,
+    std::vector<cudnn_frontend::Operation> *ops) {
+  // Creates the necessary tensor descriptors
+  int64_t q_dim[4] = {b, h, s_q, d};
+  int64_t q_stride[4];
+  generateMatrixStrides(
+      b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+  int64_t k_dim[4] = {b, h, d, s_kv};
+  int64_t k_stride[4];
+  generateMatrixStrides(
+      b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix_Transpose);
+
+  int64_t s_dim[4] = {b, h, s_q, s_kv};
+  int64_t s_stride[4];
+  generateMatrixStrides(
+      b, h, s_q, s_kv, d, s_stride, layout, MHA_Matrix::S_Matrix);
+
+  int64_t seqlen_dim[4] = {b, 1, 1, 1};
+  int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+  auto qTensor = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false);
+  auto kTransposeTensor = tensor_create(
+      tensorType, K_ID, k_dim, k_stride, false, false);  // is virtual
+  // first GEMM output
+  auto sTensor = tensor_create(CUDNN_DATA_FLOAT,
+                               VIRTUAL_ID + 1,
+                               s_dim,
+                               s_stride,
+                               true,
+                               false);  // is virtual
+
+  // Define the matmul 1 desc
+  auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder()
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .setPaddingValue(0.0f)
+                           .build();
+
+  auto seqlenQTensor = tensor_create(
+      CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+  auto seqlenKTensor = tensor_create(
+      CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+
+  // Create a matmul 1 node
+  auto &&matmul_op_builder = cudnn_frontend::OperationBuilder(
+      CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+  matmul_op_builder.setaMatDesc(qTensor)
+      .setbMatDesc(kTransposeTensor)
+      .setcMatDesc(sTensor)
+      .setmatmulDesc(matmul_1_Desc);
+
+  if (variable_sequence_length) {
+    matmul_op_builder.setmOverrideDesc(seqlenQTensor)
+        .setnOverrideDesc(seqlenKTensor);
+  }
+
+  auto matmul_op1 = matmul_op_builder.build();
+
+  ops->push_back(std::move(matmul_op1));
+
+  return sTensor;
+}
+
+static cudnn_frontend::Tensor createPaddingMask(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    MHA_Layout layout,
+    cudnnDataType_t tensorType,
+    std::vector<cudnn_frontend::Operation> *ops,
+    const cudnn_frontend::Tensor &prevBlockOutputTensor) {
+  CUDNN_FRONTEND_UNUSED(d);
+  CUDNN_FRONTEND_UNUSED(layout);
+  CUDNN_FRONTEND_UNUSED(tensorType);
+
+  PADDLE_ENFORCE_EQ(
+      (ops->size() != 0),
+      true,
+      phi::errors::PreconditionNotMet(
+          "Padding Mask constructed incorrectly as the first one"));
+
+  // subtraction output
+  int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv};
+  int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+  int64_t maskVal_dim[4] = {1, 1, 1, 1};
+  int64_t maskVal_stride[4] = {1, 1, 1, 1};
+
+  int64_t seqlen_dim[4] = {b, 1, 1, 1};
+  int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+  // mask value to put in the masked pixels
+  auto maskValTensor = tensor_create(
+      CUDNN_DATA_FLOAT, MASK_VAL_ID, maskVal_dim, maskVal_stride, false, true);
+  auto seqlenQTensor = tensor_create(
+      CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+  auto seqlenKTensor = tensor_create(
+      CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+
+  // gen index row output
+  auto rowIndexTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                      VIRTUAL_ID + 300,
+                                      afterBMM1_dim,
+                                      afterBMM1_stride,
+                                      true,
+                                      false);
+  // gen index column output
+  auto columnIndexTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                         VIRTUAL_ID + 301,
+                                         afterBMM1_dim,
+                                         afterBMM1_stride,
+                                         true,
+                                         false);
+  // less than row output
+  auto lessThanRowTensor = tensor_create(CUDNN_DATA_BOOLEAN,
+                                         VIRTUAL_ID + 302,
+                                         afterBMM1_dim,
+                                         afterBMM1_stride,
+                                         true,
+                                         false);
+  // less than column output
+  auto lessThanColTensor = tensor_create(CUDNN_DATA_BOOLEAN,
+                                         VIRTUAL_ID + 303,
+                                         afterBMM1_dim,
+                                         afterBMM1_stride,
+                                         true,
+                                         false);
+  // padding mask (lessthanRow && lessthanCol)
+  auto paddingMaskTensor = tensor_create(CUDNN_DATA_BOOLEAN,
+                                         VIRTUAL_ID + 304,
+                                         afterBMM1_dim,
+                                         afterBMM1_stride,
+                                         true,
+                                         false);
+
+  // output after masking
+  auto maskOutputTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                        VIRTUAL_ID + 305,
+                                        afterBMM1_dim,
+                                        afterBMM1_stride,
+                                        true,
+                                        false);
+
+  // Define the gen index for row descriptor
+  auto genIndexRowDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                             .setAxis(2)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+
+  // Create a gen index Node.
+  auto genIndexRow_op = unary_pw_op_create(
+      prevBlockOutputTensor, rowIndexTensor, genIndexRowDesc);
+
+  // Define the gen index for row descriptor
+  auto genIndexColumnDesc = cudnn_frontend::PointWiseDescBuilder()
+                                .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                .setAxis(3)
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .build();
+
+  // Create a gen index Node.
+  auto genIndexColumn_op = unary_pw_op_create(
+      prevBlockOutputTensor, columnIndexTensor, genIndexColumnDesc);
+
+  // Define the less than comparison for row descriptor
+  auto lessThanRowDesc =
+      pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_CMP_LT);
+
+  // Create a less than comparison for row Node.
+  auto lessThanRow_op = binary_pw_op_create(
+      rowIndexTensor, seqlenQTensor, lessThanRowTensor, lessThanRowDesc);
+
+  // Define the less than comparison for column descriptor
+  auto lessThanColDesc =
+      pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_CMP_LT);
+
+  // Create a less than comparison for col Node.
+  auto lessThanCol_op = binary_pw_op_create(
+      columnIndexTensor, seqlenKTensor, lessThanColTensor, lessThanColDesc);
+
+  // Define the less than comparison for column descriptor
+  auto paddingMaskAndDesc =
+      pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_LOGICAL_AND);
+
+  // Create a and node for combining lessThanRow and lessThanCol
+  auto paddingMaskAnd_op = binary_pw_op_create(lessThanRowTensor,
+                                               lessThanColTensor,
+                                               paddingMaskTensor,
+                                               paddingMaskAndDesc);
+
+  /////////////////// Apply the mask //////////////////////////
+
+  // Define the binary select to perform masking descriptor
+  auto maskDesc =
+      pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_BINARY_SELECT);
+
+  // Create a binary select Node.
+  auto mask_op = ternary_pw_op_create(prevBlockOutputTensor,
+                                      maskValTensor,
+                                      paddingMaskTensor,
+                                      maskOutputTensor,
+                                      maskDesc);
+
+  ops->push_back(std::move(genIndexRow_op));
+  ops->push_back(std::move(genIndexColumn_op));
+  ops->push_back(std::move(lessThanRow_op));
+  ops->push_back(std::move(lessThanCol_op));
+  ops->push_back(std::move(paddingMaskAnd_op));
+  ops->push_back(std::move(mask_op));
+
+  return maskOutputTensor;
+}
+
+static cudnn_frontend::Tensor createCausalMask(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    MHA_Layout layout,
+    cudnnDataType_t tensorType,
+    std::vector<cudnn_frontend::Operation> *ops,
+    const cudnn_frontend::Tensor &prevBlockOutputTensor) {
+  CUDNN_FRONTEND_UNUSED(d);
+  CUDNN_FRONTEND_UNUSED(layout);
+  CUDNN_FRONTEND_UNUSED(tensorType);
+
+  PADDLE_ENFORCE_EQ(
+      (ops->size() != 0),
+      true,
+      phi::errors::PreconditionNotMet(
+          "Causal Mask constructed incorrectly as the first one"));
+
+  // subtraction output
+  int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv};
+  int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+  int64_t maskVal_dim[4] = {1, 1, 1, 1};
+  int64_t maskVal_stride[4] = {1, 1, 1, 1};
+
+  // mask value to put in the masked pixels
+  auto maskValTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                     MASK_VAL_ID,
+                                     maskVal_dim,
+                                     maskVal_stride,
+                                     false,
+                                     true);  // is by value
+  // gen index row output
+  auto rowIndexTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                      VIRTUAL_ID + 100,
+                                      afterBMM1_dim,
+                                      afterBMM1_stride,
+                                      true,
+                                      false);  // is virtual
+  // gen index column output
+  auto columnIndexTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                         VIRTUAL_ID + 101,
+                                         afterBMM1_dim,
+                                         afterBMM1_stride,
+                                         true,
+                                         false);  // is virtual
+  // create causal mask (row >= col)
+  auto causalMaskTensor = tensor_create(CUDNN_DATA_BOOLEAN,
+                                        VIRTUAL_ID + 106,
+                                        afterBMM1_dim,
+                                        afterBMM1_stride,
+                                        true,
+                                        false);  // is virtual
+
+  // output after masking
+  auto maskOutputTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                        VIRTUAL_ID + 107,
+                                        afterBMM1_dim,
+                                        afterBMM1_stride,
+                                        true,
+                                        false);  // is virtual
+
+  // Define the gen index for row descriptor
+  auto genIndexRowDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                             .setAxis(2)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+
+  // Create a gen index node
+  auto genIndexRow_op = unary_pw_op_create(
+      prevBlockOutputTensor, rowIndexTensor, genIndexRowDesc);
+
+  // Define the gen index for row descriptor
+  auto genIndexColumnDesc = cudnn_frontend::PointWiseDescBuilder()
+                                .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                .setAxis(3)
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .build();
+
+  // Create a gen index node
+  auto genIndexColumn_op = unary_pw_op_create(
+      prevBlockOutputTensor, columnIndexTensor, genIndexColumnDesc);
+
+  // Define the greater than equal to comparison descriptor
+  auto rowGreaterColDesc =
+      pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_CMP_GE);
+
+  // Create a greater than equal to node
+  auto rowGreaterCol_op = binary_pw_op_create(
+      rowIndexTensor, columnIndexTensor, causalMaskTensor, rowGreaterColDesc);
+
+  // Define the binary select to perform masking descriptor
+  auto maskDesc =
+      pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_BINARY_SELECT);
+
+  // Create a binary select node
+  auto mask_op = ternary_pw_op_create(prevBlockOutputTensor,
+                                      maskValTensor,
+                                      causalMaskTensor,
+                                      maskOutputTensor,
+                                      maskDesc);
+
+  ops->push_back(std::move(genIndexRow_op));
+  ops->push_back(std::move(genIndexColumn_op));
+  ops->push_back(std::move(rowGreaterCol_op));
+  ops->push_back(std::move(mask_op));
+
+  return maskOutputTensor;
+}
+
+static cudnn_frontend::Tensor createSoftmaxForward(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    bool isTraining,
+    std::vector<cudnn_frontend::Operation> *ops,
+    const cudnn_frontend::Tensor &sAfterMaskTensor) {
+  int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv};
+  int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+  int64_t afterReduction_dim[4] = {b, h, s_q, 1};
+  int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1};
+
+  // max (x)
+  auto afterMaxReductionTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                               VIRTUAL_ID + 150,
+                                               afterReduction_dim,
+                                               afterReduction_stride,
+                                               true,
+                                               false);  // is virtual
+
+  // x - max(x)
+  auto afterSubtractionTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                              VIRTUAL_ID + 151,
+                                              afterBMM1_dim,
+                                              afterBMM1_stride,
+                                              true,
+                                              false);  // is virtual
+
+  // e^(x - max(x))
+  auto afterExponentTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                           VIRTUAL_ID + 152,
+                                           afterBMM1_dim,
+                                           afterBMM1_stride,
+                                           true,
+                                           false);  // is virtual;
+
+  // sum (e^(x - max(x)))
+  auto afterAddReductionTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                               VIRTUAL_ID + 153,
+                                               afterReduction_dim,
+                                               afterReduction_stride,
+                                               true,
+                                               false);  // is virtual
+
+  // log (sum (e^(x - max(x))))
+  auto afterLogLTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                       VIRTUAL_ID + 154,
+                                       afterReduction_dim,
+                                       afterReduction_stride,
+                                       true,
+                                       false);
+
+  // M + log (sum (e^(x - max(x))))
+  auto softmaxStatsTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                          S_STATS_ID,
+                                          afterReduction_dim,
+                                          afterReduction_stride,
+                                          !isTraining,
+                                          false);
+  // not virtual if training is true, virtual if training is false
+
+  // divide (e/ sum(e))
+  auto afterSoftmaxTensor =
+      cudnn_frontend::TensorBuilder()
+          .setDim(4, afterBMM1_dim)
+          .setStride(4, afterBMM1_stride)
+          .setId(VIRTUAL_ID + 156)
+          .setAlignment(
+              16)  // 16B alignment is needed to run a tensor core engine
+          .setDataType(CUDNN_DATA_FLOAT)
+          .setVirtual(true)
+          .setByValue(false)
+          .setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t::
+                              CUDNN_TENSOR_REORDERING_F16x16)
+          .build();
+
+  // Define the reduction descriptor
+  auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder()
+                              .setComputeType(CUDNN_DATA_FLOAT)
+                              .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+                              .build();
+
+  // Create a reduction max node
+  auto reductionMax_op = cudnn_frontend::OperationBuilder(
+                             CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                             .setxDesc(sAfterMaskTensor)
+                             .setyDesc(afterMaxReductionTensor)
+                             .setreductionDesc(reductionMaxDesc)
+                             .build();
+
+  // Define the subtract descriptor
+  auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+
+  // Create a subtract node
+  auto subtract_op = binary_pw_op_create(sAfterMaskTensor,
+                                         afterMaxReductionTensor,
+                                         afterSubtractionTensor,
+                                         subtractDesc);
+
+  // Define the exponent descriptor
+  auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+
+  // Create a exponent node
+  auto exponent_op = unary_pw_op_create(
+      afterSubtractionTensor, afterExponentTensor, exponentDesc);
+
+  // Define the reduction descriptor
+  auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                              .setComputeType(CUDNN_DATA_FLOAT)
+                              .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                              .build();
+
+  // Create a reduction add node
+  auto reductionAdd_op = cudnn_frontend::OperationBuilder(
+                             CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                             .setxDesc(afterExponentTensor)
+                             .setyDesc(afterAddReductionTensor)
+                             .setreductionDesc(reductionAddDesc)
+                             .build();
+
+  // Create log descriptor
+  auto logDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_LOG);
+
+  // Create log node
+  auto log_op =
+      unary_pw_op_create(afterAddReductionTensor, afterLogLTensor, logDesc);
+
+  // Create add descriptor
+  auto addDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_ADD);
+
+  // Create add node
+  auto add_op = binary_pw_op_create(
+      afterMaxReductionTensor, afterLogLTensor, softmaxStatsTensor, addDesc);
+
+  // Define the division descriptor
+  auto divisionDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_DIV);
+
+  // Create a subtract node
+  auto division_op = binary_pw_op_create(afterExponentTensor,
+                                         afterAddReductionTensor,
+                                         afterSoftmaxTensor,
+                                         divisionDesc);
+
+  ops->push_back(std::move(reductionMax_op));
+  ops->push_back(std::move(subtract_op));
+  ops->push_back(std::move(exponent_op));
+  ops->push_back(std::move(reductionAdd_op));
+  ops->push_back(std::move(log_op));
+  ops->push_back(std::move(add_op));
+  ops->push_back(std::move(division_op));
+
+  return afterSoftmaxTensor;
+}
+
+static cudnn_frontend::Tensor createDropoutForward(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    double probability,
+    cudnnDataType_t tensorType,
+    std::vector<cudnn_frontend::Operation> *ops,
+    const cudnn_frontend::Tensor &afterSoftmaxTensor) {
+  CUDNN_FRONTEND_UNUSED(d);
+
+  PADDLE_ENFORCE_EQ(
+      (ops->size() != 0),
+      true,
+      phi::errors::PreconditionNotMet(
+          "Dropout DAG constructed incorrectly as the first one"));
+
+  int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv};
+  int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+  int64_t scale_dim[4] = {1, 1, 1, 1};
+  int64_t scale_stride[4] = {1, 1, 1, 1};
+
+  auto dropoutSeed = tensor_create(CUDNN_DATA_INT64,
+                                   D_SEED_ID,
+                                   scale_dim,
+                                   scale_stride,
+                                   false,
+                                   false);  // not virtual
+  auto dropoutOffset = tensor_create(CUDNN_DATA_INT64,
+                                     D_OFFSET_ID,
+                                     scale_dim,
+                                     scale_stride,
+                                     false,
+                                     false);  // not virtual
+
+  // mask for the dropout
+  auto dropoutMaskTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                         VIRTUAL_ID + 200,
+                                         afterBMM1_dim,
+                                         afterBMM1_stride,
+                                         true,
+                                         false);  // is virtual
+  // after dropout tensor
+  auto afterDropoutTensor =
+      cudnn_frontend::TensorBuilder()
+          .setDim(4, afterBMM1_dim)
+          .setStride(4, afterBMM1_stride)
+          .setId(VIRTUAL_ID + 201)
+          .setAlignment(
+              16)  // 16B alignment is needed to run a tensor core engine
+          .setDataType(tensorType)
+          .setVirtual(true)
+          .setByValue(false)
+          .setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t::
+                              CUDNN_TENSOR_REORDERING_F16x16)
+          .build();
+  // scale after dropout
+  auto scaleDropoutTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                          D_CONST_ID,
+                                          scale_dim,
+                                          scale_stride,
+                                          false,
+                                          true);  // is by value
+  // after Scale
+  auto afterScaleTensor = tensor_create(tensorType,
+                                        VIRTUAL_ID + 202,
+                                        afterBMM1_dim,
+                                        afterBMM1_stride,
+                                        true,
+                                        false);  // is virtual
+
+  // Define the reduction descriptor
+  auto rngDesc = cudnn_frontend::RngDescBuilder()
+                     .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                     .setBernoulliDistProbability(1.0 - probability)
+                     .build();
+
+  // Create a rng node
+  auto rng_op =
+      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+          .setyDesc(dropoutMaskTensor)
+          .setSeedDesc(dropoutSeed)
+          .setOffsetDesc(dropoutOffset)
+          .setRngDesc(rngDesc)
+          .build();
+
+  // Define the multiply mask descriptor
+  auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+  // Create a multiply mask node
+  auto maskMul_op = binary_pw_op_create(
+      afterSoftmaxTensor, dropoutMaskTensor, afterDropoutTensor, maskMulDesc);
+
+  // Define the multiply scale descriptor
+  auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+  // Create a multiply scale node
+  auto scaleMul_op = binary_pw_op_create(
+      afterDropoutTensor, scaleDropoutTensor, afterScaleTensor, scaleMulDesc);
+
+  ops->push_back(std::move(rng_op));
+  ops->push_back(std::move(maskMul_op));
+  ops->push_back(std::move(scaleMul_op));
+
+  return afterScaleTensor;
+}
+
+static cudnn_frontend::Tensor createDropoutBackward(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    double probability,
+    cudnnDataType_t tensorType,
+    std::vector<cudnn_frontend::Operation> *ops,
+    const cudnn_frontend::Tensor &afterSoftmaxTensor,
+    const cudnn_frontend::Tensor &dropoutMaskTensor) {
+  CUDNN_FRONTEND_UNUSED(d);
+
+  PADDLE_ENFORCE_EQ(
+      (ops->size() != 0),
+      true,
+      phi::errors::PreconditionNotMet(
+          "Dropout DAG constructed incorrectly as the first one"));
+
+  int64_t afterBMM1_dim[4] = {b, h, s_q, s_kv};
+  int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+  int64_t scale_dim[4] = {1, 1, 1, 1};
+  int64_t scale_stride[4] = {1, 1, 1, 1};
+
+  auto dropoutSeed = tensor_create(CUDNN_DATA_INT64,
+                                   D_SEED_ID,
+                                   scale_dim,
+                                   scale_stride,
+                                   false,
+                                   false);  // not virtual
+  auto dropoutOffset = tensor_create(CUDNN_DATA_INT64,
+                                     D_OFFSET_ID,
+                                     scale_dim,
+                                     scale_stride,
+                                     false,
+                                     false);  // not virtual
+
+  // after dropout tensor
+  auto afterDropoutTensor =
+      cudnn_frontend::TensorBuilder()
+          .setDim(4, afterBMM1_dim)
+          .setStride(4, afterBMM1_stride)
+          .setId(VIRTUAL_ID + 201)
+          .setAlignment(
+              16)  // 16B alignment is needed to run a tensor core engine
+          .setDataType(tensorType)
+          .setVirtual(true)
+          .setByValue(false)
+          .setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t::
+                              CUDNN_TENSOR_REORDERING_F16x16)
+          .build();
+  // scale after dropout
+  auto scaleDropoutTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                          D_CONST_ID,
+                                          scale_dim,
+                                          scale_stride,
+                                          false,
+                                          true);  // is by value
+  // after Scale
+  auto afterScaleTensor = tensor_create(tensorType,
+                                        VIRTUAL_ID + 202,
+                                        afterBMM1_dim,
+                                        afterBMM1_stride,
+                                        true,
+                                        false);  // is virtual
+
+  // Define the reduction descriptor
+  auto rngDesc = cudnn_frontend::RngDescBuilder()
+                     .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                     .setBernoulliDistProbability(1.0 - probability)
+                     .build();
+
+  // Create a rng node
+  auto rng_op =
+      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+          .setyDesc(dropoutMaskTensor)
+          .setSeedDesc(dropoutSeed)
+          .setOffsetDesc(dropoutOffset)
+          .setRngDesc(rngDesc)
+          .build();
+
+  // Define the multiply mask descriptor
+  auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+  // Create a multiply mask node
+  auto maskMul_op = binary_pw_op_create(
+      afterSoftmaxTensor, dropoutMaskTensor, afterDropoutTensor, maskMulDesc);
+
+  // Define the multiply scale descriptor
+  auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+  // Create a multiply scale node
+  auto scaleMul_op = binary_pw_op_create(
+      afterDropoutTensor, scaleDropoutTensor, afterScaleTensor, scaleMulDesc);
+
+  ops->push_back(std::move(rng_op));
+  ops->push_back(std::move(maskMul_op));
+  ops->push_back(std::move(scaleMul_op));
+
+  return afterScaleTensor;
+}
+
+static void createSVBMM(int64_t b,
+                        int64_t h,
+                        int64_t s_q,
+                        int64_t s_kv,
+                        int64_t d,
+                        bool variable_sequence_length,
+                        MHA_Layout layout,
+                        cudnnDataType_t tensorType,
+                        std::vector<cudnn_frontend::Operation> *ops,
+                        cudnn_frontend::Tensor const &afterScaleDropoutTensor) {
+  PADDLE_ENFORCE_EQ((ops->size() != 0),
+                    true,
+                    phi::errors::PreconditionNotMet(
+                        "SVBMM op constructed incorrectly as the first one"));
+
+  int64_t v_dim[4] = {b, h, s_kv, d};
+  int64_t v_stride[4];
+  generateMatrixStrides(
+      b, h, s_q, s_kv, d, v_stride, layout, MHA_Matrix::V_Matrix);
+
+  int64_t o_dim[4] = {b, h, s_q, d};
+  int64_t o_stride[4];
+  generateMatrixStrides(
+      b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+  int64_t seqlen_dim[4] = {b, 1, 1, 1};
+  int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+  auto seqlenQTensor = tensor_create(
+      CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+  auto seqlenKTensor = tensor_create(
+      CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+
+  auto vTensor = tensor_create(tensorType, V_ID, v_dim, v_stride, false, false);
+  // second GEMM output
+  auto oTensor = tensor_create(tensorType, O_ID, o_dim, o_stride, false, false);
+
+  // Define the matmul 2 desc
+  auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder()
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .setPaddingValue(0.0f)
+                           .build();
+
+  // Create a matmul 2 node
+  auto &&matmul_op_builder = cudnn_frontend::OperationBuilder(
+      CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+  matmul_op_builder.setaMatDesc(afterScaleDropoutTensor)
+      .setbMatDesc(vTensor)
+      .setcMatDesc(oTensor)
+      .setmatmulDesc(matmul_2_Desc);
+
+  if (variable_sequence_length) {
+    matmul_op_builder.setmOverrideDesc(seqlenQTensor)
+        .setkOverrideDesc(seqlenKTensor);
+  }
+
+  auto matmul_op2 = matmul_op_builder.build();
+
+  ops->push_back(std::move(matmul_op2));
+}
+
+struct FADescriptor {
+  std::int64_t b;
+  std::int64_t h;
+  std::int64_t s_q;
+  std::int64_t s_kv;
+  std::int64_t d;
+  float attnScale;
+  bool isTraining;
+  float dropoutProbability;
+  MHA_Layout layout;
+  MHA_Bias_Type bias_type;
+  MHA_Mask_Type mask_type;
+  cudnnDataType_t tensor_type;
+  bool use_workspace_opt;
+  bool variable_sequence_length;
+
+  bool operator<(const FADescriptor &rhs) const {
+    return std::tie(b,
+                    h,
+                    s_q,
+                    s_kv,
+                    d,
+                    attnScale,
+                    isTraining,
+                    dropoutProbability,
+                    layout,
+                    mask_type,
+                    bias_type,
+                    tensor_type,
+                    use_workspace_opt,
+                    variable_sequence_length) <
+           std::tie(rhs.b,
+                    rhs.h,
+                    rhs.s_q,
+                    rhs.s_kv,
+                    rhs.d,
+                    rhs.attnScale,
+                    rhs.isTraining,
+                    rhs.dropoutProbability,
+                    rhs.layout,
+                    rhs.mask_type,
+                    rhs.bias_type,
+                    rhs.tensor_type,
+                    rhs.use_workspace_opt,
+                    rhs.variable_sequence_length);
+  }
+};
+}  // namespace cudnn_fused_attn
+}  // namespace phi
+
+using namespace phi::cudnn_fused_attn;  // NOLINT
+
+constexpr int BLOCK_SIZE = 512;
+
+__global__ __launch_bounds__(BLOCK_SIZE) void mask_to_actual_seqlens_kernel(
+    const int32_t *mask,
+    int32_t *q_actual_seqlen,
+    int32_t *kv_actual_seqlen,
+    int q_seqlen,
+    int kv_seqlen,
+    bool need_kv) {
+  typedef cub::BlockReduce<int, BLOCK_SIZE> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage q_smem;
+  __shared__ typename BlockReduce::TempStorage kv_smem;
+  unsigned int tid = threadIdx.x;
+  unsigned int batch_offset = blockIdx.x * q_seqlen * kv_seqlen;
+
+  // load mask, convert to 1/0, do accumulation
+  int q = 0, kv = 0;
+  for (unsigned int q_idx = tid * kv_seqlen; q_idx < q_seqlen * kv_seqlen;
+       q_idx += BLOCK_SIZE * kv_seqlen) {
+    q += (mask[q_idx + batch_offset] ? 1 : 0);
+  }
+
+  if (need_kv) {
+    for (unsigned int kv_idx = tid; kv_idx < kv_seqlen; kv_idx += BLOCK_SIZE) {
+      kv += (mask[kv_idx + batch_offset] ? 1 : 0);
+    }
+  }
+  __syncthreads();
+
+  // compute cub::BlockReduce
+  int q_sum, kv_sum;
+  q_sum = BlockReduce(q_smem).Sum(q);
+  if (need_kv) kv_sum = BlockReduce(kv_smem).Sum(kv);
+
+  // write result for this block to global mem
+  if (tid == 0) {
+    q_actual_seqlen[blockIdx.x] = q_sum;
+    if (need_kv) {
+      kv_actual_seqlen[blockIdx.x] = kv_sum;
+    }
+  }
+}
+
+void fused_attn_arbitrary_seqlen_fwd(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    bool is_training,
+    float scaling_factor,
+    float dropout_probability,
+    MHA_Layout layout,
+    MHA_Mask_Type mask_type,
+    void *devPtrQ,
+    void *devPtrK,
+    void *devPtrV,
+    void *devPtrSoftmaxStats,
+    void *devPtrO,
+    void *devPtrMask,
+    // void *devPtrCuSeqlenQ, void *devPtrCuSeqlenKV,
+    void *devPtrDropoutSeed,
+    void *devPtrDropoutOffset,
+    cudnnDataType_t tensorType,
+    cudaStream_t stream,
+    cudnnHandle_t handle) {
+  try {
+    CUDNN_CALL(phi::dynload::cudnnSetStream(handle, stream));
+
+    if (!is_training) {
+      dropout_probability = 0.0f;
+    }
+
+    bool variable_sequence_length =
+        CUDNN_VERSION >= 8906 && mask_type == MHA_Mask_Type::PADDING_MASK;
+
+    FADescriptor descriptor{b,
+                            h,
+                            s_q,
+                            s_kv,
+                            d,
+                            scaling_factor,
+                            is_training,
+                            dropout_probability,
+                            layout,
+                            MHA_Bias_Type::NO_BIAS,
+                            mask_type,
+                            tensorType,
+                            false,
+                            variable_sequence_length};
+
+    using CacheType = std::map<FADescriptor, cudnn_frontend::ExecutionPlan>;
+    static thread_local CacheType fmha_fprop_cache;
+
+    // Get plan from cache if cache is available, otherwise create one
+    auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) {
+      // if hit, return
+      auto it = cache.find(descriptor);
+      if (it != cache.end()) {
+        auto plan = it->second;
+        return plan;
+      }
+
+      // otherwise, build the op_graph and the plan. Then update cache
+      std::vector<cudnn_frontend::Operation const *> all_ops;
+      std::vector<cudnn_frontend::Operation> ops;
+
+      // Q * K^T
+      auto sTensor = createQKBMM(b,
+                                 h,
+                                 s_q,
+                                 s_kv,
+                                 d,
+                                 variable_sequence_length,
+                                 layout,
+                                 tensorType,
+                                 &ops);
+
+      // Q * K^T * bmmScale
+      auto sScaleTensor = createScale(
+          b, h, s_q, s_kv, d, layout, CUDNN_DATA_FLOAT, sTensor, &ops);
+
+      auto &sAfterMaskTensor = sScaleTensor;
+
+      if (mask_type == MHA_Mask_Type::CAUSAL_MASK) {
+        sAfterMaskTensor = createCausalMask(
+            b, h, s_q, s_kv, d, layout, tensorType, &ops, sScaleTensor);
+      } else if (variable_sequence_length) {  // padding mask
+        sAfterMaskTensor = createPaddingMask(
+            b, h, s_q, s_kv, d, layout, tensorType, &ops, sScaleTensor);
+      }
+
+      PADDLE_ENFORCE_EQ(
+          (dropout_probability >= 0.0f && dropout_probability < 1.0f),
+          true,
+          phi::errors::PreconditionNotMet(
+              "dropout_probability should be in the range [0, 1)"));
+
+      auto softmax_output = createSoftmaxForward(
+          b, h, s_q, s_kv, is_training, &ops, sAfterMaskTensor);
+
+      // Dropout(softmax)
+      auto dropout_output = createDropoutForward(b,
+                                                 h,
+                                                 s_q,
+                                                 s_kv,
+                                                 d,
+                                                 dropout_probability,
+                                                 tensorType,
+                                                 &ops,
+                                                 softmax_output);
+
+      createSVBMM(b,
+                  h,
+                  s_q,
+                  s_kv,
+                  d,
+                  variable_sequence_length,
+                  layout,
+                  tensorType,
+                  &ops,
+                  dropout_output);
+
+      for (unsigned int i = 0; i < ops.size(); i++) {
+        all_ops.push_back(&ops[i]);
+      }
+
+      // Create an Operation Graph
+      auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                         .setHandle(handle)
+                         .setOperationGraph(all_ops.size(), all_ops.data())
+                         .build();
+
+      cudnn_frontend::EngineConfigList filtered_configs;
+      auto statuses =
+          cudnn_frontend::get_heuristics_list<1>({"heuristics_instant"},
+                                                 opGraph,
+                                                 allowAllConfig,
+                                                 filtered_configs,
+                                                 true);
+
+      if (filtered_configs.size() == 0) {
+        cudnn_frontend::set_error_and_throw_exception(
+            nullptr,
+            CUDNN_STATUS_NOT_SUPPORTED,
+            "run_mha_fprop: No config returned by the heuristics");
+      }
+
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                      .setHandle(handle)
+                      .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                      .build();
+
+      cache.insert({descriptor, plan});
+      return plan;
+    };
+
+    auto plan = get_plan(fmha_fprop_cache, descriptor);
+    VLOG(10) << "Plan tag: " << plan.getTag();
+
+    auto plan_workspace_size = plan.getWorkspaceSize();
+    VLOG(10) << plan.describe() << " plan requires workspace "
+             << plan_workspace_size;
+
+    size_t actual_seqlen_workspace_size = 2 * b * sizeof(int32_t);
+    size_t workspace_size = plan_workspace_size + actual_seqlen_workspace_size;
+
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMallocAsync(&workspace, workspace_size, stream));
+    }
+
+    // Prepare actual seqlen
+    constexpr size_t nthreads_per_block = 512;
+    const size_t grid = b;
+    void *devActualSeqlenQ =
+        static_cast<int8_t *>(workspace) + plan_workspace_size;
+    void *devActualSeqlenK =
+        static_cast<int8_t *>(devActualSeqlenQ) + b * sizeof(int32_t);
+
+    if (variable_sequence_length) {
+      mask_to_actual_seqlens_kernel<<<grid, nthreads_per_block, 0, stream>>>(
+          static_cast<const int32_t *>(devPtrMask),
+          static_cast<int32_t *>(devActualSeqlenQ),
+          static_cast<int32_t *>(devActualSeqlenK),
+          s_q,
+          s_kv,
+          true);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
+    }
+
+    std::set<std::pair<uint64_t, void *>> data_ptrs;
+    // Add all the data pointers to be used in the variant pack
+    float negInfinity = -1.0E+30f;
+    float scale_dropout = 1.0f / (1.0f - dropout_probability);
+
+    data_ptrs.insert(std::pair<uint64_t, void *>(Q_ID, devPtrQ));
+    data_ptrs.insert(std::pair<uint64_t, void *>(K_ID, devPtrK));
+    data_ptrs.insert(std::pair<uint64_t, void *>(V_ID, devPtrV));
+    data_ptrs.insert(std::pair<uint64_t, void *>(MASK_VAL_ID, &negInfinity));
+    data_ptrs.insert(std::pair<uint64_t, void *>(S_CONST_ID, &scaling_factor));
+    data_ptrs.insert(std::pair<uint64_t, void *>(O_ID, devPtrO));
+    data_ptrs.insert(std::pair<uint64_t, void *>(D_SEED_ID, devPtrDropoutSeed));
+    data_ptrs.insert(
+        std::pair<uint64_t, void *>(D_OFFSET_ID, devPtrDropoutOffset));
+    data_ptrs.insert(std::pair<uint64_t, void *>(D_CONST_ID, &scale_dropout));
+
+    if (variable_sequence_length) {
+      data_ptrs.insert(
+          std::pair<uint64_t, void *>(Q_SEQLEN_ID, devActualSeqlenQ));
+      data_ptrs.insert(
+          std::pair<uint64_t, void *>(K_SEQLEN_ID, devActualSeqlenK));
+    }
+
+    // If training mode, we write out softmax stats
+    if (is_training) {
+      data_ptrs.insert(
+          std::pair<uint64_t, void *>(S_STATS_ID, devPtrSoftmaxStats));
+    }
+
+    auto variantPack = cudnn_frontend::VariantPackBuilder()
+                           .setWorkspacePointer(workspace)
+                           .setDataPointers(data_ptrs)
+                           .build();
+
+    CUDNN_CALL(phi::dynload::cudnnBackendExecute(
+        handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
+
+    if (workspace_size > 0) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeAsync(workspace, stream));
+    }
+  } catch (cudnn_frontend::cudnnException &e) {
+    struct cudaDeviceProp prop;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDeviceProperties(&prop, 0));
+
+    // cudnn flash attention is only for GA100 cards and GH100 cards
+    if (!((prop.major == 8 && prop.minor == 0) ||
+          (prop.major == 9 && prop.minor == 0)) &&
+        (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH ||
+         e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+      VLOG(10) << "Only supported for GA100 (cuDNN >= 8900) and "
+                  "GH100 (cuDNN >= 8900) GPUs";
+    } else {
+      VLOG(10) << "[ERROR] Exception " << e.what();
+    }
+  }
+}
+
+void fused_attn_arbitrary_seqlen_bwd(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    float dropout_probability,
+    MHA_Layout layout,
+    MHA_Mask_Type mask_type,
+    void *devPtrQ,
+    void *devPtrKTranspose,
+    void *devPtrVTranspose,
+    void *devPtrO,
+    void *devPtrSoftmaxStats,
+    void *devPtrdQ,
+    void *devPtrdK,
+    void *devPtrdV,
+    void *devPtrdO,
+    void *devPtrMask,
+    // void *devPtrCuSeqlenQ, void *devPtrCuSeqlenKV,
+    void *devPtrDropoutSeed,
+    void *devPtrDropoutOffset,
+    cudnnDataType_t tensorType,
+    cudaStream_t stream,
+    cudnnHandle_t handle,
+    bool use_workspace_opt) {
+  try {
+    CUDNN_CALL(phi::dynload::cudnnSetStream(handle, stream));
+
+    bool variable_sequence_length =
+        CUDNN_VERSION >= 8906 && mask_type == MHA_Mask_Type::PADDING_MASK;
+
+    FADescriptor descriptor{b,
+                            h,
+                            s_q,
+                            s_kv,
+                            d,
+                            scaling_factor,
+                            true,
+                            dropout_probability,
+                            layout,
+                            MHA_Bias_Type::NO_BIAS,
+                            mask_type,
+                            tensorType,
+                            use_workspace_opt,
+                            variable_sequence_length};
+
+    using CacheType = std::map<FADescriptor, cudnn_frontend::ExecutionPlan>;
+    static thread_local CacheType fmha_bprop_cache;
+
+    auto get_plan = [&](CacheType &cache, const FADescriptor &descriptor) {
+      auto it = cache.find(descriptor);
+      if (it != cache.end()) {
+        return it->second;
+      }
+
+      std::vector<cudnn_frontend::Operation const *> all_ops;
+      std::vector<cudnn_frontend::Operation> ops;
+
+      // Creates the necessary tensor descriptors
+      int64_t q_dim[4] = {b, h, s_q, d};
+      int64_t q_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+      int64_t k_transpose_dim[4] = {b, h, d, s_kv};
+      int64_t k_transpose_stride[4];
+      generateMatrixStrides(b,
+                            h,
+                            s_q,
+                            s_kv,
+                            d,
+                            k_transpose_stride,
+                            layout,
+                            MHA_Matrix::K_Matrix_Transpose);
+
+      int64_t v_transpose_dim[4] = {b, h, d, s_kv};
+      int64_t v_transpose_stride[4];
+      generateMatrixStrides(b,
+                            h,
+                            s_q,
+                            s_kv,
+                            d,
+                            v_transpose_stride,
+                            layout,
+                            MHA_Matrix::V_Matrix_Transpose);
+
+      int64_t p_dim[4] = {b, h, s_q, s_kv};
+      int64_t p_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, p_stride, layout, MHA_Matrix::S_Matrix);
+
+      int64_t p_transpose_dim[4] = {b, h, s_kv, s_q};
+      int64_t p_transpose_stride[4];
+      p_transpose_stride[0] = p_stride[0];
+      p_transpose_stride[1] = p_stride[1];
+      p_transpose_stride[2] = p_stride[3];
+      p_transpose_stride[3] = p_stride[2];
+
+      int64_t o_dim[4] = {b, h, s_q, d};
+      int64_t o_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+      int64_t dqAccum_dim[4] = {b, h, s_q, d};
+      int64_t dqAccum_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, dqAccum_stride, layout, MHA_Matrix::O_Matrix);
+
+      int64_t seqlen_dim[4] = {b, 1, 1, 1};
+      int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+      int64_t scale_dim[4] = {1, 1, 1, 1};
+      int64_t scale_stride[4] = {1, 1, 1, 1};
+
+      auto seqlenQTensor = tensor_create(CUDNN_DATA_INT32,
+                                         Q_SEQLEN_ID,
+                                         seqlen_dim,
+                                         seqlen_stride,
+                                         false,
+                                         false);
+      auto seqlenKTensor = tensor_create(CUDNN_DATA_INT32,
+                                         K_SEQLEN_ID,
+                                         seqlen_dim,
+                                         seqlen_stride,
+                                         false,
+                                         false);
+
+      /*******************************************************************************
+       *                          Dot product dO * O */
+
+      // output and gradient of the output
+      auto oTensor =
+          tensor_create(tensorType, O_ID, o_dim, o_stride, false, false);
+      auto dOTensor =
+          tensor_create(tensorType, dO_ID, o_dim, o_stride, false, false);
+
+      auto dotProductTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                            VIRTUAL_ID,
+                                            o_dim,
+                                            o_stride,
+                                            true,
+                                            false);  // is virtual
+
+      // Create pointwise mul
+      auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+      // do * O
+      auto dotProductOp = binary_pw_op_create(
+          dOTensor, oTensor, dotProductTensor, multiplyDesc);
+      ops.push_back(std::move(dotProductOp));
+
+      /*******************************************************************************
+       *                         Reduction(dO * O) */
+
+      int64_t reduction_dim[4] = {b, h, s_q, 1};
+      int64_t reduction_stride[4] = {h * s_q, s_q, 1, 1};
+
+      // reduction(dO * O)
+      auto afterReductionTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                VIRTUAL_ID + 1,
+                                                reduction_dim,
+                                                reduction_stride,
+                                                true,
+                                                false);  // is virtual
+      auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                  .build();
+
+      // Create a reduction add node
+      auto reductionAdd_op = cudnn_frontend::OperationBuilder(
+                                 CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                 .setxDesc(dotProductTensor)
+                                 .setyDesc(afterReductionTensor)
+                                 .setreductionDesc(reductionAddDesc)
+                                 .build();
+      ops.push_back(std::move(reductionAdd_op));
+
+      /*******************************************************************************
+       *                        reduction(dO * O) * scale prob -> softmaxSum */
+
+      auto softmaxSumTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                            S_SUM_ID,
+                                            reduction_dim,
+                                            reduction_stride,
+                                            false,
+                                            false);  // not virtual
+      auto scaleProbTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                           SCALE_PROB,
+                                           scale_dim,
+                                           scale_stride,
+                                           false,
+                                           true);  // not virtual
+      auto softmaxSumOp = binary_pw_op_create(afterReductionTensor,
+                                              scaleProbTensor,
+                                              softmaxSumTensor,
+                                              multiplyDesc);
+      ops.push_back(std::move(softmaxSumOp));
+
+      /*******************************************************************************
+       *                        Q @ K.T -> P */
+
+      // Inputs from fprop
+      auto qTensor =
+          tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false);
+      auto kTransposeTensor = tensor_create(
+          tensorType, K_ID, k_transpose_dim, k_transpose_stride, false, false);
+      auto pTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                   VIRTUAL_ID + 2,
+                                   p_dim,
+                                   p_stride,
+                                   true,
+                                   false);  // is virtual
+
+      // matmul to calculate dvTensor
+      auto matmul_0_Desc = cudnn_frontend::MatMulDescBuilder()
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .setPaddingValue(0.0f)
+                               .build();
+
+      auto &&matmul_op_builder = cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+      matmul_op_builder.setaMatDesc(qTensor)
+          .setbMatDesc(kTransposeTensor)
+          .setcMatDesc(pTensor)
+          .setmatmulDesc(matmul_0_Desc);
+
+      if (variable_sequence_length) {
+        matmul_op_builder.setmOverrideDesc(seqlenQTensor)
+            .setnOverrideDesc(seqlenKTensor);
+      }
+
+      auto matmul_op0 = matmul_op_builder.build();
+
+      ops.push_back(std::move(matmul_op0));
+
+      /*******************************************************************************
+       *                        P * bmmScale -> pAfterScale */
+
+      auto bmmScaleTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                          S_CONST_ID,
+                                          scale_dim,
+                                          scale_stride,
+                                          false,
+                                          true);  // not virtual and by value
+      auto pAfterScaleTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                             VIRTUAL_ID + 2000,
+                                             p_dim,
+                                             p_stride,
+                                             true,
+                                             false);  // virtual
+      auto scaleOp = binary_pw_op_create(
+          pTensor, bmmScaleTensor, pAfterScaleTensor, multiplyDesc);
+      ops.push_back(std::move(scaleOp));
+
+      /*******************************************************************************
+       *                          Causal masking -> pAfterMaskTensor */
+
+      auto &pAfterMaskTensor = pAfterScaleTensor;
+      if (mask_type == MHA_Mask_Type::CAUSAL_MASK) {  // causal mask
+        pAfterMaskTensor = createCausalMask(
+            b, h, s_q, s_kv, d, layout, tensorType, &ops, pAfterScaleTensor);
+      } else if (variable_sequence_length) {  // padding mask
+        pAfterMaskTensor = createPaddingMask(
+            b, h, s_q, s_kv, d, layout, tensorType, &ops, pAfterScaleTensor);
+      }
+
+      /*******************************************************************************
+       *                          pAfterMaskTensor - softmaxStats ->
+       * pAfterSubtract */
+
+      auto pAfterSubtractTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                VIRTUAL_ID + 3,
+                                                p_dim,
+                                                p_stride,
+                                                true,
+                                                false);  // is virtual
+      auto softmaxStatsTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                              S_STATS_ID,
+                                              reduction_dim,
+                                              reduction_stride,
+                                              false,
+                                              false);  // not virtual
+      auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+      auto subtract_op = binary_pw_op_create(pAfterMaskTensor,
+                                             softmaxStatsTensor,
+                                             pAfterSubtractTensor,
+                                             subtractDesc);
+      ops.push_back(std::move(subtract_op));
+
+      /*******************************************************************************
+       *                          e^(pAfterSubtract) -> pAfterSoftmax */
+
+      auto pAfterSoftmaxTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                               VIRTUAL_ID + 4,
+                                               p_dim,
+                                               p_stride,
+                                               true,
+                                               false);  // is virtual
+      auto expDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+      auto exp_op = unary_pw_op_create(
+          pAfterSubtractTensor, pAfterSoftmaxTensor, expDesc);
+      ops.push_back(std::move(exp_op));
+
+      /*******************************************************************************
+       *                          Dropout -> afterScaleDropout */
+
+      auto dropoutMaskTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                             VIRTUAL_ID + 5,
+                                             p_dim,
+                                             p_stride,
+                                             true,
+                                             false);  // is virtual
+      auto afterScaleDropoutTensor = createDropoutBackward(b,
+                                                           h,
+                                                           s_q,
+                                                           s_kv,
+                                                           d,
+                                                           dropout_probability,
+                                                           tensorType,
+                                                           &ops,
+                                                           pAfterSoftmaxTensor,
+                                                           dropoutMaskTensor);
+
+      /*******************************************************************************
+       *                          afterScaleDropout -> sTransposeTensor */
+
+      auto sTransposeTensor = tensor_create(tensorType,
+                                            VIRTUAL_ID + 6,
+                                            p_transpose_dim,
+                                            p_transpose_stride,
+                                            true,
+                                            false);  // is virtual
+      auto reshape_op = cudnn_frontend::OperationBuilder(
+                            CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                            .setxDesc(afterScaleDropoutTensor)
+                            .setyDesc(sTransposeTensor)
+                            .build();
+      ops.push_back(std::move(reshape_op));
+
+      // Outputs of bprop
+      int64_t dq_dim[4] = {b, h, s_q, d};
+      int64_t dq_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, dq_stride, layout, MHA_Matrix::Q_Matrix);
+
+      int64_t dk_dim[4] = {b, h, s_kv, d};
+      int64_t dk_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, dk_stride, layout, MHA_Matrix::K_Matrix);
+
+      int64_t dv_dim[4] = {b, h, s_kv, d};
+      int64_t dv_stride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, dv_stride, layout, MHA_Matrix::V_Matrix);
+
+      // Outputs of backprop
+      auto dQTensor =
+          tensor_create(tensorType, dQ_ID, dq_dim, dq_stride, false, false);
+      auto dKTensor =
+          tensor_create(tensorType, dK_ID, dk_dim, dk_stride, false, false);
+      auto dVTensor =
+          tensor_create(tensorType, dV_ID, dv_dim, dv_stride, false, false);
+      // not virtual
+
+      /*******************************************************************************
+       *                          sTransposeTensor @ dO -> dV */
+
+      auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder()
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .setPaddingValue(0.0f)
+                               .build();
+
+      auto &&matmul_op1_builder = cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+      matmul_op1_builder.setaMatDesc(sTransposeTensor)
+          .setbMatDesc(dOTensor)
+          .setcMatDesc(dVTensor)
+          .setmatmulDesc(matmul_1_Desc);
+
+      if (variable_sequence_length) {
+        matmul_op1_builder.setmOverrideDesc(seqlenKTensor)
+            .setkOverrideDesc(seqlenQTensor);
+      }
+
+      auto matmul_op1 = matmul_op1_builder.build();
+
+      ops.push_back(std::move(matmul_op1));
+
+      /*******************************************************************************
+       *                          dO @ V.T -> dS */
+
+      auto vTransposeTensor = tensor_create(
+          tensorType, V_ID, v_transpose_dim, v_transpose_stride, false, false);
+      auto dSTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                    VIRTUAL_ID + 7,
+                                    p_dim,
+                                    p_stride,
+                                    true,
+                                    false);  // is virtual
+
+      auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder()
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .setPaddingValue(0.0f)
+                               .build();
+
+      auto &&matmul_op2_builder = cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+      matmul_op2_builder.setaMatDesc(dOTensor)
+          .setbMatDesc(vTransposeTensor)
+          .setcMatDesc(dSTensor)
+          .setmatmulDesc(matmul_2_Desc);
+
+      if (variable_sequence_length) {
+        matmul_op2_builder.setmOverrideDesc(seqlenQTensor)
+            .setnOverrideDesc(seqlenKTensor);
+      }
+
+      auto matmul_op2 = matmul_op2_builder.build();
+
+      ops.push_back(std::move(matmul_op2));
+
+      /*******************************************************************************
+       *                          dS * dropoutMask -> dSAfterDropout */
+
+      auto dSAfterDropoutTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                VIRTUAL_ID + 8,
+                                                p_dim,
+                                                p_stride,
+                                                true,
+                                                false);  // is virtual
+      auto multiply_op = binary_pw_op_create(
+          dSTensor, dropoutMaskTensor, dSAfterDropoutTensor, multiplyDesc);
+      ops.push_back(std::move(multiply_op));
+
+      /*******************************************************************************
+       *                          dSAfterDropout - softmaxSum -> dsAfterSubtract
+       */
+
+      auto dsAfterSubtractTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                 VIRTUAL_ID + 9,
+                                                 p_dim,
+                                                 p_stride,
+                                                 true,
+                                                 false);  // is virtual
+      auto subtract_op2 = binary_pw_op_create(dSAfterDropoutTensor,
+                                              softmaxSumTensor,
+                                              dsAfterSubtractTensor,
+                                              subtractDesc);
+      ops.push_back(std::move(subtract_op2));
+
+      /*******************************************************************************
+       *                          dsAfterSubtract * afterSoftmax -> dP */
+
+      auto dPTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                    VIRTUAL_ID + 10,
+                                    p_dim,
+                                    p_stride,
+                                    true,
+                                    false);  // is virtual
+      auto multiply_op2 = binary_pw_op_create(
+          dsAfterSubtractTensor, pAfterSoftmaxTensor, dPTensor, multiplyDesc);
+      ops.push_back(std::move(multiply_op2));
+
+      /*******************************************************************************
+       *                          dP * scaleDropout -> dPAfterDropoutScale */
+      auto dPAfterDropoutScaleTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                     VIRTUAL_ID + 11,
+                                                     p_dim,
+                                                     p_stride,
+                                                     true,
+                                                     false);  // is virtual
+      auto scaleDropoutTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                              D_CONST_ID,
+                                              scale_dim,
+                                              scale_stride,
+                                              false,
+                                              true);  // is by value
+      auto multiply_op3 = binary_pw_op_create(dPTensor,
+                                              scaleDropoutTensor,
+                                              dPAfterDropoutScaleTensor,
+                                              multiplyDesc);
+      ops.push_back(std::move(multiply_op3));
+
+      /*******************************************************************************
+       *                          dPAfterDropoutScale * bmmScale ->
+       * dPScaledTensor  */
+
+      auto dPScaledTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                          VIRTUAL_ID + 12,
+                                          p_dim,
+                                          p_stride,
+                                          true,
+                                          false);  // is virtual
+      auto multiply_op4 = binary_pw_op_create(dPAfterDropoutScaleTensor,
+                                              bmmScaleTensor,
+                                              dPScaledTensor,
+                                              multiplyDesc);
+      ops.push_back(std::move(multiply_op4));
+
+      /*******************************************************************************
+       *                          K.T -> K */
+      int64_t kDim[4] = {b, h, s_kv, d};
+      int64_t kStride[4];
+      generateMatrixStrides(
+          b, h, s_q, s_kv, d, kStride, layout, MHA_Matrix::K_Matrix);
+      auto kTensor = tensor_create(tensorType,
+                                   VIRTUAL_ID + 13,
+                                   kDim,
+                                   kStride,
+                                   true,
+                                   false);  // is virtual
+      auto reshape_op2 = cudnn_frontend::OperationBuilder(
+                             CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                             .setxDesc(kTransposeTensor)
+                             .setyDesc(kTensor)
+                             .build();
+      ops.push_back(std::move(reshape_op2));
+
+      /*******************************************************************************
+       *                          dP @ K -> dqAccumTensor / dqTensor */
+      auto dqAccumTensor =
+          cudnn_frontend::TensorBuilder()
+              .setDim(4, dqAccum_dim)
+              .setStride(4, dqAccum_stride)
+              .setId(dQ_ACCUM_ID)
+              .setAlignment(
+                  16)  // 16B alignment is needed to run a tensor core engine
+              .setDataType(CUDNN_DATA_FLOAT)
+              .setVirtual(false)
+              .setByValue(false)
+              .setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t::
+                                  CUDNN_TENSOR_REORDERING_F16x16)
+              .build();
+
+      auto matmul_3_Desc = cudnn_frontend::MatMulDescBuilder()
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .setPaddingValue(0.0f)
+                               .build();
+
+      auto &&matmul_op3_builder = cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+      matmul_op3_builder.setaMatDesc(dPScaledTensor)
+          .setbMatDesc(kTensor)
+          .setmatmulDesc(matmul_3_Desc);
+
+      if (use_workspace_opt) {
+        matmul_op3_builder.setcMatDesc(dQTensor);
+      } else {
+        matmul_op3_builder.setcMatDesc(dqAccumTensor);
+      }
+
+      if (variable_sequence_length) {
+        matmul_op3_builder.setmOverrideDesc(seqlenQTensor)
+            .setkOverrideDesc(seqlenKTensor);
+      }
+
+      auto matmul_op3 = matmul_op3_builder.build();
+
+      ops.push_back(std::move(matmul_op3));
+
+      /*******************************************************************************
+       *                          dP.T @ Q -> dK */
+      auto dPTransposeTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                             VIRTUAL_ID + 14,
+                                             p_transpose_dim,
+                                             p_transpose_stride,
+                                             true,
+                                             false);  // is virtual
+      auto reshape_op3 = cudnn_frontend::OperationBuilder(
+                             CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                             .setxDesc(dPScaledTensor)
+                             .setyDesc(dPTransposeTensor)
+                             .build();
+      ops.push_back(std::move(reshape_op3));
+
+      auto matmul_4_Desc = cudnn_frontend::MatMulDescBuilder()
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .setPaddingValue(0.0f)
+                               .build();
+
+      auto &&matmul_op4_builder = cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR);
+
+      matmul_op4_builder.setaMatDesc(dPTransposeTensor)
+          .setbMatDesc(qTensor)
+          .setcMatDesc(dKTensor)
+          .setmatmulDesc(matmul_4_Desc);
+
+      if (variable_sequence_length) {
+        matmul_op4_builder.setmOverrideDesc(seqlenKTensor)
+            .setkOverrideDesc(seqlenQTensor);
+      }
+
+      auto matmul_op4 = matmul_op4_builder.build();
+
+      ops.push_back(std::move(matmul_op4));
+
+      /*******************************************************************************
+       *                          dqAccumTensor @ identity -> dqTensor */
+      if (!use_workspace_opt) {
+        auto identityDesc =
+            pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_IDENTITY);
+        auto identity_op =
+            unary_pw_op_create(dqAccumTensor, dQTensor, identityDesc);
+        ops.push_back(std::move(identity_op));
+      }
+
+      for (unsigned int i = 0; i < ops.size(); i++) {
+        all_ops.push_back(&ops[i]);
+      }
+
+      // Create an Operation Graph
+      auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                         .setHandle(handle)
+                         .setOperationGraph(all_ops.size(), all_ops.data())
+                         .build();
+
+      cudnn_frontend::EngineConfigList filtered_configs;
+      auto statuses =
+          cudnn_frontend::get_heuristics_list<1>({"heuristics_instant"},
+                                                 opGraph,
+                                                 allowAllConfig,
+                                                 filtered_configs,
+                                                 true);
+
+      if (filtered_configs.size() == 0) {
+        cudnn_frontend::set_error_and_throw_exception(
+            nullptr,
+            CUDNN_STATUS_NOT_SUPPORTED,
+            "run_mha_bprop: No config returned by the heuristics");
+      }
+
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                      .setHandle(handle)
+                      .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                      .build();
+
+      cache.insert({descriptor, plan});
+      return plan;
+    };
+
+    auto plan = get_plan(fmha_bprop_cache, descriptor);
+    VLOG(10) << "Plan tag: " << plan.getTag();
+
+    auto plan_workspace_size = plan.getWorkspaceSize();
+    size_t softmaxSum_workspace_size = b * h * s_q * sizeof(float);
+    size_t dqAccum_workspace_size =
+        use_workspace_opt ? 0 : b * s_q * h * d * sizeof(float);
+    size_t actual_seqlen_workspace_size = 2 * b * sizeof(int32_t);
+    size_t workspace_size = plan_workspace_size + softmaxSum_workspace_size +
+                            dqAccum_workspace_size +
+                            actual_seqlen_workspace_size;
+    void *workspace = nullptr;
+    VLOG(10) << "Malloc workspace size: " << workspace_size << " bytes";
+    if (workspace_size > 0) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMallocAsync(&workspace, workspace_size, stream));
+    }
+
+    void *devPtrSoftmaxSum =
+        static_cast<int8_t *>(workspace) + plan_workspace_size;
+    void *devPtrdQAccumulator =
+        static_cast<int8_t *>(devPtrSoftmaxSum) + softmaxSum_workspace_size;
+    if (!use_workspace_opt) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
+          devPtrdQAccumulator, 0, dqAccum_workspace_size, stream));
+    }
+
+    constexpr size_t nthreads_per_block = 512;
+    const size_t grid = b;
+    void *devActualSeqlenQ =
+        static_cast<int8_t *>(devPtrdQAccumulator) + dqAccum_workspace_size;
+    void *devActualSeqlenK =
+        static_cast<int8_t *>(devActualSeqlenQ) + b * sizeof(int32_t);
+    mask_to_actual_seqlens_kernel<<<grid, nthreads_per_block, 0, stream>>>(
+        static_cast<const int32_t *>(devPtrMask),
+        static_cast<int32_t *>(devActualSeqlenQ),
+        static_cast<int32_t *>(devActualSeqlenK),
+        s_q,
+        s_kv,
+        true);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
+
+    std::set<std::pair<uint64_t, void *>> data_ptrs;
+    // add all the data pointers to be used in the variant pack
+    float negInfinity = -1.0E+31f;
+    float scale_dropout = 1.0f / (1.0f - dropout_probability);
+    data_ptrs.insert(std::pair<uint64_t, void *>(dQ_ID, devPtrdQ));
+    if (!use_workspace_opt) {
+      data_ptrs.insert(
+          std::pair<uint64_t, void *>(dQ_ACCUM_ID, devPtrdQAccumulator));
+    }
+    data_ptrs.insert(std::pair<uint64_t, void *>(dK_ID, devPtrdK));
+    data_ptrs.insert(std::pair<uint64_t, void *>(dV_ID, devPtrdV));
+
+    data_ptrs.insert(std::pair<uint64_t, void *>(Q_ID, devPtrQ));
+    data_ptrs.insert(std::pair<uint64_t, void *>(K_ID, devPtrKTranspose));
+    data_ptrs.insert(std::pair<uint64_t, void *>(V_ID, devPtrVTranspose));
+    data_ptrs.insert(std::pair<uint64_t, void *>(O_ID, devPtrO));
+    data_ptrs.insert(std::pair<uint64_t, void *>(dO_ID, devPtrdO));
+    data_ptrs.insert(
+        std::pair<uint64_t, void *>(S_STATS_ID, devPtrSoftmaxStats));
+    data_ptrs.insert(std::pair<uint64_t, void *>(S_SUM_ID, devPtrSoftmaxSum));
+    data_ptrs.insert(std::pair<uint64_t, void *>(D_SEED_ID, devPtrDropoutSeed));
+    data_ptrs.insert(
+        std::pair<uint64_t, void *>(D_OFFSET_ID, devPtrDropoutOffset));
+    data_ptrs.insert(std::pair<uint64_t, void *>(MASK_VAL_ID, &negInfinity));
+    if (variable_sequence_length) {
+      data_ptrs.insert(
+          std::pair<uint64_t, void *>(Q_SEQLEN_ID, devActualSeqlenQ));
+      data_ptrs.insert(
+          std::pair<uint64_t, void *>(K_SEQLEN_ID, devActualSeqlenK));
+    }
+
+    float scaleProb = 1.0f - dropout_probability;
+    data_ptrs.insert(std::pair<uint64_t, void *>(D_CONST_ID, &scale_dropout));
+    data_ptrs.insert(std::pair<uint64_t, void *>(S_CONST_ID, &scaling_factor));
+    data_ptrs.insert(std::pair<uint64_t, void *>(SCALE_PROB, &scaleProb));
+
+    auto variantPack = cudnn_frontend::VariantPackBuilder()
+                           .setWorkspacePointer(workspace)
+                           .setDataPointers(data_ptrs)
+                           .build();
+
+    CUDNN_CALL(phi::dynload::cudnnBackendExecute(
+        handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
+    if (workspace_size > 0) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeAsync(workspace, stream));
+    }
+  } catch (cudnn_frontend::cudnnException &e) {
+    struct cudaDeviceProp prop;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDeviceProperties(&prop, 0));
+
+    // cudnn flash attention is only for GA100 cards and GH100 cards
+    if (!((prop.major == 8 && prop.minor == 0) ||
+          (prop.major == 9 && prop.minor == 0)) &&
+        (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH ||
+         e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+      VLOG(10) << "Only supported for GA100 (cuDNN >= 8900) and "
+                  "GH100 (cuDNN >= 8900) GPUs";
+    } else {
+      VLOG(10) << "[ERROR] Exception " << e.what();
+    }
+  }
+}
+
+#endif
diff --git a/paddle/phi/kernels/gpudnn/mha_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/mha_cudnn_frontend.h
new file mode 100644
index 00000000000000..264491214d2c7f
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/mha_cudnn_frontend.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+#include "paddle/phi/backends/dynload/cudnn_frontend.h"
+
+#define CUDNN_CALL(func)                                       \
+  {                                                            \
+    auto status = func;                                        \
+    if (status != CUDNN_STATUS_SUCCESS) {                      \
+      LOG(FATAL) << "CUDNN Error : "                           \
+                 << phi::dynload::cudnnGetErrorString(status); \
+    }                                                          \
+  }
+
+enum class MHA_Layout {
+  NOT_INTERLEAVED = 0,
+  QKV_INTERLEAVED = 1,
+  KV_INTERLEAVED = 2
+};
+
+enum class MHA_Matrix {
+  Q_Matrix = 0,            // queries
+  K_Matrix = 1,            // keys
+  K_Matrix_Transpose = 2,  // keys transposed
+  V_Matrix = 3,            // values
+  V_Matrix_Transpose = 4,  // value matrix transposed
+  S_Matrix = 5,            // output of GEMM1
+  O_Matrix = 6,            // final output
+};
+
+enum class MHA_Mask_Type { NO_MASK = 0, CAUSAL_MASK = 1, PADDING_MASK = 2 };
+
+enum class MHA_Bias_Type {
+  NO_BIAS = 0,
+  PRE_SCALE_BIAS = 1,
+  POST_SCALE_BIAS = 2
+};
+
+void fused_attn_arbitrary_seqlen_fwd(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    bool is_training,
+    float scaling_factor,
+    float dropout_probability,
+    MHA_Layout layout,
+    MHA_Mask_Type mask_type,
+    void* devPtrQ,
+    void* devPtrK,
+    void* devPtrV,
+    void* devPtrSoftmaxStats,
+    void* devPtrO,
+    void* devPtrMask,
+    // void *devPtrCuSeqlenQ, void *devPtrCuSeqlenKV,
+    void* devPtrDropoutSeed,
+    void* devPtrDropoutOffset,
+    cudnnDataType_t tensorType,
+    cudaStream_t stream,
+    cudnnHandle_t handle);
+
+void fused_attn_arbitrary_seqlen_bwd(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    float dropout_probability,
+    MHA_Layout layout,
+    MHA_Mask_Type mask_type,
+    void* devPtrQ,
+    void* devPtrK,
+    void* devPtrV,
+    void* devPtrO,
+    void* devPtrSoftmaxStats,
+    void* devPtrdQ,
+    void* devPtrdK,
+    void* devPtrdV,
+    void* devPtrdO,
+    void* devPtrMask,
+    // void *devPtrCuSeqlenQ, void *devPtrCuSeqlenKV,
+    void* devPtrDropoutSeed,
+    void* devPtrDropoutOffset,
+    cudnnDataType_t tensorType,
+    cudaStream_t stream,
+    cudnnHandle_t handle,
+    bool use_workspace_opt);
+
+#endif  // PADDLE_WITH_CUDNN_FRONTEND
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index fbb9159d981863..89475810c9e41b 100755
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -125,6 +125,19 @@ def _type(self):
         return PassType.FUSION_OPT
 
 
+@register_pass("fuse_dot_product_attention")
+class FuseDotProductAttentionPass(CPPPassWrapper):
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def cpp_name(self):
+        return "fuse_dot_product_attention_pass"
+
+    def _type(self):
+        return PassType.FUSION_OPT
+
+
 @register_pass("fuse_optimizer")
 class FuseOptimizerPass(CPPPassWrapper):
     def __init__(self):
diff --git a/python/paddle/framework/ir.py b/python/paddle/framework/ir.py
index dff055a1575b14..e19e4e245a5c8f 100644
--- a/python/paddle/framework/ir.py
+++ b/python/paddle/framework/ir.py
@@ -98,6 +98,9 @@ def apply_pass(name):
     if build_strategy.fuse_gemm_epilogue:
         apply_pass("fuse_gemm_epilogue_pass")
         build_strategy.fuse_gemm_epilogue = False
+    if build_strategy.fuse_dot_product_attention:
+        apply_pass("fuse_dot_product_attention_pass")
+        build_strategy.fuse_dot_product_attention = False
     if build_strategy.fuse_elewise_add_act_ops:
         apply_pass("fuse_elewise_add_act_pass")
         build_strategy.fuse_elewise_add_act_ops = False
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
index 9096b3dff5bbbe..3b334b98de56d5 100644
--- a/python/paddle/incubate/nn/functional/__init__.py
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -32,6 +32,7 @@
 from .fused_layer_norm import fused_layer_norm
 from .masked_multihead_attention import masked_multihead_attention
 from .block_multihead_attention import block_multihead_attention
+from .fused_dot_product_attention import fused_dot_product_attention
 
 __all__ = [
     'fused_multi_head_attention',
diff --git a/python/paddle/incubate/nn/functional/fused_dot_product_attention.py b/python/paddle/incubate/nn/functional/fused_dot_product_attention.py
new file mode 100644
index 00000000000000..ff56baac3a5afe
--- /dev/null
+++ b/python/paddle/incubate/nn/functional/fused_dot_product_attention.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import _C_ops
+from paddle.framework import LayerHelper, in_dynamic_mode
+
+
+def fused_dot_product_attention(
+    q,
+    k,
+    v,
+    mask,
+    scaling_factor,
+    dropout_prob,
+    is_training,
+    is_causal_masking,
+    return_softmax=False,
+):
+    r"""
+    Fused Dot Product Attention. This is a fusion operator to compute scaled dot product attention in transformer
+    model architecture. This operator only supports running on Ampere and Hopper GPU and need cudnn version >= 8906.
+
+    Args:
+        q (Tensor): The query tensor. The data type is bfloat16, float16.
+        k (Tensor): The key tensor. The data type is bfloat16, float16.
+        v (Tensor): The value tensor. The data type is bfloat16, float16.
+        mask (Tensor, optional): The mask tensor. The data type is int or bool.
+        scaling_factor (float): The scaling factor for the attention scores.
+        dropout_prob (float): The dropout probability.
+        is_training (bool): A flag indicating whether it is in train phrase or not.
+        is_causal_masking (bool): A flag indicating whether it is causal masking or not. If True, the mask will be ignored.
+        return_softmax (bool, optional): A flag indicating whether to return softmax_output or not. Default: False.
+
+
+    Returns:
+        A Tensor representing the fused dot product attention, has same shape and data type as `q` .
+
+    Warning:
+        This API needs to be integrated into `paddle.nn.functional.scaled_dot_product_attention` in the future.
+
+    """
+
+    batch_size = q.shape[0]
+    q_seqlen = q.shape[1]
+    k_seqlen = k.shape[1]
+    mask_shape = [batch_size, 1, q_seqlen, k_seqlen]
+
+    if mask is None or is_causal_masking is True:
+        mask = paddle.ones(mask_shape, dtype='int32')
+    else:  # mask is not None and is_causal_masking == False
+        assert mask.dtype in [
+            paddle.int32,
+            paddle.bool,
+        ], "mask dtype must be int32 or bool"
+        assert (
+            mask.shape == mask_shape
+        ), "mask shape must be [batch_size, 1, q_seqlen, k_seqlen]"
+        mask = mask.astype('int32')
+
+    if in_dynamic_mode():
+        out, softmax, _ = _C_ops.fused_dot_product_attention(
+            q,
+            k,
+            v,
+            mask,
+            scaling_factor,
+            dropout_prob,
+            is_training,
+            is_causal_masking,
+        )
+        return out if return_softmax is False else (out, softmax)
+    else:
+        helper = LayerHelper('fused_dot_product_attention', **locals())
+        out = helper.create_variable_for_type_inference(dtype=q.dtype)
+        softmax_out = helper.create_variable_for_type_inference(
+            dtype=q.dtype, stop_gradient=True
+        )
+        rng_state = helper.create_variable_for_type_inference(
+            dtype='int64', stop_gradient=True
+        )
+
+        attrs = {
+            "scaling_factor": scaling_factor,
+            "dropout_probability": dropout_prob,
+            "is_training": is_training,
+            "is_causal_masking": is_causal_masking,
+        }
+        helper.append_op(
+            type='fused_dot_product_attention',
+            inputs={'q': q, 'k': k, 'v': v, 'mask': mask},
+            outputs={
+                'out': [out],
+                'softmax_out': [softmax_out],
+                'rng_state': [rng_state],
+            },
+            attrs=attrs,
+        )
+        return out if return_softmax is False else (out, softmax_out)
diff --git a/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py b/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py
new file mode 100644
index 00000000000000..da82fb74b7d181
--- /dev/null
+++ b/test/ir/pir/fused_pass/test_fused_dot_product_attention_pass.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.base import core
+from paddle.nn.layer.transformer import _convert_attention_mask
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or paddle.device.cuda.get_device_capability()[0] < 8
+    or paddle.get_cudnn_version() < 8906,
+    "cudnn flash attn is only supported after Ampere and need version >= 8906",
+)
+class TestFusedDotProductAttention(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 32
+        self.seq_len = 1024
+        self.num_heads = 12
+        self.head_size = 64
+        self.default_dtype = "float16"
+
+    def test_fused_dot_product_attention(self):
+        paddle.set_default_dtype("float16")
+        self.qkv_shape = (
+            self.batch_size,
+            self.seq_len,
+            self.num_heads,
+            self.head_size,
+        )
+        self.mask_shape = (self.batch_size, 1, self.seq_len, self.seq_len)
+        q_np = np.random.normal(loc=0, scale=0.02, size=self.qkv_shape).astype(
+            "float16"
+        )
+        k_np = np.random.normal(loc=0, scale=0.02, size=self.qkv_shape).astype(
+            "float16"
+        )
+        v_np = np.random.normal(loc=0, scale=0.02, size=self.qkv_shape).astype(
+            "float16"
+        )
+        mask_np = np.ones(self.mask_shape).astype("int32")
+
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program):
+                q_ = paddle.static.data(
+                    name="q", shape=self.qkv_shape, dtype="float16"
+                )
+                k_ = paddle.static.data(
+                    name="k", shape=self.qkv_shape, dtype="float16"
+                )
+                v_ = paddle.static.data(
+                    name="v", shape=self.qkv_shape, dtype="float16"
+                )
+                mask = paddle.static.data(
+                    name="mask", shape=self.mask_shape, dtype="int32"
+                )
+
+                q_.stop_gradient = False
+                k_.stop_gradient = False
+                v_.stop_gradient = False
+                mask.stop_gradient = True
+
+                qt = paddle.transpose(q_, [0, 2, 1, 3])
+                kt = paddle.transpose(k_, [0, 2, 1, 3])
+                vt = paddle.transpose(v_, [0, 2, 1, 3])
+
+                product = paddle.matmul(
+                    x=qt * (self.head_size**-0.5), y=kt, transpose_y=True
+                )
+                attn_mask = _convert_attention_mask(mask, product.dtype)
+                product = product + attn_mask
+                weights = paddle.nn.functional.softmax(product)
+                out = paddle.matmul(weights, vt)
+                out = paddle.transpose(out, [0, 2, 1, 3])
+                res1 = paddle.reshape(
+                    out,
+                    [
+                        self.batch_size,
+                        self.seq_len,
+                        self.num_heads * self.head_size,
+                    ],
+                )
+
+                res2 = paddle.assign(res1)
+
+                res3, res4, res5 = paddle.autograd.ir_backward.grad(
+                    res2, [q_, k_, v_]
+                )
+                res3_ = paddle.assign(res3)
+                res4_ = paddle.assign(res4)
+                res5_ = paddle.assign(res5)
+
+                op_names = [op.name() for op in main_program.global_block().ops]
+
+                with paddle.static.scope_guard(paddle.static.Scope()):
+                    exe = paddle.base.Executor(paddle.base.CUDAPlace(0))
+                    fetches0 = exe.run(
+                        main_program,
+                        feed={"q": q_np, "k": k_np, "v": v_np, "mask": mask_np},
+                        fetch_list=[res2, res3_, res4_, res5_],
+                    )
+                pm = paddle.pir.PassManager()
+                pm.add_pass('fused_dot_product_attention_pass')
+                pm.run(main_program)
+                op_names = [op.name() for op in main_program.global_block().ops]
+
+                self.assertTrue('pd_op.fused_dot_product_attention' in op_names)
+                self.assertTrue(
+                    'pd_op.fused_dot_product_attention_grad' in op_names
+                )
+
+                with paddle.static.scope_guard(paddle.static.Scope()):
+                    exe = paddle.base.Executor(paddle.base.CUDAPlace(0))
+                    fetches1 = exe.run(
+                        main_program,
+                        feed={"q": q_np, "k": k_np, "v": v_np, "mask": mask_np},
+                        fetch_list=[res2, res3_, res4_, res5_],
+                    )
+        for i in range(len(fetches0)):
+            np.testing.assert_allclose(
+                fetches0[i], fetches1[i], rtol=1e-3, atol=1e-3
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index e642358fd71cb9..6fe6a90a5dd3da 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -91,6 +91,8 @@ list(REMOVE_ITEM TEST_OPS test_fused_ec_moe_op)
 list(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
 list(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
 list(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+list(REMOVE_ITEM TEST_OPS test_fused_dot_product_attention_op)
+list(REMOVE_ITEM TEST_OPS test_fuse_dot_product_attention_pass)
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
@@ -619,6 +621,13 @@ if((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6))
                   test_fuse_gemm_epilogue_pass)
 endif()
 
+if((WITH_GPU) AND (WITH_CUDNN_FRONTEND))
+  py_test_modules(test_fuse_dot_product_attention_pass MODULES
+                  test_fuse_dot_product_attention_pass)
+  py_test_modules(test_fused_dot_product_attention_op MODULES
+                  test_fused_dot_product_attention_op)
+endif()
+
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_op_depthwise_conv
                      PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
diff --git a/test/legacy_test/test_fuse_dot_product_attention_pass.py b/test/legacy_test/test_fuse_dot_product_attention_pass.py
new file mode 100644
index 00000000000000..939a063aaf8336
--- /dev/null
+++ b/test/legacy_test/test_fuse_dot_product_attention_pass.py
@@ -0,0 +1,300 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+np.random.seed(0)
+paddle.seed(0)
+
+
+def skip_unit_test():
+    return (
+        not paddle.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability() != (8, 0)
+        or paddle.get_cudnn_version() < 8906
+    )
+
+
+skip_msg = (
+    "only support with cuda and CUDNN 8.9.6 or later,"
+    " and only Ampere devices are supported"
+)
+
+
+def verify_node_count(graph, node_name, target_count):
+    count = 0
+    for node in graph.nodes():
+        if node.name() == node_name:
+            count += 1
+    return count == target_count
+
+
+class mha(paddle.nn.Layer):
+    def __init__(
+        self,
+        hidden,
+        num_heads,
+        dropout=0.0,
+        num_layers=1,
+    ):
+        super().__init__()
+        self.mha_layer = paddle.nn.MultiHeadAttention(
+            hidden,
+            num_heads,
+            dropout=dropout,
+        )
+        self.num_layers = num_layers
+
+    def forward(self, q, k, v, mask):
+        out = q
+        for _ in range(self.num_layers):
+            out = self.mha_layer(out, k, v, attn_mask=mask)
+        loss = paddle.mean(out)
+        return loss
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFuseDotProductAttention(unittest.TestCase):
+    def setUp(self):
+        self.run_steps = 10
+        self.num_layers = 3
+        self._pre_test_hook()
+        self.hidden_dim = self.num_heads * self.head_size
+        paddle.enable_static()
+        self.place = paddle.CUDAPlace(0)
+        self._create_input()
+        self.init_weight = np.random.normal(
+            loc=0.0, scale=0.01, size=(self.hidden_dim, self.hidden_dim)
+        ).astype("float32")
+        self.check_fused_fwd_op_name = "fused_dot_product_attention"
+        self.check_fused_bwd_op_name = "fused_dot_product_attention_grad"
+
+    def _set_shape(self):
+        self.batch_size = 8
+        self.num_heads = 12
+        self.head_size = 64
+        self.q_seqlen = 128
+        self.kv_seqlen = 128
+
+    def _pre_test_hook(self):
+        self._set_shape()
+        self.dropout = 0.0
+        self.atol = 1e-4
+        self.rtol = 1e-4
+
+    def _create_input(self):
+        q_input = np.random.normal(
+            loc=0.0,
+            scale=1,
+            size=(self.batch_size, self.q_seqlen, self.hidden_dim),
+        ).astype(np.float32)
+        k_input = np.random.normal(
+            loc=0.0,
+            scale=1,
+            size=(self.batch_size, self.kv_seqlen, self.hidden_dim),
+        ).astype(np.float32)
+        v_input = np.random.normal(
+            loc=0.0,
+            scale=1,
+            size=(self.batch_size, self.kv_seqlen, self.hidden_dim),
+        ).astype(np.float32)
+
+        q_actual_seqlen = np.full(
+            shape=(self.batch_size,), fill_value=self.q_seqlen, dtype=np.int32
+        )
+        kv_actual_seqlen = np.random.randint(
+            low=5, high=self.kv_seqlen, size=(self.batch_size,), dtype=np.int32
+        )
+        attn_mask_arr = np.zeros(
+            shape=(self.batch_size, 1, self.q_seqlen, self.kv_seqlen),
+            dtype=np.int32,
+        )
+        for i in range(self.batch_size):
+            attn_mask_arr[i, :, : q_actual_seqlen[i], : kv_actual_seqlen[i]] = 1
+        self.feed = {
+            "_q_input": q_input,
+            "_k_input": k_input,
+            "_v_input": v_input,
+            "_attn_mask": attn_mask_arr,
+        }
+
+    def _reset_program_state_dict(self, model, hidden_dim):
+        '''
+        Set the weight of q, k, v, o proj to be the same value.
+        '''
+        state_dict = model.state_dict()
+        reset_state_dict = {}
+        for n, p in state_dict.items():
+            if p.shape == (hidden_dim, hidden_dim):
+                reset_state_dict[p.name] = self.init_weight
+        return reset_state_dict
+
+    def _build_program(self, main_prog, startup_prog):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            q_input = paddle.static.data(
+                name="_q_input",
+                shape=[-1, -1, self.hidden_dim],
+                dtype='float32',
+            )
+            k_input = paddle.static.data(
+                name="_k_input",
+                shape=[-1, -1, self.hidden_dim],
+                dtype='float32',
+            )
+            v_input = paddle.static.data(
+                name="_v_input",
+                shape=[-1, -1, self.hidden_dim],
+                dtype='float32',
+            )
+            attn_mask = paddle.static.data(
+                name="_attn_mask",
+                shape=[-1, 1, self.q_seqlen, self.kv_seqlen],
+                dtype='int32',
+            )
+
+            model = mha(
+                self.hidden_dim,
+                self.num_heads,
+                self.dropout,
+                num_layers=self.num_layers,
+            )
+            loss = model(
+                q_input,
+                k_input,
+                v_input,
+                attn_mask,
+            )
+            opt = paddle.optimizer.SGD(learning_rate=0.1)
+            amp_list = paddle.static.amp.CustomOpLists(
+                custom_white_list=['softmax']
+            )
+            # Only test AMP because cudnn v8 fmha only support half precision currently.
+            opt = paddle.static.amp.decorate(
+                optimizer=opt,
+                amp_lists=amp_list,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+            )
+            opt.minimize(loss)
+        return loss, model
+
+    def _test_ref(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        exe = paddle.static.Executor(self.place)
+
+        loss, model = self._build_program(main_prog, startup_prog)
+        exe.run(startup_prog)
+        reset_state_dict = self._reset_program_state_dict(
+            model, self.hidden_dim
+        )
+        paddle.static.set_program_state(main_prog, reset_state_dict)
+        self.reference = []
+        for i in range(self.run_steps):
+            loss_return = exe.run(
+                main_prog,
+                feed=self.feed,
+                fetch_list=[loss.name],  # , "reshape2_0.tmp_0"]
+            )
+            self.reference.append(loss_return[0])
+
+    def _test_ir_pass(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        exe = paddle.static.Executor(self.place)
+
+        loss, model = self._build_program(main_prog, startup_prog)
+
+        exe.run(startup_prog)
+        reset_state_dict = self._reset_program_state_dict(
+            model, self.hidden_dim
+        )
+        paddle.static.set_program_state(main_prog, reset_state_dict)
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_dot_product_attention = True
+        self.program = paddle.static.CompiledProgram(
+            main_prog, build_strategy=build_strategy
+        )
+        self.result = []
+        for i in range(self.run_steps):
+            loss_return = exe.run(
+                self.program, feed=self.feed, fetch_list=[loss.name]
+            )
+            self.result.append(loss_return[0])
+
+    def test_compare_results(self):
+        self._test_ref()
+        self._test_ir_pass()
+        np.testing.assert_allclose(
+            self.reference,
+            self.result,
+            atol=self.atol,
+            rtol=self.rtol,
+            equal_nan=True,
+            err_msg=f"[{type(self).__name__}] outputs are miss-matched.",
+        )
+        self.assertTrue(
+            verify_node_count(
+                self.program._graph,
+                self.check_fused_fwd_op_name,
+                self.num_layers,
+            ),
+            f"[{type(self).__name__}] The number of {self.check_fused_fwd_op_name} is miss-matched in the computing graph.",
+        )
+        self.assertTrue(
+            verify_node_count(
+                self.program._graph,
+                self.check_fused_fwd_op_name,
+                self.num_layers,
+            ),
+            f"[{type(self).__name__}] The number of {self.check_fused_fwd_op_name} is miss-matched in the computing graph.",
+        )
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFuseDotProductAttentionCase2(TestFuseDotProductAttention):
+    def _set_shape(self):
+        self.batch_size = 4
+        self.num_heads = 12
+        self.head_size = 64
+        self.q_seqlen = 512
+        self.kv_seqlen = 512
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFuseDotProductAttentionCase3(TestFuseDotProductAttention):
+    def _set_shape(self):
+        self.batch_size = 2
+        self.num_heads = 12
+        self.head_size = 128
+        self.q_seqlen = 1024
+        self.kv_seqlen = 1024
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFuseDotProductAttentionCase4(TestFuseDotProductAttention):
+    def _set_shape(self):
+        self.batch_size = 2
+        self.num_heads = 12
+        self.head_size = 128
+        self.q_seqlen = 2048
+        self.kv_seqlen = 2048
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_fused_dot_product_attention_op.py b/test/legacy_test/test_fused_dot_product_attention_op.py
new file mode 100644
index 00000000000000..53c4ab1bf7ae78
--- /dev/null
+++ b/test/legacy_test/test_fused_dot_product_attention_op.py
@@ -0,0 +1,388 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import fused_dot_product_attention
+
+np.random.seed(2023)
+
+
+def skip_unit_test():
+    return (
+        not paddle.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability() != (8, 0)
+        or paddle.get_cudnn_version() < 8906
+    )
+
+
+skip_msg = (
+    "only support with cuda and CUDNN 8.9.6 or later,"
+    " and only Ampere and later GPU is supported."
+)
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpFP16(OpTest):
+    def _set_shape(self):
+        self.batch_size = 8
+        self.q_seqlen = 128
+        self.kv_seqlen = 128
+        self.num_heads = 12
+        self.head_size = 64
+
+    def _set_config(self):
+        self.has_attn_mask = False
+        self.is_causal_masking = False
+        self.dropout_prob = 0.0
+        self.dtype = "float16"
+        self.rtol = 5e-4
+        self.atol = 5e-4
+
+    def setUp(self):
+        self._set_shape()
+        self._set_config()
+        # has_attn_mask and is_causal_masking can't be True at the same time
+        assert not (self.has_attn_mask and self.is_causal_masking)
+        self.training = True
+        self.scaling_factor = self.head_size**-0.5
+        self.q_shape = (
+            self.batch_size,
+            self.q_seqlen,
+            self.num_heads,
+            self.head_size,
+        )
+        self.kv_shape = (
+            self.batch_size,
+            self.kv_seqlen,
+            self.num_heads,
+            self.head_size,
+        )
+        self._generate_input_data()
+        self.__class__.op_type = "fused_dot_product_attention"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+
+    def _generate_input_data(self):
+        def _random(shape, mask=None):
+            if self.dtype == "bfloat16":
+                data = np.random.normal(loc=0.0, scale=0.02, size=shape).astype(
+                    "float32"
+                )
+                # mask has the same shape as data, if the mask value is 0, the
+                # corresponding data value will be set to 0.
+                if mask is not None:
+                    data = data * mask
+                return convert_float_to_uint16(data)
+            else:
+                data = np.random.random(shape).astype("float32")
+                if mask is not None:
+                    data = data * mask
+                return data.astype(self.dtype)
+
+        self.q = _random(self.q_shape)
+        self.k = _random(self.kv_shape)
+        self.v = _random(self.kv_shape)
+
+        self.attn_mask = np.ones(
+            shape=(self.batch_size, 1, self.q_seqlen, self.kv_seqlen),
+            dtype=np.int32,
+        )
+        self.q_actual_seqlen = np.full(
+            shape=(self.batch_size,), fill_value=self.q_seqlen, dtype=np.int32
+        )
+        self.kv_actual_seqlen = np.full(
+            shape=(self.batch_size,), fill_value=self.kv_seqlen, dtype=np.int32
+        )
+        self.attn_mask = np.ones(
+            shape=(self.batch_size, 1, self.q_seqlen, self.kv_seqlen),
+            dtype=np.int32,
+        )
+        if self.has_attn_mask:
+            self.q_actual_seqlen = np.random.randint(
+                low=20,
+                high=self.q_seqlen,
+                size=(self.batch_size,),
+                dtype=np.int32,
+            )
+            self.kv_actual_seqlen = np.random.randint(
+                low=20,
+                high=self.kv_seqlen,
+                size=(self.batch_size,),
+                dtype=np.int32,
+            )
+            self.attn_mask = np.zeros(
+                shape=(self.batch_size, 1, self.q_seqlen, self.kv_seqlen),
+                dtype=np.int32,
+            )
+            for i in range(0, self.batch_size):
+                self.attn_mask[
+                    i,
+                    0,
+                    0 : self.q_actual_seqlen[i],
+                    0 : self.kv_actual_seqlen[i],
+                ] = 1
+
+        # need to set invalid position of dout to 0
+        dout_shape = (
+            self.batch_size,
+            self.q_seqlen,
+            self.num_heads,
+            self.head_size,
+        )
+        dout_mask = None
+        if self.has_attn_mask:
+            dout_mask = np.ones(shape=dout_shape, dtype=np.int32)
+            for i in range(0, self.batch_size):
+                dout_mask[i, self.q_actual_seqlen[i] :, :, :] = 0
+        self.dout = _random(dout_shape, dout_mask)
+
+    def _get_reference_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        q_tensor = paddle.to_tensor(self.q, stop_gradient=False)
+        k_tensor = paddle.to_tensor(self.k, stop_gradient=False)
+        v_tensor = paddle.to_tensor(self.v, stop_gradient=False)
+
+        q_out = paddle.transpose(
+            x=q_tensor, perm=[0, 2, 1, 3]
+        )  # [b, s, h, d] -> [b, h, s, d]
+        k_out = paddle.transpose(
+            x=k_tensor, perm=[0, 2, 1, 3]
+        )  # [b, s, h, d] -> [b, h, s, d]
+        v_out = paddle.transpose(
+            x=v_tensor, perm=[0, 2, 1, 3]
+        )  # [b, s, h, d] -> [b, h, s, d]
+
+        qk_out = paddle.matmul(
+            x=q_out * self.scaling_factor,
+            y=k_out,
+            transpose_x=False,
+            transpose_y=True,
+        )
+
+        if self.is_causal_masking:
+            self.attn_mask = np.tril(self.attn_mask, k=0)
+
+        if self.has_attn_mask or self.is_causal_masking:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+            attn_mask = (paddle.cast(attn_mask, self.dtype) - 1.0) * 1e4
+            attn_mask_out = qk_out + attn_mask
+            softmax_out = F.softmax(attn_mask_out)
+        else:
+            softmax_out = F.softmax(qk_out)
+
+        if self.dropout_prob:
+            dropout_out = F.dropout(
+                softmax_out,
+                self.dropout_prob,
+                training=self.training,
+                mode="upscale_in_train",
+            )
+            qkv_out = paddle.matmul(dropout_out, v_out)
+        else:
+            qkv_out = paddle.matmul(softmax_out, v_out)
+
+        mha_out = paddle.transpose(
+            qkv_out, perm=[0, 2, 1, 3]
+        )  # [b, h, s, d] -> [b, s, h, d]
+
+        paddle.autograd.backward(
+            [mha_out],
+            [paddle.to_tensor(self.dout, dtype=self.dtype)],
+            retain_graph=True,
+        )
+
+        # need to set invalid position of output to 0
+        valid_mha_out = paddle.full_like(mha_out, 0)
+        for i in range(0, self.batch_size):
+            valid_mha_out[i, 0 : self.q_actual_seqlen[i], :, :] = mha_out[
+                i, 0 : self.q_actual_seqlen[i], :, :
+            ]
+
+        return (
+            valid_mha_out,
+            q_tensor.grad,
+            k_tensor.grad,
+            v_tensor.grad,
+            softmax_out,
+        )
+
+    def _get_fused_attn_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        q_tensor = paddle.to_tensor(self.q, stop_gradient=False)
+        k_tensor = paddle.to_tensor(self.k, stop_gradient=False)
+        v_tensor = paddle.to_tensor(self.v, stop_gradient=False)
+
+        attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        (
+            fmha_out,
+            softmax_out,
+        ) = fused_dot_product_attention(
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            attn_mask,
+            self.scaling_factor,
+            self.dropout_prob,
+            True,
+            self.is_causal_masking,
+            True,
+        )
+
+        paddle.autograd.backward(
+            [fmha_out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+
+        return (
+            fmha_out,
+            q_tensor.grad,
+            k_tensor.grad,
+            v_tensor.grad,
+            softmax_out,
+        )
+
+    def _compare_output(self):
+        def _convert(value):
+            if self.dtype == "bfloat16":
+                return convert_uint16_to_float(value)
+            return value
+
+        output_names = [
+            "fmha_out",
+            "q_grad",
+            "k_grad",
+            "v_grad",
+        ]
+
+        outputs_ref = self._get_reference_out()
+        outputs_fused = self._get_fused_attn_out()
+
+        for i in range(len(output_names)):
+            ref_res = outputs_ref[i]
+            fused_res = outputs_fused[i]
+            np.testing.assert_allclose(
+                _convert(ref_res.numpy()),
+                _convert(fused_res.numpy()),
+                atol=self.atol,
+                rtol=self.rtol,
+                err_msg=f"Checking < {output_names[i]} > failed",
+            )
+
+    def test_output(self):
+        self._compare_output()
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpFP16WithPaddingMask(TestFusedAttentionOpFP16):
+    def _set_config(self):
+        self.has_attn_mask = True
+        self.is_causal_masking = False
+        self.dropout_prob = 0.0
+        self.dtype = "float16"
+        self.rtol = 5e-3
+        self.atol = 5e-3
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpFP16WithCausalMask(TestFusedAttentionOpFP16):
+    def _set_config(self):
+        self.has_attn_mask = False
+        self.is_causal_masking = True
+        self.dropout_prob = 0.0
+        self.dtype = "float16"
+        self.rtol = 5e-3
+        self.atol = 5e-3
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpBF16(TestFusedAttentionOpFP16):
+    def _set_config(self):
+        self.has_attn_mask = False
+        self.is_causal_masking = False
+        self.dropout_prob = 0.0
+        self.dtype = "bfloat16"
+        self.rtol = 5e-3
+        self.atol = 5e-3
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpBF16WithPaddingMask(TestFusedAttentionOpFP16):
+    def _set_config(self):
+        self.has_attn_mask = True
+        self.is_causal_masking = False
+        self.dropout_prob = 0.0
+        self.dtype = "bfloat16"
+        self.rtol = 5e-3
+        self.atol = 5e-3
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpBF16WithCausalMask(TestFusedAttentionOpFP16):
+    def _set_config(self):
+        self.has_attn_mask = False
+        self.is_causal_masking = True
+        self.dropout_prob = 0.0
+        self.dtype = "bfloat16"
+        self.rtol = 5e-3
+        self.atol = 5e-3
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpBF16WithPaddingMaskCase2(
+    TestFusedAttentionOpBF16WithPaddingMask
+):
+    def _set_shape(self):
+        self.batch_size = 2
+        self.q_seqlen = 1024
+        self.kv_seqlen = 1024
+        self.num_heads = 4
+        self.head_size = 64
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpBF16WithPaddingMaskCase3(
+    TestFusedAttentionOpBF16WithPaddingMask
+):
+    def _set_shape(self):
+        self.batch_size = 1
+        self.q_seqlen = 2048
+        self.kv_seqlen = 2048
+        self.num_heads = 2
+        self.head_size = 128
+
+
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedAttentionOpBF16WithCausalMaskCase2(
+    TestFusedAttentionOpBF16WithCausalMask
+):
+    def _set_shape(self):
+        self.batch_size = 2
+        self.q_seqlen = 1024
+        self.kv_seqlen = 1024
+        self.num_heads = 4
+        self.head_size = 128
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh
index f3ea012f69c5ec..9807f5f746a690 100644
--- a/tools/gpups_test.sh
+++ b/tools/gpups_test.sh
@@ -100,6 +100,9 @@ parallel_list="^init_phi_test$|\
 ^test_fused_token_prune_op$|\
 ^test_fused_transformer_encoder_layer$|\
 ^test_fused_transformer_with_amp_decorator$|\
+^test_fused_dot_product_attention_op$|\
+^test_fuse_dot_product_attention_pass$|\
+^test_fused_dot_product_attention_pass$|\
 ^test_gather_nd_op$|\
 ^test_index_select_op$|\
 ^test_pass_base_list$|\

From b2c3d74dd0dd60c82d957bf732a33344fa1e8189 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Tue, 12 Dec 2023 14:26:13 +0800
Subject: [PATCH 27/28] fix cuda117 compile error and rm useless code;
 test=develop (#59892)

---
 paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h | 2 ++
 paddle/fluid/framework/tensor_util.cc                     | 2 --
 python/paddle/distributed/fleet/base/strategy_compiler.py | 1 -
 python/paddle/incubate/operators/unzip.py                 | 1 +
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index a7b6b0149282a9..315a9860ed67a2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,10 +22,12 @@
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 
 #ifdef PADDLE_WITH_HETERPS
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/shuffle.h>
+#endif
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 504f8ec4dff8d6..27dc5902c75ba3 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -307,8 +307,6 @@ void TensorCopySync(const phi::DenseTensor& src,
     return;
   }
 
-  VLOG(0) << "TensorCopySync " << src.dims() << " from " << src.place()
-          << " to " << dst_place;
   src.check_memory_size();
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index abd769b1a2d3bd..d66ca5e39ecff8 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -94,7 +94,6 @@ def maximum_path_len_algo(optimizer_list):
     edge, indegree = create_graph(optimizer_list)
     topo_sort(edge, indegree)
     max_path = floyd(edge)
-    print("max_path=", max_path)
     candidate = []
     for idx in max_path:
         candidate.append(optimizer_list[idx])
diff --git a/python/paddle/incubate/operators/unzip.py b/python/paddle/incubate/operators/unzip.py
index 7a5173f4cf9d4b..3eb33804153052 100644
--- a/python/paddle/incubate/operators/unzip.py
+++ b/python/paddle/incubate/operators/unzip.py
@@ -25,6 +25,7 @@ def unzip(input, lod, len):
 
     Args:
         input (Variable): The zipped input
+        len(int): The second dim length of unzipped output.
         lod (Variable): The original lod of unzipped input, 1-D LodTensor with shape[K].
 
     Returns:

From 76ff948dff22d35500128fb2da767dad367600bd Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 12 Dec 2023 14:45:30 +0800
Subject: [PATCH 28/28] [PIR+CINN]Adjust file path length and Catch Exception
 (#59898)

---
 .../paddle/jit/dy2static/export_subgraph.py   | 35 +++++++++++--------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/python/paddle/jit/dy2static/export_subgraph.py b/python/paddle/jit/dy2static/export_subgraph.py
index 74b305394bd8c3..d77cf9df53ae79 100644
--- a/python/paddle/jit/dy2static/export_subgraph.py
+++ b/python/paddle/jit/dy2static/export_subgraph.py
@@ -21,7 +21,7 @@
 
 __all__ = []
 
-MAX_FILE_PATH_LEN = 50
+MAX_FILE_PATH_LEN = 100
 
 
 class SubGraphRole:
@@ -164,9 +164,7 @@ def parse_inout(self):
         raw_outputs = self.pp_layer._outputs.tolist()
 
         inter_outs = {
-            name
-            for name in self.raw_inter_outs
-            if self.program.block(0).has_var(name)
+            name for name in self.raw_inter_outs if global_block.has_var(name)
         }
         for var in raw_inputs:
             inputs.append(var.name)
@@ -219,14 +217,21 @@ def pir_exporter(pp_layer, program, role, shared_inputs=None, inter_outs=None):
     root_saving_dir = get_saving_dir()
     if not root_saving_dir:
         return
-    copy_program = program.clone()
-    if role == SubGraphRole.Infer:
-        InferExporter(pp_layer, copy_program, role).save()
-    elif role == SubGraphRole.Forward:
-        TrainFwdExporter(pp_layer, copy_program, role, inter_outs).save()
-    elif role == SubGraphRole.Backward:
-        TrainBwdExporter(
-            pp_layer, copy_program, role, shared_inputs, inter_outs
-        ).save()
-    else:
-        raise RuntimeError("role only support Infer/Forward/Backward")
+    try:
+        copy_program = program.clone()
+        if role == SubGraphRole.Infer:
+            InferExporter(pp_layer, copy_program, role).save()
+        elif role == SubGraphRole.Forward:
+            TrainFwdExporter(pp_layer, copy_program, role, inter_outs).save()
+        elif role == SubGraphRole.Backward:
+            TrainBwdExporter(
+                pp_layer, copy_program, role, shared_inputs, inter_outs
+            ).save()
+        else:
+            raise RuntimeError(
+                f"role only support Infer/Forward/Backward, but got: {role}"
+            )
+    except Exception as e:
+        print(
+            f"Export subgraph failed: {e}\n. Received original program: {str(program)}"
+        )