PaddlePaddle · xiaoguoguo626807 · Jan 11, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 8, 2024
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/tuple_pop_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/tuple_pop_instruction.cc
@@ -113,6 +113,13 @@ void ShareVarData(const Variable* src_var, Variable* dst_var) {
   }
 }
 
+void ShareVarData(const VariableRefArray* src_var, VariableRefArray* dst_var) {
+  for (size_t i = 0; i < src_var->size(); ++i) {
+    Variable* copy_var = const_cast<Variable*>(dst_var->at(i));
+    ShareVarData(src_var->at(i), copy_var);
+  }
+}
+
 void TuplePopInstruction::Run() {
   VLOG(6) << "run tuple_pop instruction";
   if (tuple_pop_op_.tuple_size() == 0) {

diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/tuple_push_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/tuple_push_instruction.cc
@@ -60,9 +60,53 @@ TuplePushInstruction::TuplePushInstruction(size_t id,
         type_ = OpFuncType::kGpuAsync;
         break;
       }
+    } else if (inlet_element_value.type()
+                   .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
+      auto place =
+          inlet_element_value.type()
+              .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
+              .place();
+      if (place == phi::GPUPlace()) {
+        type_ = OpFuncType::kGpuAsync;
+        break;
+      }
+    } else if (inlet_element_value.type().isa<pir::VectorType>()) {
+      pir::VectorType inlet_element_type =
+          inlet_element_value.type().dyn_cast<pir::VectorType>();
+      for (size_t i = 0; i < static_cast<size_t>(inlet_element_type.size());
+           i++) {
+        if (inlet_element_type[i]
+                .isa<paddle::dialect::AllocatedDenseTensorType>()) {
+          auto place =
+              inlet_element_type[i]
+                  .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
+                  .place();
+          if (place == phi::GPUPlace()) {
+            type_ = OpFuncType::kGpuAsync;
+            break;
+          }
+        } else if (inlet_element_type[i]
+                       .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
+          auto place =
+              inlet_element_value.type()
+                  .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
+                  .place();
+          if (place == phi::GPUPlace()) {
+            type_ = OpFuncType::kGpuAsync;
+            break;
+          }
+        } else {
+          PADDLE_THROW(phi::errors::PreconditionNotMet(
+              "Only support AllocatedDenseTensorType and "
+              "AllocatedDenseTensorArrayType in vectortype now, but get: %s",
+              inlet_element_type[i]));
+        }
+      }
     } else {
       PADDLE_THROW(phi::errors::PreconditionNotMet(
-          "Only support AllocatedDenseTensorType now"));
+          "Only support AllocatedDenseTensorType and "
+          "AllocatedDenseTensorArrayType in vectortype now, but get: %s",
+          inlet_element_value.type()));
     }
   }
 }

diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
@@ -185,10 +185,9 @@ std::vector<std::vector<pir::OpResult>> Increment_Op::Vjp(
 
   VLOG(6) << "Vjp prepare call increment_'s vjp inteface";
 
-  pir::OpResult tensor_res = paddle::dialect::increment_(inputs_[0][0], -value);
-
-  std::vector<std::vector<pir::OpResult>> res{{tensor_res}};
+  paddle::dialect::increment_(inputs_[0][0], -value);
 
+  std::vector<std::vector<pir::OpResult>> res;
   return res;
 }
 

diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
@@ -2177,14 +2177,14 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyList_GetItem(obj, i);
       item = CastPyArg2ValuePreHook(item);
-      if (PyObject_TypeCheck(item, g_ir_opresult_pytype)) {
-        value_list.emplace_back(::pybind11::handle(item).cast<pir::OpResult>());
+      if (PyObject_TypeCheck(item, g_ir_value_pytype)) {
+        value_list.emplace_back(::pybind11::handle(item).cast<pir::Value>());
       } else if (item == Py_None) {
         continue;
       } else {
         PADDLE_THROW(platform::errors::InvalidType(
             "%s(): argument (position %d) must be "
-            "vector<OpResult>, but got vector<%s>",
+            "vector<Value>, but got vector<%s>",
             op_type,
             arg_pos + 1,
             reinterpret_cast<PyTypeObject*>(item->ob_type)
@@ -2197,14 +2197,14 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyTuple_GetItem(obj, i);
       item = CastPyArg2ValuePreHook(item);
-      if (PyObject_TypeCheck(item, g_ir_opresult_pytype)) {
-        value_list.emplace_back(::pybind11::handle(item).cast<pir::OpResult>());
+      if (PyObject_TypeCheck(item, g_ir_value_pytype)) {
+        value_list.emplace_back(::pybind11::handle(item).cast<pir::Value>());
       } else if (item == Py_None) {
         continue;
       } else {
         PADDLE_THROW(platform::errors::InvalidType(
             "%s(): argument (position %d) must be "
-            "vector<OpResult>, but got vector<%s>",
+            "vector<Value>, but got vector<%s>",
             op_type,
             arg_pos + 1,
             reinterpret_cast<PyTypeObject*>(item->ob_type)

@@ -460,14 +460,15 @@ def append_add_n(value):
         # value is input of more than one fwd_op,
         # so more than one bwd_op create input_grad,
         # need add sum op to accumulate gradient
-        if value.is_tensorarray():
-            add_n_value = paddle._pir_ops.add_n_array(
-                [item[0] for item in state.value_to_valuegrad[value]]
+        add_n_list = []
+        for item in state.value_to_valuegrad[value]:
+            add_n_list.append(
+                return_map_value(item[0], bwd_value_to_block_argument_map)
             )
+        if value.is_tensorarray():
+            add_n_value = paddle._pir_ops.add_n_array(add_n_list)
         else:
-            add_n_value = paddle.add_n(
-                [item[0] for item in state.value_to_valuegrad[value]]
-            )
+            add_n_value = paddle.add_n(add_n_list)
 
         add_n_op = add_n_value.get_defining_op()
         combine_op = add_n_op.operand_source(0).get_defining_op()
@@ -581,8 +582,6 @@ def get_grad_semantic_info(op):
             "pd_op.if",
             "pd_op.while",
             "cf.tuple_push",
-            "pd_op.increment_",
-            "pd_op.increment",
         ]:
             grad_semantic_info = [
                 True for _ in range(len(get_real_op_inputs(op)))
@@ -599,7 +598,8 @@ def make_input_with_input_stopgradient(op):
         ):
             if not grad_semantic:
                 if (
-                    input.get_defining_op() is not None
+                    op.name() != "cf.tuple_push"
+                    and input.get_defining_op() is not None
                     and input.get_defining_op().name() == "builtin.combine"
                 ):
                     tmp_input = []
@@ -621,7 +621,8 @@ def make_input_with_input_stopgradient(op):
                 continue
 
             if (
-                input.get_defining_op() is not None
+                op.name() != "cf.tuple_push"
+                and input.get_defining_op() is not None
                 and input.get_defining_op().name() == "builtin.combine"
             ):
                 (
@@ -652,7 +653,8 @@ def update_input_grad_map(op, input_grads, all_inputs):
                 continue
 
             if (
-                input.get_defining_op() is not None
+                op.name() != "cf.tuple_push"
+                and input.get_defining_op() is not None
                 and input.get_defining_op().name() == "builtin.combine"
             ):
                 update_input_grad_map(
@@ -785,7 +787,7 @@ def argument_to_value(while_op):
                     input_grad_stopgradients,
                 ) = make_input_with_input_stopgradient(op)
 
-                if op.name() in ["cf.tuple_push", "pd_op.increment_"]:
+                if op.name() == "cf.tuple_push":
                     with dynamic_shape_prim_vjp_guard(op, inputs):
                         copy_out = paddle.framework.core.call_vjp(
                             op,
@@ -797,17 +799,7 @@ def argument_to_value(while_op):
 
                     pop_op = bwd_block.ops[-1]
                     bwd_ops = [pop_op]
-                    tmp_inputs = (
-                        inputs
-                        if op.name() == "pd_op.increment_"
-                        else inputs[1:]
-                    )
-                    tmp_copy_out = (
-                        copy_out
-                        if op.name() == "pd_op.increment_"
-                        else copy_out[1:]
-                    )
-                    for output, copy_output in zip(tmp_inputs, tmp_copy_out):
+                    for output, copy_output in zip(inputs[1:], copy_out[1:]):
                         control_flow_value_to_copyvalue_map[
                             output[0]
                         ] = copy_output[0]
@@ -819,7 +811,7 @@ def argument_to_value(while_op):
                     # should be delete (prune sub_graph)
                     if (
                         len(output_grads) == 0 or all(zero_flag)
-                    ) and op.name() != "pd_op.while":
+                    ) and op.name() not in ["pd_op.while", "pd_op.increment_"]:
                         continue
 
                     if op.name() == "pd_op.if":
@@ -902,6 +894,9 @@ def argument_to_value(while_op):
                             _,
                             sub_bwd_value_to_block_argument_map,
                         ) = argument_to_value(grad_op)
+                        sub_bwd_value_to_block_argument_map.update(
+                            bwd_value_to_block_argument_map
+                        )
                         while_grad_block = grad_op.as_while_op().body()
                         sub_backward_ops = []
                         append_backward_ops(

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
@@ -1980,7 +1980,7 @@ def add_n(inputs, name=None):
              [14., 16., 18.]])
     """
     if in_dynamic_or_pir_mode():
-        if isinstance(inputs, (Variable, paddle.pir.OpResult)):
+        if isinstance(inputs, (Variable, paddle.pir.Value)):
             inputs = [inputs]
         return _C_ops.add_n(inputs)
     else:
@@ -4726,7 +4726,8 @@ def increment(x, value=1.0, name=None):
         x, 'x', ['float32', 'float64', 'int32', 'int64'], 'increment'
     )
     if in_pir_mode():
-        return _C_ops.increment_(x, value)
+        _C_ops.increment_(x, value)
+        return x
     else:
         helper = LayerHelper("increment", **locals())
         helper.append_op(

diff --git a/test/legacy_test/test_while_loop_op.py b/test/legacy_test/test_while_loop_op.py
@@ -295,7 +295,7 @@ def body(i, x):
             feed_i = np.ones(1).astype('float32')
             feed_x = np.ones(1).astype('float32')
             data = np.asarray([100]).astype('float32')
-            i_grad = np.asarray([0]).astype('float32')
+            i_grad = np.asarray([20]).astype('float32')
             x_grad = np.asarray([0]).astype('float32')
 
             for p, g in grad_list:
@@ -308,6 +308,7 @@ def body(i, x):
                 feed={'i': feed_i, 'x': feed_x},
                 fetch_list=[mean, di, dx],
             )
+
             np.testing.assert_allclose(np.asarray(res[0]), data, rtol=1e-05)
             np.testing.assert_allclose(np.asarray(res[1]), i_grad, rtol=1e-05)
             np.testing.assert_allclose(np.asarray(res[2]), x_grad, rtol=1e-05)
@@ -372,7 +373,8 @@ def body(i, x):
 
 class TestApiWhileLoop_NestedWithBackwardAndLoDTensorArray(unittest.TestCase):
     # TODO(zhangbo): Support while grad exe for pir
-    # @test_with_pir_api
+
+    @test_with_pir_api
     def test_nested_net_with_backward_and_lodtensor(self):
         def external_cond(i, j, x, mem_array):
             return paddle.less_than(i, array_len)
@@ -456,7 +458,7 @@ def internal_body(j, x, mem_array):
 
             d = []
             for i in range(3):
-                d.append(np.random.random(size=[10]).astype('float32'))
+                d.append(np.ones(10).astype('float32'))
             feed_x = np.ones(10).astype('float32')
             data_sum = d[0] + d[1] + d[2] + 3 * feed_x
             x_grad = [0.3] * 10
@@ -478,7 +480,7 @@ def internal_body(j, x, mem_array):
             np.testing.assert_allclose(res[0], data_sum, rtol=1e-05)
             np.testing.assert_allclose(res[1], x_grad, rtol=1e-05)
 
-    def _test_while_with_inplace(self):
+    def test_while_backward_with_inplace(self):
         with paddle.pir_utils.IrGuard():
 
             def internal_cond(i, x, mem_array):
@@ -519,9 +521,9 @@ def internal_body(i, x, mem_array):
                 j = paddle.increment(j)
                 dmem1 = paddle.tensor.array_read(dmem_array, j)
                 j = paddle.increment(j)
-                dmem2 = paddle.tensor.array_read(mem_array, j)
+                dmem2 = paddle.tensor.array_read(dmem_array, j)
                 j = paddle.increment(j)
-                dmem3 = paddle.tensor.array_read(mem_array, j)
+                dmem3 = paddle.tensor.array_read(dmem_array, j)
                 place = (
                     base.CUDAPlace(0)
                     if core.is_compiled_with_cuda()
@@ -531,25 +533,18 @@ def internal_body(i, x, mem_array):
 
                 feed_x = np.ones(10).astype('float32')
 
-                if paddle.framework.in_pir_mode():
-                    res = exe.run(
-                        main_program,
-                        feed={"x": feed_x},
-                        fetch_list=[out, dx],  # dmem0, dmem1, dmem2, dmem3],
-                    )
-                else:
-                    res = exe.run(
-                        main_program,
-                        feed={"x": feed_x},
-                        fetch_list=[out, dx],  # dmem0, dmem1, dmem2, dmem3],
-                    )
+                res = exe.run(
+                    main_program,
+                    feed={"x": feed_x},
+                    fetch_list=[out, dx, dmem0, dmem1, dmem2, dmem3],
+                )
 
-                # print("out = ", res[0], [3] * 10)
-                # print("dx = ", res[1], [0.3] * 10)
-                # print("dmem0 = ", res[2], [0.0] * 10)
-                # print("dmem1 = ", res[3], [0.0] * 10)
-                # print("dmem2 = ", res[4], [0.0] * 10)
-                # print("dmem3 = ", res[5], [0.0] * 10)
+                np.testing.assert_allclose(res[0], [3] * 10, rtol=1e-05)
+                np.testing.assert_allclose(res[1], [0.3] * 10, rtol=1e-05)
+                np.testing.assert_allclose(res[2], [0.0] * 10, rtol=1e-05)
+                np.testing.assert_allclose(res[3], [0.0] * 10, rtol=1e-05)
+                np.testing.assert_allclose(res[4], [0.0] * 10, rtol=1e-05)
+                np.testing.assert_allclose(res[5], [0.0] * 10, rtol=1e-05)
 
 
 class TestApiWhileLoopWithSwitchCase(unittest.TestCase):

diff --git a/test/legacy_test/test_while_op.py b/test/legacy_test/test_while_op.py
@@ -33,11 +33,19 @@ def simple_net(self):
         d0 = paddle.static.data("d0", shape=[10], dtype='float32')
         d1 = paddle.static.data("d1", shape=[10], dtype='float32')
         d2 = paddle.static.data("d2", shape=[10], dtype='float32')
+        d0.persistable = True
+        d0.stop_gradient = False
+        d1.persistable = True
+        d2.persistable = True
         i = paddle.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
+        i.persistable = True
         init = paddle.zeros(shape=[10], dtype='float32')
         mem_array = paddle.tensor.array_write(x=init, i=i)
         data_array = paddle.tensor.array_write(x=d0, i=i)
+        mem_array.stop_gradient = False
+        data_array.stop_gradient = False
+        mem_array.persistable = True
         i = paddle.increment(i)
         paddle.tensor.array_write(d1, i, array=data_array)
         i = paddle.increment(i)