Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【pir】modify add_n in while use blockarg instead of input value #60668

Merged
merged 12 commits into from
Jan 11, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ void ShareVarData(const Variable* src_var, Variable* dst_var) {
}
}

void ShareVarData(const VariableRefArray* src_var, VariableRefArray* dst_var) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个函数是不是统一到 void ShareVarData(const Variable* src_var, Variable* dst_var) 即可?

for (size_t i = 0; i < src_var->size(); ++i) {
Variable* copy_var = const_cast<Variable*>(dst_var->at(i));
ShareVarData(src_var->at(i), copy_var);
}
}

void TuplePopInstruction::Run() {
VLOG(6) << "run tuple_pop instruction";
if (tuple_pop_op_.tuple_size() == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,53 @@ TuplePushInstruction::TuplePushInstruction(size_t id,
type_ = OpFuncType::kGpuAsync;
break;
}
} else if (inlet_element_value.type()
.isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
auto place =
inlet_element_value.type()
.dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
.place();
if (place == phi::GPUPlace()) {
type_ = OpFuncType::kGpuAsync;
break;
}
} else if (inlet_element_value.type().isa<pir::VectorType>()) {
pir::VectorType inlet_element_type =
inlet_element_value.type().dyn_cast<pir::VectorType>();
for (size_t i = 0; i < static_cast<size_t>(inlet_element_type.size());
i++) {
if (inlet_element_type[i]
.isa<paddle::dialect::AllocatedDenseTensorType>()) {
auto place =
inlet_element_type[i]
.dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
.place();
if (place == phi::GPUPlace()) {
type_ = OpFuncType::kGpuAsync;
break;
}
} else if (inlet_element_type[i]
.isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
auto place =
inlet_element_value.type()
.dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
.place();
if (place == phi::GPUPlace()) {
type_ = OpFuncType::kGpuAsync;
break;
}
} else {
PADDLE_THROW(phi::errors::PreconditionNotMet(
"Only support AllocatedDenseTensorType and "
"AllocatedDenseTensorArrayType in vectortype now, but get: %s",
inlet_element_type[i]));
}
}
} else {
PADDLE_THROW(phi::errors::PreconditionNotMet(
"Only support AllocatedDenseTensorType now"));
"Only support AllocatedDenseTensorType and "
"AllocatedDenseTensorArrayType in vectortype now, but get: %s",
inlet_element_value.type()));
}
}
Comment on lines +63 to 111
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

建议这段逻辑抽象一下:

static std::vector<phi::Place> ParsePlace(pir::Type type) {
  std::vector<phi::Place> rtn;
  if (type.isa<paddle::dialect::AllocatedDenseTensorType>()) {
    rtn.push_back(type.dyn_cast<paddle::dialect::AllocatedDenseTensorType>().place());
  } else if (type.isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
    rtn.push_back(type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>().place());
  } else if (type.isa<pir::VectorType>()) {
    pir::VectorType inlet_element_type = type.dyn_cast<pir::VectorType>();
    for (size_t i = 0; i < static_cast<size_t>(inlet_element_type.size()); i++) {
      auto out = ParsePlace(inlet_element_type[i]);
      rtn.insert(rtn.end(), out.begin(), out.end());
    }
  } else {
    PADDLE_THROW(phi::errors::PreconditionNotMet(
        "Only support AllocatedDenseTensorType now"));
        "Only support AllocatedDenseTensorType and "
        "AllocatedDenseTensorArrayType in vectortype now, but get: %s",
        inlet_element_value.type()));
  }
  return rtn;
}


auto all_place = ParsePlace(inlet_element_value.type());
for (auto p : all_place) {
  if (place == phi::GPUPlace()) {
    type_ = OpFuncType::kGpuAsync;
    break;
  }
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

下个pr 修改

}
Expand Down
5 changes: 2 additions & 3 deletions paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,9 @@ std::vector<std::vector<pir::OpResult>> Increment_Op::Vjp(

VLOG(6) << "Vjp prepare call increment_'s vjp inteface";

pir::OpResult tensor_res = paddle::dialect::increment_(inputs_[0][0], -value);

std::vector<std::vector<pir::OpResult>> res{{tensor_res}};
paddle::dialect::increment_(inputs_[0][0], -value);

std::vector<std::vector<pir::OpResult>> res;
return res;
}

Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/pybind/eager_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2177,14 +2177,14 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
for (Py_ssize_t i = 0; i < len; i++) {
item = PyList_GetItem(obj, i);
item = CastPyArg2ValuePreHook(item);
if (PyObject_TypeCheck(item, g_ir_opresult_pytype)) {
value_list.emplace_back(::pybind11::handle(item).cast<pir::OpResult>());
if (PyObject_TypeCheck(item, g_ir_value_pytype)) {
value_list.emplace_back(::pybind11::handle(item).cast<pir::Value>());
} else if (item == Py_None) {
continue;
} else {
PADDLE_THROW(platform::errors::InvalidType(
"%s(): argument (position %d) must be "
"vector<OpResult>, but got vector<%s>",
"vector<Value>, but got vector<%s>",
op_type,
arg_pos + 1,
reinterpret_cast<PyTypeObject*>(item->ob_type)
Expand All @@ -2197,14 +2197,14 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
for (Py_ssize_t i = 0; i < len; i++) {
item = PyTuple_GetItem(obj, i);
item = CastPyArg2ValuePreHook(item);
if (PyObject_TypeCheck(item, g_ir_opresult_pytype)) {
value_list.emplace_back(::pybind11::handle(item).cast<pir::OpResult>());
if (PyObject_TypeCheck(item, g_ir_value_pytype)) {
value_list.emplace_back(::pybind11::handle(item).cast<pir::Value>());
} else if (item == Py_None) {
continue;
} else {
PADDLE_THROW(platform::errors::InvalidType(
"%s(): argument (position %d) must be "
"vector<OpResult>, but got vector<%s>",
"vector<Value>, but got vector<%s>",
op_type,
arg_pos + 1,
reinterpret_cast<PyTypeObject*>(item->ob_type)
Expand Down
43 changes: 19 additions & 24 deletions python/paddle/autograd/ir_backward.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,14 +460,15 @@ def append_add_n(value):
# value is input of more than one fwd_op,
# so more than one bwd_op create input_grad,
# need add sum op to accumulate gradient
if value.is_tensorarray():
add_n_value = paddle._pir_ops.add_n_array(
[item[0] for item in state.value_to_valuegrad[value]]
add_n_list = []
for item in state.value_to_valuegrad[value]:
add_n_list.append(
return_map_value(item[0], bwd_value_to_block_argument_map)
)
if value.is_tensorarray():
add_n_value = paddle._pir_ops.add_n_array(add_n_list)
else:
add_n_value = paddle.add_n(
[item[0] for item in state.value_to_valuegrad[value]]
)
add_n_value = paddle.add_n(add_n_list)

add_n_op = add_n_value.get_defining_op()
combine_op = add_n_op.operand_source(0).get_defining_op()
Expand Down Expand Up @@ -581,8 +582,6 @@ def get_grad_semantic_info(op):
"pd_op.if",
"pd_op.while",
"cf.tuple_push",
"pd_op.increment_",
"pd_op.increment",
]:
grad_semantic_info = [
True for _ in range(len(get_real_op_inputs(op)))
Expand All @@ -599,7 +598,8 @@ def make_input_with_input_stopgradient(op):
):
if not grad_semantic:
if (
input.get_defining_op() is not None
op.name() != "cf.tuple_push"
and input.get_defining_op() is not None
and input.get_defining_op().name() == "builtin.combine"
):
tmp_input = []
Expand All @@ -621,7 +621,8 @@ def make_input_with_input_stopgradient(op):
continue

if (
input.get_defining_op() is not None
op.name() != "cf.tuple_push"
and input.get_defining_op() is not None
and input.get_defining_op().name() == "builtin.combine"
):
(
Expand Down Expand Up @@ -652,7 +653,8 @@ def update_input_grad_map(op, input_grads, all_inputs):
continue

if (
input.get_defining_op() is not None
op.name() != "cf.tuple_push"
and input.get_defining_op() is not None
and input.get_defining_op().name() == "builtin.combine"
):
update_input_grad_map(
Expand Down Expand Up @@ -785,7 +787,7 @@ def argument_to_value(while_op):
input_grad_stopgradients,
) = make_input_with_input_stopgradient(op)

if op.name() in ["cf.tuple_push", "pd_op.increment_"]:
if op.name() == "cf.tuple_push":
with dynamic_shape_prim_vjp_guard(op, inputs):
copy_out = paddle.framework.core.call_vjp(
op,
Expand All @@ -797,17 +799,7 @@ def argument_to_value(while_op):

pop_op = bwd_block.ops[-1]
bwd_ops = [pop_op]
tmp_inputs = (
inputs
if op.name() == "pd_op.increment_"
else inputs[1:]
)
tmp_copy_out = (
copy_out
if op.name() == "pd_op.increment_"
else copy_out[1:]
)
for output, copy_output in zip(tmp_inputs, tmp_copy_out):
for output, copy_output in zip(inputs[1:], copy_out[1:]):
control_flow_value_to_copyvalue_map[
output[0]
] = copy_output[0]
Expand All @@ -819,7 +811,7 @@ def argument_to_value(while_op):
# should be delete (prune sub_graph)
if (
len(output_grads) == 0 or all(zero_flag)
) and op.name() != "pd_op.while":
) and op.name() not in ["pd_op.while", "pd_op.increment_"]:
continue

if op.name() == "pd_op.if":
Expand Down Expand Up @@ -902,6 +894,9 @@ def argument_to_value(while_op):
_,
sub_bwd_value_to_block_argument_map,
) = argument_to_value(grad_op)
sub_bwd_value_to_block_argument_map.update(
bwd_value_to_block_argument_map
)
while_grad_block = grad_op.as_while_op().body()
sub_backward_ops = []
append_backward_ops(
Expand Down
5 changes: 3 additions & 2 deletions python/paddle/tensor/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -1980,7 +1980,7 @@ def add_n(inputs, name=None):
[14., 16., 18.]])
"""
if in_dynamic_or_pir_mode():
if isinstance(inputs, (Variable, paddle.pir.OpResult)):
if isinstance(inputs, (Variable, paddle.pir.Value)):
inputs = [inputs]
return _C_ops.add_n(inputs)
else:
Expand Down Expand Up @@ -4726,7 +4726,8 @@ def increment(x, value=1.0, name=None):
x, 'x', ['float32', 'float64', 'int32', 'int64'], 'increment'
)
if in_pir_mode():
return _C_ops.increment_(x, value)
_C_ops.increment_(x, value)
return x
else:
helper = LayerHelper("increment", **locals())
helper.append_op(
Expand Down
43 changes: 19 additions & 24 deletions test/legacy_test/test_while_loop_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def body(i, x):
feed_i = np.ones(1).astype('float32')
feed_x = np.ones(1).astype('float32')
data = np.asarray([100]).astype('float32')
i_grad = np.asarray([0]).astype('float32')
i_grad = np.asarray([20]).astype('float32')
x_grad = np.asarray([0]).astype('float32')

for p, g in grad_list:
Expand All @@ -308,6 +308,7 @@ def body(i, x):
feed={'i': feed_i, 'x': feed_x},
fetch_list=[mean, di, dx],
)

np.testing.assert_allclose(np.asarray(res[0]), data, rtol=1e-05)
np.testing.assert_allclose(np.asarray(res[1]), i_grad, rtol=1e-05)
np.testing.assert_allclose(np.asarray(res[2]), x_grad, rtol=1e-05)
Expand Down Expand Up @@ -372,7 +373,8 @@ def body(i, x):

class TestApiWhileLoop_NestedWithBackwardAndLoDTensorArray(unittest.TestCase):
# TODO(zhangbo): Support while grad exe for pir
# @test_with_pir_api

@test_with_pir_api
def test_nested_net_with_backward_and_lodtensor(self):
def external_cond(i, j, x, mem_array):
return paddle.less_than(i, array_len)
Expand Down Expand Up @@ -456,7 +458,7 @@ def internal_body(j, x, mem_array):

d = []
for i in range(3):
d.append(np.random.random(size=[10]).astype('float32'))
d.append(np.ones(10).astype('float32'))
feed_x = np.ones(10).astype('float32')
data_sum = d[0] + d[1] + d[2] + 3 * feed_x
x_grad = [0.3] * 10
Expand All @@ -478,7 +480,7 @@ def internal_body(j, x, mem_array):
np.testing.assert_allclose(res[0], data_sum, rtol=1e-05)
np.testing.assert_allclose(res[1], x_grad, rtol=1e-05)

def _test_while_with_inplace(self):
def test_while_backward_with_inplace(self):
with paddle.pir_utils.IrGuard():

def internal_cond(i, x, mem_array):
Expand Down Expand Up @@ -519,9 +521,9 @@ def internal_body(i, x, mem_array):
j = paddle.increment(j)
dmem1 = paddle.tensor.array_read(dmem_array, j)
j = paddle.increment(j)
dmem2 = paddle.tensor.array_read(mem_array, j)
dmem2 = paddle.tensor.array_read(dmem_array, j)
j = paddle.increment(j)
dmem3 = paddle.tensor.array_read(mem_array, j)
dmem3 = paddle.tensor.array_read(dmem_array, j)
place = (
base.CUDAPlace(0)
if core.is_compiled_with_cuda()
Expand All @@ -531,25 +533,18 @@ def internal_body(i, x, mem_array):

feed_x = np.ones(10).astype('float32')

if paddle.framework.in_pir_mode():
res = exe.run(
main_program,
feed={"x": feed_x},
fetch_list=[out, dx], # dmem0, dmem1, dmem2, dmem3],
)
else:
res = exe.run(
main_program,
feed={"x": feed_x},
fetch_list=[out, dx], # dmem0, dmem1, dmem2, dmem3],
)
res = exe.run(
main_program,
feed={"x": feed_x},
fetch_list=[out, dx, dmem0, dmem1, dmem2, dmem3],
)

# print("out = ", res[0], [3] * 10)
# print("dx = ", res[1], [0.3] * 10)
# print("dmem0 = ", res[2], [0.0] * 10)
# print("dmem1 = ", res[3], [0.0] * 10)
# print("dmem2 = ", res[4], [0.0] * 10)
# print("dmem3 = ", res[5], [0.0] * 10)
np.testing.assert_allclose(res[0], [3] * 10, rtol=1e-05)
np.testing.assert_allclose(res[1], [0.3] * 10, rtol=1e-05)
np.testing.assert_allclose(res[2], [0.0] * 10, rtol=1e-05)
np.testing.assert_allclose(res[3], [0.0] * 10, rtol=1e-05)
np.testing.assert_allclose(res[4], [0.0] * 10, rtol=1e-05)
np.testing.assert_allclose(res[5], [0.0] * 10, rtol=1e-05)


class TestApiWhileLoopWithSwitchCase(unittest.TestCase):
Expand Down
8 changes: 8 additions & 0 deletions test/legacy_test/test_while_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,19 @@ def simple_net(self):
d0 = paddle.static.data("d0", shape=[10], dtype='float32')
d1 = paddle.static.data("d1", shape=[10], dtype='float32')
d2 = paddle.static.data("d2", shape=[10], dtype='float32')
d0.persistable = True
d0.stop_gradient = False
d1.persistable = True
d2.persistable = True
i = paddle.zeros(shape=[1], dtype='int64')
i.stop_gradient = True
i.persistable = True
init = paddle.zeros(shape=[10], dtype='float32')
mem_array = paddle.tensor.array_write(x=init, i=i)
data_array = paddle.tensor.array_write(x=d0, i=i)
mem_array.stop_gradient = False
data_array.stop_gradient = False
mem_array.persistable = True
i = paddle.increment(i)
paddle.tensor.array_write(d1, i, array=data_array)
i = paddle.increment(i)
Expand Down