[pir+auto parallel] add reshard op for input when needed #63072

zhiqiu · 2024-03-28T06:03:13Z

PR Category

Auto Parallel

PR Types

New features

Description

[pir+auto parallel] add reshard op for input when needed

This PR adds a pass named apply_partition_pass, which will add reshard op for input when the value's dist_attr is not equal to the use_op's operand dist_attr

Pcard-76459

The program before,

{
    (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"learning_rate_1",op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[]}},persistable:[true],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[],stop_gradient:[true]} : () -> pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>
    (%1) = "builtin.parameter" () {is_distributed:[false],is_parameter:[true],need_clip:[true],op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[0,-1]}},parameter_name:"parameter_1",persistable:[true],stop_gradient:[false],trainable:[true]} : () -> pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>
    (%2) = "builtin.parameter" () {is_distributed:[false],is_parameter:[true],need_clip:[true],op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1,0]}},parameter_name:"parameter_0",persistable:[true],stop_gradient:[false],trainable:[true]} : () -> pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%3) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"input0",op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1,-1]}},place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,16],stop_gradient:[true]} : () -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%4) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"label0",op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1,-1]}},place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,8],stop_gradient:[true]} : () -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%5) = "pd_op.relu" (%3) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%6) = "pd_op.matmul" (%5, %2) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,0]}},stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%7) = "pd_op.relu" (%6) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,0]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%8) = "pd_op.matmul" (%7, %1) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[0,-1]},result(0):{dims_maping:[-1,-1],partial(0,SUM)}},stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>
    (%9) = "pd_op.relu" (%8) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%10) = "pd_op.subtract" (%9, %4) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%11) = "pd_op.square" (%10) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%12) = "pd_op.mean" (%11) {axis:(pd_op.IntArray)[],keepdim:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>
    () = "builtin.shadow_output" (%12) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[]}},output_name:"loss_0"} : (pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>) -> 
    (%13) = "pd_op.full" () {dtype:(pd_op.DataType)float32,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1]}},place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_dist.tensor<1xf32, mesh_shape:[2],dims_mappings:[-1]>
    (%14) = "pd_op.full_like" (%12, %13) {dtype:(pd_op.DataType)float32,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[]},result(0):{dims_maping:[]}},place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>, pd_dist.tensor<1xf32, mesh_shape:[2],dims_mappings:[-1]>) -> pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>
    (%15) = "pd_op.mean_grad" (%11, %14) {axis:(pd_op.IntArray)[],keepdim:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[]},result(0):{dims_maping:[-1,-1]}},reduce_all:false,stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%16) = "pd_op.square_grad" (%10, %15) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%17, %18) = "pd_op.subtract_grad" (%9, %4, %16) {axis:(Int32)-1,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},operand(2):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]},result(1):{dims_maping:[-1,-1]}},stop_gradient:[false,false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, <<NULL TYPE>>
    (%19) = "pd_op.relu_grad" (%9, %17) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%20, %21) = "pd_op.matmul_grad" (%7, %1, %19) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[0,-1]},operand(2):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,0]},result(1):{dims_maping:[0,-1]}},stop_gradient:[false,false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>
    (%22) = "pd_op.relu_grad" (%7, %20) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,0]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%23, %24) = "pd_op.matmul_grad" (%5, %2, %22) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,0]},operand(2):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,-1],partial(0,SUM)},result(1):{dims_maping:[-1,0]}},stop_gradient:[false,false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%25) = "pd_op.relu_grad" (%5, %23) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>) -> <<NULL TYPE>>
    (%26, %27) = "pd_op.sgd_" (%1, %0, %21, <<NULL VALUE>>) {multi_precision:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[0,-1]},operand(1):{dims_maping:[]},operand(2):{dims_maping:[0,-1]},operand(3):{null},result(0):{dims_maping:[0,-1]},result(1):{null}},stop_gradient:[false,false]} : (pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, <<NULL TYPE>>) -> pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, <<NULL TYPE>>
    (%28, %29) = "pd_op.sgd_" (%2, %0, %24, <<NULL VALUE>>) {multi_precision:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[]},operand(2):{dims_maping:[-1,0]},operand(3):{null},result(0):{dims_maping:[-1,0]},result(1):{null}},stop_gradient:[false,false]} : (pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, <<NULL TYPE>>) -> pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, <<NULL TYPE>>
}

The program after,

{
    (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"learning_rate_1",op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[]}},persistable:[true],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[],stop_gradient:[true]} : () -> pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>
    (%1) = "builtin.parameter" () {is_distributed:[false],is_parameter:[true],need_clip:[true],op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[0,-1]}},parameter_name:"parameter_1",persistable:[true],stop_gradient:[false],trainable:[true]} : () -> pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>
    (%2) = "builtin.parameter" () {is_distributed:[false],is_parameter:[true],need_clip:[true],op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1,0]}},parameter_name:"parameter_0",persistable:[true],stop_gradient:[false],trainable:[true]} : () -> pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%3) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"input0",op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1,-1]}},place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,16],stop_gradient:[true]} : () -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%4) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"label0",op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1,-1]}},place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,8],stop_gradient:[true]} : () -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%5) = "pd_op.relu" (%3) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%6) = "pd_op.matmul" (%5, %2) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,0]}},stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%7) = "pd_op.relu" (%6) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,0]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%8) = "pd_op.matmul" (%7, %1) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[0,-1]},result(0):{dims_maping:[-1,-1],partial(0,SUM)}},stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>
    (%9) = "dist_op.reshard" (%8) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1],partial(0,SUM)},result(0):{dims_maping:[-1,-1]}}} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%10) = "pd_op.relu" (%9) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%11) = "pd_op.subtract" (%10, %4) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%12) = "pd_op.square" (%11) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%13) = "pd_op.mean" (%12) {axis:(pd_op.IntArray)[],keepdim:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},result(0):{dims_maping:[]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>
    () = "builtin.shadow_output" (%13) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[]}},output_name:"loss_0"} : (pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>) -> 
    (%14) = "pd_op.full" () {dtype:(pd_op.DataType)float32,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},result(0):{dims_maping:[-1]}},place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_dist.tensor<1xf32, mesh_shape:[2],dims_mappings:[-1]>
    (%15) = "pd_op.full_like" (%13, %14) {dtype:(pd_op.DataType)float32,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[]},result(0):{dims_maping:[]}},place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>, pd_dist.tensor<1xf32, mesh_shape:[2],dims_mappings:[-1]>) -> pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>
    (%16) = "pd_op.mean_grad" (%12, %15) {axis:(pd_op.IntArray)[],keepdim:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[]},result(0):{dims_maping:[-1,-1]}},reduce_all:false,stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%17) = "pd_op.square_grad" (%11, %16) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%18, %19) = "pd_op.subtract_grad" (%10, %4, %17) {axis:(Int32)-1,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},operand(2):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]},result(1):{dims_maping:[-1,-1]}},stop_gradient:[false,false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, <<NULL TYPE>>
    (%20) = "pd_op.relu_grad" (%10, %18) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%21, %22) = "pd_op.matmul_grad" (%7, %1, %20) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[0,-1]},operand(2):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,0]},result(1):{dims_maping:[0,-1]}},stop_gradient:[false,false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, pd_dist.tensor<4x8xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>
    (%23) = "pd_op.relu_grad" (%7, %21) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,0]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%24, %25) = "pd_op.matmul_grad" (%5, %2, %23) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,0]},operand(2):{dims_maping:[-1,0]},result(0):{dims_maping:[-1,-1],partial(0,SUM)},result(1):{dims_maping:[-1,0]}},stop_gradient:[false,false],transpose_x:false,transpose_y:false} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>
    (%26) = "dist_op.reshard" (%24) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1],partial(0,SUM)},result(0):{dims_maping:[-1,-1]}}} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1], partial(0,SUM)>) -> pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>
    (%27) = "pd_op.relu_grad" (%5, %26) {op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,-1]},operand(1):{dims_maping:[-1,-1]},result(0):{dims_maping:[-1,-1]}},stop_gradient:[false]} : (pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>, pd_dist.tensor<4x16xf32, mesh_shape:[2],dims_mappings:[-1,-1]>) -> <<NULL TYPE>>
    (%28, %29) = "pd_op.sgd_" (%1, %0, %22, <<NULL VALUE>>) {multi_precision:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[0,-1]},operand(1):{dims_maping:[]},operand(2):{dims_maping:[0,-1]},operand(3):{null},result(0):{dims_maping:[0,-1]},result(1):{null}},stop_gradient:[false,false]} : (pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>, pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, <<NULL TYPE>>) -> pd_dist.tensor<16x8xf32, mesh_shape:[2],dims_mappings:[0,-1]>, <<NULL TYPE>>
    (%30, %31) = "pd_op.sgd_" (%2, %0, %25, <<NULL VALUE>>) {multi_precision:false,op_dist_attr:{mesh:{shape:[2],process_ids:[0,1]},operand(0):{dims_maping:[-1,0]},operand(1):{dims_maping:[]},operand(2):{dims_maping:[-1,0]},operand(3):{null},result(0):{dims_maping:[-1,0]},result(1):{null}},stop_gradient:[false,false]} : (pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, pd_dist.tensor<f32, mesh_shape:[2],dims_mappings:[]>, pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, <<NULL TYPE>>) -> pd_dist.tensor<16x16xf32, mesh_shape:[2],dims_mappings:[-1,0]>, <<NULL TYPE>>
}

… dev/add_reshard

paddle-bot · 2024-03-28T06:03:18Z

你的PR提交成功，感谢你对开源项目的贡献!
请关注后续CI自动化测试结果，详情请参考Paddle-CI手册。
Your PR has been submitted. Thanks for your contribution!
Please wait for the result of CI firstly. See Paddle CI Manual for details.

… dev/add_reshard

JZ-LIANG · 2024-03-28T08:13:52Z

test/auto_parallel/pir/test_to_static_pir_program.py

@@ -66,6 +66,7 @@ def __init__(self, mesh):
        )

    def forward(self, x):
+        x.stop_gradient = False


not need to make x require for gradient, the relu_grad in backward will trigger the partial-->replicated allreduce

it is needed, otherwise, relu_grad is not executed.

JZ-LIANG · 2024-03-28T08:14:02Z

python/paddle/distributed/auto_parallel/static/pir_pass.py

+                op.operands(), op.dist_attr().operand_dist_attrs()
+            ):
+                if (
+                    var.source().is_dist_dense_tensor_type()


In scenario where src_dist_attr and dst_dist_attr have different mesh (e.g. Pipeline Parallelism), it would be better to insert two reshard ops.
one reshard op's mesh = src_dist_attr's mesh
the other's mesh = dst_dist_attr's mesh

therefore in the following (pipeline stage) pruning pass, different stage will keep the reshard op by the mesh it need and remove the other one.

It could be refined in the next PR

pkuzyc

LGTM for spmd rule

jeff41404

LGTM for API

sunzhongkai588

LGTM

zhiqiu added 2 commits March 28, 2024 14:01

add reshard op for input when needed

ccab43e

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

aa9955c

… dev/add_reshard

zhiqiu added 2 commits March 28, 2024 14:04

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

ab1f241

… dev/add_reshard

fix unary grad inferspmd

eb3bf88

JZ-LIANG reviewed Mar 28, 2024

View reviewed changes

pkuzyc approved these changes Mar 28, 2024

View reviewed changes

jeff41404 approved these changes Mar 29, 2024

View reviewed changes

sunzhongkai588 approved these changes Mar 29, 2024

View reviewed changes

zhiqiu merged commit 70cc347 into PaddlePaddle:develop Mar 29, 2024
29 of 30 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[pir+auto parallel] add reshard op for input when needed #63072

[pir+auto parallel] add reshard op for input when needed #63072

zhiqiu commented Mar 28, 2024 •

edited

Loading

paddle-bot bot commented Mar 28, 2024

JZ-LIANG Mar 28, 2024

zhiqiu Mar 28, 2024

JZ-LIANG Mar 28, 2024

zhiqiu Mar 28, 2024

pkuzyc left a comment

jeff41404 left a comment

sunzhongkai588 left a comment

[pir+auto parallel] add reshard op for input when needed #63072

[pir+auto parallel] add reshard op for input when needed #63072

Conversation

zhiqiu commented Mar 28, 2024 • edited Loading

PR Category

PR Types

Description

paddle-bot bot commented Mar 28, 2024

JZ-LIANG Mar 28, 2024

Choose a reason for hiding this comment

zhiqiu Mar 28, 2024

Choose a reason for hiding this comment

JZ-LIANG Mar 28, 2024

Choose a reason for hiding this comment

zhiqiu Mar 28, 2024

Choose a reason for hiding this comment

pkuzyc left a comment

Choose a reason for hiding this comment

jeff41404 left a comment

Choose a reason for hiding this comment

sunzhongkai588 left a comment

Choose a reason for hiding this comment

zhiqiu commented Mar 28, 2024 •

edited

Loading