diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 76a882bf660b85..fb714b1f19039d 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -534,11 +534,7 @@ void PirInterpreter::UpdateNcclOpNum() { static std::set nccl_op_set = { "pd_op.c_softmax_with_cross_entropy", "pd_op.c_softmax_with_multi_label_cross_entropy", - "pd_op.c_allgather", - "pd_op.c_allreduce_avg", - "pd_op.c_allreduce_min", "pd_op.c_allreduce_sum", - "pd_op.c_allreduce_prod", "pd_op.c_broadcast", "pd_op.c_scatter", "pd_op.partial_send", @@ -566,10 +562,7 @@ void PirInterpreter::UpdateNcclOpNum() { "pd_op.reduce", "pd_op.c_softmax_with_cross_entropy_grad", "pd_op.c_softmax_with_multi_label_cross_entropy_grad", - "pd_op.c_allgather_grad", - "pd_op.c_allreduce_min_grad", "pd_op.c_allreduce_sum_grad", - "pd_op.c_allreduce_prod_grad", "pd_op.c_broadcast_grad", "pd_op.c_scatter_grad", "pd_op.partial_send_grad", @@ -598,11 +591,7 @@ void PirInterpreter::UpdateNcclOpNum() { "pd_op.reduce_grad", "pd_op.c_softmax_with_cross_entropy_", "pd_op.c_softmax_with_multi_label_cross_entropy_", - "pd_op.c_allgather_", - "pd_op.c_allreduce_avg_", - "pd_op.c_allreduce_min_", "pd_op.c_allreduce_sum_", - "pd_op.c_allreduce_prod_", "pd_op.c_broadcast_", "pd_op.c_scatter_", "pd_op.partial_send_", @@ -630,10 +619,7 @@ void PirInterpreter::UpdateNcclOpNum() { "pd_op.reduce_", "pd_op.c_softmax_with_cross_entropy_grad_", "pd_op.c_softmax_with_multi_label_cross_entropy_grad_", - "pd_op.c_allgather_grad_", - "pd_op.c_allreduce_min_grad_", "pd_op.c_allreduce_sum_grad_", - "pd_op.c_allreduce_prod_grad_", "pd_op.c_broadcast_grad_", "pd_op.c_scatter_grad_", "pd_op.partial_send_grad_", diff --git a/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc b/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc index 57951e943dd0f6..3184eee8229b0a 100644 --- a/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc @@ -20,8 +20,7 @@ namespace paddle::inference::tensorrt { using ReduceType = paddle::inference::tensorrt::plugin::ReduceType; std::map op_to_reduce_type = { {"c_allreduce_sum", paddle::inference::tensorrt::plugin::kRedSum}, - {"c_allreduce_min", paddle::inference::tensorrt::plugin::kRedMin}, - {"c_allreduce_prod", paddle::inference::tensorrt::plugin::kRedProd}}; +}; class CAllReduceOpConverter : public OpConverter { public: @@ -88,5 +87,3 @@ class CAllReduceOpConverter : public OpConverter { } // namespace paddle::inference::tensorrt REGISTER_TRT_OP_CONVERTER(c_allreduce_sum, CAllReduceOpConverter); -REGISTER_TRT_OP_CONVERTER(c_allreduce_min, CAllReduceOpConverter); -REGISTER_TRT_OP_CONVERTER(c_allreduce_prod, CAllReduceOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 2895f8a3a448cb..4acc898470f48b 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -3192,8 +3192,6 @@ struct SimpleOpTypeSetTeller : public Teller { "fused_preln_embedding_eltwise_layernorm", "fused_bias_dropout_residual_layer_norm", "c_allreduce_sum", - "c_allreduce_min", - "c_allreduce_prod", "roll", "cast", "preln_skip_layernorm", @@ -3368,8 +3366,6 @@ struct SimpleOpTypeSetTeller : public Teller { "preln_skip_layernorm", "fused_bias_dropout_residual_layer_norm", "c_allreduce_sum", - "c_allreduce_min", - "c_allreduce_prod", "roll", "cast", "transformer_input_convert", diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index f735aa0ee1f881..6cb330aaa5b66f 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -117,12 +117,6 @@ XPUOpMap& get_kl2_ops() { {"bitwise_and", XPUKernelSet({phi::DataType::BOOL})}, {"bitwise_or", XPUKernelSet({phi::DataType::BOOL})}, {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})}, - {"c_allgather", - XPUKernelSet({phi::DataType::FLOAT16, - phi::DataType::FLOAT32, - phi::DataType::INT32, - phi::DataType::INT64, - phi::DataType::UINT8})}, {"c_alltoall", XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32, diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc index 5229ea60849d46..01f2751afe03c9 100644 --- a/paddle/phi/backends/xpu/xpu3_op_list.cc +++ b/paddle/phi/backends/xpu/xpu3_op_list.cc @@ -199,14 +199,6 @@ XPUOpMap& get_kl3_ops() { phi::DataType::FLOAT32, phi::DataType::FLOAT16, phi::DataType::BFLOAT16})}, - {"c_allgather", - XPUKernelSet({phi::DataType::FLOAT16, - phi::DataType::FLOAT32, - phi::DataType::FLOAT64, - phi::DataType::INT32, - phi::DataType::INT64, - phi::DataType::UINT8, - phi::DataType::BOOL})}, {"c_alltoall", XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32, diff --git a/python/paddle/distributed/auto_parallel/static/mapper.py b/python/paddle/distributed/auto_parallel/static/mapper.py index 028af1326ec751..fe5fab03a0d986 100644 --- a/python/paddle/distributed/auto_parallel/static/mapper.py +++ b/python/paddle/distributed/auto_parallel/static/mapper.py @@ -28,8 +28,6 @@ def is_collective_comm_op(op): comm_list = [ "c_allreduce_sum", - "c_allreduce_min", - "c_allreduce_prod", "all_gather", "all_reduce", "broadcast", diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py index 78f9124c29c103..486c46b413fa33 100644 --- a/python/paddle/distributed/auto_parallel/static/operators/common.py +++ b/python/paddle/distributed/auto_parallel/static/operators/common.py @@ -513,16 +513,6 @@ def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names): allreduce_type = "c_allreduce_sum" need_scale = dist_ctx.gradient_scale - scale_using_allreduce_avg = dist_ctx.gradient_scale_using_allreduce_avg - - # With nccl_version > 2.10.00, we can use c_allreduce_avg to replace c_allreduce_sum and eliminate the scale op. - if ( - need_scale - and scale_using_allreduce_avg - and int(paddle.version.nccl()) > 21000 - ): - allreduce_type = "c_allreduce_avg" - need_scale = False for group in groups: group_size = len(group.ranks) diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py index 81b3381448db7e..fbbea34c1adfaa 100644 --- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py +++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py @@ -436,7 +436,6 @@ def _update_program(self, grad_groups): remove_op_types = [ 'scale', - 'c_allreduce_avg', 'c_allreduce_sum', 'c_wait_compute', ] @@ -492,9 +491,8 @@ def _update_program(self, grad_groups): allreduce_op = block.ops[group.allreduce_op_idx] assert allreduce_op.type in [ - 'c_allreduce_avg', 'c_allreduce_sum', - ], f"should found c_allreduce_avg or c_allreduce_sum op but found {allreduce_op}" + ], f"should found c_allreduce_sum op but found {allreduce_op}" allreduce_op_dist_attr = ( self.dist_context.get_op_dist_attr_for_program(allreduce_op) ) diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py index 1aaaabe9358d0e..cbee05bec2cfe4 100644 --- a/python/paddle/distributed/passes/auto_parallel_sharding.py +++ b/python/paddle/distributed/passes/auto_parallel_sharding.py @@ -1035,7 +1035,6 @@ def op_depend_on_group(op, group): ): cur_group.is_in_local_shard = True assert ops[i + 1].type in [ - "c_allreduce_avg", "c_allreduce_sum", ], "Sharding should reduce grad first and than allreduce if Hybrid Sharding with Data-Parallel" assert ( @@ -1237,7 +1236,7 @@ def _overlap_grad_comm( grad_comm_op_to_stream_idx = {} for idx, op in enumerate(ops): if is_data_parallel_reduce_op(op): - if op.type in ["c_allreduce_avg", "c_allreduce_sum"]: + if op.type in ["c_allreduce_sum"]: continue stream_idx = reduce_op_count % self.grad_comm_stream_num grad_comm_op_to_stream_idx[op] = stream_idx @@ -1291,7 +1290,6 @@ def _overlap_grad_comm( if self.sharding_hybrid_dp and grad_group.is_in_local_shard: next_op = ops[idx + 1] assert next_op.type in [ - "c_allreduce_avg", "c_allreduce_sum", ] assert next_op.output("Out")[0] == reduce_varname diff --git a/python/paddle/distributed/passes/fuse_all_reduce.py b/python/paddle/distributed/passes/fuse_all_reduce.py index e8f71369b6d090..21138dfbc65721 100755 --- a/python/paddle/distributed/passes/fuse_all_reduce.py +++ b/python/paddle/distributed/passes/fuse_all_reduce.py @@ -149,9 +149,6 @@ def filter_all_collective_op_indices(block): # NOTE: should add more collective ops all_collective_ops = { "c_allreduce_sum", - "c_allreduce_prod", - "c_allreduce_min", - "c_allgather", "c_broadcast", "broadcast", "all_gather", diff --git a/test/ir/inference/test_trt_convert_c_allreduce.py b/test/ir/inference/test_trt_convert_c_allreduce.py index 6e3bc5ae9a8943..960ebfd1ab4cef 100644 --- a/test/ir/inference/test_trt_convert_c_allreduce.py +++ b/test/ir/inference/test_trt_convert_c_allreduce.py @@ -51,21 +51,6 @@ def init_case(self): self.precision = "int8" -# class TestMax(TestDistTRT): -# -# def init_case(self): -# self.op_type = "c_allreduce_max" -# self.target_value = 2. -# self.precision = "fp16" -# -# -# class TestProd(TestDistTRT): -# -# def init_case(self): -# self.op_type = "c_allreduce_prod" -# self.target_value = 2. -# self.precision = "fp16" - if __name__ == '__main__': paddle.enable_static() unittest.main()