Skip to content

Commit 216e1c4

Browse files
authored
[fluid_ops] clean c_allreduce_prod (#72057)
1 parent 91d623b commit 216e1c4

File tree

11 files changed

+3
-72
lines changed

11 files changed

+3
-72
lines changed

paddle/fluid/framework/new_executor/pir_interpreter.cc

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -534,11 +534,7 @@ void PirInterpreter::UpdateNcclOpNum() {
534534
static std::set<std::string> nccl_op_set = {
535535
"pd_op.c_softmax_with_cross_entropy",
536536
"pd_op.c_softmax_with_multi_label_cross_entropy",
537-
"pd_op.c_allgather",
538-
"pd_op.c_allreduce_avg",
539-
"pd_op.c_allreduce_min",
540537
"pd_op.c_allreduce_sum",
541-
"pd_op.c_allreduce_prod",
542538
"pd_op.c_broadcast",
543539
"pd_op.c_scatter",
544540
"pd_op.partial_send",
@@ -566,10 +562,7 @@ void PirInterpreter::UpdateNcclOpNum() {
566562
"pd_op.reduce",
567563
"pd_op.c_softmax_with_cross_entropy_grad",
568564
"pd_op.c_softmax_with_multi_label_cross_entropy_grad",
569-
"pd_op.c_allgather_grad",
570-
"pd_op.c_allreduce_min_grad",
571565
"pd_op.c_allreduce_sum_grad",
572-
"pd_op.c_allreduce_prod_grad",
573566
"pd_op.c_broadcast_grad",
574567
"pd_op.c_scatter_grad",
575568
"pd_op.partial_send_grad",
@@ -598,11 +591,7 @@ void PirInterpreter::UpdateNcclOpNum() {
598591
"pd_op.reduce_grad",
599592
"pd_op.c_softmax_with_cross_entropy_",
600593
"pd_op.c_softmax_with_multi_label_cross_entropy_",
601-
"pd_op.c_allgather_",
602-
"pd_op.c_allreduce_avg_",
603-
"pd_op.c_allreduce_min_",
604594
"pd_op.c_allreduce_sum_",
605-
"pd_op.c_allreduce_prod_",
606595
"pd_op.c_broadcast_",
607596
"pd_op.c_scatter_",
608597
"pd_op.partial_send_",
@@ -630,10 +619,7 @@ void PirInterpreter::UpdateNcclOpNum() {
630619
"pd_op.reduce_",
631620
"pd_op.c_softmax_with_cross_entropy_grad_",
632621
"pd_op.c_softmax_with_multi_label_cross_entropy_grad_",
633-
"pd_op.c_allgather_grad_",
634-
"pd_op.c_allreduce_min_grad_",
635622
"pd_op.c_allreduce_sum_grad_",
636-
"pd_op.c_allreduce_prod_grad_",
637623
"pd_op.c_broadcast_grad_",
638624
"pd_op.c_scatter_grad_",
639625
"pd_op.partial_send_grad_",

paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ namespace paddle::inference::tensorrt {
2020
using ReduceType = paddle::inference::tensorrt::plugin::ReduceType;
2121
std::map<std::string, ReduceType> op_to_reduce_type = {
2222
{"c_allreduce_sum", paddle::inference::tensorrt::plugin::kRedSum},
23-
{"c_allreduce_min", paddle::inference::tensorrt::plugin::kRedMin},
24-
{"c_allreduce_prod", paddle::inference::tensorrt::plugin::kRedProd}};
23+
};
2524

2625
class CAllReduceOpConverter : public OpConverter {
2726
public:
@@ -88,5 +87,3 @@ class CAllReduceOpConverter : public OpConverter {
8887
} // namespace paddle::inference::tensorrt
8988

9089
REGISTER_TRT_OP_CONVERTER(c_allreduce_sum, CAllReduceOpConverter);
91-
REGISTER_TRT_OP_CONVERTER(c_allreduce_min, CAllReduceOpConverter);
92-
REGISTER_TRT_OP_CONVERTER(c_allreduce_prod, CAllReduceOpConverter);

paddle/fluid/inference/tensorrt/op_teller.cc

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3192,8 +3192,6 @@ struct SimpleOpTypeSetTeller : public Teller {
31923192
"fused_preln_embedding_eltwise_layernorm",
31933193
"fused_bias_dropout_residual_layer_norm",
31943194
"c_allreduce_sum",
3195-
"c_allreduce_min",
3196-
"c_allreduce_prod",
31973195
"roll",
31983196
"cast",
31993197
"preln_skip_layernorm",
@@ -3368,8 +3366,6 @@ struct SimpleOpTypeSetTeller : public Teller {
33683366
"preln_skip_layernorm",
33693367
"fused_bias_dropout_residual_layer_norm",
33703368
"c_allreduce_sum",
3371-
"c_allreduce_min",
3372-
"c_allreduce_prod",
33733369
"roll",
33743370
"cast",
33753371
"transformer_input_convert",

paddle/phi/backends/xpu/xpu2_op_list.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,6 @@ XPUOpMap& get_kl2_ops() {
117117
{"bitwise_and", XPUKernelSet({phi::DataType::BOOL})},
118118
{"bitwise_or", XPUKernelSet({phi::DataType::BOOL})},
119119
{"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
120-
{"c_allgather",
121-
XPUKernelSet({phi::DataType::FLOAT16,
122-
phi::DataType::FLOAT32,
123-
phi::DataType::INT32,
124-
phi::DataType::INT64,
125-
phi::DataType::UINT8})},
126120
{"c_alltoall",
127121
XPUKernelSet({phi::DataType::FLOAT16,
128122
phi::DataType::FLOAT32,

paddle/phi/backends/xpu/xpu3_op_list.cc

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,6 @@ XPUOpMap& get_kl3_ops() {
199199
phi::DataType::FLOAT32,
200200
phi::DataType::FLOAT16,
201201
phi::DataType::BFLOAT16})},
202-
{"c_allgather",
203-
XPUKernelSet({phi::DataType::FLOAT16,
204-
phi::DataType::FLOAT32,
205-
phi::DataType::FLOAT64,
206-
phi::DataType::INT32,
207-
phi::DataType::INT64,
208-
phi::DataType::UINT8,
209-
phi::DataType::BOOL})},
210202
{"c_alltoall",
211203
XPUKernelSet({phi::DataType::FLOAT16,
212204
phi::DataType::FLOAT32,

python/paddle/distributed/auto_parallel/static/mapper.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
def is_collective_comm_op(op):
2929
comm_list = [
3030
"c_allreduce_sum",
31-
"c_allreduce_min",
32-
"c_allreduce_prod",
3331
"all_gather",
3432
"all_reduce",
3533
"broadcast",

python/paddle/distributed/auto_parallel/static/operators/common.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -513,16 +513,6 @@ def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names):
513513

514514
allreduce_type = "c_allreduce_sum"
515515
need_scale = dist_ctx.gradient_scale
516-
scale_using_allreduce_avg = dist_ctx.gradient_scale_using_allreduce_avg
517-
518-
# With nccl_version > 2.10.00, we can use c_allreduce_avg to replace c_allreduce_sum and eliminate the scale op.
519-
if (
520-
need_scale
521-
and scale_using_allreduce_avg
522-
and int(paddle.version.nccl()) > 21000
523-
):
524-
allreduce_type = "c_allreduce_avg"
525-
need_scale = False
526516

527517
for group in groups:
528518
group_size = len(group.ranks)

python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,6 @@ def _update_program(self, grad_groups):
436436

437437
remove_op_types = [
438438
'scale',
439-
'c_allreduce_avg',
440439
'c_allreduce_sum',
441440
'c_wait_compute',
442441
]
@@ -492,9 +491,8 @@ def _update_program(self, grad_groups):
492491

493492
allreduce_op = block.ops[group.allreduce_op_idx]
494493
assert allreduce_op.type in [
495-
'c_allreduce_avg',
496494
'c_allreduce_sum',
497-
], f"should found c_allreduce_avg or c_allreduce_sum op but found {allreduce_op}"
495+
], f"should found c_allreduce_sum op but found {allreduce_op}"
498496
allreduce_op_dist_attr = (
499497
self.dist_context.get_op_dist_attr_for_program(allreduce_op)
500498
)

python/paddle/distributed/passes/auto_parallel_sharding.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,7 +1035,6 @@ def op_depend_on_group(op, group):
10351035
):
10361036
cur_group.is_in_local_shard = True
10371037
assert ops[i + 1].type in [
1038-
"c_allreduce_avg",
10391038
"c_allreduce_sum",
10401039
], "Sharding should reduce grad first and than allreduce if Hybrid Sharding with Data-Parallel"
10411040
assert (
@@ -1237,7 +1236,7 @@ def _overlap_grad_comm(
12371236
grad_comm_op_to_stream_idx = {}
12381237
for idx, op in enumerate(ops):
12391238
if is_data_parallel_reduce_op(op):
1240-
if op.type in ["c_allreduce_avg", "c_allreduce_sum"]:
1239+
if op.type in ["c_allreduce_sum"]:
12411240
continue
12421241
stream_idx = reduce_op_count % self.grad_comm_stream_num
12431242
grad_comm_op_to_stream_idx[op] = stream_idx
@@ -1291,7 +1290,6 @@ def _overlap_grad_comm(
12911290
if self.sharding_hybrid_dp and grad_group.is_in_local_shard:
12921291
next_op = ops[idx + 1]
12931292
assert next_op.type in [
1294-
"c_allreduce_avg",
12951293
"c_allreduce_sum",
12961294
]
12971295
assert next_op.output("Out")[0] == reduce_varname

python/paddle/distributed/passes/fuse_all_reduce.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,6 @@ def filter_all_collective_op_indices(block):
149149
# NOTE: should add more collective ops
150150
all_collective_ops = {
151151
"c_allreduce_sum",
152-
"c_allreduce_prod",
153-
"c_allreduce_min",
154-
"c_allgather",
155152
"c_broadcast",
156153
"broadcast",
157154
"all_gather",

test/ir/inference/test_trt_convert_c_allreduce.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,21 +51,6 @@ def init_case(self):
5151
self.precision = "int8"
5252

5353

54-
# class TestMax(TestDistTRT):
55-
#
56-
# def init_case(self):
57-
# self.op_type = "c_allreduce_max"
58-
# self.target_value = 2.
59-
# self.precision = "fp16"
60-
#
61-
#
62-
# class TestProd(TestDistTRT):
63-
#
64-
# def init_case(self):
65-
# self.op_type = "c_allreduce_prod"
66-
# self.target_value = 2.
67-
# self.precision = "fp16"
68-
6954
if __name__ == '__main__':
7055
paddle.enable_static()
7156
unittest.main()

0 commit comments

Comments
 (0)