tmp

waliwali777 · waliwali777 · commit 4a028502140c · 2024-10-23T15:31:13.000+08:00
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -27,8 +27,8 @@
 // FLAGS_force_sync_ops is used to finer control the op-sync in executor.
 // The format is: "micro_batch_id, job_name, op_id, op_name | micro_batch_id,
 // job_name, op_id, op_name | ...". Keep spaces to syncs all name/id. Example:
-// 1. sync the recv_v2 op in the second backward-job of 1F1B scheduling:
-// FLAGS_force_sync_ops="1, backward, , recv_v2"
+// 1. sync the p_recv op in the second backward-job of 1F1B scheduling:
+// FLAGS_force_sync_ops="1, backward, , p_recv"
 // 2. sync the full op with op_id=5: FLAGS_force_sync_ops=" , , 5, full"
 // 3. sync all ops in the first default-job: FLAGS_force_sync_ops="0,default,,
 // 4. sync all ops in the forward-job and backward-job: FLAGS_force_sync_ops=" ,
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -154,8 +154,6 @@ bool IsCommunicationOp(const ::pir::Operation* op) {
         op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
   }
   const std::set<std::string> special_comm_op_set = {
-      paddle::dialect::SendV2Op::name(),
-      paddle::dialect::RecvV2Op::name(),
       paddle::dialect::PSendOp::name(),
       paddle::dialect::PRecvOp::name(),
   };
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -54,6 +54,7 @@ std::set<std::string> OpsCanSkipedFakeAllocInStaticBuild = {
     "fetch_v2",
     "print",
     "send_v2",
+    "p_send",
     "nop"};
 
 std::set<std::string> StaticBuildBlackList = {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -328,20 +328,27 @@ void Instruction::UpdateRecordStreamForGcInfo() {
   need_record_stream_for_gc_ = true;
 
   stream_ = reinterpret_cast<const phi::GPUContext&>(DeviceContext()).stream();
-// TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now.
-// To support all the operators for communicating in the future.
+  // TODO(lizhiyu): Only analyse the 'p_send' for GPT pp strategy right now.
+  // To support all the operators for communicating in the future.
+  VLOG(0) << "enter new_executor_defs ";
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto operator_base_ptr = OpBase();
-  if ((operator_base_ptr->Type() == "p_send") &&
-      (operator_base_ptr->Attr<bool>("use_calc_stream") == false)) {
+  if ((operator_base_ptr->Type() == "p_send")) {
+    VLOG(0) << "enter new_executor_defs func";
     int ring_id = operator_base_ptr->Attr<int>("ring_id");
     if (FLAGS_dynamic_static_unified_comm) {
       const auto& comm_context_manager =
           phi::distributed::CommContextManager::GetInstance();
+      VLOG(0) << "xxx: std::to_string(ring_id): " << std::to_string(ring_id);
+      VLOG(0) << "xxx: distributed::comm_context_manager has: "
+              << comm_context_manager.Has(std::to_string(ring_id));
+
       stream_ = static_cast<phi::distributed::NCCLCommContext*>(
                     comm_context_manager.Get(std::to_string(ring_id)))
                     ->GetStream();
     } else {
+      VLOG(0) << "xxx: std::to_string(ring_id): " << std::to_string(ring_id);
+      VLOG(0) << "xxx: platform::NCCLCommContext has: ";
       stream_ = platform::NCCLCommContext::Instance()
                     .Get(ring_id, DeviceContext().GetPlace())
                     ->stream();
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h
@@ -33,6 +33,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Randperm)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(ReadFile)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Seed)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(RecvV2)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(PRecv)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(TrilIndices)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(TriuIndices)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(TruncatedGaussianRandom)
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -781,6 +781,7 @@ def comm_count(self):
         from ..reshard import get_var_with_recursion
 
         if self._comm_count is None:
+            print("---------self._comm_count is None-----------")
             dtype = None
             shape = None
             if self.op is not None:
@@ -802,8 +803,10 @@ def comm_count(self):
                 dtype = var.dtype
                 shape = var.shape
             elif self.op_desc is not None:
+                print("---------self.op_desc is not None-----------")
                 dtype = self.op_desc["inputs"]["X"][0][0]
                 shape = self.op_desc["inputs"]["X"][0][1]
+                print(dtype, shape)
 
             factor = None
             if dtype == paddle.float32 or dtype == paddle.int32:
diff --git a/python/paddle/distributed/auto_parallel/static/reshard.py b/python/paddle/distributed/auto_parallel/static/reshard.py
@@ -377,9 +377,12 @@ def insert_send_op(block, idx, tensor, src, dst, op_role, sync=True):
             attrs={
                 'ring_id': process_group.id,
                 'peer': process_group.ranks.index(dst),
+                'op_role': op_role,
                 'dynamic_shape': True,
             },
         )
+        print("------reshard------------")
+        print(send_op)
         send_op._set_attr('op_namescope', "/auto_parallel/reshard")
 
     @staticmethod
@@ -400,10 +403,13 @@ def insert_recv_op(block, idx, tensor, src, dst, op_role, sync=True):
                 'ring_id': process_group.id,
                 'peer': process_group.ranks.index(src),
                 'dtype': tensor.dtype,
+                'op_role': op_role,
                 'dynamic_shape': True,
             },
         )
         recv_op._set_attr('op_namescope', "/auto_parallel/reshard")
+        print("------reshard------------")
+        print(recv_op)
 
     @staticmethod
     def insert_reset_lod_op(block, idx, X, Y, op_role, sync=True):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -142,7 +142,7 @@ def should_remove_op(self, op_idx):
         op = self._block.ops[op_idx]
 
         # NOTE: At present, it is found that the OP without output is
-        # only send_v2 and partial_send op, which will be used in
+        # only p_send and partial_send op, which will be used in
         # all device
         if len(op.desc.output_arg_names()) == 0:
             return False
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -620,8 +620,11 @@ def _insert_send_recv(cur_id, prev_id):
                         attrs={
                             'peer': 1,
                             'ring_id': ring_id,
+                            self._op_role_key: op_role,
                         },
                     )
+                    print("---------hybrid_parallel_inference----------")
+                    print(send_op)
                     extra_index_info['index'] += 1
                     var_shape = list(var.shape)
                     if var_shape[0] < 0:
@@ -640,8 +643,11 @@ def _insert_send_recv(cur_id, prev_id):
                             'dtype': var.dtype,
                             'peer': 0,
                             'ring_id': ring_id,
+                            self._op_role_key: op_role,
                         },
                     )
+                    print("---------hybrid_parallel_inference----------")
+                    print(recv_op)
                     extra_index_info['index'] += 1
 
                 _insert_send_recv(
@@ -712,8 +718,12 @@ def _insert_sendrecv_ops_in_while_block(
                         attrs={
                             'peer': 0,
                             'ring_id': ring_id,
+                            self._op_role_key: int(self._op_role.Forward),
                         },
                     )
+                    print("---------hybrid_parallel_inference----------")
+                    print(send_op)
+
                 else:
                     var_shape = list(var.shape)
                     print(var_name)
@@ -731,8 +741,11 @@ def _insert_sendrecv_ops_in_while_block(
                             'dtype': var.dtype,
                             'peer': 1,
                             'ring_id': ring_id,
+                            self._op_role_key: int(self._op_role.Forward),
                         },
                     )
+                    print("---------hybrid_parallel_inference----------")
+                    print(recv_op)
                 index += 1
         block._sync_with_cpp()
 
diff --git a/python/paddle/distributed/passes/pass_utils.py b/python/paddle/distributed/passes/pass_utils.py
@@ -471,20 +471,18 @@ def _pir_overlap_send_recv(program):
     This function is used to replace the function '_insert_sync_for_fthenb_1f1b'.
     The finally target of this function is as follows:
         1. no need to insert the 'c_sync_calc' and 'c_sync_calc' operators
-        2. 'send_v2' operator uses 'dist_attr.execution_stream' to set stream of its own.
-        3. 'recv_v2' operator uses 'dist_attr.execution_stream' to set stream of its own.
+        2. 'p_send' operator uses 'dist_attr.execution_stream' to set stream of its own.
+        3. 'p_recv' operator uses 'dist_attr.execution_stream' to set stream of its own.
     """
     for block in program.blocks:
         for op in block.ops:
-            if op.name() == "pd_op.send_v2":
+            if op.name() == "pd_op.p_send":
                 op.set_bool_attr("dynamic_shape", False)
-                op.set_bool_attr("use_calc_stream", True)
                 ring_id = op.attrs()["ring_id"]
                 op.set_execution_stream(f"send_stream_{ring_id}")
                 op.set_scheduling_priority(0)
-            elif op.name() == "pd_op.recv_v2":
+            elif op.name() == "pd_op.p_recv":
                 op.set_bool_attr("dynamic_shape", False)
-                op.set_bool_attr("use_calc_stream", True)
                 op.set_execution_stream("recv_stream")
                 op.set_scheduling_priority(0)
 
diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py
@@ -804,10 +804,13 @@ def _insert_send_recv(cur_id, prev_id):
                             inputs={'x': var},
                             attrs={
                                 'peer': 1,
+                                self._op_role_key: op_role,
                                 'ring_id': ring_id,
                             },
                         )
-                        send_op.dist_attr.execution_stream = "default"
+                        print("---------pipeline----------")
+                        print(send_op)
+                        # send_op.dist_attr.execution_stream = "default"
                         extra_index_info['index'] += 1
                         var_shape = list(var.shape)
                         var_shape[0] = (
@@ -823,9 +826,12 @@ def _insert_send_recv(cur_id, prev_id):
                                 'dtype': var.dtype,
                                 'peer': 0,
                                 'ring_id': ring_id,
+                                self._op_role_key: op_role,
                             },
                         )
-                        recv_op.dist_attr.execution_stream = "default"
+                        print("---------pipeline----------")
+                        print(recv_op)
+                        # recv_op.dist_attr.execution_stream = "default"
                         extra_index_info['index'] += 1
                     elif self.schedule_mode == '1F1B':  # 1F1B
                         var_shape = list(var.shape)
@@ -892,9 +898,12 @@ def _insert_send_recv(cur_id, prev_id):
                                 attrs={
                                     'ring_id': ring_id,
                                     'peer': 1,
+                                    self._op_role_key: op_role,
                                 },
                             )
-                            send_op.dist_attr.execution_stream = "default"
+                            print("---------pipeline----------")
+                            print(send_op)
+                            # send_op.dist_attr.execution_stream = "default"
                         else:
                             block._insert_op_without_sync(
                                 index=index + extra_index_info['index'],
@@ -906,6 +915,7 @@ def _insert_send_recv(cur_id, prev_id):
                                     'use_calc_stream': False,
                                     'num': self.mp_degree,
                                     'id': self.mp_rank,
+                                    self._op_role_key: op_role,
                                 },
                             )
                         extra_index_info['index'] += 1
@@ -941,9 +951,12 @@ def _insert_send_recv(cur_id, prev_id):
                                     'dtype': var.dtype,
                                     'peer': 0,
                                     'ring_id': ring_id,
+                                    self._op_role_key: op_role,
                                 },
                             )
-                            recv_op.dist_attr.execution_stream = "default"
+                            print("---------pipeline----------")
+                            print(recv_op)
+                            # recv_op.dist_attr.execution_stream = "default"
                         else:
                             block._insert_op_without_sync(
                                 index=index + extra_index_info['index'],
@@ -956,6 +969,7 @@ def _insert_send_recv(cur_id, prev_id):
                                     'out_shape': var_shape,
                                     'num': self.mp_degree,
                                     'id': self.mp_rank,
+                                    self._op_role_key: op_role,
                                 },
                             )
                         extra_index_info['index'] += 1
@@ -1622,8 +1636,11 @@ def _process_persistable_vars_in_multi_sections(
                         # microbatch
                         'peer': read_dev_index,
                         'ring_id': ring_id,
+                        self._op_role_key: self._op_role.LRSched,
                     },
                 )
+                print("---------pipeline----------")
+                # print(recv_op)
                 read_block._insert_op(
                     index=0,
                     type='p_recv',
@@ -1634,6 +1651,7 @@ def _process_persistable_vars_in_multi_sections(
                         # microbatch
                         'peer': write_dev_index,
                         'ring_id': ring_id,
+                        self._op_role_key: self._op_role.LRSched,
                     },
                 )
                 read_block._insert_op(
diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model_vpp.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_llama_model_vpp.py
@@ -42,7 +42,7 @@ def setUp(self):
             "use_param_group": ["true"],
             "recompute": ["true"],
             "recompute_granularity": ["full"],
-            "virtual_pp_degree": ["2"],
+            "virtual_pp_degree": ["1"],
         }
 
     def test_simple_net_hybrid_strategy(self):
diff --git a/tools/enforce/grep_invalid_enforce.sh b/tools/enforce/grep_invalid_enforce.sh
@@ -17,14 +17,14 @@
 # This script is used to grep invalid PADDLE checks by directory or file in the paddle/fluid/,
 #   the result show all invalid PADDLE checks in specified directory or file.
 
-# Usage: 
+# Usage:
 #   - bash grep_invalid_enforce.sh [target directory or file] (run in tools directory)
 #       - The default check path is paddle/fluid/operators
 
 # Result Examples:
 # 1. grep invalid PADDLE checks in directory
 
-#     - Command: /work/paddle/tools {develop} bash grep_invalid_enforce.sh ../paddle/fluid/imperative 
+#     - Command: /work/paddle/tools {develop} bash grep_invalid_enforce.sh ../paddle/fluid/imperative
 #     - Results:
 #         - paddle/fluid/imperative/gradient_accumulator.cc
 #         PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
@@ -60,7 +60,7 @@
 #                     "Place cannot be CUDAPlace when use_double_buffer is False");
 #         PADDLE_ENFORCE_NOT_NULL(exceptions_[i]);
 #         PADDLE_ENFORCE_EQ(status, Status::kException);
-#         PADDLE_ENFORCE_EQ(status, Status::kSuccess);    
+#         PADDLE_ENFORCE_EQ(status, Status::kSuccess);
 
 . ./count_enforce_by_file.sh --source-only
 

Original file line number	Diff line number	Diff line change
`@@ -154,8 +154,6 @@ bool IsCommunicationOp(const ::pir::Operation* op) {`
`154`	`154`	`op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();`
`155`	`155`	`}`
`156`	`156`	`const std::set<std::string> special_comm_op_set = {`
`157`		`- paddle::dialect::SendV2Op::name(),`
`158`		`- paddle::dialect::RecvV2Op::name(),`
`159`	`157`	`paddle::dialect::PSendOp::name(),`
`160`	`158`	`paddle::dialect::PRecvOp::name(),`
`161`	`159`	`};`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def setUp(self):`
`42`	`42`	`"use_param_group": ["true"],`
`43`	`43`	`"recompute": ["true"],`
`44`	`44`	`"recompute_granularity": ["full"],`
`45`		`- "virtual_pp_degree": ["2"],`
	`45`	`+ "virtual_pp_degree": ["1"],`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`def test_simple_net_hybrid_strategy(self):`