[Distributed] destroy && recreate ncclComm (#72626) (#72648)

tianhaodongbd · web-flow · commit 3139a0ab6195 · 2025-05-10T17:05:40.000+08:00
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -20,7 +20,6 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/distributed/check/nccl_dynamic_check.h"
 #include "paddle/phi/core/distributed/check/static_check.h"
-#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/comm_task_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_task.h"
 #include "paddle/phi/core/distributed/nccl_tools.h"
@@ -146,7 +145,9 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       place_to_group_key_(),
       pg_timeout_(timeout),
       nccl_comm_init_option_(nccl_comm_init_option),
-      allocation_stream_pairs_() {
+      allocation_stream_pairs_(),
+      place_to_p2p_opts_(),
+      create_count_(0) {
   LOG(INFO) << "ProcessGroupNCCL pg_timeout_ " << pg_timeout_;
   LOG(INFO) << "ProcessGroupNCCL nccl_comm_init_option_ "
             << nccl_comm_init_option_;
@@ -948,12 +949,40 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
       platform::DeviceEvent(place, platform::GenerateDeviceEventFlag()));
   place_to_calc_ctx_.emplace(place_key, calc_ctx);
   place_to_comm_ctx_.emplace(place_key, std::move(comm_ctx));
+  place_to_p2p_opts_.emplace(place_key, std::move(p2p_opts));
 
   for (size_t i = 0; i < s_group_call_counter; ++i) {
     NCCL_CHECK(phi::dynload::ncclGroupStart());
   }
 }
 
+void ProcessGroupNCCL::Shutdown() {
+  for (size_t i = 0; i < s_group_call_counter; ++i) {
+    NCCL_CHECK(phi::dynload::ncclGroupEnd());
+  }
+
+  for (auto key_iter = place_to_group_key_.begin();
+       key_iter != place_to_group_key_.end();
+       ++key_iter) {
+    std::string store_key = key_iter->second;
+    auto nccl_comm_ctx = this->GetCommContext(&store_key);
+    nccl_comm_ctx->DestroyNCCLComm();
+  }
+}
+
+void ProcessGroupNCCL::Restart() {
+  for (auto key_iter = place_to_group_key_.begin();
+       key_iter != place_to_group_key_.end();
+       ++key_iter) {
+    std::string place_key = key_iter->first;
+    std::string store_key = key_iter->second;
+    phi::distributed::P2POption p2p_opts = place_to_p2p_opts_.at(place_key);
+    phi::distributed::CommContextManager::RecreateNCCLComm(
+        store_, store_key, rank_, std::to_string(create_count_), &p2p_opts);
+    create_count_++;
+  }
+}
+
 void ProcessGroupNCCL::SyncCalcStream(const Place& place,
                                       const std::string& place_key) {
   auto& calc_event = place_to_calc_event_.at(place_key);
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.h b/paddle/fluid/distributed/collective/process_group_nccl.h
@@ -25,6 +25,7 @@
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/distributed/store/store.h"
 #include "paddle/phi/core/platform/device_event.h"
@@ -190,6 +191,9 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
   phi::distributed::NCCLCommContext* GetOrCreateCommContext(
       const Place& place, CommType comm_type = CommType::UNKNOWN);
 
+  void Shutdown();
+  void Restart();
+
  private:
   std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(const Place& place,
                                                          int rank,
@@ -287,6 +291,10 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
   bool is_coalescing_{false};
   std::vector<std::shared_ptr<phi::DenseTensor>> coalescing_tensors_;
   std::vector<std::string> coalescing_place_keys_;
+
+  std::unordered_map<std::string, phi::distributed::P2POption>
+      place_to_p2p_opts_;
+  int64_t create_count_;
 };
 
 }  //  namespace distributed
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
@@ -1247,7 +1247,9 @@ void BindDistributed(py::module *m) {
                   py::arg("nccl_comm_init_option") = 0,
                   py::call_guard<py::gil_scoped_release>())
       .def_static("group_start", distributed::ProcessGroupNCCL::GroupStart)
-      .def_static("group_end", distributed::ProcessGroupNCCL::GroupEnd);
+      .def_static("group_end", distributed::ProcessGroupNCCL::GroupEnd)
+      .def("shutdown", &distributed::ProcessGroupNCCL::Shutdown)
+      .def("restart", &distributed::ProcessGroupNCCL::Restart);
 
   py::class_<distributed::AsyncLoad::Task,
              std::shared_ptr<distributed::AsyncLoad::Task>>(*m, "AsyncLoadTask")
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -126,6 +126,38 @@ void CommContextManager::CreateNCCLCommContext(
   comm_context_manager.SetStore(store);
   comm_context_manager.Emplace(unique_comm_key, std::move(nccl_comm_context));
 }
+
+void CommContextManager::RecreateNCCLComm(const std::shared_ptr<Store>& store,
+                                          const std::string& unique_comm_key,
+                                          int rank,
+                                          const std::string& hash_key,
+                                          const P2POption* p2p_opt) {
+  auto& comm_context_manager = CommContextManager::GetInstance();
+
+  ncclUniqueId nccl_id;
+  if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
+  }
+
+  std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key;
+  if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
+    std::vector<uint8_t> nccl_id_wrapper(
+        reinterpret_cast<uint8_t*>(&nccl_id),
+        reinterpret_cast<uint8_t*>(&nccl_id) + NCCL_UNIQUE_ID_BYTES);
+    store->set(unique_key, nccl_id_wrapper);
+  } else {
+    const auto& nccl_id_wrapper = store->get(unique_key);
+    std::memcpy(&nccl_id, nccl_id_wrapper.data(), nccl_id_wrapper.size());
+  }
+
+  VLOG(3) << "RecreateNCCLComm nccl_id: " << SerializeNCCLUniqueId(nccl_id);
+
+  auto comm_context = static_cast<phi::distributed::NCCLCommContext*>(
+      comm_context_manager.Get(unique_comm_key));
+  comm_context->CreateNCCLComm(nccl_id);
+
+  comm_context_manager.SetStore(store);
+}
 #endif
 
 #if defined(PADDLE_WITH_GLOO)
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
@@ -83,6 +83,11 @@ class CommContextManager {
                                     const std::string& hash_key = "",
                                     const P2POption* opt = nullptr,
                                     int nccl_comm_init_option = 0);
+  static void RecreateNCCLComm(const std::shared_ptr<Store>& store,
+                               const std::string& unique_comm_key,
+                               int rank,
+                               const std::string& hash_key = "",
+                               const P2POption* opt = nullptr);
 #endif
 
 #if defined(PADDLE_WITH_GLOO)
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -33,19 +33,35 @@ NCCLCommContext::NCCLCommContext(int rank,
                                  int size,
                                  ncclUniqueId nccl_id,
                                  int nccl_comm_init_option)
-    : CommContext(rank, size), nccl_version_(0), nccl_comm_(nullptr) {
-  if (nccl_comm_init_option > 0 && phi::dynload::ncclCommInitRank2.IsValid()) {
+    : CommContext(rank, size),
+      nccl_version_(0),
+      nccl_comm_(nullptr),
+      nranks(size_),
+      myrank(rank_),
+      param(nccl_comm_init_option) {
+  this->CreateNCCLComm(nccl_id);
+  NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_));
+}
+
+void NCCLCommContext::CreateNCCLComm(ncclUniqueId nccl_id) {
+  if (param > 0 && phi::dynload::ncclCommInitRank2.IsValid()) {
     LOG(WARNING) << "Creating modified qp with ncclCommInitRank2.";
     NCCL_CHECK(phi::dynload::ncclCommInitRank2(
-        &nccl_comm_, size_, nccl_id, rank_, nccl_comm_init_option));
+        &nccl_comm_, nranks, nccl_id, myrank, param));
   } else {
-    if (nccl_comm_init_option > 0) {
+    if (param > 0) {
       LOG(WARNING) << "ncclCommInitRank2 is not supported.";
     }
     NCCL_CHECK(
-        phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
+        phi::dynload::ncclCommInitRank(&nccl_comm_, nranks, nccl_id, myrank));
+  }
+}
+
+void NCCLCommContext::DestroyNCCLComm() {
+  if (nccl_comm_ != nullptr) {
+    NCCL_CHECK(phi::dynload::ncclCommDestroy(nccl_comm_));
+    nccl_comm_ = nullptr;
   }
-  NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_));
 }
 
 int NCCLCommContext::GetNcclVersion() { return nccl_version_; }
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -49,6 +49,10 @@ class NCCLCommContext final : public CommContext {
 
   ncclComm_t GetNcclComm();
 
+  void CreateNCCLComm(ncclUniqueId nccl_id);
+
+  void DestroyNCCLComm();
+
   gpuStream_t GetStream();
 
   gpuEvent_t GetComputeEvent();
@@ -132,6 +136,10 @@ class NCCLCommContext final : public CommContext {
 
   // used for compute wait comm, comm_stream-->event-->compute_stream
   std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> comm_event_;
+
+  int nranks;
+  int myrank;
+  int param;
 };
 
 }  // namespace distributed
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
@@ -71,6 +71,8 @@
 from .collective import (
     is_available,
     new_group,
+    restart_process_group,
+    shutdown_process_group,
     split,
 )
 from .communication import (  # noqa: F401
@@ -137,6 +139,8 @@
     "broadcast_object_list",
     "ParallelEnv",
     "new_group",
+    "shutdown_process_group",
+    "restart_process_group",
     "init_parallel_env",
     "gloo_init_parallel_env",
     "gloo_barrier",
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
@@ -388,3 +388,61 @@ def _init_parallel_env(backend: _BackendList) -> None:
         core.CommContextManager.create_bkcl_comm_context(
             store, "0", rank, world_size, endpoints_str_hash
         )
+
+
+_shutdown_group_map_by_name = {}
+
+
+def _get_shutdown_group_map_by_name():
+    global _shutdown_group_map_by_name
+    return _shutdown_group_map_by_name
+
+
+def _update_shutdown_group_map_by_name(pg_name, group):
+    global _shutdown_group_map_by_name
+    _shutdown_group_map_by_name[pg_name] = group
+
+
+def _delete_shutdown_group_map_by_name(pg_name):
+    global _shutdown_group_map_by_name
+    del _shutdown_group_map_by_name[pg_name]
+
+
+def _clear_shutdown_group_map_by_name():
+    global _shutdown_group_map_by_name
+    _shutdown_group_map_by_name.clear()
+
+
+def shutdown_process_group(group: Group | None = None) -> None:
+    shutdown_groups = _get_shutdown_group_map_by_name()
+
+    if group is None:
+        global _default_group_name
+        for pg_name, pg in _get_group_map_by_name().items():
+            if (
+                pg.process_group is not None
+                and pg_name not in shutdown_groups
+                and pg_name != _default_group_name
+            ):
+                pg.process_group.shutdown()
+                _update_shutdown_group_map_by_name(pg_name, pg)
+    else:
+        if (
+            group.process_group is not None
+            and group.name not in shutdown_groups
+        ):
+            group.process_group.shutdown()
+            _update_shutdown_group_map_by_name(group.name, group)
+
+
+def restart_process_group(group: Group | None = None) -> None:
+    shutdown_groups = _get_shutdown_group_map_by_name()
+
+    if group is None:
+        for pg in shutdown_groups.values():
+            pg.process_group.restart()
+        _clear_shutdown_group_map_by_name()
+    else:
+        if group.process_group is not None and group.name in shutdown_groups:
+            group.process_group.restart()
+            _delete_shutdown_group_map_by_name(group.name)
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
@@ -822,3 +822,17 @@ if(LOCAL_ALL_ARCH AND (LINUX OR APPLE))
     "PADDLE_DIST_UT_PORT=21212;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
   )
 endif()
+if((WITH_GPU) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_shutdown_process_group
+    START_BASH
+    ../../legacy_test/dist_test.sh
+    TIMEOUT
+    "200"
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=22024;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_shutdown_process_group PROPERTIES TIMEOUT "200")
+endif()
diff --git a/test/collective/fleet/shutdown_process_group.py b/test/collective/fleet/shutdown_process_group.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestShutdownProcessGroupAPI:
+    def __init__(self):
+        dist.init_parallel_env()
+        if dist.get_rank() == 0:
+            self.data = paddle.to_tensor([[7, 8, 9], [10, 11, 12]])
+        else:
+            self.data = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+
+    def test_shutdown_and_recreate_all(self):
+        pg = paddle.distributed.new_group([0, 1])
+
+        result_base = self.data.clone()
+        dist.all_reduce(result_base, group=pg)
+
+        paddle.distributed.shutdown_process_group()
+        paddle.distributed.restart_process_group()
+
+        result_test = self.data.clone()
+        dist.all_reduce(result_test, group=pg)
+
+        np.testing.assert_array_equal(result_base.numpy(), result_test.numpy())
+
+    def test_shutdown_and_recreate_single(self):
+        pg = paddle.distributed.new_group([0, 1])
+
+        result_base = self.data.clone()
+        dist.all_reduce(result_base, group=pg)
+
+        paddle.distributed.shutdown_process_group(pg)
+        paddle.distributed.restart_process_group(pg)
+
+        result_test = self.data.clone()
+        dist.all_reduce(result_test, group=pg)
+
+        np.testing.assert_array_equal(result_base.numpy(), result_test.numpy())
+
+
+if __name__ == "__main__":
+    test_case = TestShutdownProcessGroupAPI()
+    test_case.test_shutdown_and_recreate_all()
+    test_case.test_shutdown_and_recreate_single()
diff --git a/test/collective/fleet/test_shutdown_process_group.py b/test/collective/fleet/test_shutdown_process_group.py
diff --git a/test/collective/fleet/testslist.csv b/test/collective/fleet/testslist.csv