PaddlePaddle
diff --git a/‎paddle/fluid/distributed/collective/process_group_nccl.cc
+11 b/‎paddle/fluid/distributed/collective/process_group_nccl.cc
+11
diff --git a/‎paddle/fluid/distributed/collective/process_group_nccl.h
+3 b/‎paddle/fluid/distributed/collective/process_group_nccl.h
+3
diff --git a/‎paddle/fluid/imperative/CMakeLists.txt
+16 b/‎paddle/fluid/imperative/CMakeLists.txt
+16
diff --git a/‎paddle/fluid/imperative/prepared_operator.cc
+41 b/‎paddle/fluid/imperative/prepared_operator.cc
+41
diff --git a/‎paddle/fluid/operators/collective/c_concat_op.cc
-10 b/‎paddle/fluid/operators/collective/c_concat_op.cc
-10
diff --git a/‎paddle/fluid/operators/collective/c_concat_op.cu.cc
-181 b/‎paddle/fluid/operators/collective/c_concat_op.cu.cc
-181
@@ -217,6 +217,17 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
   return iter->second->nccl_comm();
 }
 
+phi::distributed::NCCLCommContext* ProcessGroupNCCL::GetOrCreateCommContext(
+    const Place& place, CommType comm_type) {
+  const auto& key = GetKeyFromPlace(place);
+  std::string store_key;
+  GetStoreKey(key, comm_type, &store_key);
+  if (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end()) {
+    CreateNCCLEnvCache(place, key, store_key, comm_type);
+  }
+  return GetCommContext(&store_key);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
     phi::DenseTensor* out_tensor,
     const phi::DenseTensor& in_tensor,
 
@@ -181,6 +181,9 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
 
   const bool GetNCCLCommInitOption() { return nccl_comm_init_option_; }
 
+  phi::distributed::NCCLCommContext* GetOrCreateCommContext(
+      const Place& place, CommType comm_type = CommType::UNKNOWN);
+
  private:
   std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(const Place& place,
                                                          int rank,
 
@@ -21,6 +21,22 @@ if(WITH_XPU)
          phi
          common
          var_helper)
+elseif((WITH_GPU OR WITH_ROCM) AND NOT WIN32)
+  cc_library(
+    prepared_operator
+    SRCS prepared_operator.cc
+    DEPS proto_desc
+         operator
+         device_context
+         lod_tensor
+         selected_rows_utils
+         var_type_traits
+         op_kernel_type
+         data_transform
+         phi
+         common
+         var_helper
+         process_group_nccl)
 else()
   cc_library(
     prepared_operator
 
@@ -28,6 +28,10 @@
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/phi/core/platform/onednn_op_list.h"
 #endif
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/process_group.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
+#endif
 #include "paddle/common/flags.h"
 #include "paddle/fluid/framework/library_type.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
@@ -296,6 +300,43 @@ PreparedOp PrepareImpl(
               phi::TransToPhiBackend(dev_ctx->GetPlace()))) {
         dev_ctx = pool.Get(phi::TransToPhiPlace(expected_kernel_key.backend()));
       }
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      if (attrs.find("ring_id") != attrs.end()) {
+        auto ring_id_attr = attrs.at("ring_id");
+        int ring_id = PADDLE_GET(int, ring_id_attr);
+        auto map = distributed::ProcessGroupMapFromGid::getInstance();
+        if (map->has(ring_id)) {
+          distributed::ProcessGroup* pg = map->get(ring_id);
+          auto comm_context =
+              static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)
+                  ->GetOrCreateCommContext(place);
+          auto original_stream =
+              static_cast<phi::GPUContext*>(dev_ctx)->cuda_stream();
+          dev_ctx =
+              static_cast<phi::distributed::NCCLCommContext*>(comm_context)
+                  ->GetDevContext();
+          dev_ctx->SetCommContext(comm_context);
+          // Note(lizhenxing): In dynamic mode, c_softmax_with_cross_entropy
+          // need use global calculate stream (original_stream). Using the
+          // comm_ctx's stream will lead to synchronization issues, causing
+          // accuracy diff in test_parallel_dygraph_mp_layers.
+          if (phi::is_gpu_place(place) &&
+              ((attrs.find("use_calc_stream") != attrs.end() &&
+                PADDLE_GET_CONST(bool, attrs.at("use_calc_stream"))) ||
+               phi_kernel_name == "c_softmax_with_cross_entropy")) {
+            static_cast<phi::GPUContext*>(dev_ctx)->SetCUDAStream(
+                original_stream, false);
+            auto& instance =
+                paddle::memory::allocation::AllocatorFacade::Instance();
+            dev_ctx->SetAllocator(
+                instance
+                    .GetAllocator(
+                        place, static_cast<phi::GPUContext*>(dev_ctx)->stream())
+                    .get());
+          }
+        }
+      }
+#endif
       return PreparedOp(op,
                         empty_ctx,
                         expected_kernel_key,
 
@@ -117,13 +117,3 @@ REGISTER_OPERATOR(c_concat,
                   ops::CConcatOpGradMaker<paddle::framework::OpDesc>,
                   ops::CConcatOpGradMaker<paddle::imperative::OpBase>,
                   ops::CConcatOpMaker);
-
-PD_REGISTER_STRUCT_KERNEL(c_concat,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CConcatOpCPUKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t,
-                          phi::dtype::float16) {}