PaddlePaddle
diff --git a/‎cmake/external/gloo.cmake
+17 b/‎cmake/external/gloo.cmake
+17
diff --git a/‎paddle/fluid/distributed/collective/CMakeLists.txt
+3-1 b/‎paddle/fluid/distributed/collective/CMakeLists.txt
+3-1
diff --git a/‎paddle/fluid/distributed/collective/process_group_custom.cc
+111-67 b/‎paddle/fluid/distributed/collective/process_group_custom.cc
+111-67
diff --git a/‎paddle/fluid/distributed/collective/process_group_custom.h
+41-19 b/‎paddle/fluid/distributed/collective/process_group_custom.h
+41-19
diff --git a/‎paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+2-2 b/‎paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+2-2
diff --git a/‎paddle/fluid/framework/ir/graph_pattern_detector.cc
+4-3 b/‎paddle/fluid/framework/ir/graph_pattern_detector.cc
+4-3
@@ -42,6 +42,23 @@ if(WITH_GPU)
   endif()
 endif()
 
+if(CMAKE_COMPILER_IS_GNUCC)
+  execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
+                  OUTPUT_VARIABLE GCC_VERSION)
+  string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
+  list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
+  list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
+  set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
+  if(GCC_VERSION GREATER_EQUAL "12.0")
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
+         native_dst)
+    set(GLOO_PATCH_COMMAND patch -d ${GLOO_SOURCE_DIR}/gloo/transport/tcp <
+                           ${native_dst})
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
+         types_header)
+    set(GLOO_PATCH_COMMAND patch -d ${GLOO_SOURCE_DIR}/gloo/ < ${types_header})
+  endif()
+endif()
 include_directories(${GLOO_INCLUDE_DIR})
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
 
@@ -61,5 +61,7 @@ if(WITH_CUSTOM_DEVICE)
          place
          enforce
          collective_helper
-         device_context)
+         device_context
+         comm_static_check
+         dense_tensor)
 endif()
@@ -16,11 +16,13 @@
 
 #include "paddle/fluid/distributed/collective/common.h"
 #include "paddle/fluid/distributed/collective/custom_ccl_tools.h"
+#include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/distributed/check/static_check.h"
 
 DECLARE_bool(xccl_blocking_wait);
 
@@ -234,10 +236,21 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
     const phi::DenseTensor& in_tensor,
     int64_t offset,
     int64_t numel,
-    bool sync_op  // for compatibility, no use now
-) {
-  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
+    bool sync_op,  // for compatibility, no use now
+    bool use_calc_stream) {
+  // numel > 0 indicates the tensor need to be sliced
+  const phi::DenseTensor& in_tensor_maybe_partial =
+      numel > 0
+          ? paddle::distributed::GetPartialTensor(in_tensor, offset, numel)
+          : in_tensor;
+  phi::distributed::CommStaticCheck::GatherLikeShape(*out_tensor,
+                                                     in_tensor_maybe_partial,
+                                                     /*dst_rank*/ rank_,
+                                                     /*cur_rank*/ rank_,
+                                                     size_);
+  std::vector<phi::DenseTensor> in_wrapper{in_tensor_maybe_partial};
   std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
+
   return Collective(
       in_wrapper,
       out_wrapper,
@@ -247,80 +260,23 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
           const phi::stream::Stream& stream) {
         return phi::DeviceManager::CCLAllGather(
             device_type_,
-            XcclGetPointerByOffset(input.data(), offset, input.dtype()),
+            input.data(),
             output.data(),
-            numel,
+            input.numel(),
             phi::ccl::ToCCLDataType(input.dtype()),
             comm,
             stream);
       },
       CommType::ALLGATHER);
 }
 
-std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
-    phi::DenseTensor* out_tensor,
-    const phi::DenseTensor& in_tensor,
-    const AllreduceOptions& opts,
-    bool sync_op  // for compatibility, no use now
-) {
-  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
-  std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
-  return AllReduce(in_wrapper, out_wrapper, opts);
-}
-
-std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
     phi::DenseTensor* out_tensor,
     const phi::DenseTensor& in_tensor,
-    const BroadcastOptions& opts,
-    bool sync_op  // for compatibility, no use now
-) {
-  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
-  std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
-  return Broadcast(in_wrapper, out_wrapper, opts);
-}
-
-std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Barrier(
-    const BarrierOptions& opts) {
-  // Only support single card single process
-  PADDLE_ENFORCE_GE(opts.device_id,
-                    0,
-                    platform::errors::PreconditionNotMet(
-                        "The barrier device id must greater or equal than 0."));
-  platform::CustomPlace place(device_type_, opts.device_id);
-  auto allocator = std::unique_ptr<phi::Allocator>(
-      new paddle::experimental::DefaultAllocator(place));
-  phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1});
-  phi::DenseTensor barrier_tensor{allocator.get(), meta};
-
-  auto task = ProcessGroupCustom::AllReduce(&barrier_tensor,
-                                            barrier_tensor,
-                                            {},
-                                            /*sync_op*/ true);
-  auto xccl_task = dynamic_cast<ProcessGroupCustom::CustomTask*>(task.get());
-  xccl_task->barrierTensors_ = {barrier_tensor};
-  return task;
-}
-
-phi::DeviceContext* ProcessGroupCustom::GetDeviceContext(
-    const Place& place) const {
-  const std::string key = GetKeyFromPlace(place);
-  const auto& iter = places_to_ctx_.find(key);
-  PADDLE_ENFORCE_NE(
-      iter,
-      places_to_ctx_.end(),
-      platform::errors::NotFound(
-          "Cannot find the device context in this process group."));
-  return iter->second[0].get();
-}
-
-phi::ccl::CCLComm ProcessGroupCustom::CustomCCLComm(const Place& place) const {
-  std::vector<Place> places = {place};
-  const auto& iter = places_to_customcomm_.find(GetKeyFromPlaces(places));
-  PADDLE_ENFORCE_NE(iter,
-                    places_to_customcomm_.end(),
-                    platform::errors::InvalidArgument(
-                        "Cannot find nccl comm in process group."));
-  return iter->second[0]->GetCustomCCLComm();
+    int64_t offset,
+    int64_t numel,
+    bool sync_op) {
+  return AllGather(out_tensor, in_tensor, offset, numel, sync_op);
 }
 
 // TODO(sunyilun): methods below will be removed later
@@ -356,6 +312,28 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
       CommType::ALLGATHER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    const AllreduceOptions& opts,
+    bool sync_op,  // for compatibility, no use now
+    bool use_calc_stream) {
+  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
+  std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
+  return AllReduce(in_wrapper, out_wrapper, opts);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    const AllreduceOptions& opts,
+    bool sync_op  // for compatibility, no use now
+) {
+  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
+  std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
+  return AllReduce(in_wrapper, out_wrapper, opts);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
@@ -390,6 +368,72 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
       CommType::ALLREDUCE);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    const BroadcastOptions& opts,
+    bool sync_op,  // for compatibility, no use now
+    bool use_calc_stream) {
+  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
+  std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
+  return Broadcast(in_wrapper, out_wrapper, opts);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    const BroadcastOptions& opts,
+    bool sync_op) {
+  std::vector<phi::DenseTensor> in_wrapper{in_tensor};
+  std::vector<phi::DenseTensor> out_wrapper{*out_tensor};
+  return Broadcast(in_wrapper, out_wrapper, opts);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Barrier(
+    const BarrierOptions& opts) {
+  // Only support single card single process
+  PADDLE_ENFORCE_GE(opts.device_id,
+                    0,
+                    platform::errors::PreconditionNotMet(
+                        "The barrier device id must greater or equal than 0."));
+  platform::CustomPlace place(device_type_, opts.device_id);
+  auto allocator = std::unique_ptr<phi::Allocator>(
+      new paddle::experimental::DefaultAllocator(place));
+  phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1});
+  phi::DenseTensor barrier_tensor{allocator.get(), meta};
+
+  auto task = ProcessGroupCustom::AllReduce(&barrier_tensor,
+                                            barrier_tensor,
+                                            {},
+                                            /*sync_op*/ true,
+                                            false);
+  auto xccl_task = dynamic_cast<ProcessGroupCustom::CustomTask*>(task.get());
+  xccl_task->barrierTensors_ = {barrier_tensor};
+  return task;
+}
+
+phi::DeviceContext* ProcessGroupCustom::GetDeviceContext(
+    const Place& place) const {
+  const std::string key = GetKeyFromPlace(place);
+  const auto& iter = places_to_ctx_.find(key);
+  PADDLE_ENFORCE_NE(
+      iter,
+      places_to_ctx_.end(),
+      platform::errors::NotFound(
+          "Cannot find the device context in this process group."));
+  return iter->second[0].get();
+}
+
+phi::ccl::CCLComm ProcessGroupCustom::CustomCCLComm(const Place& place) const {
+  std::vector<Place> places = {place};
+  const auto& iter = places_to_customcomm_.find(GetKeyFromPlaces(places));
+  PADDLE_ENFORCE_NE(iter,
+                    places_to_customcomm_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find nccl comm in process group."));
+  return iter->second[0]->GetCustomCCLComm();
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
 
@@ -80,25 +80,6 @@ class ProcessGroupCustom : public ProcessGroupWithoutStream {
 
   std::string GetBackendName() const override { return "XCCL_" + device_type_; }
 
-  std::shared_ptr<ProcessGroup::Task> AllGather(
-      phi::DenseTensor* out_tensor,
-      const phi::DenseTensor& in_tensor,
-      int64_t offset,
-      int64_t numel,
-      bool sync_op) override;
-
-  std::shared_ptr<ProcessGroup::Task> AllReduce(
-      phi::DenseTensor* out_tensor,
-      const phi::DenseTensor& in_tensor,
-      const AllreduceOptions& opts,
-      bool sync_op) override;
-
-  std::shared_ptr<ProcessGroup::Task> Broadcast(
-      phi::DenseTensor* out_tensor,
-      const phi::DenseTensor& in_tensor,
-      const BroadcastOptions& opts,
-      bool sync_op) override;
-
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
@@ -111,16 +92,57 @@ class ProcessGroupCustom : public ProcessGroupWithoutStream {
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      int64_t offset,
+      int64_t numel,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      int64_t offset,
+      int64_t numel,
+      bool sync_op) override;
+
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
       const AllreduceOptions& = AllreduceOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      const AllreduceOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      const AllreduceOptions& opts,
+      bool sync_op) override;
+
   std::shared_ptr<ProcessGroup::Task> Broadcast(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      const BroadcastOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      const BroadcastOptions& opts,
+      bool sync_op) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupCustom::CustomTask> CreateTask(
       std::vector<Place> places,
 
@@ -1840,7 +1840,7 @@ def GenerateHigherOrderNodeCreationCode(self):
 
         if is_composite_grad_api and next_grad_node_creation_str != '':
             next_grad_node_creation_str = f"""
- if (!paddle::prim::PrimCommonUtils::IsBwdPrimEnabled()) {{
+ if (!paddle::prim::PrimCommonUtils::IsEagerPrimEnabled()) {{
     {next_grad_node_creation_str}
  }}
   """
@@ -2260,7 +2260,7 @@ def GenerateNodeDefinition(
         # TODO(Ruting):using composite only when we don't have backward kernel in the future.
         elif is_composite_grad_api:
             grad_function_call_str = f"""
-  if (paddle::prim::PrimCommonUtils::IsBwdPrimEnabled()) {{
+  if (paddle::prim::PrimCommonUtils::IsEagerPrimEnabled()) {{
   {indent}{composite_grad_api_namespace}{composite_grad_api_name}{composite_template_name}({composite_grad_api_args_str});
   VLOG(4) << "Composite api {composite_grad_api_name} is called ";
   }}else{{
 
@@ -2346,16 +2346,16 @@ PDNode *patterns::ScaleQuant::operator()() {
   return quant_op;
 }
 
-PDNode *patterns::QuantConv::operator()() {
+PDNode *patterns::QuantConv::operator()(const std::string &conv_type) {
   auto quant_in = pattern->NewNode(quant_in_repr())
                       ->AsInput()
                       ->assert_is_op_input("quantize", "Input");
   auto quant_op = pattern->NewNode(quant_op_repr())->assert_is_op("quantize");
 
   auto conv_in = pattern->NewNode(conv_in_repr())
                      ->AsInput()
-                     ->assert_is_op_input("conv2d", "Input");
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+                     ->assert_is_op_input(conv_type, "Input");
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op(conv_type);
   conv_op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
            "bfloat16";
@@ -2845,6 +2845,7 @@ PDNode *patterns::Bfloat16Placement::operator()(
                                        "clip",
                                        "concat",
                                        "conv2d",
+                                       "fused_conv2d",
                                        "conv2d_transpose",
                                        "elementwise_add",
                                        "elementwise_mul",