PaddlePaddle · bukejiyu · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 13, 2025
@@ -313,6 +313,7 @@ option(
   OFF)
 option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
 option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
+option(WITH_FLAGCX "Compile PaddlePaddle with FLAGCX support" OFF)
 option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
 option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
 option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
@@ -538,6 +539,11 @@ else()
   endif()
 endif()
 
+if(WITH_FLAGCX)
+  add_definitions("-DPADDLE_WITH_FLAGCX")
+  include(flagcx)
+endif()
+
 if(WITH_HETERPS AND WITH_PSLIB)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()

diff --git a/cmake/flagcx.cmake b/cmake/flagcx.cmake
@@ -0,0 +1,24 @@
+set(CMAKE_FIND_DEBUG_MODE ON)
+# flagcx.cmake
+if(NOT WITH_FLAGCX)
+  return()
+endif()
+
+if(WITH_FLAGCX)
+  set(FLAGCX_ROOT
+      $ENV{FLAGCX_ROOT}
+      CACHE PATH "FLAGCX_ROOT")
+  message(STATUS "FLAGCX_ROOT is ${FLAGCX_ROOT}")
+  find_path(
+    FLAGCX_INCLUDE_DIR flagcx.h
+    PATHS ${FLAGCX_ROOT}/flagcx/include
+    NO_DEFAULT_PATH)
+  message(STATUS "FLAGCX_INCLUDE_DIR is ${FLAGCX_INCLUDE_DIR}")
+  include_directories(SYSTEM ${FLAGCX_INCLUDE_DIR})
+  set(FLAGCX_LIB
+      "${FLAGCX_ROOT}/build/lib/libflagcx.so"
+      CACHE FILEPATH "flagcx library." FORCE)
+  generate_dummy_static_lib(LIB_NAME "flagcx" GENERATOR "flagcx.cmake")
+  target_link_libraries(flagcx ${FLAGCX_LIB})
+  message(STATUS "FLAGCX_LIB is ${FLAGCX_LIB}")
+endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
@@ -13,7 +13,7 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   set(AVX512F_FLAG "-mavx512f")
   set(Wno_Maybe_Uninitialized "-Wno-maybe-uninitialized")
   set(FMA_FLAG "-mfma")
-  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0)
+  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.0)
     set(NO_INLINE "-fno-inline")
   else()
     set(NO_INLINE "")
@@ -27,7 +27,7 @@ elseif(MSVC)
   set(AVX512F_FLAG "/arch:AVX512")
   set(Wno_Maybe_Uninitialized "/wd4701")
   set(FMA_FLAG "/arch:AVX2")
-  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0)
+  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.0)
     set(NO_INLINE "/Ob0")
   else()
     set(NO_INLINE "")

diff --git a/paddle/cinn/backends/nvrtc/header_generator.cc b/paddle/cinn/backends/nvrtc/header_generator.cc
@@ -49,8 +49,7 @@ std::string read_file_as_string(const std::string& file_path) {
   std::ifstream file(cinn_path + '/' + file_path);
 
   if (!file.is_open()) {
-    LOG_FIRST_N(INFO, 1) << "Unable to open file : " << cinn_path << '/'
-                         << file_path;
+    VLOG(1) << "Unable to open file : " << cinn_path << '/' << file_path;
     return "";
   }
   std::stringstream buffer;

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fold_full_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fold_full_pass.cc
@@ -49,7 +49,7 @@ class FoldFullWithReshapeOpPattern : public pir::OpRewritePattern<OPTYPE> {
         phi::IntArray(out_shape.Get(), out_shape.size()));
 
     auto new_full_op = rewriter.Build<paddle::dialect::FullOp>(attrs);
-
+    new_full_op->result(0).set_type(op->result(0).type());
     rewriter.ReplaceAllUsesWith(op->result(0), new_full_op->result(0));
     rewriter.EraseOp(op);
     if (pre_op->use_empty()) {

diff --git a/paddle/cinn/operator_fusion/fusion_tracker/interpreter.cc b/paddle/cinn/operator_fusion/fusion_tracker/interpreter.cc
@@ -161,31 +161,41 @@ void RunItersTransformInstr(const std::shared_ptr<ItersTransformInstr>& instr,
 
 void RunAxisTransformInstr(const std::shared_ptr<AxisTransformInstr>& instr,
                            FusionInterpreter* interpreter) {
-  auto substitute_dimexpr_for_shape = [&](std::vector<symbol::DimExpr>& shape) {
-    for (auto& dim_expr : shape) {
-      if (dim_expr.isa<std::int64_t>()) continue;
-      symbol::DimExpr origin_dim_expr = dim_expr;
-      while (true) {
-        dim_expr = symbol::SubstituteDimExpr(
-            dim_expr, interpreter->substitute_dimexpr_map);
-        if (dim_expr == origin_dim_expr || dim_expr.isa<std::int64_t>()) break;
-        origin_dim_expr = dim_expr;
-      }
-    }
-  };
-  auto substitute_dimexpr_for_transform =
-      adt::match{[&](const AppendAxisTransformPtr& transform) {
-                   substitute_dimexpr_for_shape(transform->shape);
-                 },
-                 [&](const ReshapeTransformPtr& transform) {
-                   substitute_dimexpr_for_shape(transform->in_shape);
-                   substitute_dimexpr_for_shape(transform->out_shape);
-                 },
-                 [&](const auto& transform) {}};
+  auto substitute_dimexpr_for_shape =
+      [&](const std::vector<symbol::DimExpr>& shape) {
+        std::vector<symbol::DimExpr> result;
+        for (const auto& dim_expr : shape) {
+          symbol::DimExpr substituted = dim_expr;
+          while (true) {
+            if (substituted.isa<std::int64_t>()) break;
+            auto tmp_substituted = symbol::SubstituteDimExpr(
+                substituted, interpreter->substitute_dimexpr_map);
+            if (tmp_substituted == substituted) break;
+            substituted = tmp_substituted;
+          }
+          result.emplace_back(substituted);
+        }
+        return result;
+      };
+  auto substitute_dimexpr_for_transform = adt::match{
+      [&](const AppendAxisTransformPtr& trans) -> AxisTransform {
+        auto substituted_shape = substitute_dimexpr_for_shape(trans->shape);
+        return std::make_shared<AppendAxisTransform>(trans->axis,
+                                                     substituted_shape);
+      },
+      [&](const ReshapeTransformPtr& trans) -> AxisTransform {
+        auto substituted_in_shape =
+            substitute_dimexpr_for_shape(trans->in_shape);
+        auto substituted_out_shape =
+            substitute_dimexpr_for_shape(trans->out_shape);
+        return std::make_shared<ReshapeTransform>(substituted_in_shape,
+                                                  substituted_out_shape);
+      },
+      [&](const auto& trans) -> AxisTransform { return trans; }};
   auto axis_transform = [&](ir::Expr op_expr) -> ir::Expr {
     for (auto trans : instr->axis_transform_route_) {
-      std::visit(substitute_dimexpr_for_transform, trans);
-      op_expr = std::visit(ApplyAxisTransform(op_expr), trans);
+      auto new_trans = std::visit(substitute_dimexpr_for_transform, trans);
+      op_expr = std::visit(ApplyAxisTransform(op_expr), new_trans);
     }
     return op_expr;
   };

diff --git a/paddle/cinn/operator_fusion/pattern_fuser.h b/paddle/cinn/operator_fusion/pattern_fuser.h
@@ -365,6 +365,7 @@ static bool IsLoopFrameworkEqual(const StmtPattern& lhs,
   const auto& rhs_loops = GetLoopFramework(rhs);
   VLOG(4) << "lhs " << lhs_loops.DebugStr();
   VLOG(4) << "rhs " << rhs_loops.DebugStr();
+  if (lhs_loops.loop.empty() || rhs_loops.loop.empty()) return false;
 
   // TODO(huangjiyi): support horizontal fusion without reduce dims equal.
   const auto get_reduce_loop = [](const MaybeLoopFramework& loop) {

diff --git a/paddle/cinn/operator_fusion/pir_graph_analyzing/loop_axis_mapping.cc b/paddle/cinn/operator_fusion/pir_graph_analyzing/loop_axis_mapping.cc
@@ -563,7 +563,7 @@ LoopAxisMapping CreateLoopAxisMappingForReshape(pir::Operation* op) {
   result.output_values.push_back(op->result(0));
   result.loop2output.resize(1);
   auto in_shape = GetCompatibleValueAllDims(op->operand_source(0));
-  auto out_shape = GetValueAllDims(op->result(0));
+  auto out_shape = GetCompatibleValueAllDims(op->result(0));
   result.loop = out_shape;
 
   if (!ShapeProductEqual(in_shape, out_shape)) {

diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
@@ -1261,6 +1261,19 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table,
 PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
 
+/**
+ * ProcessGroupFlagCX related FLAG
+ * Name: flagcx_blocking_wait
+ * Since Version:
+ * Value Range: bool, default=false
+ * Example:
+ * Note: nccl blocking wait.
+ * blocks host thread until collective operation completes
+ */
+#if defined(PADDLE_WITH_FLAGCX)
+PHI_DEFINE_EXPORTED_bool(flagcx_blocking_wait, false, "flagcx blocking wait");
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DEFINE_EXPORTED_bool(benchmark_nccl,
                          false,
@@ -1766,6 +1779,13 @@ PHI_DEFINE_EXPORTED_string(
     "For instance, /usr/local/cuda/lib64. If default, "
     "dlopen will search cuda from LD_LIBRARY_PATH");
 
+PHI_DEFINE_EXPORTED_string(
+    flagcx_dir,  // NOLINT
+    "",
+    "Specify path for loading libflagcx.so. For instance, "
+    "For instance, /usr/local/flagcx/lib. If default, "
+    "dlopen will search flagcx from LD_LIBRARY_PATH");
+
 PHI_DEFINE_EXPORTED_string(cupti_dir,
                            "",
                            "Specify path for loading cupti.so.");  // NOLINT
@@ -1873,6 +1893,20 @@ PHI_DEFINE_EXPORTED_bool(
     false,
     "Enable xqa optim in block_multihead_attention kernel (GQA).");
 
+/**
+ * Whether to use FP32 for accumulation of QK output in
+ * block_multihead_attention kernel(fp16)
+ * Name: blha_use_fp32_qk_sum Since Version: 3.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If TRUE, FP32 will be used for accumulation of the QK output
+ * in block_multihead_attention kernel(fp16) .
+ */
+PHI_DEFINE_EXPORTED_bool(blha_use_fp32_qk_sum,
+                         false,
+                         "use FP32 for accumulation of QK output in "
+                         "block_multihead_attention kernel(fp16).");
+
 PHI_DEFINE_EXPORTED_bool(cuda_core_int8_gemm,
                          false,
                          "Enable speed up int8 gemm calculations when m<=4");

diff --git a/paddle/common/macros.h b/paddle/common/macros.h
@@ -30,6 +30,8 @@ limitations under the License. */
 #define COMM_CONTEXT phi::distributed::NCCLCommContext
 #elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
 #define COMM_CONTEXT phi::distributed::BKCLCommContext
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+#define COMM_CONTEXT phi::distributed::XCCLCommContext
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -38,6 +40,9 @@ limitations under the License. */
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #define CREATE_COMM_CONTEXT \
   phi::distributed::CommContextManager::CreateBKCLCommContext
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+#define CREATE_COMM_CONTEXT \
+  phi::distributed::CommContextManager::CreateXCCLCommContext
 #endif
 
 namespace common {

@@ -36,6 +36,13 @@ if(WITH_NCCL OR WITH_RCCL)
 
 endif()
 
+if(WITH_FLAGCX)
+  cc_library(
+    process_group_flagcx
+    SRCS process_group_flagcx.cc common.cc
+    DEPS process_group phi)
+endif()
+
 if(WITH_XPU_BKCL)
   cc_library(
     process_group_bkcl
@@ -66,6 +73,9 @@ set(COMM_UTILS_DEPS process_group)
 if(WITH_NCCL OR WITH_RCCL)
   set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
 endif()
+if(WITH_FLAGCX)
+  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_flagcx)
+endif()
 if(WITH_CUSTOM_DEVICE)
   set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_custom)
 endif()

@@ -18,7 +18,6 @@
 // https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
 
 #include <cuda_runtime.h>
-#include <pybind11/functional.h>
 #include <atomic>
 #include <chrono>
 #include <memory>
@@ -209,6 +208,7 @@ int Buffer::get_root_rdma_rank(bool global) const {
 
 int Buffer::get_local_device_id() const { return device_id; }
 
+#ifndef PADDLE_NO_PYTHON
 pybind11::bytearray Buffer::get_local_ipc_handle() const {
   return {ipc_handles[nvl_rank].reserved, CUDA_IPC_HANDLE_SIZE};
 }
@@ -301,6 +301,7 @@ void Buffer::sync(
   // Ready to use
   available = true;
 }
+#endif
 
 std::tuple<deep_ep::detail::Tensor,
            std::optional<deep_ep::detail::Tensor>,

@@ -24,8 +24,11 @@
 #undef NDEBUG
 #endif
 
+#ifndef PADDLE_NO_PYTHON
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
+#endif
+#include <optional>
 #include <tuple>
 #include <vector>
 #include "paddle/fluid/distributed/collective/deep_ep/include/types.h"
@@ -119,6 +122,7 @@ struct Buffer {
 
   int get_local_device_id() const;
 
+#ifndef PADDLE_NO_PYTHON
   pybind11::bytearray get_local_ipc_handle() const;
 
   pybind11::bytearray get_local_nvshmem_unique_id() const;
@@ -127,6 +131,7 @@ struct Buffer {
             const std::vector<std::optional<pybind11::bytearray>>&
                 all_gathered_handles,
             const std::optional<pybind11::bytearray>& root_unique_id_opt);
+#endif
 
   std::tuple<deep_ep::detail::Tensor,
              std::optional<deep_ep::detail::Tensor>,