PaddlePaddle
diff --git a/‎cmake/external/xpu.cmake
+7-7 b/‎cmake/external/xpu.cmake
+7-7
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp
+2 b/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp
+2
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp
+1 b/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp
+1
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/api.cuh
+1 b/‎paddle/fluid/distributed/collective/deep_ep/kernels/api.cuh
+1
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh
+63 b/‎paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh
+63
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu
+33-41 b/‎paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu
+33-41
diff --git a/‎paddle/fluid/framework/ir/xpu/weight_only_linear_xpu_pass.cc
+6-6 b/‎paddle/fluid/framework/ir/xpu/weight_only_linear_xpu_pass.cc
+6-6
diff --git a/‎paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+18-3 b/‎paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+18-3
diff --git a/‎paddle/fluid/pybind/deep_ep_api.cc
+1 b/‎paddle/fluid/pybind/deep_ep_api.cc
+1
diff --git a/‎paddle/phi/backends/xpu/xpu2_op_list.cc
+1-1 b/‎paddle/phi/backends/xpu/xpu2_op_list.cc
+1-1
diff --git a/‎paddle/phi/backends/xpu/xpu3_op_list.cc
+2 b/‎paddle/phi/backends/xpu/xpu3_op_list.cc
+2
@@ -34,12 +34,12 @@ if(NOT DEFINED XPU_XHPC_BASE_DATE)
 endif()
 set(XPU_XCCL_BASE_VERSION "3.0.2.5") # For XRE5
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
-  set(XPU_XFT_BASE_VERSION "20230602")
+  set(XPU_XFT_BASE_VERSION "20250402/xpu3")
 endif()
 
 if(NOT DEFINED XPU_XRE_BASE_VERSION)
   if(WITH_XPU_XRE5)
-    set(XPU_XRE_BASE_VERSION "5.0.21.18")
+    set(XPU_XRE_BASE_VERSION "5.0.21.19")
   else()
     set(XPU_XRE_BASE_VERSION "4.32.0.1")
   endif()
@@ -61,7 +61,7 @@ set(XPU_XCCL_BASE_URL
 
 if(NOT XPU_XFT_BASE_URL)
   set(XPU_XFT_BASE_URL
-      "https://klx-sdk-release-public.su.bcebos.com/xft/dev/${XPU_XFT_BASE_VERSION}"
+      "https://klx-sdk-release-public.su.bcebos.com/xft_internal/dev/${XPU_XFT_BASE_VERSION}"
   )
 endif()
 
@@ -112,7 +112,7 @@ else()
     set(XPU_XHPC_DIR_NAME "xhpc-ubuntu1604_x86_64")
   endif()
   set(XPU_XCCL_DIR_NAME "xccl_Linux_x86_64")
-  set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
+  set(XPU_XFT_DIR_NAME "xft_internal_ubuntu2004")
 endif()
 
 set(XPU_XRE_URL
@@ -187,9 +187,9 @@ if(DEFINED ENV{XPU_LIB_ROOT})
   endif()
 
   # XCCL
-  if(DEFINED ENV{XCCL_DIR_NAME})
-    set(XPU_XCCL_URL "${XPU_LIB_ROOT}/$ENV{XCCL_DIR_NAME}")
-    set(XCCL_DIR_NAME "$ENV{XCCL_DIR_NAME}")
+  if(DEFINED ENV{XPU_XCCL_DIR_NAME})
+    set(XPU_XCCL_URL "${XPU_LIB_ROOT}/$ENV{XPU_XCCL_DIR_NAME}")
+    set(XPU_XCCL_DIR_NAME "$ENV{XPU_XCCL_DIR_NAME}")
   endif()
 
   # XHPC
 
@@ -1625,6 +1625,8 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank,
 #endif
 }
 
+void Buffer::barrier_all() { internode_ll::barrier_all(calc_ctx->stream()); }
+
 #ifdef PADDLE_WITH_NVSHMEM
 std::tuple<deep_ep::detail::Tensor,
            std::optional<deep_ep::detail::Tensor>,
 
@@ -251,6 +251,7 @@ struct Buffer {
   void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank,
                                 int hidden,
                                 int num_experts);
+  void barrier_all();
 
 #ifdef PADDLE_WITH_NVSHMEM
   std::tuple<deep_ep::detail::Tensor,
 
@@ -288,6 +288,7 @@ void combine(cudaDataType_t type,
 // Internode low-latency kernels
 namespace internode_ll {
 
+void barrier_all(cudaStream_t stream);
 void clean_low_latency_buffer(int* clean_0,
                               int num_clean_int_0,
                               int* clean_1,
 
@@ -82,6 +82,12 @@ uint16_t HtoBE16(uint16_t x) {
 
 typedef struct mlx5_wqe_ctrl_seg __attribute__((__aligned__(8))) ibgda_ctrl_seg_t;
 
+typedef struct {
+    uint32_t add_data;
+    uint32_t field_boundary;
+    uint64_t reserved;
+} __attribute__((__packed__)) ibgda_atomic_32_masked_fa_seg_t;
+
 __device__ static __forceinline__
 nvshmemi_ibgda_device_state_t* ibgda_get_state() {
     return &nvshmemi_ibgda_device_state_d;
@@ -439,4 +445,61 @@ nvshmemi_ibgda_put_nbi_warp(uint64_t req_rptr, uint64_t req_lptr, size_t bytes,
     __syncwarp();
 }
 
+__device__ static __forceinline__ void ibgda_write_amo_add_wqe(
+        nvshmemi_ibgda_device_qp_t *qp, const int &value,
+        uint64_t laddr, __be32 lkey, uint64_t raddr, __be32 rkey,
+        uint16_t wqe_idx, void **out_wqes) {
+    ibgda_ctrl_seg_t ctrl_seg = {0};
+    struct mlx5_wqe_raddr_seg raddr_seg;
+    struct mlx5_wqe_atomic_seg atomic_seg_1;
+    struct mlx5_wqe_data_seg data_seg;
+
+    auto ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
+    auto raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
+    auto atomic_seg_ptr = reinterpret_cast<mlx5_wqe_atomic_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
+    auto data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(atomic_seg_ptr) + sizeof(*atomic_seg_ptr));
+
+    raddr_seg.raddr = HtoBE64(raddr);
+    raddr_seg.rkey = rkey;
+    raddr_seg.reserved = 0;
+
+    // NOTES: `0x08000000` means `IBGDA_4_BYTE_EXT_AMO_OPMOD`
+    ctrl_seg.opmod_idx_opcode = HtoBE32(MLX5_OPCODE_ATOMIC_MASKED_FA | (wqe_idx << 8) | 0x08000000);
+    auto atomic_32_masked_fa_seg = reinterpret_cast<ibgda_atomic_32_masked_fa_seg_t*>(&atomic_seg_1);
+    atomic_32_masked_fa_seg->add_data = HtoBE32(value);
+    atomic_32_masked_fa_seg->field_boundary = 0;
+
+    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 4);
+    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+
+    data_seg.byte_count = HtoBE32(sizeof(int));
+    data_seg.lkey = lkey;
+    data_seg.addr = HtoBE64(laddr);
+
+    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*atomic_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<int4*>(&ctrl_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<int4*>(&raddr_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(atomic_seg_ptr), *reinterpret_cast<int4*>(&atomic_seg_1));
+    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<int4*>(&data_seg));
+}
+
+__device__ __forceinline__ void nvshmemi_ibgda_amo_nonfetch_add(void *rptr, const int& value, int pe, int qp_id) {
+    nvshmemi_ibgda_device_qp_t *qp = ibgda_get_rc(pe, qp_id);
+
+    __be32 rkey;
+    uint64_t raddr;
+    ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), pe, &raddr, &rkey);
+
+    uint64_t my_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
+    void *wqe_ptrs = ibgda_get_wqe_ptr(qp, my_wqe_idx);
+
+    ibgda_write_amo_add_wqe(qp, value, reinterpret_cast<uint64_t>(qp->ibuf.buf),
+                            qp->ibuf.lkey, raddr, rkey, my_wqe_idx, &wqe_ptrs);
+
+    ibgda_submit_requests<true>(qp, my_wqe_idx, 1);
+}
+
 } // namespace deep_ep
@@ -34,6 +34,15 @@ namespace deep_ep {
 
 namespace internode_ll {
 
+__global__ void barrier_all() { nvshmemx_barrier_all_block(); }
+
+void barrier_all(cudaStream_t stream) {
+  constexpr int kNumThreads = 1;
+
+  SETUP_LAUNCH_CONFIG(1, kNumThreads, stream);
+  LAUNCH_KERNEL(&cfg, barrier_all);
+}
+
 template <int kNumThreads>
 __launch_bounds__(kNumThreads, 1) __global__ void clean_low_latency_buffer(
     int* clean_0, int num_clean_int_0, int* clean_1, int num_clean_int_1) {
@@ -112,7 +121,6 @@ __global__ __launch_bounds__(
 
   // Message package: hidden data, FP8 scales, index at source
   // NOTES: currently we have 3 reserved int fields for future use
-
   using vec_t = typename std::conditional<kUseFP8, int2, int4>::type;
   const size_t num_bytes_per_msg =
       sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float))
@@ -305,13 +313,11 @@ __global__ __launch_bounds__(
                              responsible_expert_idx) != FINISHED_SUM_TAG * 2) {
     }
     if (dst_rank != rank) {
-      nvshmemi_ibgda_rma_p(
+      nvshmemi_ibgda_amo_nonfetch_add(
           rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
           -num_tokens_sent - 1,
           dst_rank,
-          dst_expert_local_idx,
-          0);
-      nvshmemi_ibgda_prepare_recvs(dst_rank, dst_expert_local_idx);
+          dst_expert_local_idx);
     } else {
       st_na_release(rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
                     -num_tokens_sent - 1);
@@ -366,16 +372,9 @@ LOW_LATENCY_DISPATCH_RECV:
     EP_STATIC_ASSERT(kNumWarpsPerGroup > 1,
                      "Requires more than one warp per group");
     if (sub_warp_id == 1 && lane_id == 0) {
-      if (src_rank != rank) {
-        nvshmemi_ibgda_poll_recv(src_rank, local_expert_idx);
-        num_recv_tokens = ld_acquire_sys_global(
-            rdma_recv_count + local_expert_idx * num_ranks + src_rank);
-        EP_DEVICE_ASSERT(num_recv_tokens != 0);
-      } else {
-        while ((num_recv_tokens = ld_acquire_global(
-                    rdma_recv_count + local_expert_idx * num_ranks +
-                    src_rank)) == 0) {
-        }
+      while ((num_recv_tokens = ld_acquire_global(
+                  rdma_recv_count + local_expert_idx * num_ranks + src_rank)) ==
+             0) {
       }
       num_recv_tokens = -num_recv_tokens - 1;
       recv_token_begin_idx =
@@ -539,7 +538,8 @@ __global__ __launch_bounds__(
                     int num_experts,
                     int rank,
                     int num_ranks,
-                    int phases) {
+                    int phases,
+                    bool zero_copy) {
   const auto sm_id = static_cast<int>(blockIdx.x);
   const auto num_sms = static_cast<int>(gridDim.x);
   const auto thread_id = static_cast<int>(threadIdx.x);
@@ -580,7 +580,9 @@ __global__ __launch_bounds__(
     const auto local_expert_idx = responsible_expert_idx % num_local_experts;
     const auto global_expert_idx = rank * num_local_experts + local_expert_idx;
     const auto layout =
-        __ldg(layout_range + local_expert_idx * num_ranks + dst_rank);
+        __ldg(layout_range + local_expert_idx * num_ranks +
+              dst_rank);  // num_recv_tokens, recv_token_begin_idx
+
     const auto local_x = reinterpret_cast<const int4*>(x) +
                          local_expert_idx * num_ranks *
                              num_max_dispatch_tokens_per_rank *
@@ -625,13 +627,14 @@ __global__ __launch_bounds__(
                            st_na_global);
       } else {
         const auto buf_int4_ptr = reinterpret_cast<int4*>(buf_ptr);
-        UNROLLED_WARP_COPY(7,
-                           lane_id,
-                           hidden_bf16_int4,
-                           buf_int4_ptr,
-                           x_int4,
-                           ld_nc_global,
-                           st_na_global);
+        if (!zero_copy)
+          UNROLLED_WARP_COPY(7,
+                             lane_id,
+                             hidden_bf16_int4,
+                             buf_int4_ptr,
+                             x_int4,
+                             ld_nc_global,
+                             st_na_global);
         nvshmemi_ibgda_put_nbi_warp(dst_ptr,
                                     buf_ptr,
                                     hidden * sizeof(nv_bfloat16),
@@ -651,11 +654,8 @@ __global__ __launch_bounds__(
       while (ld_acquire_global(atomic_clean_flag) == 0) {
       }
       if (dst_rank != rank) {
-        nvshmemi_ibgda_rma_p(rdma_recv_flag + global_expert_idx,
-                             1,
-                             dst_rank,
-                             local_expert_idx,
-                             0);
+        nvshmemi_ibgda_amo_nonfetch_add(
+            rdma_recv_flag + global_expert_idx, 1, dst_rank, local_expert_idx);
       } else {
         st_na_release(rdma_recv_flag + global_expert_idx, 1);
       }
@@ -672,18 +672,9 @@ LOW_LATENCY_COMBINE_RECV:
   if (responsible_expert_idx < num_experts) {
     EP_STATIC_ASSERT(kNumWarpsPerGroup > 1,
                      "Invalid number of warps per group");
-    if (sub_warp_id == 0 && lane_id == 0) {
-      // TODO(Xreki): refactor QP indices
-      auto src_rank = responsible_expert_idx / num_local_experts;
-      auto src_expert_idx = responsible_expert_idx % num_local_experts;
-      if (src_rank != rank) {
-        nvshmemi_ibgda_poll_recv(src_rank, src_expert_idx);
-      } else {
-        while (ld_acquire_global(rdma_recv_flag + responsible_expert_idx) ==
-               0) {
-        }
+    if (sub_warp_id == 0 && lane_id == 0)
+      while (ld_acquire_global(rdma_recv_flag + responsible_expert_idx) == 0) {
       }
-    }
   }
   cg::this_grid().sync();
 
@@ -796,7 +787,8 @@ void combine(void* combined_x,
                   num_experts,                                           \
                   rank,                                                  \
                   num_ranks,                                             \
-                  phases);                                               \
+                  phases,                                                \
+                  false);                                                \
   }                                                                      \
   break
 
 
@@ -44,20 +44,20 @@ PermuteINT8WeightOnlyPattern::PermuteINT8WeightOnlyPattern(
     PDPattern* pattern, const std::string& name_scope)
     : PatternBase(pattern, name_scope, name_scope) {
   auto* input = pattern->NewNode(input_repr())
-                    ->assert_is_op_input("weight_only_linear_xpu", "x")
+                    ->assert_is_op_input("weight_only_linear", "x")
                     ->AsInput();
   auto* weight = pattern->NewNode(weight_repr())
-                     ->assert_is_op_input("weight_only_linear_xpu", "weight")
+                     ->assert_is_op_input("weight_only_linear", "weight")
                      ->AsInput();
   auto* weight_scale =
       pattern->NewNode(weight_scale_repr())
-          ->assert_is_op_input("weight_only_linear_xpu", "weight_scale")
+          ->assert_is_op_input("weight_only_linear", "weight_scale")
           ->AsInput();
   auto* out = pattern->NewNode(out_repr())
-                  ->assert_is_op_output("weight_only_linear_xpu", "out")
+                  ->assert_is_op_output("weight_only_linear", "out")
                   ->AsOutput();
   auto* weight_only_linear = pattern->NewNode(weight_only_linear_repr())
-                                 ->assert_is_op("weight_only_linear_xpu");
+                                 ->assert_is_op("weight_only_linear");
 
   std::vector<PDNode*> input_vars{input, weight, weight_scale};
   std::vector<PDNode*> output_vars{out};
@@ -236,4 +236,4 @@ REGISTER_PASS(weight_only_linear_xpu_pass,
 REGISTER_PASS_CAPABILITY(weight_only_linear_xpu_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination().EQ(
-            "weight_only_linear_xpu", 0));
+            "weight_only_linear", 0));
@@ -107,7 +107,8 @@ phi::DeviceContext* ParseDeviceContext(pir::Operation* op,
 
   // only gpu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
-  if (phi::is_gpu_place(place) || phi::is_custom_place(place)) {
+  if (phi::is_gpu_place(place) || phi::is_custom_place(place) ||
+      phi::is_xpu_place(place)) {
     VLOG(6) << "Parse DeviceContext for " << op_name
             << ", execution stream = " << execution_stream;
     if (execution_stream != kDefaultStream) {
@@ -136,7 +137,7 @@ phi::DeviceContext* ParseDeviceContext(pir::Operation* op,
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU_BKCL)
     // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
     // with use_cal_stream==false by returning a device context getting from the
     // global NCCLCommContext instance. Because when use_calc_stream==false, in
@@ -205,7 +206,21 @@ phi::DeviceContext* ParseDeviceContext(pir::Operation* op,
             op_name.compare(paddle::dialect::AllToAllOp::name()) == 0 ||
             op_name.compare(
                 paddle::dialect::CSoftmaxWithCrossEntropyOp::name()) == 0) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_XPU_BKCL)
+          if (phi::is_xpu_place(place) && execution_stream == kDefaultStream) {
+            VLOG(3) << "set stream for " << op_name << "in XPU device";
+            if (origin_dev_ctx != nullptr) {
+              // set stream
+              auto default_stream =
+                  static_cast<DEVICE_CONTEXT*>(origin_dev_ctx)->stream();
+              static_cast<DEVICE_CONTEXT*>(dev_ctx)->SetStream(default_stream);
+              // todo set allocator
+            } else {
+              VLOG(3) << "CUSTOM DEVICE op " << op_name << " ring_id "
+                      << ring_id << " origin_dev_ctx is nullptr";
+            }
+          }
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
           if (phi::is_custom_place(place) &&
               execution_stream == kDefaultStream) {
             VLOG(3) << "set stream for " << op_name << "in Custom device";
 
@@ -89,6 +89,7 @@ void BindDeepEPApi(pybind11::module *m) {
       .def("intranode_combine", &deep_ep::Buffer::intranode_combine_api)
       .def("internode_dispatch", &deep_ep::Buffer::internode_dispatch_api)
       .def("internode_combine", &deep_ep::Buffer::internode_combine_api)
+      .def("barrier_all", &deep_ep::Buffer::barrier_all)
       .def("clean_low_latency_buffer",
            &deep_ep::Buffer::clean_low_latency_buffer)
       .def("low_latency_dispatch", &deep_ep::Buffer::low_latency_dispatch_api)
 
@@ -1220,7 +1220,7 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::FLOAT32})},
       {"warpctc_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"warpctc", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"weight_only_linear_xpu",
+      {"weight_only_linear",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::BFLOAT16})},
       {"where_index",
        XPUKernelSet({phi::DataType::INT32,
 
@@ -1690,6 +1690,8 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT32,
                      phi::DataType::INT64})},
+      {"weight_quantize",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::BFLOAT16})},
       {"where_grad",
        XPUKernelSet({phi::DataType::INT32,
                      phi::DataType::INT64,
Original file line number	Diff line number	Diff line change
`@@ -1625,6 +1625,8 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank,`
`1625`	`1625`	`#endif`
`1626`	`1626`	`}`
`1627`	`1627`
	`1628`	`+void Buffer::barrier_all() { internode_ll::barrier_all(calc_ctx->stream()); }`
	`1629`	`+`
`1628`	`1630`	`#ifdef PADDLE_WITH_NVSHMEM`
`1629`	`1631`	`std::tuple<deep_ep::detail::Tensor,`
`1630`	`1632`	`std::optional<deep_ep::detail::Tensor>,`