Update deep_ep::internode_ll and add barrier function

RichardWooSJTU · RichardWooSJTU · commit 361ff3074c46 · 2025-04-15T14:43:22.000+08:00
diff --git a/paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp b/paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp
@@ -1625,6 +1625,8 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank,
 #endif
 }
 
+void Buffer::barrier_all() { internode_ll::barrier_all(calc_ctx->stream()); }
+
 #ifdef PADDLE_WITH_NVSHMEM
 std::tuple<deep_ep::detail::Tensor,
            std::optional<deep_ep::detail::Tensor>,
diff --git a/paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp b/paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp
@@ -251,6 +251,7 @@ struct Buffer {
   void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank,
                                 int hidden,
                                 int num_experts);
+  void barrier_all();
 
 #ifdef PADDLE_WITH_NVSHMEM
   std::tuple<deep_ep::detail::Tensor,
diff --git a/paddle/fluid/distributed/collective/deep_ep/kernels/api.cuh b/paddle/fluid/distributed/collective/deep_ep/kernels/api.cuh
@@ -288,6 +288,7 @@ void combine(cudaDataType_t type,
 // Internode low-latency kernels
 namespace internode_ll {
 
+void barrier_all(cudaStream_t stream);
 void clean_low_latency_buffer(int* clean_0,
                               int num_clean_int_0,
                               int* clean_1,
diff --git a/paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh b/paddle/fluid/distributed/collective/deep_ep/kernels/ibgda_device.cuh
@@ -82,6 +82,12 @@ uint16_t HtoBE16(uint16_t x) {
 
 typedef struct mlx5_wqe_ctrl_seg __attribute__((__aligned__(8))) ibgda_ctrl_seg_t;
 
+typedef struct {
+    uint32_t add_data;
+    uint32_t field_boundary;
+    uint64_t reserved;
+} __attribute__((__packed__)) ibgda_atomic_32_masked_fa_seg_t;
+
 __device__ static __forceinline__
 nvshmemi_ibgda_device_state_t* ibgda_get_state() {
     return &nvshmemi_ibgda_device_state_d;
@@ -439,4 +445,61 @@ nvshmemi_ibgda_put_nbi_warp(uint64_t req_rptr, uint64_t req_lptr, size_t bytes,
     __syncwarp();
 }
 
+__device__ static __forceinline__ void ibgda_write_amo_add_wqe(
+        nvshmemi_ibgda_device_qp_t *qp, const int &value,
+        uint64_t laddr, __be32 lkey, uint64_t raddr, __be32 rkey,
+        uint16_t wqe_idx, void **out_wqes) {
+    ibgda_ctrl_seg_t ctrl_seg = {0};
+    struct mlx5_wqe_raddr_seg raddr_seg;
+    struct mlx5_wqe_atomic_seg atomic_seg_1;
+    struct mlx5_wqe_data_seg data_seg;
+
+    auto ctrl_seg_ptr = reinterpret_cast<ibgda_ctrl_seg_t*>(out_wqes[0]);
+    auto raddr_seg_ptr = reinterpret_cast<mlx5_wqe_raddr_seg*>(reinterpret_cast<uintptr_t>(ctrl_seg_ptr) + sizeof(*ctrl_seg_ptr));
+    auto atomic_seg_ptr = reinterpret_cast<mlx5_wqe_atomic_seg*>(reinterpret_cast<uintptr_t>(raddr_seg_ptr) + sizeof(*raddr_seg_ptr));
+    auto data_seg_ptr = reinterpret_cast<mlx5_wqe_data_seg*>(reinterpret_cast<uintptr_t>(atomic_seg_ptr) + sizeof(*atomic_seg_ptr));
+
+    raddr_seg.raddr = HtoBE64(raddr);
+    raddr_seg.rkey = rkey;
+    raddr_seg.reserved = 0;
+
+    // NOTES: `0x08000000` means `IBGDA_4_BYTE_EXT_AMO_OPMOD`
+    ctrl_seg.opmod_idx_opcode = HtoBE32(MLX5_OPCODE_ATOMIC_MASKED_FA | (wqe_idx << 8) | 0x08000000);
+    auto atomic_32_masked_fa_seg = reinterpret_cast<ibgda_atomic_32_masked_fa_seg_t*>(&atomic_seg_1);
+    atomic_32_masked_fa_seg->add_data = HtoBE32(value);
+    atomic_32_masked_fa_seg->field_boundary = 0;
+
+    ctrl_seg.qpn_ds = HtoBE32((qp->qpn << 8) | 4);
+    ctrl_seg.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+
+    data_seg.byte_count = HtoBE32(sizeof(int));
+    data_seg.lkey = lkey;
+    data_seg.addr = HtoBE64(laddr);
+
+    EP_STATIC_ASSERT(sizeof(*ctrl_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*raddr_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*atomic_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    EP_STATIC_ASSERT(sizeof(*data_seg_ptr) == sizeof(int4), "Invalid vectorization");
+    st_na_relaxed(reinterpret_cast<int4*>(ctrl_seg_ptr), *reinterpret_cast<int4*>(&ctrl_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(raddr_seg_ptr), *reinterpret_cast<int4*>(&raddr_seg));
+    st_na_relaxed(reinterpret_cast<int4*>(atomic_seg_ptr), *reinterpret_cast<int4*>(&atomic_seg_1));
+    st_na_relaxed(reinterpret_cast<int4*>(data_seg_ptr), *reinterpret_cast<int4*>(&data_seg));
+}
+
+__device__ __forceinline__ void nvshmemi_ibgda_amo_nonfetch_add(void *rptr, const int& value, int pe, int qp_id) {
+    nvshmemi_ibgda_device_qp_t *qp = ibgda_get_rc(pe, qp_id);
+
+    __be32 rkey;
+    uint64_t raddr;
+    ibgda_get_rkey(reinterpret_cast<uint64_t>(rptr), pe, &raddr, &rkey);
+
+    uint64_t my_wqe_idx = ibgda_reserve_wqe_slots(qp, 1);
+    void *wqe_ptrs = ibgda_get_wqe_ptr(qp, my_wqe_idx);
+
+    ibgda_write_amo_add_wqe(qp, value, reinterpret_cast<uint64_t>(qp->ibuf.buf),
+                            qp->ibuf.lkey, raddr, rkey, my_wqe_idx, &wqe_ptrs);
+
+    ibgda_submit_requests<true>(qp, my_wqe_idx, 1);
+}
+
 } // namespace deep_ep
diff --git a/paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu b/paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu
@@ -34,6 +34,15 @@ namespace deep_ep {
 
 namespace internode_ll {
 
+__global__ void barrier_all() { nvshmemx_barrier_all_block(); }
+
+void barrier_all(cudaStream_t stream) {
+  constexpr int kNumThreads = 1;
+
+  SETUP_LAUNCH_CONFIG(1, kNumThreads, stream);
+  LAUNCH_KERNEL(&cfg, barrier_all);
+}
+
 template <int kNumThreads>
 __launch_bounds__(kNumThreads, 1) __global__ void clean_low_latency_buffer(
     int* clean_0, int num_clean_int_0, int* clean_1, int num_clean_int_1) {
@@ -112,7 +121,6 @@ __global__ __launch_bounds__(
 
   // Message package: hidden data, FP8 scales, index at source
   // NOTES: currently we have 3 reserved int fields for future use
-
   using vec_t = typename std::conditional<kUseFP8, int2, int4>::type;
   const size_t num_bytes_per_msg =
       sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float))
@@ -305,13 +313,11 @@ __global__ __launch_bounds__(
                              responsible_expert_idx) != FINISHED_SUM_TAG * 2) {
     }
     if (dst_rank != rank) {
-      nvshmemi_ibgda_rma_p(
+      nvshmemi_ibgda_amo_nonfetch_add(
           rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
           -num_tokens_sent - 1,
           dst_rank,
-          dst_expert_local_idx,
-          0);
-      nvshmemi_ibgda_prepare_recvs(dst_rank, dst_expert_local_idx);
+          dst_expert_local_idx);
     } else {
       st_na_release(rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
                     -num_tokens_sent - 1);
@@ -366,16 +372,10 @@ LOW_LATENCY_DISPATCH_RECV:
     EP_STATIC_ASSERT(kNumWarpsPerGroup > 1,
                      "Requires more than one warp per group");
     if (sub_warp_id == 1 && lane_id == 0) {
-      if (src_rank != rank) {
-        nvshmemi_ibgda_poll_recv(src_rank, local_expert_idx);
-        num_recv_tokens = ld_acquire_sys_global(
-            rdma_recv_count + local_expert_idx * num_ranks + src_rank);
-        EP_DEVICE_ASSERT(num_recv_tokens != 0);
-      } else {
-        while ((num_recv_tokens = ld_acquire_global(
-                    rdma_recv_count + local_expert_idx * num_ranks +
-                    src_rank)) == 0) {
-        }
+      // printf("enter recv meta\n");
+      while ((num_recv_tokens = ld_acquire_global(
+                  rdma_recv_count + local_expert_idx * num_ranks + src_rank)) ==
+             0) {
       }
       num_recv_tokens = -num_recv_tokens - 1;
       recv_token_begin_idx =
@@ -539,7 +539,8 @@ __global__ __launch_bounds__(
                     int num_experts,
                     int rank,
                     int num_ranks,
-                    int phases) {
+                    int phases,
+                    bool zero_copy) {
   const auto sm_id = static_cast<int>(blockIdx.x);
   const auto num_sms = static_cast<int>(gridDim.x);
   const auto thread_id = static_cast<int>(threadIdx.x);
@@ -580,7 +581,9 @@ __global__ __launch_bounds__(
     const auto local_expert_idx = responsible_expert_idx % num_local_experts;
     const auto global_expert_idx = rank * num_local_experts + local_expert_idx;
     const auto layout =
-        __ldg(layout_range + local_expert_idx * num_ranks + dst_rank);
+        __ldg(layout_range + local_expert_idx * num_ranks +
+              dst_rank);  // num_recv_tokens, recv_token_begin_idx
+
     const auto local_x = reinterpret_cast<const int4*>(x) +
                          local_expert_idx * num_ranks *
                              num_max_dispatch_tokens_per_rank *
@@ -625,13 +628,14 @@ __global__ __launch_bounds__(
                            st_na_global);
       } else {
         const auto buf_int4_ptr = reinterpret_cast<int4*>(buf_ptr);
-        UNROLLED_WARP_COPY(7,
-                           lane_id,
-                           hidden_bf16_int4,
-                           buf_int4_ptr,
-                           x_int4,
-                           ld_nc_global,
-                           st_na_global);
+        if (!zero_copy)
+          UNROLLED_WARP_COPY(7,
+                             lane_id,
+                             hidden_bf16_int4,
+                             buf_int4_ptr,
+                             x_int4,
+                             ld_nc_global,
+                             st_na_global);
         nvshmemi_ibgda_put_nbi_warp(dst_ptr,
                                     buf_ptr,
                                     hidden * sizeof(nv_bfloat16),
@@ -651,11 +655,8 @@ __global__ __launch_bounds__(
       while (ld_acquire_global(atomic_clean_flag) == 0) {
       }
       if (dst_rank != rank) {
-        nvshmemi_ibgda_rma_p(rdma_recv_flag + global_expert_idx,
-                             1,
-                             dst_rank,
-                             local_expert_idx,
-                             0);
+        nvshmemi_ibgda_amo_nonfetch_add(
+            rdma_recv_flag + global_expert_idx, 1, dst_rank, local_expert_idx);
       } else {
         st_na_release(rdma_recv_flag + global_expert_idx, 1);
       }
@@ -672,18 +673,9 @@ LOW_LATENCY_COMBINE_RECV:
   if (responsible_expert_idx < num_experts) {
     EP_STATIC_ASSERT(kNumWarpsPerGroup > 1,
                      "Invalid number of warps per group");
-    if (sub_warp_id == 0 && lane_id == 0) {
-      // TODO(Xreki): refactor QP indices
-      auto src_rank = responsible_expert_idx / num_local_experts;
-      auto src_expert_idx = responsible_expert_idx % num_local_experts;
-      if (src_rank != rank) {
-        nvshmemi_ibgda_poll_recv(src_rank, src_expert_idx);
-      } else {
-        while (ld_acquire_global(rdma_recv_flag + responsible_expert_idx) ==
-               0) {
-        }
+    if (sub_warp_id == 0 && lane_id == 0)
+      while (ld_acquire_global(rdma_recv_flag + responsible_expert_idx) == 0) {
       }
-    }
   }
   cg::this_grid().sync();
 
@@ -796,7 +788,8 @@ void combine(void* combined_x,
                   num_experts,                                           \
                   rank,                                                  \
                   num_ranks,                                             \
-                  phases);                                               \
+                  phases,                                                \
+                  false);                                                \
   }                                                                      \
   break
 
diff --git a/paddle/fluid/pybind/deep_ep_api.cc b/paddle/fluid/pybind/deep_ep_api.cc
@@ -89,6 +89,7 @@ void BindDeepEPApi(pybind11::module *m) {
       .def("intranode_combine", &deep_ep::Buffer::intranode_combine_api)
       .def("internode_dispatch", &deep_ep::Buffer::internode_dispatch_api)
       .def("internode_combine", &deep_ep::Buffer::internode_combine_api)
+      .def("barrier_all", &deep_ep::Buffer::barrier_all)
       .def("clean_low_latency_buffer",
            &deep_ep::Buffer::clean_low_latency_buffer)
       .def("low_latency_dispatch", &deep_ep::Buffer::low_latency_dispatch_api)
diff --git a/python/paddle/distributed/communication/deep_ep/buffer.py b/python/paddle/distributed/communication/deep_ep/buffer.py
@@ -736,6 +736,9 @@ def internode_combine(
         )
         return combined_x, combined_topk_weights, EventOverlap(event)
 
+    def barrier_all(self):
+        self.runtime.barrier_all()
+
     def clean_low_latency_buffer(
         self,
         num_max_dispatch_tokens_per_rank: int,

Original file line number	Diff line number	Diff line change
`@@ -1625,6 +1625,8 @@ void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank,`
`1625`	`1625`	`#endif`
`1626`	`1626`	`}`
`1627`	`1627`
	`1628`	`+void Buffer::barrier_all() { internode_ll::barrier_all(calc_ctx->stream()); }`
	`1629`	`+`
`1628`	`1630`	`#ifdef PADDLE_WITH_NVSHMEM`
`1629`	`1631`	`std::tuple<deep_ep::detail::Tensor,`
`1630`	`1632`	`std::optional<deep_ep::detail::Tensor>,`
Original file line number	Diff line number	Diff line change
`@@ -736,6 +736,9 @@ def internode_combine(`
`736`	`736`	`)`
`737`	`737`	`return combined_x, combined_topk_weights, EventOverlap(event)`
`738`	`738`
	`739`	`+ def barrier_all(self):`
	`740`	`+ self.runtime.barrier_all()`
	`741`	`+`
`739`	`742`	`def clean_low_latency_buffer(`
`740`	`743`	`self,`
`741`	`744`	`num_max_dispatch_tokens_per_rank: int,`