fix moe_combine bug.

A-nnonymous · A-nnonymous · commit 422850eb523e · 2025-05-28T09:39:47.000Z
diff --git a/paddle/phi/kernels/gpu/moe_combine_grad_kernel.cu b/paddle/phi/kernels/gpu/moe_combine_grad_kernel.cu
@@ -1,6 +1,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/moe_combine_grad_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 namespace phi {
 
 template <typename T>
@@ -129,6 +130,8 @@ void MoeCombineGradKernel(const Context& dev_ctx,
                           DenseTensor* grad_combine_weights_helper) {
   dev_ctx.template Alloc<T>(grad_x);
   dev_ctx.template Alloc<T>(grad_combine_weights_helper);
+  phi::Full<T, Context>(dev_ctx, phi::IntArray(common::vectorize(grad_x->dims())), 0, grad_x);
+  phi::Full<T, Context>(dev_ctx, phi::IntArray(common::vectorize(grad_combine_weights_helper->dims())), 0, grad_combine_weights_helper);
   auto x_shape = x.dims();
   auto combine_weights_shape = combine_weights.dims();
   moe_combine_bwd<T, Context>(dev_ctx,
diff --git a/paddle/phi/kernels/gpu/moe_combine_kernel.cu b/paddle/phi/kernels/gpu/moe_combine_kernel.cu
@@ -1,6 +1,7 @@
 #include "paddle/phi/kernels/moe_combine_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace phi {
 
@@ -91,6 +92,7 @@ void moe_combine_fwd(const Context& dev_ctx,
                         DenseTensor* y) {
     dev_ctx.template Alloc<T>(y);  // T cannot support phi::dtype::float8 very
                                    // well, maybe replaced with x.dtype();
+    phi::Full<T, Context>(dev_ctx, phi::IntArray(common::vectorize(y->dims())), 0, y);
     auto combine_weights_shape = combine_weights.dims();
     auto x_shape = x.dims();
     moe_combine_fwd<T, Context>(dev_ctx,
diff --git a/paddle/phi/kernels/gpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cu b/paddle/phi/kernels/gpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cu
@@ -110,7 +110,6 @@ void MoeGateDispatchPartialNoSoftMaxTopkGradKernel(const Context& dev_ctx,
                                                   int64_t expert_end_index,
                                                   DenseTensor* x_grad,
                                                   DenseTensor* combine_weights_grad){
-  printf("MoeGateDispatchPartialNoSoftMaxTopkGradKernel begin\n");
   dev_ctx.template Alloc<T>(x_grad);
   dev_ctx.template Alloc<float>(combine_weights_grad);
   // DenseTensor t_scatter_index;
diff --git a/test/legacy_test/ernie_utils/moe_layer_uneven.py b/test/legacy_test/ernie_utils/moe_layer_uneven.py
@@ -218,7 +218,6 @@ def forward(ctx, x, combine_weights, scatter_index):
 
     @staticmethod
     def backward(ctx, grad_y, *_):
-        '''
         """
         Input:
             grad_y:  [seqlen, hidden_size]
@@ -243,10 +242,9 @@ def backward(ctx, grad_y, *_):
         # grad_combine_weight_helper is the same shape with grad x [seqlen * K, dim]
         # reduce the hidden shape
         # TODO: implement reduce in cuda ops
-        #grad_combine_weight = grad_combine_weight_helper.sum(-1)
-        #return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None
-        return grad_x, grad_combine_weight_helper
-        '''
+        grad_combine_weight = grad_combine_weight_helper.sum(-1)
+        return grad_x, grad_combine_weight.reshape(ctx.combine_weights.shape), None
+        #return grad_x, grad_combine_weight_helper
 
 
 
diff --git a/test/legacy_test/test_incubate_moe_combine.py b/test/legacy_test/test_incubate_moe_combine.py
@@ -66,8 +66,8 @@ def test_moe_combine(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_n
     grad = paddle.to_tensor(grad_numpy).cast("float32")
 
     y = GateCombine.apply(x, combine_weights, scatter_index)
-    #paddle.autograd.backward([y], [grad], True)
-    grad.backward()
+    paddle.autograd.backward([y], [grad], True)
+    #grad.backward()
     return [x.grad, combine_weights.grad, y]
 
 
diff --git a/test/legacy_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/test/legacy_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
@@ -12,7 +12,6 @@
 
 
 def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op():
-    import moe_ops_partial_nosoftmaxtopk
 
     s, d, e = 4, 100, 8
     k, cap = 4, 3
@@ -137,7 +136,6 @@ def check_ascend(index_rev, chunks):
 
 
 def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop():
-    import moe_ops_partial_nosoftmaxtopk
 
     S, E, D = 3, 4, 3
     k = 2
@@ -162,7 +160,6 @@ def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop():
 
 
 def test_moe_ops_partial_nosoftmax_topk_empty_output():
-    import moe_ops_partial_nosoftmaxtopk
 
     S, E, D = 3, 4, 3
     k = 2