PaddlePaddle · phlrain · Jun 4, 2025 · May 20, 2025 · May 21, 2025 · May 21, 2025
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
@@ -1217,6 +1217,106 @@ void MeshgridGradInferMeta(const std::vector<const MetaTensor*>& inputs,
   }
 }
 
+void MoeCombineGradInferMeta(const MetaTensor& x,
+                             const MetaTensor& combine_weights,
+                             const MetaTensor& scatter_index,
+                             const MetaTensor& y,
+                             MetaTensor* grad_x,
+                             MetaTensor* grad_combine_weights_helper) {
+  auto x_dim = x.dims();
+  auto combine_weights_shape = combine_weights.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dim.size(),
+      2,
+      errors::InvalidArgument("The input X should have 2 dimensions"
+                              "But received X's dimension = %d",
+                              x_dim.size()));
+  PADDLE_ENFORCE_EQ(
+      (scatter_index.dtype() == phi::DataType::INT32),
+      true,
+      errors::InvalidArgument("The input scatter_index type should be int32"
+                              "But received scatter_index type = %s",
+                              scatter_index.dtype()));
+  grad_x->set_dims(common::make_ddim({x_dim[0], x_dim[1]}));
+  grad_x->set_dtype(x.dtype());
+  grad_combine_weights_helper->set_dims(common::make_ddim(
+      {combine_weights_shape[0], combine_weights_shape[1], x_dim[1]}));
+  grad_combine_weights_helper->set_dtype(x.dtype());
+}
+
+void MoeGateDispatchPartialNoSoftmaxTopkGradInferMeta(
+    const MetaTensor& combine_weights_out,
+    const MetaTensor& scatter_index,
+    const MetaTensor& scatter_index_rev,
+    const MetaTensor& expert_offset,
+    const MetaTensor& expert_offset_local,
+    const MetaTensor& y_grad,
+    const MetaTensor& combine_weights_out_grad,
+    int64_t k,
+    int64_t capacity,
+    bool use_pad,
+    int64_t expert_start_index,
+    int64_t expert_end_index,
+    MetaTensor* x_grad,
+    MetaTensor* combine_weights_grad) {
+  int64_t num_experts = expert_offset.dims()[0];
+  int64_t hidden_size = y_grad.dims()[1];
+  int64_t num_rows = scatter_index.dims()[1];
+  PADDLE_ENFORCE_GT(num_experts,
+                    0,
+                    common::errors::InvalidArgument(
+                        "Input num_experts should be greater than 0"));
+  PADDLE_ENFORCE_EQ((expert_offset.dtype() == phi::DataType::INT64),
+                    true,
+                    common::errors::InvalidArgument(
+                        "Input expert_offset type should be int64"));
+  if (use_pad) {
+    PADDLE_ENFORCE_GE(num_experts,
+                      y_grad.dims()[0] / capacity,
+                      common::errors::InvalidArgument(
+                          "Number of experts should be greater than or equal "
+                          "to y_grad.dims()[0]/capacity"));
+  } else {
+    PADDLE_ENFORCE_GT(y_grad.dims()[0],
+                      0,
+                      common::errors::InvalidArgument(
+                          "Input y_grad.dims()[0] should be greater than 0"));
+  }
+  combine_weights_grad->set_dims(combine_weights_out_grad.dims());
+  combine_weights_grad->set_dtype(phi::DataType::FLOAT32);
+  x_grad->set_dims({num_rows, hidden_size});
+  x_grad->set_dtype(y_grad.dtype());
+}
+
+void MoeGateDispatchPermuteGradInferMeta(const MetaTensor& combine_weights,
+                                         const MetaTensor& scatter_index,
+                                         const MetaTensor& expert_id,
+                                         const MetaTensor& y_grad,
+                                         const MetaTensor& combine_weights_grad,
+                                         int64_t k,
+                                         int64_t capacity,
+                                         int64_t world_size,
+                                         MetaTensor* x_grad,
+                                         MetaTensor* gate_logits_grad) {
+  auto y_grad_dims = y_grad.dims();
+  PADDLE_ENFORCE_EQ(
+      y_grad_dims[1],
+      world_size,
+      common::errors::InvalidArgument(
+          "The second dimension of y_grad should be equal to world_size, but "
+          "received y_grad_dims[1] = %d, world_size = %d",
+          y_grad_dims[1],
+          world_size));
+  int64_t num_local_experts = y_grad_dims[0];
+  int64_t num_experts = world_size * num_local_experts;
+  int64_t hidden_size = y_grad_dims[y_grad_dims.size() - 1];
+  int64_t num_rows = scatter_index.dims()[1];
+  x_grad->set_dims({num_rows, hidden_size});
+  x_grad->set_dtype(y_grad.dtype());
+  gate_logits_grad->set_dims({num_rows, num_experts});
+  gate_logits_grad->set_dtype(phi::DataType::FLOAT32);
+}
+
 void MultiDotGradInferMeta(const std::vector<const MetaTensor*>& x,
                            const MetaTensor& out_grad,
                            std::vector<MetaTensor*> x_grad) {
@@ -1887,4 +1987,89 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
     value_grad->share_lod(values);
   }
 }
+
+void CalAuxLossGradInferMeta(const MetaTensor& gate_prob,
+                             const MetaTensor& seqlen_float,
+                             const MetaTensor& ce,
+                             const MetaTensor& l_aux_loss_grad,
+                             const int64_t num_experts,
+                             const bool use_group,
+                             const int64_t moe_k,
+                             MetaTensor* gate_prob_grad) {
+  auto gate_prob_dims = gate_prob.dims();
+
+  PADDLE_ENFORCE_EQ(
+      gate_prob.dtype(),
+      l_aux_loss_grad.dtype(),
+      errors::InvalidArgument(
+          "The input out_grad type should be equal to gate_prob type"));
+
+  gate_prob_grad->set_dims({gate_prob_dims});
+  gate_prob_grad->set_dtype(gate_prob.dtype());
+}
+
+void MoeGateDispatchGradInferMeta(const MetaTensor& combine_weights,
+                                  const MetaTensor& scatter_index,
+                                  const MetaTensor& expert_id,
+                                  const MetaTensor& y_grad,
+                                  const MetaTensor& combine_weights_grad,
+                                  const int64_t k,
+                                  const int64_t capacity,
+                                  const bool use_pad,
+                                  MetaTensor* x_grad,
+                                  MetaTensor* gate_logits_grad) {
+  auto combine_weights_dims = combine_weights.dims();
+  auto scatter_index_dims = scatter_index.dims();
+  auto expert_id_dims = expert_id.dims();
+  auto y_grad_dims = y_grad.dims();
+  auto combine_weights_grad_dims = combine_weights_grad.dims();
+
+  PADDLE_ENFORCE_EQ(combine_weights_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Input combine_weights should have 2 dimensions"));
+
+  PADDLE_ENFORCE_EQ(
+      scatter_index_dims.size(),
+      2,
+      errors::InvalidArgument("Input scatter_index should have 2 dimensions"));
+
+  PADDLE_ENFORCE_EQ(
+      expert_id_dims.size(),
+      2,
+      errors::InvalidArgument("Input expert_id should have 2 dimensions"));
+
+  PADDLE_ENFORCE_EQ(
+      y_grad_dims.size(),
+      2,
+      errors::InvalidArgument("Input y_grad should have 2 dimensions"));
+
+  PADDLE_ENFORCE_EQ(combine_weights_grad_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Input combine_weights_grad should have 2 dimensions"));
+
+  int64_t num_experts = y_grad_dims[0] / capacity;
+  int64_t hidden_size = y_grad_dims[1];
+
+  int64_t num_rows = scatter_index_dims[1];
+
+  gate_logits_grad->set_dims(common::make_ddim({num_rows, num_experts}));
+  gate_logits_grad->set_dtype(phi::DataType::FLOAT32);
+
+  x_grad->set_dims(common::make_ddim({num_rows, hidden_size}));
+  x_grad->set_dtype(y_grad.dtype());
+}
+void FusedRMSNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& scale,
+                               const MetaTensor& invvar,
+                               const MetaTensor& dy,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad) {
+  x_grad->set_dims(x.dims());
+  x_grad->set_dtype(x.dtype());
+  scale_grad->set_dims(scale.dims());
+  scale_grad->set_dtype(scale.dtype());
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
@@ -462,6 +462,44 @@ void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query,
                                            MetaTensor* value_grad,
                                            MetaTensor* bias_grad);
 
+void MoeCombineGradInferMeta(const MetaTensor& x,
+                             const MetaTensor& combine_weights,
+                             const MetaTensor& scatter_index,
+                             const MetaTensor& grad_y,
+                             MetaTensor* grad_x,
+                             MetaTensor* grad_combine_weights_helper);
+// Tensor combine_weights_out, Tensor scatter_index, Tensor scatter_index_rev,
+// Tensor expert_offset, Tensor expert_offset_local, Tensor y_grad, Tensor
+// combine_weights_out_grad, int64_t k, int64_t capacity, bool use_pad, int64_t
+// expert_start_index, int64_t expert_end_index)
+//  output : Tensor(x_grad), Tensor(combine_weights_grad)
+void MoeGateDispatchPartialNoSoftmaxTopkGradInferMeta(
+    const MetaTensor& combine_weights_out,
+    const MetaTensor& scatter_index,
+    const MetaTensor& scatter_index_rev,
+    const MetaTensor& expert_offset,
+    const MetaTensor& expert_offset_local,
+    const MetaTensor& y_grad,
+    const MetaTensor& combine_weights_out_grad,
+    int64_t k,
+    int64_t capacity,
+    bool use_pad,
+    int64_t expert_start_index,
+    int64_t expert_end_index,
+    MetaTensor* x_grad,
+    MetaTensor* combine_weights_grad);
+
+void MoeGateDispatchPermuteGradInferMeta(const MetaTensor& combine_weights,
+                                         const MetaTensor& scatter_index,
+                                         const MetaTensor& expert_id,
+                                         const MetaTensor& y_grad,
+                                         const MetaTensor& combine_weights_grad,
+                                         int64_t k,
+                                         int64_t capacity,
+                                         int64_t world_size,
+                                         MetaTensor* x_grad,
+                                         MetaTensor* gate_logits_grad);
+
 void MultiDotGradInferMeta(const std::vector<const MetaTensor*>& x,
                            const MetaTensor& out_grad,
                            std::vector<MetaTensor*> x_grad);
@@ -680,4 +718,31 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
                            MetaTensor* x_grad,
                            MetaTensor* value_grad);
 
+void CalAuxLossGradInferMeta(const MetaTensor& gate_prob,
+                             const MetaTensor& seqlen_float,
+                             const MetaTensor& ce,
+                             const MetaTensor& l_aux_loss_grad,
+                             const int64_t num_experts,
+                             const bool use_group,
+                             const int64_t moe_k,
+                             MetaTensor* gate_prob_grad);
+
+void MoeGateDispatchGradInferMeta(const MetaTensor& combine_weights,
+                                  const MetaTensor& scatter_index,
+                                  const MetaTensor& expert_id,
+                                  const MetaTensor& y_grad,
+                                  const MetaTensor& combine_weights_grad,
+                                  const int64_t k,
+                                  const int64_t capacity,
+                                  const bool use_pad,
+                                  MetaTensor* x_grad,
+                                  MetaTensor* gate_logits_grad);
+
+void FusedRMSNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& scale,
+                               const MetaTensor& invvar,
+                               const MetaTensor& dy,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
@@ -4592,6 +4592,20 @@ void WeightDequantizeInferMeta(const MetaTensor& x,
   out->set_dtype(scale.dtype());
 }
 
+void FusedRMSNormInferMeta(const MetaTensor& x,
+                           const MetaTensor& scale,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* invvar) {
+  // Y: same shape, dtype, layout as X
+  y->set_dims(x.dims());
+  y->set_dtype(x.dtype());
+  // mean & invvar: 1-D length = x.dims()[0]
+  int64_t rows = x.dims()[0];
+  invvar->set_dims(DDim({rows}));
+  invvar->set_dtype(DataType::FLOAT32);
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
@@ -790,5 +790,10 @@ void WeightDequantizeInferMeta(const MetaTensor& x,
                                const std::string& algo,
                                const int32_t group_size,
                                MetaTensor* out);
+void FusedRMSNormInferMeta(const MetaTensor& x,
+                           const MetaTensor& scale,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* invvar);
 
 }  // namespace phi