PaddlePaddle · luotao1 · May 12, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 29, 2025
@@ -719,6 +719,11 @@ PD_REGISTER_SPMD_RULE(
     PD_INFER_SPMD(phi::distributed::ArgMaxInferSpmdBase),
     PD_INFER_SPMD(phi::distributed::ArgMaxInferSpmdReverseBase));
 
+// topk
+PD_REGISTER_SPMD_RULE(topk,
+                      PD_INFER_SPMD(phi::distributed::TopkInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::TopkGradInferSpmd));
+
 // unbind
 PD_REGISTER_SPMD_RULE(unbind,
                       PD_INFER_SPMD(phi::distributed::UnbindInferSpmd),

@@ -68,6 +68,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/squeeze.h"
 #include "paddle/phi/infermeta/spmd_rules/stack.h"
 #include "paddle/phi/infermeta/spmd_rules/tile.h"
+#include "paddle/phi/infermeta/spmd_rules/topk.h"
 #include "paddle/phi/infermeta/spmd_rules/transpose.h"
 #include "paddle/phi/infermeta/spmd_rules/triu.h"
 #include "paddle/phi/infermeta/spmd_rules/unbind.h"

@@ -0,0 +1,164 @@
+/* Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/topk.h"
+#include "glog/logging.h"
+#include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo TopkInferSpmd(
+    const DistMetaTensor& x, int k, int axis, bool largest, bool sorted) {
+  // Verify input args
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);
+  axis = axis < 0 ? x_ndim + axis : axis;
+  PADDLE_ENFORCE_EQ(
+      0 <= axis && axis < x_ndim,
+      true,
+      phi::errors::InvalidArgument(
+          "The axis of topk should be in range [0, %d), but got %d.",
+          x_ndim,
+          axis));
+
+  // Create destination dist attrs
+  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  TensorDistAttr out_dist_attr_dst =
+      CopyTensorDistAttrForOutput(x_dist_attr_src);
+  TensorDistAttr indices_dist_attr_dst =
+      CopyTensorDistAttrForOutput(x_dist_attr_src);
+
+  // Infer dims_mapping
+  std::vector<int64_t> x_dims_mapping_dst = x_dims_mapping_src;
+  x_dims_mapping_dst[axis] = -1;
+  std::vector<int64_t> out_dims_mapping_dst = x_dims_mapping_dst;
+  std::vector<int64_t> indices_dims_mapping_dst = x_dims_mapping_dst;
+
+  // Set the dims mapping for outputs
+  out_dist_attr_dst.set_dims_mapping(out_dims_mapping_dst);
+  indices_dist_attr_dst.set_dims_mapping(indices_dims_mapping_dst);
+
+  // Update the dims mapping for inputs
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping_dst);
+  VLOG(4) << "TopkInferSpmd: Done.";
+  LOG_SPMD_INPUT(x);
+  LOG_SPMD_OUTPUT(out_dist_attr_dst);
+  LOG_SPMD_OUTPUT(indices_dist_attr_dst);
+
+  return {{x_dist_attr_dst}, {out_dist_attr_dst, indices_dist_attr_dst}};
+}
+
+SpmdInfo TopkGradInferSpmd(const DistMetaTensor& x,
+                           const DistMetaTensor& indices,
+                           const DistMetaTensor& out_grad,
+                           int k,
+                           int axis,
+                           bool largest,
+                           bool sorted) {
+  // Verify input args
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);
+  EXTRACT_SHAPE_AND_DIST_ATTR(indices);
+  EXTRACT_SHAPE_AND_DIST_ATTR(out_grad);
+  PADDLE_ENFORCE_EQ(indices_ndim,
+                    out_grad_ndim,
+                    common::errors::InvalidArgument(
+                        "TopKGrad: The rank of Indices [%d] and OutGrad [%d] "
+                        "must be the same.",
+                        indices_ndim,
+                        out_grad_ndim));
+  PADDLE_ENFORCE_EQ(x_ndim,
+                    indices_ndim,
+                    common::errors::InvalidArgument(
+                        "TopKGrad: The rank of Input [%d] and Indices [%d] "
+                        "must be the same.",
+                        x_ndim,
+                        indices_ndim));
+  axis = axis < 0 ? x_ndim + axis : axis;
+  PADDLE_ENFORCE_EQ(
+      0 <= axis && axis < x_ndim,
+      true,
+      phi::errors::InvalidArgument(
+          "The axis of topk_grad should be in range [0, %d), but got %d.",
+          x_ndim,
+          axis));
+  // Build einsum notation
+  std::string alphabet = "abcdefghijlopqrstuvwxyz";
+  std::string x_axes = alphabet.substr(0, x_ndim - 1);
+  std::string indices_axes = x_axes;
+  std::string out_grad_axes = x_axes;
+
+  // Merge sharding
+  std::pair<std::string, std::vector<int64_t>> indices_pair(
+      indices_axes, indices_dims_mapping_src);
+  std::pair<std::string, std::vector<int64_t>> out_grad_pair(
+      out_grad_axes, out_grad_dims_mapping_src);
+  std::pair<std::string, std::vector<int64_t>> x_pair(x_axes,
+                                                      x_dims_mapping_src);
+  auto axis_to_dim_map =
+      ShardingMergeForTensors({x_pair, indices_pair, out_grad_pair});
+
+  // Infer dims mapping
+  std::vector<int64_t> x_grad_dims_mapping_dst =
+      GetDimsMappingForAxes(x_axes, axis_to_dim_map);
+  x_grad_dims_mapping_dst.insert(x_grad_dims_mapping_dst.begin() + axis, -1);
+  std::vector<int64_t> x_dims_mapping_dst = x_grad_dims_mapping_dst;
+  std::vector<int64_t> indices_dims_mapping_dst = x_grad_dims_mapping_dst;
+  std::vector<int64_t> out_grad_dims_mapping_dst = x_grad_dims_mapping_dst;
+
+  // Set the dims mapping
+  TensorDistAttr x_grad_dist_attr_dst =
+      CopyTensorDistAttrForOutput(out_grad_dist_attr_src);
+  TensorDistAttr x_dist_attr_dst =
+      CopyTensorDistAttrForOutput(out_grad_dist_attr_src);
+  TensorDistAttr indices_dist_attr_dst =
+      CopyTensorDistAttrForOutput(out_grad_dist_attr_src);
+  TensorDistAttr out_grad_dist_attr_dst =
+      CopyTensorDistAttrForOutput(out_grad_dist_attr_src);
+
+  x_grad_dist_attr_dst.set_dims_mapping(x_grad_dims_mapping_dst);
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping_dst);
+  indices_dist_attr_dst.set_dims_mapping(indices_dims_mapping_dst);
+  out_grad_dist_attr_dst.set_dims_mapping(out_grad_dims_mapping_dst);
+
+  VLOG(4) << "TopkGradInferSpmd: Done.";
+  LOG_SPMD_INPUT(x);
+  LOG_SPMD_INPUT(indices);
+  LOG_SPMD_INPUT(out_grad);
+  LOG_SPMD_OUTPUT(x_grad_dist_attr_dst);
+
+  return {{x_dist_attr_dst, indices_dist_attr_dst, out_grad_dist_attr_dst},
+          {x_grad_dist_attr_dst}};
+}
+SpmdInfo TopkInferSpmdDynamic(const DistMetaTensor& x,
+                              const Scalar& k,
+                              int axis,
+                              bool largest,
+                              bool sorted) {
+  return TopkInferSpmd(x, k.to<int>(), axis, largest, sorted);
+}
+
+SpmdInfo TopkGradInferSpmdDynamic(const DistMetaTensor& x,
+                                  const DistMetaTensor& indices,
+                                  const DistMetaTensor& out_grad,
+                                  const Scalar& k,
+                                  int axis,
+                                  bool largest,
+                                  bool sorted) {
+  return TopkGradInferSpmd(
+      x, indices, out_grad, k.to<int>(), axis, largest, sorted);
+}
+
+}  // namespace distributed
+}  // namespace phi
@@ -0,0 +1,50 @@
+/* Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo TopkInferSpmd(
+    const DistMetaTensor& x, int k, int axis, bool largest, bool sorted);
+
+SpmdInfo TopkGradInferSpmd(const DistMetaTensor& x,
+                           const DistMetaTensor& indices,
+                           const DistMetaTensor& out_grad,
+                           int k,
+                           int axis,
+                           bool largest,
+                           bool sorted);
+
+SpmdInfo TopkInferSpmdDynamic(const DistMetaTensor& x,
+                              const Scalar& k,
+                              int axis,
+                              bool largest,
+                              bool sorted);
+
+SpmdInfo TopkGradInferSpmdDynamic(const DistMetaTensor& x,
+                                  const DistMetaTensor& indices,
+                                  const DistMetaTensor& out_grad,
+                                  const Scalar& k,
+                                  int axis,
+                                  bool largest,
+                                  bool sorted);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -3519,6 +3519,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [x]
+    spmd_rule: TopkGradInferSpmdDynamic
   kernel :
     func : topk_grad
     data_type : out_grad

diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -5144,6 +5144,7 @@
   output : Tensor(out), Tensor(indices)
   infer_meta :
     func : TopKInferMeta
+    spmd_rule: TopkInferSpmdDynamic
   kernel :
     func : topk
     data_type : x

diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt
@@ -44,6 +44,7 @@ if(WITH_DISTRIBUTE)
   py_test_modules(test_logsumexp_rule MODULES test_logsumexp_rule)
   py_test_modules(test_nonzero_rule MODULES test_nonzero_rule)
   if(NOT WITH_ROCM)
+    py_test_modules(test_topk_rule MODULES test_topk_rule)
     py_test_modules(test_add_n_rule MODULES test_add_n_rule)
     py_test_modules(test_mean_all_rule MODULES test_mean_all_rule)
     py_test_modules(test_argmin_rule MODULES test_argmin_rule)

diff --git a/test/auto_parallel/spmd_rules/test_topk_rule.py b/test/auto_parallel/spmd_rules/test_topk_rule.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from collections import OrderedDict
+
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+from paddle.framework import core
+
+
+class TestTopkSPMDRule(unittest.TestCase):
+    def setUp(self):
+        x_shape = [16, 16, 16]
+        out_shape = [16, 2, 16]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1], [2, 3]])
+
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, -1, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        out_tensor_dist_attr = TensorDistAttr()
+        out_tensor_dist_attr.dims_mapping = [-1, -1, -1]
+        out_tensor_dist_attr.process_mesh = process_mesh
+        self.out_dist_tensor_spec = DistTensorSpec(
+            out_shape, x_tensor_dist_attr
+        )
+
+        self.rule = core.get_phi_spmd_rule("topk")
+        self.attrs = OrderedDict()
+        self.attrs['k'] = 2
+        self.attrs['axis'] = 1
+        self.attrs['largest'] = True
+        self.attrs['sorted'] = True
+
+    def test_topk_forward(self):
+        # axis = 1
+        # [0, 1, -1] --> [0, -1, -1], [0, -1, -1]
+        self.attrs['axis'] = 1
+        self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec,
+            self.attrs['k'],
+            self.attrs['axis'],
+            self.attrs['largest'],
+            self.attrs['sorted'],
+        )
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(inferred_input_dist_attrs), 1)
+        self.assertEqual(len(inferred_output_dist_attrs), 2)
+
+        self.assertEqual(inferred_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(inferred_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(
+            inferred_output_dist_attrs[0].dims_mapping, [0, -1, -1]
+        )
+
+    def test_topk_backward(self):
+        # axis = 1
+        # [0, -1, 1], [0, -1, 1], [-1, 1, -1] --> [0, -1, 1], [0, -1, 1], [0, -1, 1], [0, -1, 1]
+        self.attrs['axis'] = 1
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, 1])
+        self.out_dist_tensor_spec.shape = [16, 2, 16]
+        self.out_dist_tensor_spec.set_dims_mapping([-1, 1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['k'],
+            self.attrs['axis'],
+            self.attrs['largest'],
+            self.attrs['sorted'],
+        )
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(inferred_input_dist_attrs), 3)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_input_dist_attrs[0].dims_mapping, [0, -1, 1])
+        self.assertEqual(inferred_input_dist_attrs[0].dims_mapping, [0, -1, 1])
+        self.assertEqual(inferred_input_dist_attrs[0].dims_mapping, [0, -1, 1])
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [0, -1, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()