[0-size Tensor No.81、138、141] Add 0-size Tensor support for blha_get_max_len (#72937)

co63oc · web-flow · commit 71ee89734d21 · 2025-06-04T17:04:48.000+08:00
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -2314,15 +2314,40 @@ bool NanmedianOpInferSymbolicShape(
   if (mode == "avg") {
     median_shape.emplace_back(2);
   }
-  infer_context->SetShapeOrDataForValue(
-      op->result(0),
-      symbol::ShapeOrDataDimExprs{
-          symbol::TensorShapeOrDataDimExprs(out_shape)});
-  infer_context->SetShapeOrDataForValue(
-      op->result(1),
-      symbol::ShapeOrDataDimExprs{
-          symbol::TensorShapeOrDataDimExprs(median_shape)});
 
+  const auto &IsZero = [&](const symbol::DimExpr &dim_expr) {
+    if (dim_expr.isa<int64_t>()) {
+      return dim_expr.dyn_cast<int64_t>() == static_cast<int64_t>(0);
+    }
+    return false;
+  };
+  bool size_0 = false;
+  for (size_t i = 0; i < x_shape.size(); i++) {
+    if (IsZero(x_shape.at(i))) {
+      size_0 = true;
+      break;
+    }
+  }
+  if (size_0) {
+    std::vector<symbol::DimExpr> x_numel_0_shape = {};
+    infer_context->SetShapeOrDataForValue(
+        op->result(0),
+        symbol::ShapeOrDataDimExprs{
+            symbol::TensorShapeOrDataDimExprs(x_numel_0_shape)});
+    infer_context->SetShapeOrDataForValue(
+        op->result(1),
+        symbol::ShapeOrDataDimExprs{
+            symbol::TensorShapeOrDataDimExprs(x_numel_0_shape)});
+  } else {
+    infer_context->SetShapeOrDataForValue(
+        op->result(0),
+        symbol::ShapeOrDataDimExprs{
+            symbol::TensorShapeOrDataDimExprs(out_shape)});
+    infer_context->SetShapeOrDataForValue(
+        op->result(1),
+        symbol::ShapeOrDataDimExprs{
+            symbol::TensorShapeOrDataDimExprs(median_shape)});
+  }
   return true;
 }
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
@@ -2854,6 +2854,11 @@ void NanmedianInferMeta(const MetaTensor& x,
   }
   median_index->set_dtype(DataType::INT64);
   median_index->set_dims(make_ddim(median_dim));
+
+  if (x.numel() == 0) {
+    out->set_dims(make_ddim({}));
+    median_index->set_dims(make_ddim({}));
+  }
 }
 
 void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
@@ -30,6 +31,21 @@ void MvGradKernel(const Context& dev_ctx,
   auto dout = out_grad;
   auto dx = x_grad;
   auto dvec = vec_grad;
+  if (x.numel() == 0 || vec.numel() == 0) {
+    if (dx) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dx->dims())),
+                            static_cast<T>(0),
+                            dx);
+    }
+    if (dvec) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dvec->dims())),
+                            static_cast<T>(0),
+                            dvec);
+    }
+    return;
+  }
 
   const auto& dim_x = x.dims();
   int m = static_cast<int>(dim_x[0]);
diff --git a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
@@ -96,6 +96,10 @@ void NanmedianGradKernel(const Context& dev_ctx,
                          bool keepdim UNUSED,
                          const std::string& mode,
                          DenseTensor* x_grad) {
+  if (x_grad && x_grad->numel() == 0) {
+    dev_ctx.template Alloc<T>(x_grad);
+    return;
+  }
   DenseTensor tmp_x;
   auto rank = x.dims().size();
   if ((axes.size() == 0) || rank <= 1) {
diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/nanmedian_utils.h"
 #include "paddle/phi/kernels/top_k_kernel.h"
 
@@ -218,6 +219,16 @@ void NanmedianKernel(const Context& dev_ctx,
                      const std::string& mode,
                      DenseTensor* out,
                      DenseTensor* median_index) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), NAN, out);
+    phi::Full<int64_t, Context>(
+        dev_ctx,
+        phi::IntArray(common::vectorize(median_index->dims())),
+        0,
+        median_index);
+    return;
+  }
   DenseTensor tmp_x;
   auto rank = x.dims().size();
   if ((axes.size() == 0) || rank <= 1) {
diff --git a/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu b/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/flash_attn_kernel.h"
@@ -49,13 +50,33 @@ void BlhaGetMaxLenKernel(const Context& dev_ctx,
                          const phi::DenseTensor& batch_size,
                          DenseTensor* max_enc_len_this_time,
                          DenseTensor* max_dec_len_this_time) {
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto& dev_ctx_cpu = *pool.Get(phi::CPUPlace());
   // decoder
   max_dec_len_this_time->Resize({{1}});
-  GetMaxLenTensor(dev_ctx, seq_lens_decoder, batch_size, max_dec_len_this_time);
+  if (seq_lens_decoder.numel() > 0) {
+    GetMaxLenTensor(
+        dev_ctx, seq_lens_decoder, batch_size, max_dec_len_this_time);
+  } else {
+    phi::Full<int, phi::CPUContext>(
+        reinterpret_cast<const phi::CPUContext&>(dev_ctx_cpu),
+        phi::IntArray(common::vectorize(max_dec_len_this_time->dims())),
+        0,
+        max_dec_len_this_time);
+  }
 
   // encoder
   max_enc_len_this_time->Resize({{1}});
-  GetMaxLenTensor(dev_ctx, seq_lens_encoder, batch_size, max_enc_len_this_time);
+  if (seq_lens_encoder.numel() > 0) {
+    GetMaxLenTensor(
+        dev_ctx, seq_lens_encoder, batch_size, max_enc_len_this_time);
+  } else {
+    phi::Full<int, phi::CPUContext>(
+        reinterpret_cast<const phi::CPUContext&>(dev_ctx_cpu),
+        phi::IntArray(common::vectorize(max_enc_len_this_time->dims())),
+        0,
+        max_enc_len_this_time);
+  }
 }
 }  // namespace fusion
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/xpu/blha_get_max_len_kernel.cc b/paddle/phi/kernels/fusion/xpu/blha_get_max_len_kernel.cc
@@ -14,9 +14,11 @@
 
 #include <paddle/phi/backends/xpu/xpu_context.h>
 #include "glog/logging.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/memcpy_kernel.h"
 #include "xpu/xdnn.h"
 
@@ -49,13 +51,33 @@ void BlhaGetMaxLenKernel(const Context& dev_ctx,
                          const phi::DenseTensor& batch_size,
                          DenseTensor* max_enc_len_this_time,
                          DenseTensor* max_dec_len_this_time) {
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto& dev_ctx_cpu = *pool.Get(phi::CPUPlace());
   // decoder
   max_dec_len_this_time->Resize({{1}});
-  GetMaxLenTensor(dev_ctx, seq_lens_decoder, batch_size, max_dec_len_this_time);
+  if (seq_lens_decoder.numel() > 0) {
+    GetMaxLenTensor(
+        dev_ctx, seq_lens_decoder, batch_size, max_dec_len_this_time);
+  } else {
+    phi::Full<int, phi::CPUContext>(
+        reinterpret_cast<const phi::CPUContext&>(dev_ctx_cpu),
+        phi::IntArray(common::vectorize(max_dec_len_this_time->dims())),
+        0,
+        max_dec_len_this_time);
+  }
 
   // encoder
   max_enc_len_this_time->Resize({{1}});
-  GetMaxLenTensor(dev_ctx, seq_lens_encoder, batch_size, max_enc_len_this_time);
+  if (seq_lens_encoder.numel() > 0) {
+    GetMaxLenTensor(
+        dev_ctx, seq_lens_encoder, batch_size, max_enc_len_this_time);
+  } else {
+    phi::Full<int, phi::CPUContext>(
+        reinterpret_cast<const phi::CPUContext&>(dev_ctx_cpu),
+        phi::IntArray(common::vectorize(max_enc_len_this_time->dims())),
+        0,
+        max_enc_len_this_time);
+  }
 }
 }  // namespace fusion
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
@@ -41,6 +42,21 @@ void MvGradKernel(const Context &dev_ctx,
   auto dout = out_grad;
   auto dx = x_grad;
   auto dvec = vec_grad;
+  if (x.numel() == 0 || vec.numel() == 0) {
+    if (dx) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dx->dims())),
+                            static_cast<T>(0),
+                            dx);
+    }
+    if (dvec) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dvec->dims())),
+                            static_cast<T>(0),
+                            dvec);
+    }
+    return;
+  }
 
   auto dim_x = x.dims();
   int m = dim_x[0];
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -111,6 +111,10 @@ void NanmedianGradKernel(const Context& dev_ctx,
                          bool keepdim UNUSED,
                          const std::string& mode,
                          DenseTensor* x_grad) {
+  if (x_grad && x_grad->numel() == 0) {
+    dev_ctx.template Alloc<T>(x_grad);
+    return;
+  }
   DenseTensor tmp_x;
   auto rank = x.dims().size();
   if ((axes.size() == 0) || rank <= 1) {
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -356,6 +356,16 @@ void NanmedianKernel(const Context& dev_ctx,
                      const std::string& mode,
                      DenseTensor* out,
                      DenseTensor* median_index) {
+  if (x.numel() == 0) {
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(common::vectorize(out->dims())), NAN, out);
+    phi::Full<int64_t, Context>(
+        dev_ctx,
+        phi::IntArray(common::vectorize(median_index->dims())),
+        0,
+        median_index);
+    return;
+  }
   DenseTensor tmp_x;
   auto rank = x.dims().size();
   if ((axes.size() == 0) || rank <= 1) {
diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -29,6 +29,17 @@ void MvKernel(const Context& dev_ctx,
   const T* x_data = x.data<T>();
   const T* vec_data = vec.data<T>();
   T* out_data = dev_ctx.template Alloc<T>(out);
+  if (out && out->numel() == 0) {
+    return;
+  }
+  // x.shape [10, 0], vec.shape [0], out.shape [10]
+  if (vec.numel() == 0) {
+    phi::Full<T, Context>(dev_ctx,
+                          phi::IntArray(common::vectorize(out->dims())),
+                          static_cast<T>(0),
+                          out);
+    return;
+  }
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
 
diff --git a/test/legacy_test/test_blha_get_max_len_op.py b/test/legacy_test/test_blha_get_max_len_op.py
@@ -108,5 +108,82 @@ def test_static_api(self):
         )
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(),
+    "Only support XPU or GPU in CUDA mode.",
+)
+class TestBlhaGetMaxLenOp_ZeroSize(unittest.TestCase):
+    def setUp(self):
+        self.name = "TestBlhaGetMaxLenOpDynamic_ZeroSize"
+        if paddle.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        elif paddle.device.is_compiled_with_xpu():
+            place = paddle.device.XPUPlace(0)
+        else:
+            raise ValueError("Only support CUDA or XPU Place.")
+        self.batch_size = 0
+        self.test_encoder_data = np.random.randint(
+            1, 100, size=self.batch_size
+        ).astype("int32")
+        self.test_decoder_data = np.random.randint(
+            1, 100, size=self.batch_size
+        ).astype("int32")
+
+    def test_dynamic_api(self):
+        paddle.disable_static()
+        seq_lens_encoder = paddle.to_tensor(
+            self.test_encoder_data,
+            "int32",
+        )
+        seq_lens_decoder = paddle.to_tensor(
+            self.test_decoder_data,
+            "int32",
+        )
+        batch_size_tensor = paddle.ones([self.batch_size])
+        max_enc_len_this_time, max_dec_len_this_time = blha_get_max_len(
+            seq_lens_encoder,
+            seq_lens_decoder,
+            batch_size_tensor,
+        )
+        assert tuple(max_enc_len_this_time.shape) == (1,) and tuple(
+            max_dec_len_this_time.shape
+        ) == (1,)
+
+    def test_static_api(self):
+        paddle.enable_static()
+
+        if paddle.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        elif paddle.device.is_compiled_with_xpu():
+            place = paddle.device.XPUPlace(0)
+        else:
+            raise ValueError("Only support CUDA or XPU Place.")
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            seq_lens_encoder = paddle.static.data(
+                "seq_lens_encoder", self.test_encoder_data.shape, "int32"
+            )
+            seq_lens_decoder = paddle.static.data(
+                "seq_lens_decoder", self.test_decoder_data.shape, "int32"
+            )
+            batch_size_tensor = paddle.ones([self.batch_size], "int32")
+            max_enc_len_this_time, max_dec_len_this_time = blha_get_max_len(
+                seq_lens_encoder,
+                seq_lens_decoder,
+                batch_size_tensor,
+            )
+            exe = paddle.static.Executor(place)
+            res_max_enc_len_this_time, res_max_dec_len_this_time = exe.run(
+                feed={
+                    "seq_lens_encoder": self.test_encoder_data,
+                    "seq_lens_decoder": self.test_decoder_data,
+                },
+                fetch_list=[max_enc_len_this_time, max_dec_len_this_time],
+            )
+        assert tuple(res_max_enc_len_this_time.shape) == (1,) and tuple(
+            res_max_dec_len_this_time.shape
+        ) == (1,)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_mv_op.py b/test/legacy_test/test_mv_op.py
diff --git a/test/legacy_test/test_nanmedian.py b/test/legacy_test/test_nanmedian.py

Original file line number	Diff line number	Diff line change
`@@ -2854,6 +2854,11 @@ void NanmedianInferMeta(const MetaTensor& x,`
`2854`	`2854`	`}`
`2855`	`2855`	`median_index->set_dtype(DataType::INT64);`
`2856`	`2856`	`median_index->set_dims(make_ddim(median_dim));`
	`2857`	`+`
	`2858`	`+ if (x.numel() == 0) {`
	`2859`	`+ out->set_dims(make_ddim({}));`
	`2860`	`+ median_index->set_dims(make_ddim({}));`
	`2861`	`+ }`
`2857`	`2862`	`}`
`2858`	`2863`
`2859`	`2864`	`void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {`