[NPU] Fix concat while inputs size is 1 (PaddlePaddle#237)

YanhuiDua · web-flow · commit 3dffe1144fca · 2022-11-29T10:41:31.000+08:00
* fix concat while inputs size is 1

* fix concat kernel

* fix bug
diff --git a/backends/npu/kernels/concat_kernel.cc b/backends/npu/kernels/concat_kernel.cc
@@ -61,7 +61,8 @@ void ConcatKernel(const Context& dev_ctx,
       }
     }
     if (inputs.size() == 1) {
-      *out = inputs[0];
+      out->ResizeAndAllocate(inputs[0].dims());
+      TensorCopy(dev_ctx, inputs[0], true, out);
       return;
     }
     NpuOpRunner runner;
@@ -72,7 +73,7 @@ void ConcatKernel(const Context& dev_ctx,
         .AddAttr("N", static_cast<int>(inputs.size()));
     runner.AddInputNames(names);
     runner.Run(stream);
-    
+
   } else {
     // TODO(songkai05): In CANN512, Concat doesn't support dtype double,
     // so cast double to float32 temporarily until it supports double.
@@ -98,12 +99,13 @@ void ConcatKernel(const Context& dev_ctx,
     out_fp32.set_meta(meta_fp32);
     dev_ctx.template Alloc<float>(&out_fp32);
 
-    if(inputs.size() == 1) {
-      int index = std::stoi(names[0].substr(1, names[0].size()-1));
-      *out = *ins[index];
+    if (inputs.size() == 1) {
+      int index = std::stoi(names[0].substr(1, names[0].size() - 1));
+      out->ResizeAndAllocate(ins[index].dims());
+      TensorCopy(dev_ctx, *ins[index], true, out);
       return;
     }
-    
+
     NpuOpRunner runner;
     runner.SetType("Concat")
         .AddInput(dev_ctx, std::move(std::vector<int>(1, axis)))
diff --git a/backends/npu/kernels/funcs/npu_funcs.h b/backends/npu/kernels/funcs/npu_funcs.h
@@ -86,6 +86,9 @@ inline void TensorCopy(const Context& dev_ctx,
   C_Stream stream = static_cast<C_Stream>(dev_ctx.stream());
 
   auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
+  if (UNLIKELY(size) == 0) {
+    return;
+  }
 
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place_.GetType() == phi::AllocationType::CUSTOM) {
@@ -108,17 +111,16 @@ inline void TensorCopy(const Context& dev_ctx,
           dev_ctx.Wait();
         }
       } else {
-        PADDLE_THROW(phi::errors::Unimplemented(
-        "TensorCopy is not supported."));
+        PADDLE_THROW(
+            phi::errors::Unimplemented("TensorCopy is not supported."));
       }
     } else {
-      PADDLE_THROW(phi::errors::Unimplemented(
-        "TensorCopy is not supported."));
+      PADDLE_THROW(phi::errors::Unimplemented("TensorCopy is not supported."));
     }
   } else if (src_place.GetType() == phi::AllocationType::CPU &&
-                dst_place_.GetType() == phi::AllocationType::CPU) {
-      std::memcpy(dst_ptr, src_ptr, size);
-  } 
+             dst_place_.GetType() == phi::AllocationType::CPU) {
+    std::memcpy(dst_ptr, src_ptr, size);
+  }
 }
 
 /**
@@ -357,9 +359,9 @@ inline void NpuBroadcast(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(&tmp_tensor);
     NpuOpRunner runner;
     runner.SetType("Expand")
-          .AddInput(tmp_src)
-          .AddInput(dev_ctx, phi::vectorize<int64_t>(tmp_tensor_dims))
-          .AddOutput(tmp_tensor);
+        .AddInput(tmp_src)
+        .AddInput(dev_ctx, phi::vectorize<int64_t>(tmp_tensor_dims))
+        .AddOutput(tmp_tensor);
     auto stream = dev_ctx.stream();
     runner.Run(stream);
     tmp_src = tmp_tensor;
@@ -421,12 +423,13 @@ inline void NpuElementWiseOpBroadcast(const Context& dev_ctx,
       phi::errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
-  PADDLE_ENFORCE_LE(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than or equal to %d, but received axis is %d.",
-                        max_dim,
-                        axis));
+  PADDLE_ENFORCE_LE(
+      axis,
+      max_dim,
+      phi::errors::InvalidArgument(
+          "Axis should be less than or equal to %d, but received axis is %d.",
+          max_dim,
+          axis));
 
   for (int i = 0; i < x_dims.size(); ++i) {
     dst_dims_vec[i + x_axis] =