PaddlePaddle
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎SECURITY.md
Lines changed: 5 additions & 0 deletions b/‎SECURITY.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎cmake/cudnn.cmake
Lines changed: 11 additions & 7 deletions b/‎cmake/cudnn.cmake
Lines changed: 11 additions & 7 deletions
diff --git a/‎paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/pir/serialize_deserialize/CMakeLists.txt
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/pir/serialize_deserialize/CMakeLists.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/pir/serialize_deserialize/patch/0.yaml renamed to ‎paddle/fluid/pir/serialize_deserialize/patch/2.yaml b/‎paddle/fluid/pir/serialize_deserialize/patch/0.yaml renamed to ‎paddle/fluid/pir/serialize_deserialize/patch/2.yaml
diff --git a/‎paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
Lines changed: 5 additions & 2 deletions b/‎paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎paddle/fluid/pybind/eager_method.cc
Lines changed: 60 additions & 0 deletions b/‎paddle/fluid/pybind/eager_method.cc
Lines changed: 60 additions & 0 deletions
diff --git a/‎paddle/phi/core/memory/allocation/allocator_facade.cc
Lines changed: 24 additions & 2 deletions b/‎paddle/phi/core/memory/allocation/allocator_facade.cc
Lines changed: 24 additions & 2 deletions
diff --git a/‎paddle/phi/core/memory/allocation/allocator_facade.h
Lines changed: 2 additions & 0 deletions b/‎paddle/phi/core/memory/allocation/allocator_facade.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/phi/core/memory/allocation/stream_safe_custom_device_allocator.cc
Lines changed: 14 additions & 1 deletion b/‎paddle/phi/core/memory/allocation/stream_safe_custom_device_allocator.cc
Lines changed: 14 additions & 1 deletion
diff --git a/‎paddle/phi/core/memory/allocation/stream_safe_custom_device_allocator.h
Lines changed: 1 addition & 0 deletions b/‎paddle/phi/core/memory/allocation/stream_safe_custom_device_allocator.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/phi/core/memory/malloc.cc
Lines changed: 14 additions & 0 deletions b/‎paddle/phi/core/memory/malloc.cc
Lines changed: 14 additions & 0 deletions
diff --git a/‎paddle/phi/core/memory/malloc.h
Lines changed: 8 additions & 0 deletions b/‎paddle/phi/core/memory/malloc.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/phi/infermeta/unary.cc
Lines changed: 9 additions & 9 deletions b/‎paddle/phi/infermeta/unary.cc
Lines changed: 9 additions & 9 deletions
diff --git a/‎paddle/phi/kernels/cpu/elementwise_add_kernel.cc
Lines changed: 0 additions & 1 deletion b/‎paddle/phi/kernels/cpu/elementwise_add_kernel.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
Lines changed: 0 additions & 1 deletion b/‎paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
Lines changed: 0 additions & 1 deletion b/‎paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
Lines changed: 0 additions & 1 deletion
@@ -62,7 +62,7 @@ repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.11.11
     hooks:
-      - id: ruff
+      - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
   # For C++ files
   - repo: local
 
@@ -19,6 +19,11 @@ These tools include adversarial example evaluation test, pseudo-natural environm
 Always load and execute untrusted models inside a sandbox and be sure to know the security impacts.
 There are several ways in which a model could become untrusted. PaddlePaddle has enough features to impact on the system. (e.g. `paddle.load` uses [pickle](https://docs.python.org/3/library/pickle.html) implicitly, which may cause malformed models to achieve arbitrary code execution). So we recommend when using the untrusted models, you need to carefully audit it and run PaddlePaddle inside a sandbox.
 
+### Using distributed features
+PaddlePaddle offers distributed computing capabilities through the paddle.distributed package. These distributed features are meant for secure, trusted environments only, not for use on public or untrusted networks.
+
+For efficiency, PaddlePaddle Distributed (e.g. RPC) does not use encryption or authentication. Messages are sent in plain text, and connections from any source are accepted. This means if you run a PaddlePaddle Distributed program on your network, anyone who can access that network could send tasks to PaddlePaddle, and those tasks will be executed without any security checks, using the same permissions as the PaddlePaddle process.
+
 ## PaddlePaddle Code Security
 
 PaddlePaddle always take code security seriously. However, due to the complexity of the framework and its dependence on other thirdparty open source libraries, there may still be some security issues undetected. Therefore, we hope that more security researchers and PaddlePaddle developers can participate in the code security program. We encourage responsible disclosure of security issues, as well as contributing code to improve our vulnerability finding tools to make PaddlePaddle safer.
 
@@ -10,20 +10,24 @@ else()
       CACHE PATH "CUDNN ROOT")
 endif()
 
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
 find_path(
   CUDNN_INCLUDE_DIR cudnn.h
-  PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT}
-        $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+  PATHS ${CUDNN_ROOT}
+        ${CUDNN_ROOT}/include
+        ${CUDNN_ROOT}/include/${TARGET_ARCH}-linux-gnu
+        $ENV{CUDNN_ROOT}
+        $ENV{CUDNN_ROOT}/include
+        ${CUDA_TOOLKIT_INCLUDE}
         /usr/local/lib/python${PY_VERSION}/dist-packages/nvidia/cudnn/include/
   NO_DEFAULT_PATH)
 
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
 
-set(TARGET_ARCH "x86_64")
-if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
 list(
   APPEND
   CUDNN_CHECK_LIBRARY_DIRS
 
@@ -1003,13 +1003,13 @@ bool WhileOp::InferSymbolicShape(
     auto yield_input_data_opt = yield_input_shape_or_data.data();
     auto input_data_opt =
         infer_context->GetShapeOrDataForValue(body_args[i]).data();
-    bool const_data_not_euqal =
+    bool const_data_not_equal =
         is_all_const_data(yield_input_data_opt) &&
         (!is_all_const_data(input_data_opt) ||
          is_all_const_data(input_data_opt) &&
              yield_input_data_opt.value() != input_data_opt.value());
     auto result_shape_or_data =
-        const_data_not_euqal
+        const_data_not_equal
             ? symbol::TensorShapeOrDataDimExprs(
                   yield_input_shape_or_data.shape(),
                   creat_new_data(yield_input_data_opt.value().size()))
 
@@ -13,8 +13,8 @@ endif()
 
 file(GLOB_RECURSE YAML_PATCH_FILES "*.yaml")
 # change pir version when new patches are added
-add_definitions(-DDEVELOP_VERSION=0)
-add_definitions(-DRELEASE_VERSION=1)
+add_definitions(-DDEVELOP_VERSION=2)
+add_definitions(-DRELEASE_VERSION=2)
 set(TEMPLATE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/patch/template.h.in)
 set(PATCH_HEADER ${CMAKE_CURRENT_BINARY_DIR}/patch/patch.h)
 
 
@@ -2307,15 +2307,18 @@ void group_norm_grad(const Tensor& x,
       auto tmp1 = out_grad_data * (x_data - mean_new) * sqrt_var_1;
 
       auto scale_grad_tmp = reshape<T>(
-          tmp1.sum(reduce_axis_except_channel, scale->dtype(), false), {-1});
+          tmp1.sum(reduce_axis_except_channel, x_data.dtype(), false), {-1});
+      scale_grad_tmp = ConvertToOrig<T>(scale_grad_tmp, scale->dtype());
+
       set_output<T>(scale_grad_tmp, scale_grad);
     }
   }
 
   if (bias_grad) {
     if (bias) {
       auto bias_grad_tmp =
-          out_grad_data.sum(reduce_axis_except_channel, bias->dtype(), false);
+          out_grad_data.sum(reduce_axis_except_channel, x_data.dtype(), false);
+      bias_grad_tmp = ConvertToOrig<T>(bias_grad_tmp, bias->dtype());
 
       set_output<T>(reshape<T>(bias_grad_tmp, {-1}), bias_grad);
     }
 
@@ -29,6 +29,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -1398,6 +1399,61 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_set_underline_tensor(TensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
+  auto& value = GetTensorFromArgs("set_tensor", "value", args, 0, false);
+  if (!value.defined()) {
+    PADDLE_THROW(
+        common::errors::Unavailable("The `set_tensor()` method of (Dist)Tensor "
+                                    "get a non initialized src value"));
+  } else if (value.is_dense_tensor()) {
+    auto* src_tensor = static_cast<phi::DenseTensor*>(value.impl().get());
+    if (self->tensor.is_dense_tensor()) {
+      auto* dst_tensor =
+          static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+      framework::TensorCopy(*src_tensor, dst_tensor->place(), dst_tensor);
+    } else {
+      PADDLE_THROW(common::errors::Unavailable(
+          "The `set_tensor()` method of non DenseTensor get a DenseTensor src "
+          "value"));
+    }
+
+  } else if (value.is_dist_tensor()) {
+#ifdef PADDLE_WITH_DISTRIBUTE
+    auto* src_tensor =
+        static_cast<phi::distributed::DistTensor*>(value.impl().get());
+    if (self->tensor.is_dist_tensor()) {
+      auto* dst_tensor =
+          static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
+      framework::TensorCopy(*(src_tensor->unsafe_mutable_value()),
+                            dst_tensor->place(),
+                            dst_tensor->unsafe_mutable_value());
+
+      // TensorCopyFrom(dst_tensor->unsafe_mutable_value(),
+      // *(src_tensor->unsafe_mutable_value()), dst_tensor->place(), -1);
+    } else {
+      PADDLE_THROW(
+          common::errors::Unavailable("The `set_tensor()` method of non "
+                                      "DistTensor get a DistTensor src value"));
+    }
+#else
+    PADDLE_THROW(common::errors::Unavailable(
+        "The `set_tensor()` method of (Dist)Tensor is not supported in the "
+        "current PaddlePaddle, please recompile and installPaddlePaddle "
+        "with the option of `WITH_DISTRIBUTE=ON`."));
+#endif
+
+  } else {
+    PADDLE_THROW(common::errors::Unavailable(
+        "The `set_tensor()` method of (Dist)Tensor get a non "
+        "DenseTensor/DistTensor src value"));
+  }
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_get_underline_selected_rows(TensorObject* self,
                                                            PyObject* args,
                                                            PyObject* kwargs) {
@@ -3643,6 +3699,10 @@ PyMethodDef variable_methods[] = {  // NOLINT
      (PyCFunction)(void (*)())tensor_method__get_tensor_from_selected_rows,
      METH_VARARGS | METH_KEYWORDS,
      nullptr},
+    {"set_tensor",
+     (PyCFunction)(void (*)())tensor_method_set_underline_tensor,
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"_getitem_dygraph",
      (PyCFunction)(void (*)())tensor__getitem_dygraph,
      METH_VARARGS | METH_KEYWORDS,
 
@@ -791,6 +791,17 @@ class AllocatorFacadePrivate {
     }
   }
 
+  void EraseStream(std::shared_ptr<phi::Allocation> allocation,
+                   phi::stream::stream_t stream) {
+    if (auto stream_safe_cuda_allocation =
+            std::dynamic_pointer_cast<StreamSafeCustomDeviceAllocation>(
+                allocation)) {
+      stream_safe_cuda_allocation->EraseStream(stream);
+    } else {
+      VLOG(6) << "EraseStream for a non-StreamSafeCUDAAllocation";
+    }
+  }
+
   phi::stream::stream_t GetStream(
       const std::shared_ptr<phi::Allocation>& allocation) const {
     const std::shared_ptr<StreamSafeCustomDeviceAllocation>
@@ -1787,11 +1798,17 @@ AllocationPtr AllocatorFacade::Alloc(const phi::Place& place,
 bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    !defined(PADDLE_WITH_CUSTOM_DEVICE)
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());  // NOLINT
   return s == GetStream(allocation);
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  phi::stream::stream_t s =
+      reinterpret_cast<phi::stream::stream_t>(stream.id());  // NOLINT
+  return s == GetStream(allocation);
 #else
-  PADDLE_THROW(common::errors::PreconditionNotMet("Not compiled with GPU."));
+  PADDLE_THROW(common::errors::PreconditionNotMet(
+      "Not compiled with GPU or CUDA backend."));
 #endif
 }
 
@@ -1946,6 +1963,11 @@ bool AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
   return GetPrivate()->RecordStream(allocation, stream);
 }
 
+void AllocatorFacade::EraseStream(std::shared_ptr<phi::Allocation> allocation,
+                                  phi::stream::stream_t stream) {
+  GetPrivate()->EraseStream(allocation, stream);
+}
+
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const phi::Place& place, phi::stream::stream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
 
@@ -109,6 +109,8 @@ class AllocatorFacade {
   uint64_t Release(const phi::CustomPlace& place, phi::stream::stream_t stream);
   bool RecordStream(std::shared_ptr<Allocation> allocation,
                     phi::stream::stream_t stream);
+  void EraseStream(std::shared_ptr<Allocation> allocation,
+                   phi::stream::stream_t stream);
   TEST_API const std::shared_ptr<Allocator>& GetAllocator(
       const phi::Place& place, phi::stream::stream_t stream);
   phi::stream::stream_t GetStream(
 
@@ -58,6 +58,18 @@ bool StreamSafeCustomDeviceAllocation::RecordStream(
   return true;
 }
 
+void StreamSafeCustomDeviceAllocation::EraseStream(
+    phi::stream::stream_t stream) {
+  VLOG(8) << "Try remove stream " << stream << " for address " << ptr();
+  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
+  auto it = outstanding_event_map_.find(stream);
+  if (it == outstanding_event_map_.end()) {
+    return;
+  }
+  it->second->Destroy();
+  outstanding_event_map_.erase(it);
+}
+
 bool StreamSafeCustomDeviceAllocation::CanBeFreed() {
   std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
   if (!phi::DeviceManager::HasDeviceType(place_.GetDeviceType())) {
@@ -191,7 +203,8 @@ uint64_t StreamSafeCustomDeviceAllocator::ReleaseImpl(const phi::Place& place) {
 
 void StreamSafeCustomDeviceAllocator::ProcessUnfreedAllocations() {
   // NOTE(Ruibiao): This condition is to reduce lock completion. It does not
-  // need to be thread-safe since here occasional misjudgments are permissible.
+  // need to be thread-safe since here occasional misjudgments are
+  // permissible.
   if (unfreed_allocations_.empty()) {
     return;
   }
 
@@ -36,6 +36,7 @@ class StreamSafeCustomDeviceAllocation : public Allocation {
                                    StreamSafeCustomDeviceAllocator *allocator);
 
   bool RecordStream(phi::stream::stream_t stream);
+  void EraseStream(phi::stream::stream_t stream);
   bool CanBeFreed();
   phi::stream::stream_t GetOwningStream() const;
   void SetOwningStream(phi::stream::stream_t s);
 
@@ -77,10 +77,24 @@ gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+uint64_t Release(const phi::CustomPlace& place, phi::stream::stream_t stream) {
+  return allocation::AllocatorFacade::Instance().Release(place, stream);
+}
+
 bool RecordStream(std::shared_ptr<Allocation> allocation,
                   phi::stream::stream_t stream) {
   return allocation::AllocatorFacade::Instance().RecordStream(allocation,
                                                               stream);
 }
+
+void EraseStream(std::shared_ptr<Allocation> allocation,
+                 phi::stream::stream_t stream) {
+  return allocation::AllocatorFacade::Instance().EraseStream(allocation,
+                                                             stream);
+}
+
+phi::stream::stream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
+  return allocation::AllocatorFacade::Instance().GetStream(allocation);
+}
 #endif
 }  // namespace paddle::memory
@@ -59,8 +59,16 @@ void EraseStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
 gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+extern uint64_t Release(const phi::CustomPlace& place,
+                        phi::stream::stream_t stream);
+
 bool RecordStream(std::shared_ptr<Allocation> allocation,
                   phi::stream::stream_t stream);
+
+void EraseStream(std::shared_ptr<Allocation> allocation,
+                 phi::stream::stream_t stream);
+
+phi::stream::stream_t GetStream(const std::shared_ptr<Allocation>& allocation);
 #endif
 
 template <typename StreamType>
 
@@ -4520,31 +4520,31 @@ void SplitInferMeta(const MetaTensor& x,
   } else {
     auto input_axis_dim = x.dims().at(axis_value);
     std::vector<int64_t> sections_vec;
-    const int unknow_dim_val = -1;
-    int unknow_dim_idx = -1;
-    int num_of_unknow = 0;
+    const int unknown_dim_val = -1;
+    int unknown_dim_idx = -1;
+    int num_of_unknown = 0;
     int64_t sum_of_section = 0;
 
     for (int i = 0; i < static_cast<int>(sections_data.size()); ++i) {
       sections_vec.push_back(sections_data[i]);
 
-      if (sections_data[i] == unknow_dim_val) {
-        num_of_unknow++;
-        unknow_dim_idx = i;
+      if (sections_data[i] == unknown_dim_val) {
+        num_of_unknown++;
+        unknown_dim_idx = i;
       } else {
         sum_of_section += static_cast<int64_t>(sections_data[i]);
       }
     }
 
-    PADDLE_ENFORCE_LE(num_of_unknow,
+    PADDLE_ENFORCE_LE(num_of_unknown,
                       1,
                       common::errors::InvalidArgument(
                           "Only one dimension value of Attr(num_or_sections) "
                           "in SplitOp can be -1. "
                           "But received Attr(num_or_sections) = [%s].",
                           common::make_ddim(sections_data)));
 
-    if (unknow_dim_idx != -1) {
+    if (unknown_dim_idx != -1) {
       // for example, input shape = [4 ,5], axis = 1, sections = [2, 3, -1].
       // input_axis_dim = 5, sum_of_sections = 5.
       // the following check will fail.
@@ -4561,7 +4561,7 @@ void SplitInferMeta(const MetaTensor& x,
               x.dims(),
               axis_value));
 
-      sections_vec[unknow_dim_idx] = input_axis_dim - sum_of_section;
+      sections_vec[unknown_dim_idx] = input_axis_dim - sum_of_section;
     } else {
       PADDLE_ENFORCE_EQ(
           sum_of_section,
 
@@ -52,7 +52,6 @@ void AddKernel(const Context& dev_ctx,
                const DenseTensor& y,
                DenseTensor* out) {
   if (x.numel() == 0 || y.numel() == 0) {
-    out->Resize(out->dims());
     dev_ctx.template Alloc<T>(out);
     return;
   }
 
@@ -28,7 +28,6 @@ void DivideKernel(const Context& dev_ctx,
                   const DenseTensor& y,
                   DenseTensor* out) {
   if (x.numel() == 0 || y.numel() == 0) {
-    out->Resize(out->dims());
     dev_ctx.template Alloc<T>(out);
     return;
   }
 
@@ -28,7 +28,6 @@ void MultiplyKernel(const Context& dev_ctx,
                     const DenseTensor& y,
                     DenseTensor* out) {
   if (x.numel() == 0 || y.numel() == 0) {
-    out->Resize(out->dims());
     dev_ctx.template Alloc<T>(out);
     return;
   }
Original file line number	Diff line number	Diff line change
`@@ -2307,15 +2307,18 @@ void group_norm_grad(const Tensor& x,`
`2307`	`2307`	`auto tmp1 = out_grad_data * (x_data - mean_new) * sqrt_var_1;`
`2308`	`2308`
`2309`	`2309`	`auto scale_grad_tmp = reshape<T>(`
`2310`		`- tmp1.sum(reduce_axis_except_channel, scale->dtype(), false), {-1});`
	`2310`	`+ tmp1.sum(reduce_axis_except_channel, x_data.dtype(), false), {-1});`
	`2311`	`+ scale_grad_tmp = ConvertToOrig<T>(scale_grad_tmp, scale->dtype());`
	`2312`	`+`
`2311`	`2313`	`set_output<T>(scale_grad_tmp, scale_grad);`
`2312`	`2314`	`}`
`2313`	`2315`	`}`
`2314`	`2316`
`2315`	`2317`	`if (bias_grad) {`
`2316`	`2318`	`if (bias) {`
`2317`	`2319`	`auto bias_grad_tmp =`
`2318`		`- out_grad_data.sum(reduce_axis_except_channel, bias->dtype(), false);`
	`2320`	`+ out_grad_data.sum(reduce_axis_except_channel, x_data.dtype(), false);`
	`2321`	`+ bias_grad_tmp = ConvertToOrig<T>(bias_grad_tmp, bias->dtype());`
`2319`	`2322`
`2320`	`2323`	`set_output<T>(reshape<T>(bias_grad_tmp, {-1}), bias_grad);`
`2321`	`2324`	`}`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,6 @@ void AddKernel(const Context& dev_ctx,`
`52`	`52`	`const DenseTensor& y,`
`53`	`53`	`DenseTensor* out) {`
`54`	`54`	`if (x.numel() == 0 \|\| y.numel() == 0) {`
`55`		`- out->Resize(out->dims());`
`56`	`55`	`dev_ctx.template Alloc<T>(out);`
`57`	`56`	`return;`
`58`	`57`	`}`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,6 @@ void DivideKernel(const Context& dev_ctx,`
`28`	`28`	`const DenseTensor& y,`
`29`	`29`	`DenseTensor* out) {`
`30`	`30`	`if (x.numel() == 0 \|\| y.numel() == 0) {`
`31`		`- out->Resize(out->dims());`
`32`	`31`	`dev_ctx.template Alloc<T>(out);`
`33`	`32`	`return;`
`34`	`33`	`}`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,6 @@ void MultiplyKernel(const Context& dev_ctx,`
`28`	`28`	`const DenseTensor& y,`
`29`	`29`	`DenseTensor* out) {`
`30`	`30`	`if (x.numel() == 0 \|\| y.numel() == 0) {`
`31`		`- out->Resize(out->dims());`
`32`	`31`	`dev_ctx.template Alloc<T>(out);`
`33`	`32`	`return;`
`34`	`33`	`}`