[MLU] change log of cncl & fix kernels. (PaddlePaddle#565)

ShawnNew · web-flow · commit 26414ece40c9 · 2023-05-19T10:24:54.000+08:00
diff --git a/backends/mlu/kernels/adam_kernel.cc b/backends/mlu/kernels/adam_kernel.cc
@@ -231,29 +231,23 @@ void AdamWKernel(const Context& dev_ctx,
   VLOG(3) << "Skip update" << skip_update_ << ", With decay: " << with_decay;
 
   if (!skip_update_ && with_decay) {
-    if (master_param.is_initialized()) {
-      PADDLE_THROW(
-          phi::errors::Unimplemented("Master Param is not supported on MLU"));
-    } else {
-      // update param with decay coeff: mul(-1 * lr, coeff * param) + param
-      MLUCnnlTensorDesc lr_desc(learning_rate);
-      MLUCnnlTensorDesc param_desc(param);
-      MLUCnnlOpTensorDesc mul_op_desc(
-          CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnlTensorDesc lr_desc(learning_rate);
+    MLUCnnlTensorDesc param_desc(param);
+    MLUCnnlOpTensorDesc mul_op_desc(
+        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
 
-      MLUCnnl::OpTensor(dev_ctx,
-                        mul_op_desc.get(),
-                        lr_desc.get(),
-                        GetBasePtr(&learning_rate),
-                        param_desc.get(),
-                        GetBasePtr(&param),
-                        param_desc.get(),
-                        const_cast<void*>(GetBasePtr(&param)),
-                        ToCnnlDataType<T>(),
-                        /*alpha1*/ -1.f,
-                        /*alpha2*/ coeff,
-                        /*beta*/ 1.f);
-    }
+    MLUCnnl::OpTensor(dev_ctx,
+                      mul_op_desc.get(),
+                      lr_desc.get(),
+                      GetBasePtr(&learning_rate),
+                      param_desc.get(),
+                      GetBasePtr(&param),
+                      param_desc.get(),
+                      const_cast<void*>(GetBasePtr(&param)),
+                      ToCnnlDataType<T>(),
+                      /*alpha1*/ -1.f,
+                      /*alpha2*/ coeff,
+                      /*beta*/ 1.f);
   }
 
   custom_kernel::AdamKernel<T, Context>(dev_ctx,
diff --git a/backends/mlu/kernels/elementwise_div_kernel.cc b/backends/mlu/kernels/elementwise_div_kernel.cc
@@ -43,8 +43,13 @@ void DivideGradKernel(const Context& dev_ctx,
                       int axis,
                       phi::DenseTensor* dx,
                       phi::DenseTensor* dy) {
-  const auto& x_dims = x.dims();
-  const auto& y_dims = y.dims();
+  Tensor x_t, y_t;
+  x_t = x;
+  y_t = y;
+  if (x.dims().size() == 0) x_t.Resize(phi::make_ddim({1}));
+  if (y.dims().size() == 0) y_t.Resize(phi::make_ddim({1}));
+  const auto& x_dims = x_t.dims();
+  const auto& y_dims = y_t.dims();
   axis =
       (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
   int max_dim = std::max(x_dims.size(), y_dims.size());
@@ -74,7 +79,7 @@ void DivideGradKernel(const Context& dev_ctx,
                  dout_desc.get(),
                  GetBasePtr(&dout),
                  y_desc.get(),
-                 GetBasePtr(&y),
+                 GetBasePtr(&y_t),
                  dout_desc.get(),
                  GetBasePtr(&dout_div_y));
 
diff --git a/backends/mlu/kernels/funcs/reduce_op.h b/backends/mlu/kernels/funcs/reduce_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "kernels/funcs/mlu_baseop.h"
+#include "kernels/funcs/mlu_funcs.h"
 
 namespace custom_kernel {
 
@@ -27,6 +28,10 @@ void MLUReduceOp(const Context& dev_ctx,
                  const std::string& reduce_name,
                  phi::DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
+  if (x.dims().size() == 0) {
+    TensorCopy(dev_ctx, x, true, out);
+    return;
+  }
 
   auto dims = axes;
   auto input_dims = phi::vectorize(x.dims());
diff --git a/backends/mlu/kernels/split_kernel.cc b/backends/mlu/kernels/split_kernel.cc
@@ -69,6 +69,23 @@ void SplitKernel(const Context& dev_ctx,
                  vct_tensor.data());
 }
 
+template <typename T, typename Context>
+void SplitWithNumKernel(const Context& dev_ctx,
+                        const phi::DenseTensor& x,
+                        int num,
+                        const phi::Scalar& axis_scalar,
+                        std::vector<phi::DenseTensor*> outs) {
+  int axis_value = axis_scalar.to<int>();
+  auto input_axis_dim = x.dims().at(axis_value);
+  std::vector<int64_t> sections_vec;
+  for (int i = 0; i < num; ++i) {
+    sections_vec.push_back(input_axis_dim / num);
+  }
+  phi::IntArray sections(sections_vec);
+  custom_kernel::SplitKernel<T, Context>(
+      dev_ctx, x, sections, axis_scalar, outs);
+}
+
 }  // namespace custom_kernel
 
 PD_REGISTER_PLUGIN_KERNEL(split,
@@ -80,3 +97,12 @@ PD_REGISTER_PLUGIN_KERNEL(split,
                           int,
                           bool,
                           phi::dtype::float16) {}
+PD_REGISTER_PLUGIN_KERNEL(split_with_num,
+                          mlu,
+                          ALL_LAYOUT,
+                          custom_kernel::SplitWithNumKernel,
+                          float,
+                          int64_t,
+                          int,
+                          bool,
+                          phi::dtype::float16) {}
diff --git a/backends/mlu/runtime/runtime.cc b/backends/mlu/runtime/runtime.cc
@@ -324,17 +324,16 @@ C_Status XcclCommInitRank(size_t nranks,
   PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetDevice(&dev_id));
   int dev_list[] = {dev_id};
   int rank_list[] = {rank};
-  VLOG(4) << "[CNCL] create comm.";
+  VLOG(4) << "[CNCL] create comm with rank: " << rank << " clique: "
+          << reinterpret_cast<cnclCliqueId *>(unique_id->data)->hash;
   PADDLE_ENFORCE_MLU_SUCCESS(
       cnclInitComms(reinterpret_cast<cnclComm_t *>(comm),
                     1,
                     dev_list,
                     rank_list,
                     nranks,
                     reinterpret_cast<cnclCliqueId *>(unique_id->data)));
-  VLOG(4) << "[CNCL] comm inited: " << reinterpret_cast<cnclComm_t>(*comm)
-          << " clique: "
-          << reinterpret_cast<cnclCliqueId *>(unique_id->data)->hash;
+  VLOG(4) << "[CNCL] comm inited: " << reinterpret_cast<cnclComm_t>(*comm);
   return C_SUCCESS;
 }