XPU Update xft and weight_quant (PaddlePaddle#72053)

yulangz · web-flow · commit 978113dda702 · 2025-04-16T16:26:05.000+08:00
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -34,12 +34,12 @@ if(NOT DEFINED XPU_XHPC_BASE_DATE)
 endif()
 set(XPU_XCCL_BASE_VERSION "3.0.2.5") # For XRE5
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
-  set(XPU_XFT_BASE_VERSION "20230602")
+  set(XPU_XFT_BASE_VERSION "20250402/xpu3")
 endif()
 
 if(NOT DEFINED XPU_XRE_BASE_VERSION)
   if(WITH_XPU_XRE5)
-    set(XPU_XRE_BASE_VERSION "5.0.21.18")
+    set(XPU_XRE_BASE_VERSION "5.0.21.19")
   else()
     set(XPU_XRE_BASE_VERSION "4.32.0.1")
   endif()
@@ -61,7 +61,7 @@ set(XPU_XCCL_BASE_URL
 
 if(NOT XPU_XFT_BASE_URL)
   set(XPU_XFT_BASE_URL
-      "https://klx-sdk-release-public.su.bcebos.com/xft/dev/${XPU_XFT_BASE_VERSION}"
+      "https://klx-sdk-release-public.su.bcebos.com/xft_internal/dev/${XPU_XFT_BASE_VERSION}"
   )
 endif()
 
@@ -112,7 +112,7 @@ else()
     set(XPU_XHPC_DIR_NAME "xhpc-ubuntu1604_x86_64")
   endif()
   set(XPU_XCCL_DIR_NAME "xccl_Linux_x86_64")
-  set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
+  set(XPU_XFT_DIR_NAME "xft_internal_ubuntu2004")
 endif()
 
 set(XPU_XRE_URL
@@ -187,9 +187,9 @@ if(DEFINED ENV{XPU_LIB_ROOT})
   endif()
 
   # XCCL
-  if(DEFINED ENV{XCCL_DIR_NAME})
-    set(XPU_XCCL_URL "${XPU_LIB_ROOT}/$ENV{XCCL_DIR_NAME}")
-    set(XCCL_DIR_NAME "$ENV{XCCL_DIR_NAME}")
+  if(DEFINED ENV{XPU_XCCL_DIR_NAME})
+    set(XPU_XCCL_URL "${XPU_LIB_ROOT}/$ENV{XPU_XCCL_DIR_NAME}")
+    set(XPU_XCCL_DIR_NAME "$ENV{XPU_XCCL_DIR_NAME}")
   endif()
 
   # XHPC
diff --git a/paddle/fluid/framework/ir/xpu/weight_only_linear_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/weight_only_linear_xpu_pass.cc
@@ -44,20 +44,20 @@ PermuteINT8WeightOnlyPattern::PermuteINT8WeightOnlyPattern(
     PDPattern* pattern, const std::string& name_scope)
     : PatternBase(pattern, name_scope, name_scope) {
   auto* input = pattern->NewNode(input_repr())
-                    ->assert_is_op_input("weight_only_linear_xpu", "x")
+                    ->assert_is_op_input("weight_only_linear", "x")
                     ->AsInput();
   auto* weight = pattern->NewNode(weight_repr())
-                     ->assert_is_op_input("weight_only_linear_xpu", "weight")
+                     ->assert_is_op_input("weight_only_linear", "weight")
                      ->AsInput();
   auto* weight_scale =
       pattern->NewNode(weight_scale_repr())
-          ->assert_is_op_input("weight_only_linear_xpu", "weight_scale")
+          ->assert_is_op_input("weight_only_linear", "weight_scale")
           ->AsInput();
   auto* out = pattern->NewNode(out_repr())
-                  ->assert_is_op_output("weight_only_linear_xpu", "out")
+                  ->assert_is_op_output("weight_only_linear", "out")
                   ->AsOutput();
   auto* weight_only_linear = pattern->NewNode(weight_only_linear_repr())
-                                 ->assert_is_op("weight_only_linear_xpu");
+                                 ->assert_is_op("weight_only_linear");
 
   std::vector<PDNode*> input_vars{input, weight, weight_scale};
   std::vector<PDNode*> output_vars{out};
@@ -236,4 +236,4 @@ REGISTER_PASS(weight_only_linear_xpu_pass,
 REGISTER_PASS_CAPABILITY(weight_only_linear_xpu_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination().EQ(
-            "weight_only_linear_xpu", 0));
+            "weight_only_linear", 0));
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -107,7 +107,8 @@ phi::DeviceContext* ParseDeviceContext(pir::Operation* op,
 
   // only gpu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
-  if (phi::is_gpu_place(place) || phi::is_custom_place(place)) {
+  if (phi::is_gpu_place(place) || phi::is_custom_place(place) ||
+      phi::is_xpu_place(place)) {
     VLOG(6) << "Parse DeviceContext for " << op_name
             << ", execution stream = " << execution_stream;
     if (execution_stream != kDefaultStream) {
@@ -136,7 +137,7 @@ phi::DeviceContext* ParseDeviceContext(pir::Operation* op,
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU_BKCL)
     // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
     // with use_cal_stream==false by returning a device context getting from the
     // global NCCLCommContext instance. Because when use_calc_stream==false, in
@@ -205,7 +206,21 @@ phi::DeviceContext* ParseDeviceContext(pir::Operation* op,
             op_name.compare(paddle::dialect::AllToAllOp::name()) == 0 ||
             op_name.compare(
                 paddle::dialect::CSoftmaxWithCrossEntropyOp::name()) == 0) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_XPU_BKCL)
+          if (phi::is_xpu_place(place) && execution_stream == kDefaultStream) {
+            VLOG(3) << "set stream for " << op_name << "in XPU device";
+            if (origin_dev_ctx != nullptr) {
+              // set stream
+              auto default_stream =
+                  static_cast<DEVICE_CONTEXT*>(origin_dev_ctx)->stream();
+              static_cast<DEVICE_CONTEXT*>(dev_ctx)->SetStream(default_stream);
+              // todo set allocator
+            } else {
+              VLOG(3) << "CUSTOM DEVICE op " << op_name << " ring_id "
+                      << ring_id << " origin_dev_ctx is nullptr";
+            }
+          }
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
           if (phi::is_custom_place(place) &&
               execution_stream == kDefaultStream) {
             VLOG(3) << "set stream for " << op_name << "in Custom device";
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -1220,7 +1220,7 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::FLOAT32})},
       {"warpctc_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"warpctc", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"weight_only_linear_xpu",
+      {"weight_only_linear",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::BFLOAT16})},
       {"where_index",
        XPUKernelSet({phi::DataType::INT32,
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -1690,6 +1690,8 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT32,
                      phi::DataType::INT64})},
+      {"weight_quantize",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::BFLOAT16})},
       {"where_grad",
        XPUKernelSet({phi::DataType::INT32,
                      phi::DataType::INT64,
diff --git a/paddle/phi/kernels/fusion/xpu/spatial_transformer_resblock_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/spatial_transformer_resblock_xpu_kernel.cc
@@ -62,7 +62,8 @@ void SpatialTransformerResblockXPUKernel(
     bool include_silu,
     DenseTensor* out,
     DenseTensor* out_max) {
-#ifdef PADDLE_WITH_XPU_XFT
+  // not suppotr in current xft
+#if defined(PADDLE_WITH_XPU_XFT_NOT_SUPPORT)
   using XPUType = typename XPUTypeTrait<T>::Type;
 
   auto* in1 = reinterpret_cast<const XPUType*>(x.data<T>());
diff --git a/paddle/phi/kernels/xpu/top_p_sampling_kernel.cc b/paddle/phi/kernels/xpu/top_p_sampling_kernel.cc
@@ -58,13 +58,13 @@ void TopPSamplingKernel(const Context& dev_ctx,
   auto x_dims = x.dims();
   int bs = x_dims[0];
   int vocab_size = x_dims[1];
-  int p_num = ps.numel();
+  // int p_num = ps.numel();
 
-  PADDLE_ENFORCE_EQ(
-      p_num,
-      bs,
-      common::errors::PreconditionNotMet(
-          "Expected bs == p_num, but got bs=%d, p_num=%d.", bs, p_num));
+  // PADDLE_ENFORCE_EQ(
+  //     p_num,
+  //     bs,
+  //     common::errors::PreconditionNotMet(
+  //         "Expected bs == p_num, but got bs=%d, p_num=%d.", bs, p_num));
 
   std::vector<int64_t> infer_seed(bs, random_seed);
   if (topp_seed.get_ptr() != nullptr) {
diff --git a/paddle/phi/kernels/xpu/weight_quantize_kernel.cc b/paddle/phi/kernels/xpu/weight_quantize_kernel.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#if defined(PADDLE_WITH_XPU_XFT)
+#include <xft/xdnn_plugin.h>
+#endif
+#include "paddle/common/enforce.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WeightQuantizeKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const std::string& algo,
+                          const int32_t arch,
+                          const int32_t group_size,
+                          DenseTensor* out,
+                          DenseTensor* scale) {
+#if defined(PADDLE_WITH_XPU_XFT)
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(&dev_ctx);
+  int k = x.dims()[0];
+  int n = x.dims()[1];
+  scale->Resize({static_cast<int64_t>(n)});
+
+  dev_ctx.template Alloc<float>(scale);
+
+  if (algo == "weight_only_int8") {
+    out->Resize({static_cast<int64_t>(k), static_cast<int64_t>(n)});
+    dev_ctx.template Alloc<int8_t>(out);
+
+    int ret = baidu::xpu::xftkernel::xft_quant2d_per_channel<XPUType, float>(
+        xpu_ctx->x_context(),
+        reinterpret_cast<const XPUType*>(x.template data<T>()),
+        nullptr,
+        out->data<int8_t>(),
+        scale->data<float>(),
+        k,
+        n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "quant2d");
+  } else {
+    PADDLE_THROW(common::errors::Unimplemented(
+        "Weight quantize only supports weight_only_int8 on XPU now."));
+  }
+#else
+  PADDLE_THROW(common::errors::Unimplemented(
+      "weight_quantize is not supported since it's not "
+      "compiled with XPU_XFT"));
+#endif
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(weight_quantize,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::WeightQuantizeKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml
@@ -828,16 +828,6 @@
   optional : mask
   support_dygraph_mode : true
 
-- op : weight_only_linear_xpu
-  args : (Tensor x, Tensor weight, Tensor bias, Tensor weight_scale, str weight_dtype, int arch = 80, int group_size = -1)
-  output : Tensor(out)
-  infer_meta :
-    func : WeightOnlyLinearInferMeta
-  kernel :
-    func : weight_only_linear_xpu
-    data_type : x
-  optional : bias
-
 - op : yolo_box_xpu
   args : (Tensor x, Tensor x_max, Tensor grid, Tensor stride, Tensor anchor_grid, float offset)
   output : Tensor(out), Tensor(out_max)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh