PaddlePaddle
diff --git a/‎.github/workflows/_Api-Benchmark.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/_Api-Benchmark.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/_Slice.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_Slice.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/coverage_diff.py
Lines changed: 1 addition & 1 deletion b/‎ci/coverage_diff.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/coverage_gcda_clean.py
Lines changed: 1 addition & 1 deletion b/‎ci/coverage_gcda_clean.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/slice/test_slice_float32.py
Lines changed: 2 additions & 2 deletions b/‎ci/slice/test_slice_float32.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/cudnn.cmake
Lines changed: 5 additions & 0 deletions b/‎cmake/cudnn.cmake
Lines changed: 5 additions & 0 deletions
diff --git a/‎cmake/external/pocketfft.cmake
Lines changed: 0 additions & 2 deletions b/‎cmake/external/pocketfft.cmake
Lines changed: 0 additions & 2 deletions
diff --git a/‎cmake/generic.cmake
Lines changed: 4 additions & 0 deletions b/‎cmake/generic.cmake
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/cinn/hlir/dialect/operator/transforms/fuse_parallel_matmul_pass.cc
Lines changed: 162 additions & 0 deletions b/‎paddle/cinn/hlir/dialect/operator/transforms/fuse_parallel_matmul_pass.cc
Lines changed: 162 additions & 0 deletions
diff --git a/‎paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
Lines changed: 11 additions & 2 deletions b/‎paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
Lines changed: 11 additions & 2 deletions
diff --git a/‎paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/prim/api/auto_code_generated/static_gen.py
Lines changed: 0 additions & 1 deletion b/‎paddle/fluid/prim/api/auto_code_generated/static_gen.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddle/fluid/primitive/codegen/decomp_rule_gen.py
Lines changed: 4 additions & 4 deletions b/‎paddle/fluid/primitive/codegen/decomp_rule_gen.py
Lines changed: 4 additions & 4 deletions
@@ -102,6 +102,8 @@ jobs:
           tar -zvxf PaddleTest.tar.gz 1>/dev/null 2>&1
           # git submodule foreach "git config --global --add safe.directory \$toplevel/\$sm_path"
           source ${{ github.workspace }}/../../../proxy
+          mkdir -p ${{ github.workspace }}/../../../pip
+          ${python} -m pip config set global.cache-dir ${{ github.workspace }}/../../../pip
           ${python} -m pip install -r ./PaddleTest/framework/e2e/api_benchmark_new/requirement.txt
           '
 
@@ -113,6 +115,7 @@ jobs:
           export LD_LIBRARY_PATH=/usr/local/cuda-11.8/compat:/usr/local/cuda/compat:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
           cd ./PaddleTest/framework/e2e/api_benchmark_new
           cp /paddle/PTSTools/Uploader/apibm_config.yml .
+          source ${{ github.workspace }}/../../../proxy
           ${python} -m pip install https://paddle-github-action.bj.bcebos.com/PR/build/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
           if [ ${core_index} -eq -1 ];then
             ${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --core_index 2
 
@@ -72,8 +72,8 @@ jobs:
           mkdir -p ${{ github.workspace }}/../../../.cache/pip
           source ${{ github.workspace }}/../../../proxy
           python3.10 -m pip config set global.cache-dir ${{ github.workspace }}/../../../.cache/pip
-          python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
           python3.10 -m pip install $wheel_link
+          python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
           python3.10 test_slice_float32.py
           '
 
 
@@ -56,11 +56,11 @@ repos:
         args: [--force-exclude]
   # For Python files
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.8.0
+    rev: 25.1.0
     hooks:
       - id: black
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.0
+    rev: v0.11.11
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
 
@@ -66,7 +66,7 @@ option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
 option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" ON)
 option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)
 option(WITH_PIP_CUDA_LIBRARIES
-       "Paddle uses the CUDA library provided by NVIDIA" OFF)
+       "Paddle uses the CUDA library provided by NVIDIA" ON)
 option(WITH_PIP_TENSORRT "Paddle uses the tensorrt provided by NVIDIA" OFF)
 option(WITH_NIGHTLY_BUILD
        "Compile nightly paddle whl package of the develop branch" OFF)
 
@@ -40,7 +40,7 @@ def get_diff_file_lines(diff_file):
             line = line.strip()
 
             if line.startswith('+++ '):
-                current_file = line.lstrip('+++ ')
+                current_file = line.removeprefix('+++ ')
 
                 diff_file_lines[current_file] = []
 
 
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" usage: gcda_clean.py pull_id. """
+"""usage: gcda_clean.py pull_id."""
 
 import os
 import sys
 
@@ -697,9 +697,9 @@ def main():
             S,
             ", B = ",
             B,
-            ", score <2 : ",
+            ", score <=2 : ",
             B2,
-            ", score > 3 : ",
+            ", score >2 : ",
             Bother,
         )
 
 
@@ -14,6 +14,7 @@ find_path(
   CUDNN_INCLUDE_DIR cudnn.h
   PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT}
         $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+        /usr/local/lib/python${PY_VERSION}/dist-packages/nvidia/cudnn/include/
   NO_DEFAULT_PATH)
 
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
@@ -43,6 +44,9 @@ set(CUDNN_LIB_NAME "")
 
 if(LINUX)
   set(CUDNN_LIB_NAME "libcudnn.so")
+  if(${CUDA_VERSION} GREATER_EQUAL 12.6)
+    set(CUDNN_LIB_NAME "libcudnn.so.9")
+  endif()
 endif()
 
 if(WIN32)
@@ -58,6 +62,7 @@ find_library(
   CUDNN_LIBRARY
   NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
   PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
+        /usr/local/lib/python${PY_VERSION}/dist-packages/nvidia/cudnn/lib/
   NO_DEFAULT_PATH
   DOC "Path to cuDNN library.")
 
 
@@ -45,8 +45,6 @@ ExternalProject_Add(
   UPDATE_COMMAND ""
   CONFIGURE_COMMAND ""
   BUILD_COMMAND
-  COMMAND ${CMAKE_COMMAND} -E remove_directory ${POCKETFFT_SOURCE_DIR}
-  COMMAND ${CMAKE_COMMAND} -E make_directory ${POCKETFFT_SOURCE_DIR}
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${SOURCE_DIR}
           ${POCKETFFT_SOURCE_DIR}
   INSTALL_COMMAND ""
 
@@ -726,6 +726,10 @@ function(nv_test TARGET_NAME)
       target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
     else()
       target_link_libraries(${TARGET_NAME} python)
+      if(WITH_SHARED_PHI)
+        target_link_libraries(${TARGET_NAME} -Wl,--as-needed phi_core phi_gpu
+                              -Wl,--no-as-needed)
+      endif()
     endif()
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main)
     common_link(${TARGET_NAME})
 
@@ -22,6 +22,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/transforms/sub_graph_detector.h"
+#include "paddle/fluid/pir/utils/general_functions.h"
 #include "paddle/pir/include/core/builtin_dialect.h"
 #include "paddle/pir/include/pass/pass.h"
 #include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h"
@@ -183,6 +184,166 @@ class MergeParallelMatmulPattern
   }
 };
 
+class MergeParallelLinearPattern
+    : public pir::OpRewritePattern<paddle::dialect::FusedGemmEpilogueOp> {
+ public:
+  using pir::OpRewritePattern<
+      paddle::dialect::FusedGemmEpilogueOp>::OpRewritePattern;
+
+  bool MatchAndRewrite(paddle::dialect::FusedGemmEpilogueOp fused_gemm_op,
+                       pir::PatternRewriter& rewriter) const override {
+    auto ValidFusedGemmAttr = [](pir::Operation* op) -> bool {
+      if (!op->isa<paddle::dialect::FusedGemmEpilogueOp>()) {
+        return false;
+      }
+      bool trans_x =
+          op->attribute("trans_x").dyn_cast<pir::BoolAttribute>().data();
+      bool trans_y =
+          op->attribute("trans_y").dyn_cast<pir::BoolAttribute>().data();
+
+      // only support trans_x and trans_y are false
+      if (trans_x || trans_y) return false;
+
+      std::string activation =
+          op->attribute<pir::StrAttribute>("activation").AsString();
+      if (activation != "none") return false;
+      return true;
+    };
+    if (!ValidFusedGemmAttr(fused_gemm_op)) {
+      return false;
+    }
+
+    auto IsFirstInput = [&](pir::Operation* op, pir::Value in_x) -> bool {
+      return in_x == op->operand_source(0);
+    };
+
+    auto VectorPrefixEqual = [](const std::vector<std::int64_t>& a,
+                                const std::vector<std::int64_t>& b) {
+      return std::vector<std::int64_t>(a.begin(), a.end() - 1) ==
+             std::vector<std::int64_t>(b.begin(), b.end() - 1);
+    };
+
+    auto IsDynamicShape = [&](const std::vector<int64_t>& dims) {
+      return std::any_of(
+          dims.begin(), dims.end(), [](int64_t dim) { return dim < 0; });
+    };
+
+    auto input_x = fused_gemm_op.operand_source(0);
+    std::vector<pir::Operation*> merge_ops = [&]() {
+      std::vector<pir::Operation*> ret;
+      std::optional<std::vector<std::int64_t>> pre_w_dim;
+      std::optional<std::vector<std::int64_t>> pre_bias_dim;
+      std::vector<std::int64_t> cur_w_dim;
+      std::vector<std::int64_t> cur_bias_dim;
+      for (auto it = input_x.use_begin(); it != input_x.use_end(); ++it) {
+        if (!ValidFusedGemmAttr(it->owner())) {
+          continue;
+        }
+
+        if (!IsFirstInput(it->owner(), input_x)) {
+          continue;
+        }
+        if (!pre_w_dim.has_value()) {
+          pre_w_dim = pir::GetShapeFromValue(it->owner()->operand_source(1));
+        }
+        if (!pre_bias_dim.has_value()) {
+          pre_bias_dim = pir::GetShapeFromValue(it->owner()->operand_source(2));
+        }
+        cur_w_dim = pir::GetShapeFromValue(it->owner()->operand_source(1));
+        cur_bias_dim = pir::GetShapeFromValue(it->owner()->operand_source(2));
+
+        if (IsDynamicShape(cur_w_dim) || IsDynamicShape(cur_bias_dim)) {
+          continue;
+        }
+        if (VectorPrefixEqual(pre_w_dim.value(), cur_w_dim)) {
+          ret.push_back(it->owner());
+        }
+      }
+      return ret;
+    }();
+    if (merge_ops.size() <= 1) {
+      return false;
+    }
+    std::sort(
+        merge_ops.begin(),
+        merge_ops.end(),
+        [&](pir::Operation* a, pir::Operation* b) {
+          int a_distance = std::distance(a->GetParent()->begin(),
+                                         a->operator pir::Block::Iterator());
+          int b_distance = std::distance(b->GetParent()->begin(),
+                                         b->operator pir::Block::Iterator());
+          return a_distance < b_distance;
+        });
+
+    const auto [combine_w_ins, combine_bias_ins] = [&]() {
+      std::vector<pir::Value> weight_list, bias_list;
+      for (pir::Operation* op : merge_ops) {
+        weight_list.push_back(op->operand_source(1));
+        bias_list.push_back(op->operand_source(2));
+      }
+      return std::make_tuple(weight_list, bias_list);
+    }();
+
+    const std::vector<std::int64_t> combine_shapes = [&]() {
+      std::vector<std::int64_t> ret{0};
+      std::int64_t accumulate = 0;
+      for (pir::Value input : combine_w_ins) {
+        const auto& shape = pir::GetShapeFromValue(input);
+        accumulate += shape.back();
+        ret.push_back(accumulate);
+      }
+      return ret;
+    }();
+    const std::vector<pir::Value> outputs = [&]() {
+      std::vector<pir::Value> ret;
+      for (pir::Operation* fused_gemm_op : merge_ops) {
+        ret.push_back(fused_gemm_op->result(0));
+      }
+      return ret;
+    }();
+
+    auto* insert_point = FindInsertPoint(merge_ops, outputs);
+    MoveUpstreamOpBeforeGroup(
+        merge_ops, merge_ops.back()->GetParent(), insert_point);
+    rewriter.set_insertion_point(insert_point);
+
+    auto combine_w = rewriter.Build<pir::CombineOp>(combine_w_ins).result(0);
+    auto combine_bias =
+        rewriter.Build<pir::CombineOp>(combine_bias_ins).result(0);
+    auto concat_w =
+        rewriter.Build<paddle::dialect::ConcatOp>(combine_w, -1).result(0);
+    auto concat_b =
+        rewriter.Build<paddle::dialect::ConcatOp>(combine_bias, -1).result(0);
+    auto new_fused_gemm_out =
+        rewriter
+            .Build<paddle::dialect::FusedGemmEpilogueOp>(
+                input_x, concat_w, concat_b, fused_gemm_op.attributes())
+            .result(0);
+
+    const auto& out_rank = new_fused_gemm_out.type()
+                               .dyn_cast<paddle::dialect::DenseTensorType>()
+                               .dims()
+                               .size();
+
+    for (size_t i = 0; i < merge_ops.size(); ++i) {
+      auto split_out = rewriter
+                           .Build<paddle::dialect::SliceOp>(
+                               new_fused_gemm_out,
+                               std::vector<std::int64_t>{out_rank - 1},
+                               std::vector<std::int64_t>{combine_shapes[i]},
+                               std::vector<int64_t>{combine_shapes[i + 1]},
+                               std::vector<std::int64_t>{},
+                               std::vector<std::int64_t>{})
+                           .result(0);
+
+      rewriter.ReplaceAllUsesWith(merge_ops[i]->result(0), split_out);
+      rewriter.EraseOp(merge_ops[i]);
+    }
+
+    return true;
+  }
+};
+
 class FuseParallelMatmulPass : public pir::PatternRewritePass {
  public:
   FuseParallelMatmulPass()
@@ -191,6 +352,7 @@ class FuseParallelMatmulPass : public pir::PatternRewritePass {
   pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override {
     pir::RewritePatternSet ps(context);
     ps.Add<MergeParallelMatmulPattern>(context);
+    ps.Add<MergeParallelLinearPattern>(context);
     return ps;
   }
 };
 
@@ -100,6 +100,15 @@ __device__ inline float FN_FP32(rcp)(float x) {
   asm("rcp.approx.ftz.f32 %0, %1;" : "=f"(res) : "f"(x));
   return res;
 }
+__device__ inline float FN_FP32(tanh_approx)(float x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+  float res;
+  asm("tanh.approx.f32 %0, %1;" : "=f"(res) : "f"(x));
+  return res;
+#else
+  return tanh(x);
+#endif
+}
 
 // *************************************************************** //
 // float64 unary and binary operator
@@ -426,7 +435,7 @@ __device__ inline bfloat16 FN_BF16(erf)(bfloat16 x) { return bfloat16(FN_FP32(er
 __device__ inline bfloat16 FN_BF16(tan)(bfloat16 x) { return bfloat16(FN_FP32(tan)(static_cast<float>(x))); }
 __device__ inline bfloat16 FN_BF16(sinh)(bfloat16 x) { return bfloat16(FN_FP32(sinh)(static_cast<float>(x))); }
 __device__ inline bfloat16 FN_BF16(cosh)(bfloat16 x) { return bfloat16(FN_FP32(cosh)(static_cast<float>(x))); }
-__device__ inline bfloat16 FN_BF16(tanh)(bfloat16 x) { return bfloat16(FN_FP32(tanh)(static_cast<float>(x))); }
+__device__ inline bfloat16 FN_BF16(tanh)(bfloat16 x) { return bfloat16(FN_FP32(tanh_approx)(static_cast<float>(x))); }
 __device__ inline bfloat16 FN_BF16(asin)(bfloat16 x) { return bfloat16(FN_FP32(asin)(static_cast<float>(x))); }
 __device__ inline bfloat16 FN_BF16(acos)(bfloat16 x) { return bfloat16(FN_FP32(acos)(static_cast<float>(x))); }
 __device__ inline bfloat16 FN_BF16(atan)(bfloat16 x) { return bfloat16(FN_FP32(atan)(static_cast<float>(x))); }
@@ -480,7 +489,7 @@ __device__ inline float16 FN_FP16(erf)(float16 x) { return float16(FN_FP32(erf)(
 __device__ inline float16 FN_FP16(tan)(float16 x) { return float16(FN_FP32(tan)(static_cast<float>(x))); }
 __device__ inline float16 FN_FP16(sinh)(float16 x) { return float16(FN_FP32(sinh)(static_cast<float>(x))); }
 __device__ inline float16 FN_FP16(cosh)(float16 x) { return float16(FN_FP32(cosh)(static_cast<float>(x))); }
-__device__ inline float16 FN_FP16(tanh)(float16 x) { return float16(FN_FP32(tanh)(static_cast<float>(x))); }
+__device__ inline float16 FN_FP16(tanh)(float16 x) { return float16(FN_FP32(tanh_approx)(static_cast<float>(x))); }
 __device__ inline float16 FN_FP16(asin)(float16 x) { return float16(FN_FP32(asin)(static_cast<float>(x))); }
 __device__ inline float16 FN_FP16(acos)(float16 x) { return float16(FN_FP32(acos)(static_cast<float>(x))); }
 __device__ inline float16 FN_FP16(atan)(float16 x) { return float16(FN_FP32(atan)(static_cast<float>(x))); }
 
@@ -534,7 +534,7 @@ def GeneratePythonCFunction(self):
         )
 
         # Set prefix of forward_api_name to avoid conflicts
-        prefix = self.namespace.strip("::")
+        prefix = self.namespace.removeprefix("::").removesuffix("::")
         forward_api_name_prefix = "" if prefix == "" else prefix + "_"
 
         # Generate Python-C Function Registration
 
@@ -20,7 +20,6 @@
 import jinja2
 import yaml
 
-# fmt: off
 # import from paddle/fluid/operators/generator
 sys.path.append(
     str(pathlib.Path(__file__).parents[3].joinpath('operators/generator'))
 
@@ -20,11 +20,9 @@
 import jinja2
 import yaml
 
-# fmt: off
 # import from paddle/fluid/operators/generator
 sys.path.insert(
-    0,
-    str(pathlib.Path(__file__).resolve().parents[2] / 'operators/generator')
+    0, str(pathlib.Path(__file__).resolve().parents[2] / 'operators/generator')
 )
 import filters as op_gen_filters
 import tests_utils as op_gen_tests
@@ -35,7 +33,9 @@
 # import from paddle/fluid/pir/dialect/op_generator/api_gen.py
 sys.path.insert(
     0,
-    str(pathlib.Path(__file__).resolve().parents[2] / 'pir/dialect/op_generator')
+    str(
+        pathlib.Path(__file__).resolve().parents[2] / 'pir/dialect/op_generator'
+    ),
 )
 
 from decomp_interface_gen_op_list import (
Original file line number	Diff line number	Diff line change
`@@ -72,8 +72,8 @@ jobs:`
`72`	`72`	`mkdir -p ${{ github.workspace }}/../../../.cache/pip`
`73`	`73`	`source ${{ github.workspace }}/../../../proxy`
`74`	`74`	`python3.10 -m pip config set global.cache-dir ${{ github.workspace }}/../../../.cache/pip`
`75`		`- python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118`
`76`	`75`	`python3.10 -m pip install $wheel_link`
	`76`	`+ python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118`
`77`	`77`	`python3.10 test_slice_float32.py`
`78`	`78`	`'`
`79`	`79`
Original file line number	Diff line number	Diff line change
`@@ -534,7 +534,7 @@ def GeneratePythonCFunction(self):`
`534`	`534`	`)`
`535`	`535`
`536`	`536`	`# Set prefix of forward_api_name to avoid conflicts`
`537`		`- prefix = self.namespace.strip("::")`
	`537`	`+ prefix = self.namespace.removeprefix("::").removesuffix("::")`
`538`	`538`	`forward_api_name_prefix = "" if prefix == "" else prefix + "_"`
`539`	`539`
`540`	`540`	`# Generate Python-C Function Registration`