Skip to content

Commit a94bd18

Browse files
authored
Merge branch 'PaddlePaddle:develop' into develop
2 parents ac57ded + b8455e0 commit a94bd18

File tree

203 files changed

+4019
-3226
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+4019
-3226
lines changed

.github/workflows/_Api-Benchmark.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ jobs:
102102
tar -zvxf PaddleTest.tar.gz 1>/dev/null 2>&1
103103
# git submodule foreach "git config --global --add safe.directory \$toplevel/\$sm_path"
104104
source ${{ github.workspace }}/../../../proxy
105+
mkdir -p ${{ github.workspace }}/../../../pip
106+
${python} -m pip config set global.cache-dir ${{ github.workspace }}/../../../pip
105107
${python} -m pip install -r ./PaddleTest/framework/e2e/api_benchmark_new/requirement.txt
106108
'
107109
@@ -113,6 +115,7 @@ jobs:
113115
export LD_LIBRARY_PATH=/usr/local/cuda-11.8/compat:/usr/local/cuda/compat:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
114116
cd ./PaddleTest/framework/e2e/api_benchmark_new
115117
cp /paddle/PTSTools/Uploader/apibm_config.yml .
118+
source ${{ github.workspace }}/../../../proxy
116119
${python} -m pip install https://paddle-github-action.bj.bcebos.com/PR/build/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
117120
if [ ${core_index} -eq -1 ];then
118121
${python} runner_ci_action.py --yaml ../yaml/api_benchmark_fp32.yml --core_index 2

.github/workflows/_Slice.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ jobs:
7272
mkdir -p ${{ github.workspace }}/../../../.cache/pip
7373
source ${{ github.workspace }}/../../../proxy
7474
python3.10 -m pip config set global.cache-dir ${{ github.workspace }}/../../../.cache/pip
75-
python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
7675
python3.10 -m pip install $wheel_link
76+
python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
7777
python3.10 test_slice_float32.py
7878
'
7979

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ repos:
5656
args: [--force-exclude]
5757
# For Python files
5858
- repo: https://github.com/psf/black-pre-commit-mirror
59-
rev: 24.8.0
59+
rev: 25.1.0
6060
hooks:
6161
- id: black
6262
- repo: https://github.com/astral-sh/ruff-pre-commit
63-
rev: v0.11.0
63+
rev: v0.11.11
6464
hooks:
6565
- id: ruff
6666
args: [--fix, --exit-non-zero-on-fix, --no-cache]

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
6666
option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" ON)
6767
option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)
6868
option(WITH_PIP_CUDA_LIBRARIES
69-
"Paddle uses the CUDA library provided by NVIDIA" OFF)
69+
"Paddle uses the CUDA library provided by NVIDIA" ON)
7070
option(WITH_PIP_TENSORRT "Paddle uses the tensorrt provided by NVIDIA" OFF)
7171
option(WITH_NIGHTLY_BUILD
7272
"Compile nightly paddle whl package of the develop branch" OFF)

ci/coverage_diff.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def get_diff_file_lines(diff_file):
4040
line = line.strip()
4141

4242
if line.startswith('+++ '):
43-
current_file = line.lstrip('+++ ')
43+
current_file = line.removeprefix('+++ ')
4444

4545
diff_file_lines[current_file] = []
4646

ci/coverage_gcda_clean.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
16-
""" usage: gcda_clean.py pull_id. """
16+
"""usage: gcda_clean.py pull_id."""
1717

1818
import os
1919
import sys

ci/slice/test_slice_float32.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -697,9 +697,9 @@ def main():
697697
S,
698698
", B = ",
699699
B,
700-
", score <2 : ",
700+
", score <=2 : ",
701701
B2,
702-
", score > 3 : ",
702+
", score >2 : ",
703703
Bother,
704704
)
705705

cmake/cudnn.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ find_path(
1414
CUDNN_INCLUDE_DIR cudnn.h
1515
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT}
1616
$ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
17+
/usr/local/lib/python${PY_VERSION}/dist-packages/nvidia/cudnn/include/
1718
NO_DEFAULT_PATH)
1819

1920
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
@@ -43,6 +44,9 @@ set(CUDNN_LIB_NAME "")
4344

4445
if(LINUX)
4546
set(CUDNN_LIB_NAME "libcudnn.so")
47+
if(${CUDA_VERSION} GREATER_EQUAL 12.6)
48+
set(CUDNN_LIB_NAME "libcudnn.so.9")
49+
endif()
4650
endif()
4751

4852
if(WIN32)
@@ -58,6 +62,7 @@ find_library(
5862
CUDNN_LIBRARY
5963
NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
6064
PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
65+
/usr/local/lib/python${PY_VERSION}/dist-packages/nvidia/cudnn/lib/
6166
NO_DEFAULT_PATH
6267
DOC "Path to cuDNN library.")
6368

cmake/external/pocketfft.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@ ExternalProject_Add(
4545
UPDATE_COMMAND ""
4646
CONFIGURE_COMMAND ""
4747
BUILD_COMMAND
48-
COMMAND ${CMAKE_COMMAND} -E remove_directory ${POCKETFFT_SOURCE_DIR}
49-
COMMAND ${CMAKE_COMMAND} -E make_directory ${POCKETFFT_SOURCE_DIR}
5048
COMMAND ${CMAKE_COMMAND} -E copy_directory ${SOURCE_DIR}
5149
${POCKETFFT_SOURCE_DIR}
5250
INSTALL_COMMAND ""

cmake/generic.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,10 @@ function(nv_test TARGET_NAME)
726726
target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
727727
else()
728728
target_link_libraries(${TARGET_NAME} python)
729+
if(WITH_SHARED_PHI)
730+
target_link_libraries(${TARGET_NAME} -Wl,--as-needed phi_core phi_gpu
731+
-Wl,--no-as-needed)
732+
endif()
729733
endif()
730734
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main)
731735
common_link(${TARGET_NAME})

paddle/cinn/hlir/dialect/operator/transforms/fuse_parallel_matmul_pass.cc

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
2323
#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
2424
#include "paddle/fluid/pir/transforms/sub_graph_detector.h"
25+
#include "paddle/fluid/pir/utils/general_functions.h"
2526
#include "paddle/pir/include/core/builtin_dialect.h"
2627
#include "paddle/pir/include/pass/pass.h"
2728
#include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h"
@@ -183,6 +184,166 @@ class MergeParallelMatmulPattern
183184
}
184185
};
185186

187+
class MergeParallelLinearPattern
188+
: public pir::OpRewritePattern<paddle::dialect::FusedGemmEpilogueOp> {
189+
public:
190+
using pir::OpRewritePattern<
191+
paddle::dialect::FusedGemmEpilogueOp>::OpRewritePattern;
192+
193+
bool MatchAndRewrite(paddle::dialect::FusedGemmEpilogueOp fused_gemm_op,
194+
pir::PatternRewriter& rewriter) const override {
195+
auto ValidFusedGemmAttr = [](pir::Operation* op) -> bool {
196+
if (!op->isa<paddle::dialect::FusedGemmEpilogueOp>()) {
197+
return false;
198+
}
199+
bool trans_x =
200+
op->attribute("trans_x").dyn_cast<pir::BoolAttribute>().data();
201+
bool trans_y =
202+
op->attribute("trans_y").dyn_cast<pir::BoolAttribute>().data();
203+
204+
// only support trans_x and trans_y are false
205+
if (trans_x || trans_y) return false;
206+
207+
std::string activation =
208+
op->attribute<pir::StrAttribute>("activation").AsString();
209+
if (activation != "none") return false;
210+
return true;
211+
};
212+
if (!ValidFusedGemmAttr(fused_gemm_op)) {
213+
return false;
214+
}
215+
216+
auto IsFirstInput = [&](pir::Operation* op, pir::Value in_x) -> bool {
217+
return in_x == op->operand_source(0);
218+
};
219+
220+
auto VectorPrefixEqual = [](const std::vector<std::int64_t>& a,
221+
const std::vector<std::int64_t>& b) {
222+
return std::vector<std::int64_t>(a.begin(), a.end() - 1) ==
223+
std::vector<std::int64_t>(b.begin(), b.end() - 1);
224+
};
225+
226+
auto IsDynamicShape = [&](const std::vector<int64_t>& dims) {
227+
return std::any_of(
228+
dims.begin(), dims.end(), [](int64_t dim) { return dim < 0; });
229+
};
230+
231+
auto input_x = fused_gemm_op.operand_source(0);
232+
std::vector<pir::Operation*> merge_ops = [&]() {
233+
std::vector<pir::Operation*> ret;
234+
std::optional<std::vector<std::int64_t>> pre_w_dim;
235+
std::optional<std::vector<std::int64_t>> pre_bias_dim;
236+
std::vector<std::int64_t> cur_w_dim;
237+
std::vector<std::int64_t> cur_bias_dim;
238+
for (auto it = input_x.use_begin(); it != input_x.use_end(); ++it) {
239+
if (!ValidFusedGemmAttr(it->owner())) {
240+
continue;
241+
}
242+
243+
if (!IsFirstInput(it->owner(), input_x)) {
244+
continue;
245+
}
246+
if (!pre_w_dim.has_value()) {
247+
pre_w_dim = pir::GetShapeFromValue(it->owner()->operand_source(1));
248+
}
249+
if (!pre_bias_dim.has_value()) {
250+
pre_bias_dim = pir::GetShapeFromValue(it->owner()->operand_source(2));
251+
}
252+
cur_w_dim = pir::GetShapeFromValue(it->owner()->operand_source(1));
253+
cur_bias_dim = pir::GetShapeFromValue(it->owner()->operand_source(2));
254+
255+
if (IsDynamicShape(cur_w_dim) || IsDynamicShape(cur_bias_dim)) {
256+
continue;
257+
}
258+
if (VectorPrefixEqual(pre_w_dim.value(), cur_w_dim)) {
259+
ret.push_back(it->owner());
260+
}
261+
}
262+
return ret;
263+
}();
264+
if (merge_ops.size() <= 1) {
265+
return false;
266+
}
267+
std::sort(
268+
merge_ops.begin(),
269+
merge_ops.end(),
270+
[&](pir::Operation* a, pir::Operation* b) {
271+
int a_distance = std::distance(a->GetParent()->begin(),
272+
a->operator pir::Block::Iterator());
273+
int b_distance = std::distance(b->GetParent()->begin(),
274+
b->operator pir::Block::Iterator());
275+
return a_distance < b_distance;
276+
});
277+
278+
const auto [combine_w_ins, combine_bias_ins] = [&]() {
279+
std::vector<pir::Value> weight_list, bias_list;
280+
for (pir::Operation* op : merge_ops) {
281+
weight_list.push_back(op->operand_source(1));
282+
bias_list.push_back(op->operand_source(2));
283+
}
284+
return std::make_tuple(weight_list, bias_list);
285+
}();
286+
287+
const std::vector<std::int64_t> combine_shapes = [&]() {
288+
std::vector<std::int64_t> ret{0};
289+
std::int64_t accumulate = 0;
290+
for (pir::Value input : combine_w_ins) {
291+
const auto& shape = pir::GetShapeFromValue(input);
292+
accumulate += shape.back();
293+
ret.push_back(accumulate);
294+
}
295+
return ret;
296+
}();
297+
const std::vector<pir::Value> outputs = [&]() {
298+
std::vector<pir::Value> ret;
299+
for (pir::Operation* fused_gemm_op : merge_ops) {
300+
ret.push_back(fused_gemm_op->result(0));
301+
}
302+
return ret;
303+
}();
304+
305+
auto* insert_point = FindInsertPoint(merge_ops, outputs);
306+
MoveUpstreamOpBeforeGroup(
307+
merge_ops, merge_ops.back()->GetParent(), insert_point);
308+
rewriter.set_insertion_point(insert_point);
309+
310+
auto combine_w = rewriter.Build<pir::CombineOp>(combine_w_ins).result(0);
311+
auto combine_bias =
312+
rewriter.Build<pir::CombineOp>(combine_bias_ins).result(0);
313+
auto concat_w =
314+
rewriter.Build<paddle::dialect::ConcatOp>(combine_w, -1).result(0);
315+
auto concat_b =
316+
rewriter.Build<paddle::dialect::ConcatOp>(combine_bias, -1).result(0);
317+
auto new_fused_gemm_out =
318+
rewriter
319+
.Build<paddle::dialect::FusedGemmEpilogueOp>(
320+
input_x, concat_w, concat_b, fused_gemm_op.attributes())
321+
.result(0);
322+
323+
const auto& out_rank = new_fused_gemm_out.type()
324+
.dyn_cast<paddle::dialect::DenseTensorType>()
325+
.dims()
326+
.size();
327+
328+
for (size_t i = 0; i < merge_ops.size(); ++i) {
329+
auto split_out = rewriter
330+
.Build<paddle::dialect::SliceOp>(
331+
new_fused_gemm_out,
332+
std::vector<std::int64_t>{out_rank - 1},
333+
std::vector<std::int64_t>{combine_shapes[i]},
334+
std::vector<int64_t>{combine_shapes[i + 1]},
335+
std::vector<std::int64_t>{},
336+
std::vector<std::int64_t>{})
337+
.result(0);
338+
339+
rewriter.ReplaceAllUsesWith(merge_ops[i]->result(0), split_out);
340+
rewriter.EraseOp(merge_ops[i]);
341+
}
342+
343+
return true;
344+
}
345+
};
346+
186347
class FuseParallelMatmulPass : public pir::PatternRewritePass {
187348
public:
188349
FuseParallelMatmulPass()
@@ -191,6 +352,7 @@ class FuseParallelMatmulPass : public pir::PatternRewritePass {
191352
pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override {
192353
pir::RewritePatternSet ps(context);
193354
ps.Add<MergeParallelMatmulPattern>(context);
355+
ps.Add<MergeParallelLinearPattern>(context);
194356
return ps;
195357
}
196358
};

paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,15 @@ __device__ inline float FN_FP32(rcp)(float x) {
100100
asm("rcp.approx.ftz.f32 %0, %1;" : "=f"(res) : "f"(x));
101101
return res;
102102
}
103+
__device__ inline float FN_FP32(tanh_approx)(float x) {
104+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
105+
float res;
106+
asm("tanh.approx.f32 %0, %1;" : "=f"(res) : "f"(x));
107+
return res;
108+
#else
109+
return tanh(x);
110+
#endif
111+
}
103112

104113
// *************************************************************** //
105114
// float64 unary and binary operator
@@ -426,7 +435,7 @@ __device__ inline bfloat16 FN_BF16(erf)(bfloat16 x) { return bfloat16(FN_FP32(er
426435
__device__ inline bfloat16 FN_BF16(tan)(bfloat16 x) { return bfloat16(FN_FP32(tan)(static_cast<float>(x))); }
427436
__device__ inline bfloat16 FN_BF16(sinh)(bfloat16 x) { return bfloat16(FN_FP32(sinh)(static_cast<float>(x))); }
428437
__device__ inline bfloat16 FN_BF16(cosh)(bfloat16 x) { return bfloat16(FN_FP32(cosh)(static_cast<float>(x))); }
429-
__device__ inline bfloat16 FN_BF16(tanh)(bfloat16 x) { return bfloat16(FN_FP32(tanh)(static_cast<float>(x))); }
438+
__device__ inline bfloat16 FN_BF16(tanh)(bfloat16 x) { return bfloat16(FN_FP32(tanh_approx)(static_cast<float>(x))); }
430439
__device__ inline bfloat16 FN_BF16(asin)(bfloat16 x) { return bfloat16(FN_FP32(asin)(static_cast<float>(x))); }
431440
__device__ inline bfloat16 FN_BF16(acos)(bfloat16 x) { return bfloat16(FN_FP32(acos)(static_cast<float>(x))); }
432441
__device__ inline bfloat16 FN_BF16(atan)(bfloat16 x) { return bfloat16(FN_FP32(atan)(static_cast<float>(x))); }
@@ -480,7 +489,7 @@ __device__ inline float16 FN_FP16(erf)(float16 x) { return float16(FN_FP32(erf)(
480489
__device__ inline float16 FN_FP16(tan)(float16 x) { return float16(FN_FP32(tan)(static_cast<float>(x))); }
481490
__device__ inline float16 FN_FP16(sinh)(float16 x) { return float16(FN_FP32(sinh)(static_cast<float>(x))); }
482491
__device__ inline float16 FN_FP16(cosh)(float16 x) { return float16(FN_FP32(cosh)(static_cast<float>(x))); }
483-
__device__ inline float16 FN_FP16(tanh)(float16 x) { return float16(FN_FP32(tanh)(static_cast<float>(x))); }
492+
__device__ inline float16 FN_FP16(tanh)(float16 x) { return float16(FN_FP32(tanh_approx)(static_cast<float>(x))); }
484493
__device__ inline float16 FN_FP16(asin)(float16 x) { return float16(FN_FP32(asin)(static_cast<float>(x))); }
485494
__device__ inline float16 FN_FP16(acos)(float16 x) { return float16(FN_FP32(acos)(static_cast<float>(x))); }
486495
__device__ inline float16 FN_FP16(atan)(float16 x) { return float16(FN_FP32(atan)(static_cast<float>(x))); }

paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ def GeneratePythonCFunction(self):
534534
)
535535

536536
# Set prefix of forward_api_name to avoid conflicts
537-
prefix = self.namespace.strip("::")
537+
prefix = self.namespace.removeprefix("::").removesuffix("::")
538538
forward_api_name_prefix = "" if prefix == "" else prefix + "_"
539539

540540
# Generate Python-C Function Registration

paddle/fluid/prim/api/auto_code_generated/static_gen.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import jinja2
2121
import yaml
2222

23-
# fmt: off
2423
# import from paddle/fluid/operators/generator
2524
sys.path.append(
2625
str(pathlib.Path(__file__).parents[3].joinpath('operators/generator'))

paddle/fluid/primitive/codegen/decomp_rule_gen.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,9 @@
2020
import jinja2
2121
import yaml
2222

23-
# fmt: off
2423
# import from paddle/fluid/operators/generator
2524
sys.path.insert(
26-
0,
27-
str(pathlib.Path(__file__).resolve().parents[2] / 'operators/generator')
25+
0, str(pathlib.Path(__file__).resolve().parents[2] / 'operators/generator')
2826
)
2927
import filters as op_gen_filters
3028
import tests_utils as op_gen_tests
@@ -35,7 +33,9 @@
3533
# import from paddle/fluid/pir/dialect/op_generator/api_gen.py
3634
sys.path.insert(
3735
0,
38-
str(pathlib.Path(__file__).resolve().parents[2] / 'pir/dialect/op_generator')
36+
str(
37+
pathlib.Path(__file__).resolve().parents[2] / 'pir/dialect/op_generator'
38+
),
3939
)
4040

4141
from decomp_interface_gen_op_list import (

0 commit comments

Comments
 (0)