Skip to content

Commit 215d621

Browse files
authored
Merge branch 'PaddlePaddle:develop' into develop
2 parents 01b2f6e + feebd79 commit 215d621

File tree

374 files changed

+20068
-6163
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

374 files changed

+20068
-6163
lines changed

ci/run_sot_test.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ function run_sot_test() {
1818
PY_VERSION_NO_DOT=$(echo $PY_VERSION | sed 's/\.//g')
1919

2020
export STRICT_MODE=1
21-
export COST_MODEL=False
2221
export MIN_GRAPH_SIZE=0
2322
export SOT_LOG_LEVEL=0
2423
export FLAGS_cudnn_deterministic=True
24+
export SOT_ENABLE_STRICT_GUARD_CHECK=True
2525

2626
# Install PaddlePaddle
2727
echo "::group::Installing paddle wheel..."
@@ -54,7 +54,7 @@ function run_sot_test() {
5454
echo "skip ${PY_VERSION_NO_DOT} ${file}"
5555
continue
5656
fi
57-
echo Running:" STRICT_MODE=1 COST_MODEL=False MIN_GRAPH_SIZE=0 SOT_LOG_LEVEL=0 FLAGS_cudnn_deterministic=True python " $file
57+
echo Running:" STRICT_MODE=1 MIN_GRAPH_SIZE=0 SOT_LOG_LEVEL=0 FLAGS_cudnn_deterministic=True SOT_ENABLE_STRICT_GUARD_CHECK=True python " $file
5858
# run unittests
5959
python_output=$($PYTHON_WITH_SPECIFY_VERSION $file 2>&1)
6060

cmake/external/nvshmem.cmake

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
include(ExternalProject)
16+
17+
set(GDRCOPY_HOME
18+
$ENV{GDRCOPY_HOME}
19+
CACHE PATH "Path to GDRCOPY installation")
20+
if(GDRCOPY_HOME)
21+
message(STATUS "GDRCOPY_HOME: ${GDRCOPY_HOME}")
22+
else()
23+
message(
24+
WARNING
25+
"Setting GDRCOPY_HOME environment or cmake option maybe needed to specify your install path GDRCOPY."
26+
)
27+
endif()
28+
29+
set(NVSHMEM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nvshmem)
30+
set(NVSHMEM_PREFIX_DIR ${THIRD_PARTY_PATH}/nvshmem)
31+
set(NVSHMEM_SOURCE_DIR ${NVSHMEM_PREFIX_DIR}/src/extern_nvshmem)
32+
message(STATUS "NVSHMEM_INSTALL_DIR: ${NVSHMEM_INSTALL_DIR}")
33+
34+
set(NVSHMEM_INCLUDE_DIR
35+
"${NVSHMEM_INSTALL_DIR}/include"
36+
CACHE PATH "nvshmem include directory." FORCE)
37+
38+
include_directories(${NVSHMEM_INCLUDE_DIR})
39+
40+
if(NVSHMEM_SRC_TAR_PATH)
41+
set(NVSHMEM_DOWNLOAD_COMMAND
42+
rm -rf extern_nvshmem nvshmem_src_3.1.7-1.txz && cp
43+
${NVSHMEM_SRC_TAR_PATH} . && tar xf nvshmem_src_3.1.7-1.txz && mv
44+
nvshmem_src extern_nvshmem)
45+
else()
46+
set(NVSHMEM_URL
47+
"https://developer.download.nvidia.com/compute/redist/nvshmem/3.1.7/source/nvshmem_src_3.1.7-1.txz"
48+
CACHE STRING "" FORCE)
49+
set(NVSHMEM_DOWNLOAD_COMMAND
50+
rm -rf extern_nvshmem nvshmem_src_3.1.7-1.txz && wget
51+
--no-check-certificate -q ${NVSHMEM_URL} && tar xf
52+
nvshmem_src_3.1.7-1.txz && mv nvshmem_src extern_nvshmem)
53+
endif()
54+
55+
set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/third_party/nvshmem.patch)
56+
set(NVSHMEM_PATCH_COMMAND
57+
git init && git config user.name "PaddlePaddle" && git config user.email
58+
"paddle@baidu.com" && git config --add safe.directory . && git add . && git
59+
commit -m "init" && git apply ${NVSHMEM_PATCH_PATH})
60+
61+
set(NVSHMEM_LIB ${NVSHMEM_INSTALL_DIR}/lib/libnvshmem.a)
62+
set(NVSHMEM_BOOTSTRAP_UID_LIB
63+
${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_uid.so)
64+
set(NVSHMEM_BOOTSTRAP_MPI_LIB
65+
${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_mpi.so)
66+
set(NVSHMEM_BOOTSTRAP_PMI_LIB
67+
${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_pmi.so)
68+
set(NVSHMEM_BOOTSTRAP_PMI2_LIB
69+
${NVSHMEM_INSTALL_DIR}/lib/nvshmem_bootstrap_pmi2.so)
70+
set(NVSHMEM_TRANSPORT_IBRC_LIB
71+
${NVSHMEM_INSTALL_DIR}/lib/nvshmem_transport_ibrc.so.3)
72+
set(NVSHMEM_TRANSPORT_IBGDA_LIB
73+
${NVSHMEM_INSTALL_DIR}/lib/nvshmem_transport_ibgda.so.3)
74+
75+
# only compile nvshmem for sm90
76+
set(CUDA_ARCHITECTURES "90")
77+
78+
ExternalProject_Add(
79+
extern_nvshmem
80+
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
81+
PREFIX ${NVSHMEM_PREFIX_DIR}
82+
SOURCE_DIR ${NVSHMEM_SOURCE_DIR}
83+
DOWNLOAD_DIR ${NVSHMEM_PREFIX_DIR}/src
84+
DOWNLOAD_COMMAND ${NVSHMEM_DOWNLOAD_COMMAND}
85+
PATCH_COMMAND ${NVSHMEM_PATCH_COMMAND}
86+
UPDATE_COMMAND ""
87+
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${NVSHMEM_INSTALL_DIR}
88+
-DGDRCOPY_HOME:PATH=${GDRCOPY_HOME}
89+
-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}
90+
-DNVSHMEM_ENABLE_ALL_DEVICE_INLINING=0
91+
-DNVSHMEM_SHMEM_SUPPORT=0
92+
-DNVSHMEM_UCX_SUPPORT=0
93+
-DNVSHMEM_USE_NCCL=0
94+
-DNVSHMEM_IBGDA_SUPPORT=1
95+
-DNVSHMEM_PMIX_SUPPORT=0
96+
-DNVSHMEM_TIMEOUT_DEVICE_POLLING=0
97+
-DNVSHMEM_USE_GDRCOPY=1
98+
-DNVSHMEM_IBRC_SUPPORT=1
99+
-DNVSHMEM_BUILD_TESTS=0
100+
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${NVSHMEM_INSTALL_DIR}
101+
BUILD_BYPRODUCTS ${NVSHMEM_LIB})
102+
103+
add_definitions(-DPADDLE_WITH_NVSHMEM)
104+
add_library(nvshmem STATIC IMPORTED GLOBAL)
105+
set_property(TARGET nvshmem PROPERTY IMPORTED_LOCATION ${NVSHMEM_LIB})
106+
add_dependencies(nvshmem extern_nvshmem)

cmake/external/xpu.cmake

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ set(XPU_XFA_LIB_NAME "libxpu_flash_attention.so")
3030
set(XPU_XPUDNN_LIB_NAME "libxpu_dnn.so")
3131

3232
if(NOT DEFINED XPU_XHPC_BASE_DATE)
33-
set(XPU_XHPC_BASE_DATE "dev/20250304")
33+
set(XPU_XHPC_BASE_DATE "dev/20250306")
3434
endif()
35-
set(XPU_XCCL_BASE_VERSION "3.0.2.3") # For XRE5
35+
set(XPU_XCCL_BASE_VERSION "3.0.2.5") # For XRE5
3636
if(NOT DEFINED XPU_XFT_BASE_VERSION)
3737
set(XPU_XFT_BASE_VERSION "20230602")
3838
endif()

cmake/third_party.cmake

+11
Original file line numberDiff line numberDiff line change
@@ -693,4 +693,15 @@ if(WITH_OPENVINO)
693693
list(APPEND third_party_deps extern_openvino)
694694
endif()
695695

696+
string(FIND "${CUDA_ARCH_BIN}" "90" ARCH_BIN_CONTAINS_90)
697+
if(NOT WITH_GPU
698+
OR NOT WITH_DISTRIBUTE
699+
OR (ARCH_BIN_CONTAINS_90 EQUAL -1))
700+
set(WITH_NVSHMEM OFF)
701+
endif()
702+
if(WITH_NVSHMEM)
703+
include(external/nvshmem)
704+
list(APPEND third_party_deps extern_nvshmem)
705+
endif()
706+
696707
add_custom_target(third_party ALL DEPENDS ${third_party_deps})

paddle/cinn/backends/codegen_cuda_dev.cc

+29-5
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616

1717
namespace cinn {
1818
namespace backends {
19-
20-
const std::string CodeGenCudaDev::source_header_ = // NOLINT
21-
R"(#include <cstdint>
22-
19+
const std::string CodeGenCudaDev::general_source_header_ = // NOLINT
20+
R"(
21+
#pragma once
22+
#include <cstdint>
2323
#define CINN_WITH_CUDA
2424
#include "bfloat16.h"
2525
#include "float16.h"
@@ -34,11 +34,35 @@ using cinn::common::float162;
3434
using cinn::common::bfloat168;
3535
using cinn::common::bfloat164;
3636
using cinn::common::bfloat162;
37-
3837
#include "cinn_cuda_runtime_source.cuh"
38+
)";
39+
const std::string CodeGenCudaDev::source_header_ = // NOLINT
40+
R"(
41+
#pragma once
42+
#include <cinn_with_cuda_h>
43+
44+
#include <bfloat16_h>
45+
#include <cstdint>
46+
#include <float16_h>
47+
using cinn::common::bfloat16;
48+
using cinn::common::float16;
49+
using cinn::common::float8;
50+
using cinn::common::half4;
51+
using cinn::common::half8;
52+
using cinn::common::float168;
53+
using cinn::common::float164;
54+
using cinn::common::float162;
55+
using cinn::common::bfloat168;
56+
using cinn::common::bfloat164;
57+
using cinn::common::bfloat162;
58+
#include <cinn_cuda_runtime_source_h>
59+
3960
)";
4061

4162
const std::string &CodeGenCudaDev::GetSourceHeader() { return source_header_; }
63+
const std::string &CodeGenCudaDev::GetGeneralSourceHeader() {
64+
return general_source_header_;
65+
}
4266

4367
CodeGenCudaDev::CodeGenCudaDev(Target target) : CodeGenGpuDev(target) {}
4468

paddle/cinn/backends/codegen_cuda_dev.h

+5
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,15 @@ class CodeGenCudaDev : public CodeGenGpuDev {
3131
public:
3232
explicit CodeGenCudaDev(Target target);
3333
static const std::string& GetSourceHeader();
34+
static const std::string& GetGeneralSourceHeader();
3435
void PrintIncludes() override;
3536

3637
private:
3738
static const std::string source_header_;
39+
// general_source_header_ is used for the more general situation, which load
40+
// some header files while compiling but not set them into header files while
41+
// creating the kernel function.
42+
static const std::string general_source_header_;
3843
};
3944

4045
} // namespace backends

paddle/cinn/backends/nvrtc/header_generator.cc

+50
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@
1414

1515
#include "paddle/cinn/backends/nvrtc/header_generator.h"
1616

17+
#include <fstream>
1718
#include "glog/logging.h"
1819
#include "jitify.hpp" // NOLINT
20+
#include "paddle/cinn/common/common.h"
1921
#include "paddle/common/enforce.h"
22+
2023
namespace cinn {
2124
namespace backends {
2225
namespace nvrtc {
@@ -34,12 +37,59 @@ const size_t JitSafeHeaderGenerator::size() const {
3437
return include_names_.size();
3538
}
3639

40+
std::string read_file_as_string(const std::string& file_path) {
41+
#ifdef RUNTIME_INCLUDE_DIR
42+
static constexpr char* defined_runtime_include_dir = RUNTIME_INCLUDE_DIR;
43+
#else
44+
static constexpr char* defined_runtime_include_dir = nullptr;
45+
#endif
46+
47+
#ifdef CINN_WITH_CUDA
48+
std::string cinn_path = defined_runtime_include_dir;
49+
std::ifstream file(cinn_path + '/' + file_path);
50+
51+
if (!file.is_open()) {
52+
VLOG(1) << "Unable to open file : " << cinn_path << '/' << file_path;
53+
return "";
54+
}
55+
std::stringstream buffer;
56+
buffer << file.rdbuf();
57+
file.close();
58+
return buffer.str();
59+
#else
60+
return "";
61+
#endif
62+
}
63+
#ifdef CINN_WITH_CUDA
64+
65+
static const std::string cinn_float16_header = // NOLINT
66+
read_file_as_string("float16.h");
67+
static const std::string cinn_bfloat16_header = // NOLINT
68+
read_file_as_string("bfloat16.h");
69+
static const std::string cinn_with_cuda_header = // NOLINT
70+
R"(
71+
#pragma once
72+
#define CINN_WITH_CUDA
73+
)";
74+
static const std::string cinn_cuda_runtime_source_header = // NOLINT
75+
read_file_as_string("cinn_cuda_runtime_source.cuh");
76+
#endif
3777
JitSafeHeaderGenerator::JitSafeHeaderGenerator() {
3878
const auto& headers_map = ::jitify::detail::get_jitsafe_headers_map();
3979
for (auto& pair : headers_map) {
4080
include_names_.emplace_back(pair.first.data());
4181
headers_.emplace_back(pair.second.data());
4282
}
83+
#ifdef CINN_WITH_CUDA
84+
include_names_.emplace_back("float16_h");
85+
headers_.emplace_back(cinn_float16_header.data());
86+
include_names_.emplace_back("bfloat16_h");
87+
headers_.emplace_back(cinn_bfloat16_header.data());
88+
include_names_.emplace_back("cinn_with_cuda_h");
89+
headers_.emplace_back(cinn_with_cuda_header.data());
90+
include_names_.emplace_back("cinn_cuda_runtime_source_h");
91+
headers_.emplace_back(cinn_cuda_runtime_source_header.data());
92+
#endif
4393
}
4494

4595
} // namespace nvrtc

paddle/cinn/backends/nvrtc/nvrtc_util.cc

+12
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <fstream>
2525
#include <iostream>
2626

27+
#include "paddle/cinn/backends/codegen_cuda_dev.h"
2728
#include "paddle/cinn/backends/cuda_util.h"
2829
#include "paddle/cinn/backends/nvrtc/header_generator.h"
2930
#include "paddle/cinn/common/common.h"
@@ -181,6 +182,17 @@ std::string Compiler::CompileCudaSource(const std::string& code,
181182
nvrtcResult compile_res =
182183
nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
183184

185+
if (compile_res != NVRTC_SUCCESS) {
186+
std::string new_code = CodeGenCudaDev::GetGeneralSourceHeader() + code;
187+
NVRTC_CALL(nvrtcCreateProgram(&prog,
188+
new_code.c_str(),
189+
nullptr,
190+
header_gen.size(),
191+
header_gen.headers().data(),
192+
header_gen.include_names().data()));
193+
compile_res =
194+
nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data());
195+
}
184196
{ // get log
185197
size_t log_size;
186198
NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size));

paddle/cinn/common/simplify_special_pattern.cc

+12-12
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@
2323
#include "paddle/cinn/optim/simplify_util.h"
2424
namespace cinn {
2525
namespace common {
26-
using cinn::optim::CheckPattern;
2726
using cinn::optim::GetFlattenExprs;
2827
using cinn::optim::IsNegatedIndexExpr;
2928
using cinn::optim::IsSumPartialBySymbol;
29+
using cinn::optim::MatchPattern;
3030
using cinn::optim::ProveDivisible;
3131
using cinn::optim::SimplifySymbolicAdd;
3232

@@ -51,7 +51,7 @@ static void MergeMulModInsertElements(
5151
*has_mult = true;
5252
mult_exprs->emplace_back(ele);
5353
} else {
54-
*no_opt_sum = no_opt_sum->get() ? *no_opt_sum + ele : ele;
54+
*no_opt_sum = no_opt_sum->get() ? ir::Add::Make(*no_opt_sum, ele) : ele;
5555
}
5656
}
5757
}
@@ -250,24 +250,24 @@ std::optional<ir::IndexExpr> AddMulCornerCase(
250250
// S0 / (S1 * S2) * S2 + S0 % (S1 * S2) / S1 ===> S0 / S1
251251
std::optional<ir::IndexExpr> DivMulAddModDivCase(const ir::IndexExpr& lhs,
252252
const ir::IndexExpr& rhs) {
253-
ir::Var a = ir::Var("a");
254-
ir::Var b = ir::Var("b");
255-
ir::Var c = ir::Var("c");
256-
ir::Var f = ir::Var("f");
257-
std::unordered_map<std::string, ir::IndexExpr> map;
258-
259-
ir::IndexExpr pattern = f / c * a + f % c / b;
253+
if (!MatchPattern(rhs, "f % c / b")) return std::nullopt;
260254

261255
auto flatten = GetFlattenExprs<ir::Add>(lhs);
262256
ir::IndexExpr res;
263257
bool find = false;
264258
for (const auto& expr : flatten) {
265259
if (!find) {
266260
ir::IndexExpr cand = ir::Add::Make(expr, rhs);
267-
map.clear();
261+
268262
// Check if the pattern is matched
269-
if (CheckPattern(cand, pattern, &map) &&
270-
map.at("c") == map.at("a") * map.at("b")) {
263+
auto opt_map = MatchPattern(
264+
cand,
265+
"f / c * a + f % c / b",
266+
[](const std::unordered_map<std::string, ir::IndexExpr>& m) {
267+
return m.at("c") == m.at("a") * m.at("b");
268+
});
269+
if (opt_map) {
270+
auto map = opt_map.value();
271271
ir::IndexExpr simplified = map.at("f") / map.at("b");
272272
res = res.defined() ? res + simplified : simplified;
273273
find = true;

paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc

-3
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
3232
#include "paddle/cinn/hlir/dialect/operator/transforms/accuracy_check_pass.h"
3333
#include "paddle/cinn/hlir/dialect/operator/transforms/add_broadcast_to_elementwise_pass.h"
34-
#include "paddle/cinn/hlir/dialect/operator/transforms/add_store_in_fusion_op_pass.h"
35-
#include "paddle/cinn/hlir/dialect/operator/transforms/add_store_in_group_op_pass.h"
3634
#include "paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h"
3735
#include "paddle/cinn/hlir/dialect/operator/transforms/conv2d_transpose_filter_pass.h"
3836
#include "paddle/cinn/hlir/dialect/operator/transforms/convert_fa_to_qkvmha_pass.h"
@@ -201,7 +199,6 @@ void ApplyDivideGroupOpToFusionOpPass(
201199
std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
202200
pass_manager->AddPass(
203201
cinn::dialect::ir::CreateRemoveRedundantGroupOutputPass());
204-
pass_manager->AddPass(cinn::dialect::ir::CreateAddStoreInGroupOpPass());
205202
pass_manager->AddPass(cinn::dialect::ir::CreateCinnGroupClusterPass());
206203

207204
pass_manager->AddPass(cinn::dialect::ir::CreateSingleOpFallbackToPhiPass());

0 commit comments

Comments
 (0)