Skip to content

[Cherry-pick]cherry-pick PR 71919 to release/3.0 (#71919) #71964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
599dc84
update (#71552) (#71583)
chen2016013 Mar 12, 2025
9b805c4
[CINN] Fix horizontal fusion with empty loop (#71550) (#71574)
huangjiyi Mar 12, 2025
9863408
Revert "[PHI]fix 0 size error (#71485)" (#71568) (#71576)
phlrain Mar 12, 2025
6320b01
fix bug (#71591)
gongshaotian Mar 13, 2025
aa72149
[PIR-Auto-Parallel] fix comm group hang in sync shared param pass (#…
waliwali777 Mar 13, 2025
3eb020a
[CherryPick][AutoParallel] Fix inplace op in grad clip (#71565) (#71584)
Waynezee Mar 13, 2025
aac2570
[3.0][Dy2St] Support `element_size` method for value and breakgraph w…
SigureMo Mar 13, 2025
6470d80
【Paddle Tensor】fix converter old ir issues -1 (#70849) (#71643)
PolaKuma Mar 14, 2025
bbc3129
setup (#71351) (#71630)
swgu98 Mar 14, 2025
dbaed63
[CINN] Create new AxisTransform when substitute dimexpr (#71587) (#71…
huangjiyi Mar 17, 2025
8513fe6
[AutoParallel] Fix pipeline visualization tool (cherry-pick from f086…
AndSonder Mar 17, 2025
9931609
[XPU] fix xpu grad merge bug when using amp master_grad (cherry-pick …
AndSonder Mar 17, 2025
d33a367
【Paddle TensorRT】Resolved the precision issue of pd_op.slice (#71655)…
lizexu123 Mar 18, 2025
40b25b1
[cherry-pick]fix DeepEP compile in infer mode (#71590) (#71656)
Hongqing-work Mar 18, 2025
079767e
【Paddle Tensor】Fix bugs related to converting unit tests about collec…
PolaKuma Mar 18, 2025
5aab1be
[cherry-pick] [AutoParallel] Add expand spmd #71603 (#71744)
Xing-lil Mar 19, 2025
34a136e
[Comm] Fix_NPU_Comm (#71723) (#71742)
Xing-lil Mar 19, 2025
0aa35fa
fix avx512 error (#71759)
XieYunshen Mar 19, 2025
7776754
[CherryPick][Auto Parallel] fix loss scale in xpu (#71698) (#71765)
Waynezee Mar 19, 2025
9b637a7
cherry-pick PR 71507 to release/3.0 (#71732)
mikethegoblin Mar 19, 2025
42c47d9
[AutoParallel] Update dense_tensor_idx des (#71571) (#71740)
Xing-lil Mar 19, 2025
6af9534
【Paddle TensorRT】Modified the serialization save path for TensorRT an…
lizexu123 Mar 19, 2025
6ed5dd3
update (#71784)
Xing-lil Mar 19, 2025
fc814e4
[inference]Fix FP16 precision BLHA accumulation overflow. (#71919)
bukejiyu Mar 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ option(
OFF)
option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
option(WITH_FLAGCX "Compile PaddlePaddle with FLAGCX support" OFF)
option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
Expand Down Expand Up @@ -538,6 +539,11 @@ else()
endif()
endif()

if(WITH_FLAGCX)
add_definitions("-DPADDLE_WITH_FLAGCX")
include(flagcx)
endif()

if(WITH_HETERPS AND WITH_PSLIB)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
Expand Down
24 changes: 24 additions & 0 deletions cmake/flagcx.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
set(CMAKE_FIND_DEBUG_MODE ON)
# flagcx.cmake
if(NOT WITH_FLAGCX)
return()
endif()

if(WITH_FLAGCX)
set(FLAGCX_ROOT
$ENV{FLAGCX_ROOT}
CACHE PATH "FLAGCX_ROOT")
message(STATUS "FLAGCX_ROOT is ${FLAGCX_ROOT}")
find_path(
FLAGCX_INCLUDE_DIR flagcx.h
PATHS ${FLAGCX_ROOT}/flagcx/include
NO_DEFAULT_PATH)
message(STATUS "FLAGCX_INCLUDE_DIR is ${FLAGCX_INCLUDE_DIR}")
include_directories(SYSTEM ${FLAGCX_INCLUDE_DIR})
set(FLAGCX_LIB
"${FLAGCX_ROOT}/build/lib/libflagcx.so"
CACHE FILEPATH "flagcx library." FORCE)
generate_dummy_static_lib(LIB_NAME "flagcx" GENERATOR "flagcx.cmake")
target_link_libraries(flagcx ${FLAGCX_LIB})
message(STATUS "FLAGCX_LIB is ${FLAGCX_LIB}")
endif()
4 changes: 2 additions & 2 deletions cmake/simd.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
set(AVX512F_FLAG "-mavx512f")
set(Wno_Maybe_Uninitialized "-Wno-maybe-uninitialized")
set(FMA_FLAG "-mfma")
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0)
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.0)
set(NO_INLINE "-fno-inline")
else()
set(NO_INLINE "")
Expand All @@ -27,7 +27,7 @@ elseif(MSVC)
set(AVX512F_FLAG "/arch:AVX512")
set(Wno_Maybe_Uninitialized "/wd4701")
set(FMA_FLAG "/arch:AVX2")
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0)
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.0)
set(NO_INLINE "/Ob0")
else()
set(NO_INLINE "")
Expand Down
3 changes: 1 addition & 2 deletions paddle/cinn/backends/nvrtc/header_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ std::string read_file_as_string(const std::string& file_path) {
std::ifstream file(cinn_path + '/' + file_path);

if (!file.is_open()) {
LOG_FIRST_N(INFO, 1) << "Unable to open file : " << cinn_path << '/'
<< file_path;
VLOG(1) << "Unable to open file : " << cinn_path << '/' << file_path;
return "";
}
std::stringstream buffer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class FoldFullWithReshapeOpPattern : public pir::OpRewritePattern<OPTYPE> {
phi::IntArray(out_shape.Get(), out_shape.size()));

auto new_full_op = rewriter.Build<paddle::dialect::FullOp>(attrs);

new_full_op->result(0).set_type(op->result(0).type());
rewriter.ReplaceAllUsesWith(op->result(0), new_full_op->result(0));
rewriter.EraseOp(op);
if (pre_op->use_empty()) {
Expand Down
56 changes: 33 additions & 23 deletions paddle/cinn/operator_fusion/fusion_tracker/interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,31 +161,41 @@ void RunItersTransformInstr(const std::shared_ptr<ItersTransformInstr>& instr,

void RunAxisTransformInstr(const std::shared_ptr<AxisTransformInstr>& instr,
FusionInterpreter* interpreter) {
auto substitute_dimexpr_for_shape = [&](std::vector<symbol::DimExpr>& shape) {
for (auto& dim_expr : shape) {
if (dim_expr.isa<std::int64_t>()) continue;
symbol::DimExpr origin_dim_expr = dim_expr;
while (true) {
dim_expr = symbol::SubstituteDimExpr(
dim_expr, interpreter->substitute_dimexpr_map);
if (dim_expr == origin_dim_expr || dim_expr.isa<std::int64_t>()) break;
origin_dim_expr = dim_expr;
}
}
};
auto substitute_dimexpr_for_transform =
adt::match{[&](const AppendAxisTransformPtr& transform) {
substitute_dimexpr_for_shape(transform->shape);
},
[&](const ReshapeTransformPtr& transform) {
substitute_dimexpr_for_shape(transform->in_shape);
substitute_dimexpr_for_shape(transform->out_shape);
},
[&](const auto& transform) {}};
auto substitute_dimexpr_for_shape =
[&](const std::vector<symbol::DimExpr>& shape) {
std::vector<symbol::DimExpr> result;
for (const auto& dim_expr : shape) {
symbol::DimExpr substituted = dim_expr;
while (true) {
if (substituted.isa<std::int64_t>()) break;
auto tmp_substituted = symbol::SubstituteDimExpr(
substituted, interpreter->substitute_dimexpr_map);
if (tmp_substituted == substituted) break;
substituted = tmp_substituted;
}
result.emplace_back(substituted);
}
return result;
};
auto substitute_dimexpr_for_transform = adt::match{
[&](const AppendAxisTransformPtr& trans) -> AxisTransform {
auto substituted_shape = substitute_dimexpr_for_shape(trans->shape);
return std::make_shared<AppendAxisTransform>(trans->axis,
substituted_shape);
},
[&](const ReshapeTransformPtr& trans) -> AxisTransform {
auto substituted_in_shape =
substitute_dimexpr_for_shape(trans->in_shape);
auto substituted_out_shape =
substitute_dimexpr_for_shape(trans->out_shape);
return std::make_shared<ReshapeTransform>(substituted_in_shape,
substituted_out_shape);
},
[&](const auto& trans) -> AxisTransform { return trans; }};
auto axis_transform = [&](ir::Expr op_expr) -> ir::Expr {
for (auto trans : instr->axis_transform_route_) {
std::visit(substitute_dimexpr_for_transform, trans);
op_expr = std::visit(ApplyAxisTransform(op_expr), trans);
auto new_trans = std::visit(substitute_dimexpr_for_transform, trans);
op_expr = std::visit(ApplyAxisTransform(op_expr), new_trans);
}
return op_expr;
};
Expand Down
1 change: 1 addition & 0 deletions paddle/cinn/operator_fusion/pattern_fuser.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ static bool IsLoopFrameworkEqual(const StmtPattern& lhs,
const auto& rhs_loops = GetLoopFramework(rhs);
VLOG(4) << "lhs " << lhs_loops.DebugStr();
VLOG(4) << "rhs " << rhs_loops.DebugStr();
if (lhs_loops.loop.empty() || rhs_loops.loop.empty()) return false;

// TODO(huangjiyi): support horizontal fusion without reduce dims equal.
const auto get_reduce_loop = [](const MaybeLoopFramework& loop) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ LoopAxisMapping CreateLoopAxisMappingForReshape(pir::Operation* op) {
result.output_values.push_back(op->result(0));
result.loop2output.resize(1);
auto in_shape = GetCompatibleValueAllDims(op->operand_source(0));
auto out_shape = GetValueAllDims(op->result(0));
auto out_shape = GetCompatibleValueAllDims(op->result(0));
result.loop = out_shape;

if (!ShapeProductEqual(in_shape, out_shape)) {
Expand Down
34 changes: 34 additions & 0 deletions paddle/common/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,19 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table,
PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif

/**
* ProcessGroupFlagCX related FLAG
* Name: flagcx_blocking_wait
* Since Version:
* Value Range: bool, default=false
* Example:
* Note: nccl blocking wait.
* blocks host thread until collective operation completes
*/
#if defined(PADDLE_WITH_FLAGCX)
PHI_DEFINE_EXPORTED_bool(flagcx_blocking_wait, false, "flagcx blocking wait");
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PHI_DEFINE_EXPORTED_bool(benchmark_nccl,
false,
Expand Down Expand Up @@ -1766,6 +1779,13 @@ PHI_DEFINE_EXPORTED_string(
"For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");

PHI_DEFINE_EXPORTED_string(
flagcx_dir, // NOLINT
"",
"Specify path for loading libflagcx.so. For instance, "
"For instance, /usr/local/flagcx/lib. If default, "
"dlopen will search flagcx from LD_LIBRARY_PATH");

PHI_DEFINE_EXPORTED_string(cupti_dir,
"",
"Specify path for loading cupti.so."); // NOLINT
Expand Down Expand Up @@ -1873,6 +1893,20 @@ PHI_DEFINE_EXPORTED_bool(
false,
"Enable xqa optim in block_multihead_attention kernel (GQA).");

/**
* Whether to use FP32 for accumulation of QK output in
* block_multihead_attention kernel(fp16)
* Name: blha_use_fp32_qk_sum Since Version: 3.0.0
* Value Range: bool, default=false
* Example:
* Note: If TRUE, FP32 will be used for accumulation of the QK output
* in block_multihead_attention kernel(fp16) .
*/
PHI_DEFINE_EXPORTED_bool(blha_use_fp32_qk_sum,
false,
"use FP32 for accumulation of QK output in "
"block_multihead_attention kernel(fp16).");

PHI_DEFINE_EXPORTED_bool(cuda_core_int8_gemm,
false,
"Enable speed up int8 gemm calculations when m<=4");
Expand Down
5 changes: 5 additions & 0 deletions paddle/common/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ limitations under the License. */
#define COMM_CONTEXT phi::distributed::NCCLCommContext
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
#define COMM_CONTEXT phi::distributed::BKCLCommContext
#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
#define COMM_CONTEXT phi::distributed::XCCLCommContext
#endif

#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
Expand All @@ -38,6 +40,9 @@ limitations under the License. */
#elif defined(PADDLE_WITH_XPU_BKCL)
#define CREATE_COMM_CONTEXT \
phi::distributed::CommContextManager::CreateBKCLCommContext
#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
#define CREATE_COMM_CONTEXT \
phi::distributed::CommContextManager::CreateXCCLCommContext
#endif

namespace common {
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/distributed/collective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ if(WITH_NCCL OR WITH_RCCL)

endif()

if(WITH_FLAGCX)
cc_library(
process_group_flagcx
SRCS process_group_flagcx.cc common.cc
DEPS process_group phi)
endif()

if(WITH_XPU_BKCL)
cc_library(
process_group_bkcl
Expand Down Expand Up @@ -66,6 +73,9 @@ set(COMM_UTILS_DEPS process_group)
if(WITH_NCCL OR WITH_RCCL)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
endif()
if(WITH_FLAGCX)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_flagcx)
endif()
if(WITH_CUSTOM_DEVICE)
set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_custom)
endif()
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
// https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE

#include <cuda_runtime.h>
#include <pybind11/functional.h>
#include <atomic>
#include <chrono>
#include <memory>
Expand Down Expand Up @@ -209,6 +208,7 @@ int Buffer::get_root_rdma_rank(bool global) const {

int Buffer::get_local_device_id() const { return device_id; }

#ifndef PADDLE_NO_PYTHON
pybind11::bytearray Buffer::get_local_ipc_handle() const {
return {ipc_handles[nvl_rank].reserved, CUDA_IPC_HANDLE_SIZE};
}
Expand Down Expand Up @@ -301,6 +301,7 @@ void Buffer::sync(
// Ready to use
available = true;
}
#endif

std::tuple<deep_ep::detail::Tensor,
std::optional<deep_ep::detail::Tensor>,
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@
#undef NDEBUG
#endif

#ifndef PADDLE_NO_PYTHON
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#endif
#include <optional>
#include <tuple>
#include <vector>
#include "paddle/fluid/distributed/collective/deep_ep/include/types.h"
Expand Down Expand Up @@ -119,6 +122,7 @@ struct Buffer {

int get_local_device_id() const;

#ifndef PADDLE_NO_PYTHON
pybind11::bytearray get_local_ipc_handle() const;

pybind11::bytearray get_local_nvshmem_unique_id() const;
Expand All @@ -127,6 +131,7 @@ struct Buffer {
const std::vector<std::optional<pybind11::bytearray>>&
all_gathered_handles,
const std::optional<pybind11::bytearray>& root_unique_id_opt);
#endif

std::tuple<deep_ep::detail::Tensor,
std::optional<deep_ep::detail::Tensor>,
Expand Down
Loading