Skip to content

Integrate FlagCX into Paddle #71507

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2246208
initial commit for flagcx integration
mikethegoblin Mar 9, 2025
ccfeb9e
fix compilation issue related to dynamic loader
mikethegoblin Mar 9, 2025
85baaf7
fix compilation issue
mikethegoblin Mar 9, 2025
c9b186e
fix bugs and test distributed api functions
mikethegoblin Mar 10, 2025
d6d34a9
set WITH_FLAGCX flag to off by defalut
mikethegoblin Mar 14, 2025
b8ed1c6
fix compilation issue when not compiling with flagcx
mikethegoblin Mar 14, 2025
ede5be9
add unit test and remove redundant code
mikethegoblin Mar 15, 2025
9a8d1d6
remove redundant code in comm_context_manager.cc
mikethegoblin Mar 15, 2025
69a74d6
remove debug log
mikethegoblin Mar 15, 2025
aaa9b92
remove unit tests
mikethegoblin Mar 17, 2025
f6b78a3
restore test file format
mikethegoblin Mar 17, 2025
95e8b1d
modify code according review comments
mikethegoblin Mar 18, 2025
bb71a32
fix code format
mikethegoblin Mar 18, 2025
698de0a
fix code format
mikethegoblin Mar 18, 2025
51d03f4
add flagcx as submodule and build it during cmake
mikethegoblin Mar 20, 2025
ce9e449
modify cmake
mikethegoblin Mar 20, 2025
6fccbdf
added some unit tests
mikethegoblin Mar 21, 2025
d7f92be
resolve conflicts with develop branch
mikethegoblin Mar 21, 2025
10d66d2
update cmake and unit test
mikethegoblin Mar 21, 2025
bcbcbff
remove flagcx from commutils deps
mikethegoblin Mar 21, 2025
af1a7e4
copy flagcx so to system lib path
mikethegoblin Mar 22, 2025
bea6809
modify unit test
mikethegoblin Mar 24, 2025
fdd9595
update unit test and packing
mikethegoblin Mar 25, 2025
660521f
fix typo in env_dict
mikethegoblin Mar 25, 2025
edeb718
change timeout for unittest
mikethegoblin Mar 25, 2025
a2b67d9
remove redundant comments
mikethegoblin Mar 25, 2025
3479cbd
skip flagcx tests when not compiled with flagcx
mikethegoblin Mar 26, 2025
b64dc67
skip tests
mikethegoblin Mar 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,7 @@
path = third_party/openvino
url = https://github.com/openvinotoolkit/openvino.git
ignore = dirty
[submodule "third_party/flagcx"]
path = third_party/flagcx
url = https://github.com/FlagOpen/FlagCX.git
ignore = dirty
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ option(
OFF)
option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
option(WITH_FLAGCX "Compile PaddlePaddle with FLAGCX support" OFF)
option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
Expand Down Expand Up @@ -538,6 +539,11 @@ else()
endif()
endif()

if(WITH_FLAGCX)
add_definitions("-DPADDLE_WITH_FLAGCX")
# include(flagcx)
endif()

if(WITH_HETERPS AND WITH_PSLIB)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
Expand Down
47 changes: 47 additions & 0 deletions cmake/external/flagcx.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
set(CMAKE_FIND_DEBUG_MODE ON)
# flagcx.cmake
if(NOT WITH_FLAGCX)
return()
endif()

set(FLAGCX_SOURCE_DIR "${PADDLE_SOURCE_DIR}/third_party/flagcx")
set(FLAGCX_BINARY_DIR "${PADDLE_SOURCE_DIR}/build/third_party/flagcx")
set(THIRD_PARTY_DIR "${PADDLE_SOURCE_DIR}/build/third_party")
set(FLAGCX_ROOT "/usr/local/flagcx")
set(FLAGCX_LIB_DIR "${FLAGCX_BINARY_DIR}/build/lib")
set(USR_LOCAL_DIR "/usr/local")

file(REMOVE_RECURSE ${FLAGCX_BINARY_DIR})
message(STATUS "removed old flagcx dir")
message(STATUS "Copying third-party source to build directory")
execute_process(COMMAND cp -r ${FLAGCX_SOURCE_DIR} ${THIRD_PARTY_DIR}
RESULT_VARIABLE COPY_RESULT)

if(NOT COPY_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to copy third-party source to build directory")
endif()

# Create a custom target to build the third-party library
message(STATUS "Building third-party library with its Makefile")
execute_process(
COMMAND make
WORKING_DIRECTORY ${FLAGCX_BINARY_DIR}
RESULT_VARIABLE BUILD_RESULT)

find_path(
FLAGCX_INCLUDE_DIR flagcx.h
PATHS ${FLAGCX_SOURCE_DIR}/flagcx/include
NO_DEFAULT_PATH)

message(STATUS "FLAGCX_INCLUDE_DIR is ${FLAGCX_INCLUDE_DIR}")
include_directories(SYSTEM ${FLAGCX_INCLUDE_DIR})

add_library(flagcx INTERFACE)
find_library(
FLAGCX_LIB
NAMES flagcx libflagcx
PATHS ${FLAGCX_LIB_DIR}
DOC "My custom library")

add_dependencies(flagcx FLAGCX_LIB)
message(STATUS "FLAGCX_LIB is ${FLAGCX_LIB}")
4 changes: 4 additions & 0 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,10 @@ if(WITH_TESTING OR WITH_DISTRIBUTE)
include(external/gtest) # download, build, install gtest
list(APPEND third_party_deps extern_gtest)
endif()
if(WITH_FLAGCX)
include(external/flagcx)
list(APPEND third_party_deps flagcx)
endif()

if(WITH_ONNXRUNTIME)
include(external/onnxruntime
Expand Down
20 changes: 20 additions & 0 deletions paddle/common/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,19 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table,
PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif

/**
* ProcessGroupFlagCX related FLAG
* Name: flagcx_blocking_wait
* Since Version:
* Value Range: bool, default=false
* Example:
* Note: nccl blocking wait.
* blocks host thread until collective operation completes
*/
#if defined(PADDLE_WITH_FLAGCX)
PHI_DEFINE_EXPORTED_bool(flagcx_blocking_wait, false, "flagcx blocking wait");
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PHI_DEFINE_EXPORTED_bool(benchmark_nccl,
false,
Expand Down Expand Up @@ -1770,6 +1783,13 @@ PHI_DEFINE_EXPORTED_string(
"For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH");

PHI_DEFINE_EXPORTED_string(
flagcx_dir, // NOLINT
"/usr/local/flagcx/build/lib",
"Specify path for loading libflagcx.so. For instance, "
"For instance, /usr/local/flagcx/lib. If default, "
"dlopen will search flagcx from LD_LIBRARY_PATH");

PHI_DEFINE_EXPORTED_string(cupti_dir,
"",
"Specify path for loading cupti.so."); // NOLINT
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/distributed/collective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ if(WITH_NCCL OR WITH_RCCL)

endif()

if(WITH_FLAGCX)
cc_library(
process_group_flagcx
SRCS process_group_flagcx.cc common.cc
DEPS process_group phi)
endif()

if(WITH_XPU_BKCL)
cc_library(
process_group_bkcl
Expand Down
Loading
Loading