Skip to content

Commit 2fad9d0

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into support_scalar_type
2 parents 195c87e + b703025 commit 2fad9d0

File tree

330 files changed

+20009
-3848
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

330 files changed

+20009
-3848
lines changed

cmake/external/protobuf.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ elseif(WITH_IPU)
326326
set(PROTOBUF_VERSION 21.12)
327327
elseif(WITH_ARM_BRPC)
328328
set(PROTOBUF_VERSION 21.12-baidu-ee-common)
329+
elseif(WIN32)
330+
#Lower version prootbuf is used for widows
331+
set(PROTOBUF_VERSION 3.2)
329332
else()
330333
set(PROTOBUF_VERSION 21.12)
331334
if(WITH_GPU)

cmake/external/xpu.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ set(XPU_PROJECT "extern_xpu")
77
set(XPU_API_LIB_NAME "libxpuapi.so")
88
set(XPU_RT_LIB_NAME "libxpurt.so")
99

10-
set(XPU_BASE_DATE "20230119")
11-
set(XPU_XCCL_BASE_VERSION "1.0.7")
10+
set(XPU_BASE_DATE "20230215")
11+
set(XPU_XCCL_BASE_VERSION "1.0.8")
1212

1313
if(NOT DEFINED XPU_BASE_URL)
1414
set(XPU_BASE_URL_WITHOUT_DATE

cmake/flags.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ if(NOT WIN32)
149149
-Wno-unused-parameter
150150
-Wno-unused-function
151151
-Wno-error=literal-suffix
152+
-Wno-error=array-bounds #Warning in Eigen, gcc 12.2
152153
-Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
153154
-Wno-error=terminate # Warning in PADDLE_ENFORCE
154155
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2

cmake/inference_lib.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ copy(
439439
DSTS ${dst_dir}/${module}/allocation)
440440

441441
set(module "platform")
442-
set(platform_lib_deps profiler_proto errors)
442+
set(platform_lib_deps phi_profiler_proto errors)
443443
if(WITH_GPU)
444444
set(platform_lib_deps ${platform_lib_deps} external_error_proto)
445445
endif()
@@ -449,7 +449,7 @@ copy(
449449
fluid_lib_dist
450450
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h
451451
${src_dir}/${module}/details/*.h
452-
${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
452+
${PADDLE_BINARY_DIR}/paddle/phi/api/profiler/*.pb.h
453453
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload
454454
${dst_dir}/${module}/details ${dst_dir}/${module})
455455

paddle/fluid/distributed/CMakeLists.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,8 @@ add_subdirectory(fleet_executor)
44
if(WITH_PYTHON)
55
py_proto_compile(pslib_py_proto SRCS ps.proto)
66
py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
7-
add_custom_target(
8-
ps_py_proto_init ALL
9-
COMMAND ${CMAKE_COMMAND} -E make_directory
10-
${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
11-
add_dependencies(ps_py_proto ps_py_proto_init)
7+
file(MAKE_DIRECTORY
8+
${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
129
set(PSLIB_PROTO_DSTPATH
1310
"${PADDLE_SOURCE_DIR}/python/paddle/fluid/incubate/fleet/parameter_server/pslib/"
1411
)

paddle/fluid/distributed/fleet_executor/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ cc_library(
4444
message_bus.cc
4545
dist_model_tensor_wrapper.cc
4646
DEPS proto_desc
47+
standalone_executor
4748
fleet_executor_desc_proto
4849
interceptor_message_proto
4950
task_loop_thread_pool

paddle/fluid/distributed/fleet_executor/carrier.cc

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
#include "paddle/fluid/framework/variable.h"
2929
#include "paddle/fluid/framework/variable_helper.h"
3030

31+
PADDLE_DEFINE_EXPORTED_bool(
32+
fleet_executor_with_standalone,
33+
false,
34+
"Use standalone executor to run ops. Temporary FLAGS, will be removed "
35+
"after all fleet executor cases are modified to run ops with standalone "
36+
"executor.");
37+
3138
namespace paddle {
3239
namespace distributed {
3340

@@ -95,7 +102,7 @@ void Carrier::Init(
95102
thread_pool_.SetThreadNum(thread_num_);
96103
thread_pool_.Start();
97104

98-
CreateInterceptors();
105+
CreateInterceptors(inference_root_scope_vars);
99106
is_init_ = true;
100107
}
101108

@@ -279,7 +286,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
279286
return gc;
280287
}
281288

282-
void Carrier::CreateInterceptors() {
289+
void Carrier::CreateInterceptors(
290+
const std::vector<std::string>& inference_root_scope_vars) {
283291
if (interceptor_id_to_node_.empty()) return;
284292

285293
auto gc = GetGC(place_);
@@ -343,7 +351,48 @@ void Carrier::CreateInterceptors() {
343351
interceptor->SetMiniBatchScope(minibatch_scope_);
344352
interceptor->SetMicroBatchScope(microbatch_scopes_);
345353
interceptor->SetRootScope(root_scope_);
346-
interceptor->SetGC(gc);
354+
355+
if (FLAGS_fleet_executor_with_standalone &&
356+
(task_node->type() == "Amplifier" || task_node->type() == "Compute")) {
357+
std::vector<std::shared_ptr<InterpreterCore>> cores;
358+
framework::interpreter::ExecutionConfig execution_config;
359+
execution_config.create_local_scope = false;
360+
execution_config.force_root_scope_vars = std::set<std::string>(
361+
inference_root_scope_vars.begin(), inference_root_scope_vars.end());
362+
363+
const framework::ProgramDesc* program = task_node->program();
364+
PADDLE_ENFORCE_NOT_NULL(
365+
program,
366+
phi::errors::InvalidArgument("TaskNode %d's program is not set.",
367+
interceptor_id));
368+
std::vector<framework::VarDesc*> all_vars = program->Block(0).AllVars();
369+
for (framework::VarDesc* var : all_vars) {
370+
execution_config.skip_gc_vars.insert(var->Name());
371+
}
372+
373+
// ONLY unused vars can be GCed.
374+
const std::unordered_map<const framework::OperatorBase*,
375+
std::vector<std::string>>& unused_vars =
376+
task_node->unused_vars();
377+
for (auto& item : unused_vars) {
378+
for (const std::string& unused_var : item.second) {
379+
execution_config.skip_gc_vars.erase(unused_var);
380+
}
381+
}
382+
383+
for (framework::Scope* scope : microbatch_scopes_) {
384+
cores.push_back(std::make_shared<InterpreterCore>(
385+
place_, task_node->program()->Block(0), scope, execution_config));
386+
}
387+
388+
for (size_t i = 1; i < cores.size(); ++i) {
389+
cores[i]->ShareWorkQueueFrom(cores[i - 1]);
390+
}
391+
392+
interceptor->SetInterpreterCore(cores);
393+
} else {
394+
interceptor->SetGC(gc);
395+
}
347396

348397
SetInterceptor(interceptor_id, std::move(interceptor));
349398
VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id

paddle/fluid/distributed/fleet_executor/carrier.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ class Carrier final {
9494
Carrier() = delete;
9595

9696
// create each Interceptor
97-
void CreateInterceptors();
97+
void CreateInterceptors(
98+
const std::vector<std::string>& inference_root_scope_vars = {});
9899

99100
int64_t GetRank(int64_t interceptor_id) const;
100101

paddle/fluid/distributed/fleet_executor/compute_interceptor.cc

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -187,20 +187,27 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
187187
}
188188

189189
void ComputeInterceptor::RunOps() {
190-
for (auto op : node_->ops()) {
190+
if (!cores_.empty() || !node_->ops().empty()) {
191191
PADDLE_ENFORCE_LT(cur_scope_id_,
192192
microbatch_scopes_.size(),
193193
platform::errors::InvalidArgument(
194194
"Step out of range. There are %ld "
195195
"microbatch_scopes, but recevice scope index %ld",
196196
microbatch_scopes_.size(),
197197
cur_scope_id_));
198-
op->Run(*microbatch_scopes_[cur_scope_id_], place_);
199-
if (gc_) {
200-
framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
201-
op,
202-
node_->unused_vars(),
203-
gc_.get());
198+
}
199+
200+
if (!cores_.empty()) {
201+
cores_[cur_scope_id_]->Run(/*feed_names=*/{}, /*need_fetch=*/false);
202+
} else {
203+
for (auto op : node_->ops()) {
204+
op->Run(*microbatch_scopes_[cur_scope_id_], place_);
205+
if (gc_) {
206+
framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
207+
op,
208+
node_->unused_vars(),
209+
gc_.get());
210+
}
204211
}
205212
}
206213
}

paddle/fluid/distributed/fleet_executor/interceptor.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
2626
#include "paddle/fluid/framework/blocking_queue.h"
27+
#include "paddle/fluid/framework/new_executor/interpretercore.h"
2728
#include "paddle/fluid/platform/enforce.h"
2829
#include "paddle/fluid/platform/errors.h"
2930
#include "paddle/fluid/platform/macros.h"
@@ -40,6 +41,8 @@ class TaskNode;
4041
class Carrier;
4142
class TaskLoop;
4243

44+
using InterpreterCore = framework::InterpreterCore;
45+
4346
constexpr int64_t SOURCE_ID = -1;
4447
constexpr int64_t SINK_ID = -2;
4548

@@ -75,6 +78,10 @@ class Interceptor {
7578
void SetMicroBatchScope(const std::vector<framework::Scope*>& scopes) {
7679
microbatch_scopes_ = scopes;
7780
}
81+
void SetInterpreterCore(
82+
const std::vector<std::shared_ptr<InterpreterCore>> cores) {
83+
cores_ = cores;
84+
}
7885
void SetGC(const std::shared_ptr<framework::GarbageCollector>& gc) {
7986
gc_ = gc;
8087
}
@@ -100,6 +107,7 @@ class Interceptor {
100107
framework::Scope* root_scope_{nullptr};
101108
framework::Scope* minibatch_scope_{nullptr};
102109
std::vector<framework::Scope*> microbatch_scopes_{};
110+
std::vector<std::shared_ptr<InterpreterCore>> cores_{};
103111
std::shared_ptr<framework::GarbageCollector> gc_{nullptr};
104112

105113
Carrier* carrier_;

paddle/fluid/distributed/fleet_executor/task_node.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ class TaskNode final {
128128
// task_id-->type
129129
std::unordered_map<int64_t, DependType> id_to_dep_type_;
130130

131-
framework::ProgramDesc* program_;
131+
framework::ProgramDesc* program_{nullptr};
132132
std::string cond_var_;
133133
std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
134134
std::unordered_map<const OperatorBase*, std::vector<std::string>>

paddle/fluid/eager/amp_utils.h

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,39 @@ static inline paddle::experimental::DataType GetPromoteType(
8585
return dst_type;
8686
}
8787

88+
inline paddle::experimental::DataType GetDtypeWithPlace(
89+
const std::string& op_name,
90+
const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
91+
kSlotSmallVectorSize>& amp_tensors_vector,
92+
const paddle::experimental::DataType amp_dtype) {
93+
if (amp_dtype == paddle::experimental::DataType::FLOAT32) {
94+
return amp_dtype;
95+
}
96+
bool is_right_place = false;
97+
for (const auto& tensors : amp_tensors_vector) {
98+
for (const auto& tensor : tensors) {
99+
auto place = tensor.place();
100+
is_right_place = (paddle::platform::is_gpu_place(place) ||
101+
paddle::platform::is_cuda_pinned_place(place) ||
102+
paddle::platform::is_xpu_place(place) ||
103+
paddle::platform::is_mlu_place(place) ||
104+
paddle::platform::is_npu_place(place) ||
105+
paddle::platform::is_npu_pinned_place(place) ||
106+
paddle::platform::is_custom_place(place));
107+
if (is_right_place) {
108+
break;
109+
}
110+
}
111+
}
112+
113+
if (!is_right_place) {
114+
VLOG(6) << "Change " << op_name << "'s AMP type from " << amp_dtype
115+
<< " to FP32";
116+
return paddle::experimental::DataType::FLOAT32;
117+
}
118+
return amp_dtype;
119+
}
120+
88121
inline paddle::experimental::DataType GetAmpDestDtype(
89122
const std::string& op_name,
90123
const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
@@ -95,19 +128,21 @@ inline paddle::experimental::DataType GetAmpDestDtype(
95128
VLOG(6) << "AMP GetAmpDestDtype:"
96129
<< " op(" << op_name << ") amp_dtype(" << amp_dtype << ") amp_level("
97130
<< static_cast<int>(amp_level) << ").";
131+
auto return_amp_type = paddle::experimental::DataType::FLOAT16;
132+
98133
if (amp_dtype == "float16") {
99134
if (amp_level == paddle::imperative::AmpLevel::O1) {
100135
if (paddle::imperative::AmpOperators::Instance()
101136
.GetMutableAllowOps()
102137
->count(op_name)) {
103-
return paddle::experimental::DataType::FLOAT16;
138+
return_amp_type = paddle::experimental::DataType::FLOAT16;
104139
} else if (paddle::imperative::AmpOperators::Instance()
105140
.GetMutableBlockOps()
106141
->count(op_name) ||
107142
paddle::imperative::AmpOperators::Instance()
108143
.GetMutableUnsupportedFp16Ops()
109144
->count(op_name)) {
110-
return paddle::experimental::DataType::FLOAT32;
145+
return_amp_type = paddle::experimental::DataType::FLOAT32;
111146
} else {
112147
auto dst_type = GetPromoteType(op_name,
113148
amp_tensors_vector,
@@ -118,7 +153,7 @@ inline paddle::experimental::DataType GetAmpDestDtype(
118153
->count(op_name)) {
119154
dst_type = paddle::experimental::DataType::FLOAT32;
120155
}
121-
return dst_type;
156+
return_amp_type = dst_type;
122157
}
123158
} else if (amp_level == paddle::imperative::AmpLevel::O2) {
124159
auto dst_type = paddle::experimental::DataType::FLOAT16;
@@ -130,18 +165,18 @@ inline paddle::experimental::DataType GetAmpDestDtype(
130165
->count(op_name)) {
131166
dst_type = paddle::experimental::DataType::FLOAT32;
132167
}
133-
return dst_type;
168+
return_amp_type = dst_type;
134169
}
135170
} else if (amp_dtype == "bfloat16") {
136171
if (amp_level == paddle::imperative::AmpLevel::O1) {
137172
if (paddle::imperative::AmpOperators::Instance()
138173
.GetMutableAllowOps()
139174
->count(op_name)) {
140-
return paddle::experimental::DataType::BFLOAT16;
175+
return_amp_type = paddle::experimental::DataType::BFLOAT16;
141176
} else if (paddle::imperative::AmpOperators::Instance()
142177
.GetMutableBlockOps()
143178
->count(op_name)) {
144-
return paddle::experimental::DataType::FLOAT32;
179+
return_amp_type = paddle::experimental::DataType::FLOAT32;
145180
} else {
146181
auto dst_type =
147182
GetPromoteType(op_name,
@@ -153,7 +188,7 @@ inline paddle::experimental::DataType GetAmpDestDtype(
153188
->count(op_name)) {
154189
dst_type = paddle::experimental::DataType::FLOAT32;
155190
}
156-
return dst_type;
191+
return_amp_type = dst_type;
157192
}
158193
} else if (amp_level == paddle::imperative::AmpLevel::O2) {
159194
auto dst_type = paddle::experimental::DataType::BFLOAT16;
@@ -165,10 +200,12 @@ inline paddle::experimental::DataType GetAmpDestDtype(
165200
->count(op_name)) {
166201
dst_type = paddle::experimental::DataType::FLOAT32;
167202
}
168-
return dst_type;
203+
return_amp_type = dst_type;
169204
}
205+
} else {
206+
return_amp_type = paddle::experimental::DataType::FLOAT32;
170207
}
171-
return paddle::experimental::DataType::FLOAT32;
208+
return GetDtypeWithPlace(op_name, amp_tensors_vector, return_amp_type);
172209
}
173210

174211
} // namespace egr

paddle/fluid/eager/eager_amp_auto_cast.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,19 @@ static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
2222
const paddle::experimental::DataType& dst_dtype) {
2323
auto place = tensor.place();
2424
auto data_type = tensor.dtype();
25+
// Except CPU judgment, other conditions should be consistent with
26+
// amp_utils.h's judgment
2527
if (paddle::platform::is_gpu_place(place) ||
2628
paddle::platform::is_cuda_pinned_place(place) ||
2729
paddle::platform::is_xpu_place(place) ||
2830
paddle::platform::is_mlu_place(place) ||
2931
paddle::platform::is_npu_place(place) ||
3032
paddle::platform::is_npu_pinned_place(place) ||
31-
paddle::platform::is_custom_place(place)) {
33+
paddle::platform::is_custom_place(place) ||
34+
paddle::platform::is_cpu_place(place)) {
3235
// CudaPinndePlace is added for varbase created by dataloader
36+
// Cpu place is for differnt place tensor, when input1 is cpu and input2 is
37+
// gpu
3338
if ((data_type == paddle::experimental::DataType::FLOAT32 ||
3439
data_type == paddle::experimental::DataType::FLOAT16 ||
3540
data_type == paddle::experimental::DataType::BFLOAT16) &&

paddle/fluid/eager/eager_tensor.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include "paddle/fluid/framework/variable.h"
2020
// Phi deps
2121
#include "paddle/phi/api/include/tensor.h"
22-
#include "paddle/phi/api/lib/utils/tensor_utils.h"
2322
#include "paddle/phi/core/compat/convert_utils.h"
2423

2524
namespace egr {

0 commit comments

Comments
 (0)