Skip to content

Commit 3627147

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into eager_pickler
2 parents d1b2a84 + 30c7758 commit 3627147

File tree

627 files changed

+2517
-3235
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

627 files changed

+2517
-3235
lines changed

paddle/fluid/framework/CMakeLists.txt

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,19 +1166,6 @@ cc_test_old(
11661166
string_helper
11671167
glog)
11681168

1169-
cc_library(
1170-
save_load_util
1171-
SRCS save_load_util.cc
1172-
DEPS tensor scope layer)
1173-
cc_test_old(
1174-
save_load_util_test
1175-
SRCS
1176-
save_load_util_test.cc
1177-
DEPS
1178-
save_load_util
1179-
tensor
1180-
scope
1181-
layer)
11821169
cc_library(
11831170
generator
11841171
SRCS generator.cc

paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc

100755100644
Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -336,27 +336,46 @@ void ComputePropagateScalesMkldnnPass::ComputeWeightScales(
336336
ComputeLstmWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales);
337337
}
338338

339-
void ComputePropagateScalesMkldnnPass::UpdateScaleOpInScale(
339+
void ComputePropagateScalesMkldnnPass::UpdateScaleOpInOutScales(
340340
Node* op_node,
341341
const std::string& input_name,
342342
const std::string& output_name,
343343
StringPairMap* var_quant_scales) const {
344-
auto iter = var_quant_scales->find(output_name);
345-
if (iter != var_quant_scales->end()) {
346-
auto pair = iter->second;
347-
const auto tensor = pair.second;
348-
349-
const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
350-
phi::DenseTensor tmp_tensor;
351-
tmp_tensor.Resize(tensor.dims());
352-
auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
353-
for (int i = 0; i < tensor.numel(); i++) {
354-
data[i] = data[i] * scale;
355-
}
344+
auto out_iter = var_quant_scales->find(output_name);
345+
auto input_iter = var_quant_scales->find(input_name);
346+
// All the input and output have scales
347+
if (out_iter != var_quant_scales->end() &&
348+
input_iter != var_quant_scales->end()) {
349+
return;
350+
}
356351

357-
auto new_pair = std::make_pair(pair.first, tmp_tensor);
358-
var_quant_scales->insert(std::make_pair(input_name, new_pair));
352+
const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
353+
if (std::abs(scale) < 1e-6 && out_iter != var_quant_scales->end()) {
354+
return;
359355
}
356+
357+
std::string name = input_name;
358+
auto iter = out_iter;
359+
if (input_iter != var_quant_scales->end()) {
360+
iter = input_iter;
361+
name = output_name;
362+
}
363+
364+
phi::DenseTensor tmp_tensor;
365+
auto pair = iter->second;
366+
const auto tensor = pair.second;
367+
tmp_tensor.Resize(tensor.dims());
368+
auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
369+
auto* src_data = tensor.data<float>();
370+
for (int i = 0; i < tensor.numel(); i++) {
371+
if (out_iter != var_quant_scales->end()) {
372+
data[i] = src_data[i] / scale;
373+
} else {
374+
data[i] = src_data[i] * scale;
375+
}
376+
}
377+
auto new_pair = std::make_pair(pair.first, tmp_tensor);
378+
var_quant_scales->insert(std::make_pair(name, new_pair));
360379
}
361380

362381
std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
@@ -403,10 +422,12 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
403422
}
404423
} else if (op_name == "scale") {
405424
const std::string output_name = op_node->Op()->Output("Out")[0];
425+
const std::string input_name = op_node->Op()->Input("X")[0];
406426
auto out_iter = var_quant_scales->find(output_name);
407-
if (out_iter != var_quant_scales->end()) {
408-
const std::string input_name = op_node->Op()->Input("X")[0];
409-
UpdateScaleOpInScale(
427+
auto input_iter = var_quant_scales->find(input_name);
428+
if (out_iter != var_quant_scales->end() ||
429+
input_iter != var_quant_scales->end()) {
430+
UpdateScaleOpInOutScales(
410431
op_node, input_name, output_name, var_quant_scales);
411432
}
412433
}

paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,10 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
7979
void UpdateReluOutputScales(ir::Graph* graph,
8080
StringPairMap* var_quant_scales) const;
8181

82-
void UpdateScaleOpInScale(Node* op_node,
83-
const std::string& input_name,
84-
const std::string& output_name,
85-
StringPairMap* var_quant_scales) const;
82+
void UpdateScaleOpInOutScales(Node* op_node,
83+
const std::string& input_name,
84+
const std::string& output_name,
85+
StringPairMap* var_quant_scales) const;
8686

8787
std::unordered_set<std::string> UpdateScales(
8888
ir::Graph* graph,

paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include "paddle/phi/core/kernel_registry.h"
2424

2525
USE_OP_ITSELF(softmax);
26-
USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
26+
PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN);
2727
USE_OP_ITSELF(elementwise_add);
2828
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
2929
USE_OP_ITSELF(leaky_relu);

paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ bool DependencyBuilder::OpHappensBefore(int prior_op_idx,
8484
}
8585

8686
void DependencyBuilder::AddDependencyForCoalesceTensorOp() {
87-
const std::string kCoalesceTensor = "coalesce_tensor";
8887
for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {
8988
if (instructions_->at(op_idx).OpBase()->Type() == kCoalesceTensor) {
9089
VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx;

paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,30 @@ void BuildVariableScope(const framework::BlockDesc& block,
300300
}
301301
}
302302

303+
OpFuncType AnalyseOpFuncType(const OpFuncNode& op_func_node,
304+
const platform::Place& place) {
305+
if (platform::is_cpu_place(place)) {
306+
return OpFuncType::kQueueSync;
307+
}
308+
309+
PADDLE_ENFORCE_EQ(IsSupportedHeterPlace(place),
310+
true,
311+
phi::errors::Fatal("Unsupported current place %s", place));
312+
313+
// Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
314+
// computing. They execute serially in device thread and block CUDA kernel
315+
// launching in other GPU OPs. To improve performance, set them as kQueueSync
316+
// and so that they would be dispatched to host thread.
317+
std::shared_ptr<OperatorBase> op = op_func_node.operator_base_;
318+
if (op->Type() == kCoalesceTensor &&
319+
op->Attr<bool>("set_constant") == false &&
320+
op->Attr<bool>("copy_data") == false) {
321+
return OpFuncType::kQueueSync;
322+
}
323+
324+
return OpFuncType::kQueueAsync;
325+
}
326+
303327
void CreateAllOps(const framework::BlockDesc& block,
304328
std::vector<std::unique_ptr<OperatorBase>>* ops) {
305329
for (auto& op : block.AllOps()) {
@@ -448,14 +472,7 @@ void HandleOperatorBase(const platform::Place& place,
448472
auto* dev_ctx = pool.Get(place);
449473
// input, output is prepared. set the other attributes.
450474
op_func_node->operator_base_ = op_base;
451-
if (IsSupportedHeterPlace(place)) {
452-
op_func_node->type_ = OpFuncType::kQueueAsync;
453-
} else if (platform::is_cpu_place(place)) {
454-
op_func_node->type_ = OpFuncType::kQueueSync;
455-
} else {
456-
PADDLE_THROW(
457-
platform::errors::Fatal("Unsupported current place %s", place));
458-
}
475+
op_func_node->type_ = AnalyseOpFuncType(*op_func_node, place);
459476
op_func_node->kernel_func_ = nullptr;
460477
op_base->Run(*local_scope, place); // Run without data transformer.
461478
std::unordered_set<int> no_data_transform_index;
@@ -663,14 +680,9 @@ void BuildOpFuncList(const platform::Place& place,
663680
dev_ctx = pool.Get(kernel_type.place_);
664681
}
665682
op_func_node.dev_ctx_ = dev_ctx;
666-
if (IsSupportedHeterPlace(kernel_type.place_)) {
667-
op_func_node.type_ = OpFuncType::kQueueAsync;
668-
} else if (platform::is_cpu_place(kernel_type.place_)) {
669-
op_func_node.type_ = OpFuncType::kQueueSync;
670-
} else {
671-
PADDLE_THROW(platform::errors::Fatal("Unsupported current place %s",
672-
kernel_type.place_));
673-
}
683+
op_func_node.type_ =
684+
AnalyseOpFuncType(op_func_node, kernel_type.place_);
685+
674686
VLOG(3) << op_with_kernel->Type()
675687
<< " : finally selected kernel_key: " << kernel_type;
676688

paddle/fluid/framework/new_executor/interpretercore.cc

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ void InterpreterCore::BuildInplace() {
420420
std::set<std::string> skip_inplace_outvars;
421421
for (Instruction& instr : vec_instruction_) {
422422
OperatorBase* op = instr.OpBase();
423-
if (op->Type() == "coalesce_tensor") {
423+
if (op->Type() == kCoalesceTensor) {
424424
const std::vector<std::string>& outputs =
425425
op->OutputVars(/*has_intermediate=*/false);
426426
skip_inplace_outvars.insert(outputs.begin(), outputs.end());
@@ -897,8 +897,9 @@ void InterpreterCore::RunNextInstructions(
897897
int64_t first_op = -1;
898898
for (auto next_id : direct_run_ops) {
899899
if (IsReady(next_id)) {
900-
// only keep one op running in current thread
901-
if (first_op == -1) {
900+
// only keep one sync op running in current thread
901+
if (first_op == -1 &&
902+
vec_instruction_[next_id].KernelType() == OpFuncType::kQueueSync) {
902903
first_op = next_id;
903904
continue;
904905
}
@@ -935,11 +936,11 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
935936
try {
936937
interpreter::WaitEvent(instr_node, place_);
937938

938-
RunInstruction(instr_node);
939-
940-
CheckGC(instr_node);
941-
942-
interpreter::LogDeviceMemoryStats(place_);
939+
if (!instr_node.IsArtificial()) {
940+
RunInstruction(instr_node);
941+
CheckGC(instr_node);
942+
interpreter::LogDeviceMemoryStats(place_);
943+
}
943944

944945
interpreter::RecordEvent(instr_node, place_);
945946
} catch (platform::EnforceNotMet& ex) {

paddle/fluid/framework/new_executor/new_executor_defs.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -675,7 +675,8 @@ Instruction::Instruction(size_t id,
675675
OpFuncNode&& op_func_node,
676676
const platform::DeviceContext& dev_ctx,
677677
const Priority priority)
678-
: id_(id),
678+
: is_artificial_(op_func_node.operator_base_->Type() == "depend"),
679+
id_(id),
679680
op_func_node_(op_func_node),
680681
dev_ctx_(dev_ctx),
681682
priority_(priority) {

paddle/fluid/framework/new_executor/new_executor_defs.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,16 @@ namespace framework {
3232

3333
using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
3434

35-
constexpr int kEmptyVarIndex = 0;
35+
constexpr const char* kCoalesceTensor = "coalesce_tensor";
3636

3737
// stream types
3838
constexpr const char* kCustomStream = "CustromStream";
3939
constexpr const char* kDefaultStream = "DefaultStream";
4040
constexpr const char* kD2HStream = "D2HStream";
4141
constexpr const char* kH2DStream = "H2DStream";
4242

43+
constexpr int kEmptyVarIndex = 0;
44+
4345
enum class Priority { kLowest, kNormal };
4446

4547
class InterpretercoreInferShapeContext : public InferShapeContext {
@@ -305,6 +307,8 @@ class Instruction {
305307
const platform::DeviceContext& dev_ctx,
306308
const Priority priority);
307309

310+
bool IsArtificial() const { return is_artificial_; }
311+
308312
size_t Id() const;
309313

310314
const std::map<std::string, std::vector<int>>& Inputs() const;
@@ -368,6 +372,9 @@ class Instruction {
368372
Priority GetPriority() const { return priority_; }
369373

370374
private:
375+
bool is_artificial_; // Instruction is artificial means that it is only used
376+
// to assist scheduling and no need to be executed.
377+
371378
size_t id_;
372379
OpFuncNode op_func_node_;
373380
const platform::DeviceContext& dev_ctx_; // not owned

paddle/fluid/framework/new_executor/stream_analyzer.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,11 +239,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
239239
*/
240240
bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
241241
const Instruction& next_instr) {
242-
if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true;
242+
if (cur_instr.KernelType() == next_instr.KernelType() &&
243+
(&cur_instr.DeviceContext() == &next_instr.DeviceContext())) {
244+
return true;
245+
}
243246

244247
// xpu&ipu memcpy kerenl is synchronous.
245-
if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_))
248+
if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) {
246249
return true;
250+
}
247251

248252
// npu d2h kernel is asynchronous.
249253
if (platform::is_npu_place(place_) || platform::is_custom_place(place_)) {

paddle/fluid/framework/operator.cc

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,16 +1414,10 @@ bool OperatorWithKernel::SupportsKernelType(
14141414

14151415
bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
14161416
proto::VarType::Type data_type) const {
1417-
// NOTE(jiahongyu): Only mkldnn kernels need to check "use_mkldnn" attribute,
1418-
// hence we first call function SupportsMKLDNN. If we check "use_mkldnn"
1419-
// attribute first, it will cause error because some codes add "use_mkldnn"
1420-
// attribute to non-mkldnn ops.
1421-
if (!this->SupportsMKLDNN(data_type)) {
1422-
return false;
1423-
}
14241417
const std::string use_mkldnn_attr = "use_mkldnn";
14251418
return ctx.HasAttr(use_mkldnn_attr) && ctx.Attr<bool>(use_mkldnn_attr) &&
1426-
platform::is_cpu_place(ctx.GetPlace());
1419+
platform::is_cpu_place(ctx.GetPlace()) &&
1420+
this->SupportsMKLDNN(data_type);
14271421
}
14281422

14291423
void OperatorWithKernel::InferShape(InferShapeContext* ctx) const {

0 commit comments

Comments
 (0)