PaddlePaddle
diff --git a/‎paddle/fluid/framework/CMakeLists.txt
Lines changed: 0 additions & 13 deletions b/‎paddle/fluid/framework/CMakeLists.txt
Lines changed: 0 additions & 13 deletions
diff --git a/‎paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
100755100644
Lines changed: 39 additions & 18 deletions b/‎paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
100755100644
Lines changed: 39 additions & 18 deletions
diff --git a/‎paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
Lines changed: 4 additions & 4 deletions b/‎paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
Lines changed: 0 additions & 1 deletion b/‎paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
Lines changed: 28 additions & 16 deletions b/‎paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
Lines changed: 28 additions & 16 deletions
diff --git a/‎paddle/fluid/framework/new_executor/interpretercore.cc
Lines changed: 9 additions & 8 deletions b/‎paddle/fluid/framework/new_executor/interpretercore.cc
Lines changed: 9 additions & 8 deletions
diff --git a/‎paddle/fluid/framework/new_executor/new_executor_defs.cc
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/framework/new_executor/new_executor_defs.cc
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/new_executor/new_executor_defs.h
Lines changed: 8 additions & 1 deletion b/‎paddle/fluid/framework/new_executor/new_executor_defs.h
Lines changed: 8 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/new_executor/stream_analyzer.cc
Lines changed: 6 additions & 2 deletions b/‎paddle/fluid/framework/new_executor/stream_analyzer.cc
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/operator.cc
Lines changed: 2 additions & 8 deletions b/‎paddle/fluid/framework/operator.cc
Lines changed: 2 additions & 8 deletions
@@ -1166,19 +1166,6 @@ cc_test_old(
   string_helper
   glog)
 
-cc_library(
-  save_load_util
-  SRCS save_load_util.cc
-  DEPS tensor scope layer)
-cc_test_old(
-  save_load_util_test
-  SRCS
-  save_load_util_test.cc
-  DEPS
-  save_load_util
-  tensor
-  scope
-  layer)
 cc_library(
   generator
   SRCS generator.cc
 
@@ -336,27 +336,46 @@ void ComputePropagateScalesMkldnnPass::ComputeWeightScales(
   ComputeLstmWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales);
 }
 
-void ComputePropagateScalesMkldnnPass::UpdateScaleOpInScale(
+void ComputePropagateScalesMkldnnPass::UpdateScaleOpInOutScales(
     Node* op_node,
     const std::string& input_name,
     const std::string& output_name,
     StringPairMap* var_quant_scales) const {
-  auto iter = var_quant_scales->find(output_name);
-  if (iter != var_quant_scales->end()) {
-    auto pair = iter->second;
-    const auto tensor = pair.second;
-
-    const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
-    phi::DenseTensor tmp_tensor;
-    tmp_tensor.Resize(tensor.dims());
-    auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
-    for (int i = 0; i < tensor.numel(); i++) {
-      data[i] = data[i] * scale;
-    }
+  auto out_iter = var_quant_scales->find(output_name);
+  auto input_iter = var_quant_scales->find(input_name);
+  // All the input and output have scales
+  if (out_iter != var_quant_scales->end() &&
+      input_iter != var_quant_scales->end()) {
+    return;
+  }
 
-    auto new_pair = std::make_pair(pair.first, tmp_tensor);
-    var_quant_scales->insert(std::make_pair(input_name, new_pair));
+  const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
+  if (std::abs(scale) < 1e-6 && out_iter != var_quant_scales->end()) {
+    return;
   }
+
+  std::string name = input_name;
+  auto iter = out_iter;
+  if (input_iter != var_quant_scales->end()) {
+    iter = input_iter;
+    name = output_name;
+  }
+
+  phi::DenseTensor tmp_tensor;
+  auto pair = iter->second;
+  const auto tensor = pair.second;
+  tmp_tensor.Resize(tensor.dims());
+  auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
+  auto* src_data = tensor.data<float>();
+  for (int i = 0; i < tensor.numel(); i++) {
+    if (out_iter != var_quant_scales->end()) {
+      data[i] = src_data[i] / scale;
+    } else {
+      data[i] = src_data[i] * scale;
+    }
+  }
+  auto new_pair = std::make_pair(pair.first, tmp_tensor);
+  var_quant_scales->insert(std::make_pair(name, new_pair));
 }
 
 std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
@@ -403,10 +422,12 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
       }
     } else if (op_name == "scale") {
       const std::string output_name = op_node->Op()->Output("Out")[0];
+      const std::string input_name = op_node->Op()->Input("X")[0];
       auto out_iter = var_quant_scales->find(output_name);
-      if (out_iter != var_quant_scales->end()) {
-        const std::string input_name = op_node->Op()->Input("X")[0];
-        UpdateScaleOpInScale(
+      auto input_iter = var_quant_scales->find(input_name);
+      if (out_iter != var_quant_scales->end() ||
+          input_iter != var_quant_scales->end()) {
+        UpdateScaleOpInOutScales(
             op_node, input_name, output_name, var_quant_scales);
       }
     }
 
@@ -79,10 +79,10 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
   void UpdateReluOutputScales(ir::Graph* graph,
                               StringPairMap* var_quant_scales) const;
 
-  void UpdateScaleOpInScale(Node* op_node,
-                            const std::string& input_name,
-                            const std::string& output_name,
-                            StringPairMap* var_quant_scales) const;
+  void UpdateScaleOpInOutScales(Node* op_node,
+                                const std::string& input_name,
+                                const std::string& output_name,
+                                StringPairMap* var_quant_scales) const;
 
   std::unordered_set<std::string> UpdateScales(
       ir::Graph* graph,
 
@@ -23,7 +23,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(softmax);
-USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
+PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP_ITSELF(leaky_relu);
 
@@ -84,7 +84,6 @@ bool DependencyBuilder::OpHappensBefore(int prior_op_idx,
 }
 
 void DependencyBuilder::AddDependencyForCoalesceTensorOp() {
-  const std::string kCoalesceTensor = "coalesce_tensor";
   for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {
     if (instructions_->at(op_idx).OpBase()->Type() == kCoalesceTensor) {
       VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx;
 
@@ -300,6 +300,30 @@ void BuildVariableScope(const framework::BlockDesc& block,
   }
 }
 
+OpFuncType AnalyseOpFuncType(const OpFuncNode& op_func_node,
+                             const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    return OpFuncType::kQueueSync;
+  }
+
+  PADDLE_ENFORCE_EQ(IsSupportedHeterPlace(place),
+                    true,
+                    phi::errors::Fatal("Unsupported current place %s", place));
+
+  // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
+  // computing. They execute serially in device thread and block CUDA kernel
+  // launching in other GPU OPs. To improve performance, set them as kQueueSync
+  // and so that they would be dispatched to host thread.
+  std::shared_ptr<OperatorBase> op = op_func_node.operator_base_;
+  if (op->Type() == kCoalesceTensor &&
+      op->Attr<bool>("set_constant") == false &&
+      op->Attr<bool>("copy_data") == false) {
+    return OpFuncType::kQueueSync;
+  }
+
+  return OpFuncType::kQueueAsync;
+}
+
 void CreateAllOps(const framework::BlockDesc& block,
                   std::vector<std::unique_ptr<OperatorBase>>* ops) {
   for (auto& op : block.AllOps()) {
@@ -448,14 +472,7 @@ void HandleOperatorBase(const platform::Place& place,
   auto* dev_ctx = pool.Get(place);
   // input, output is prepared. set the other attributes.
   op_func_node->operator_base_ = op_base;
-  if (IsSupportedHeterPlace(place)) {
-    op_func_node->type_ = OpFuncType::kQueueAsync;
-  } else if (platform::is_cpu_place(place)) {
-    op_func_node->type_ = OpFuncType::kQueueSync;
-  } else {
-    PADDLE_THROW(
-        platform::errors::Fatal("Unsupported current place %s", place));
-  }
+  op_func_node->type_ = AnalyseOpFuncType(*op_func_node, place);
   op_func_node->kernel_func_ = nullptr;
   op_base->Run(*local_scope, place);  // Run without data transformer.
   std::unordered_set<int> no_data_transform_index;
@@ -663,14 +680,9 @@ void BuildOpFuncList(const platform::Place& place,
           dev_ctx = pool.Get(kernel_type.place_);
         }
         op_func_node.dev_ctx_ = dev_ctx;
-        if (IsSupportedHeterPlace(kernel_type.place_)) {
-          op_func_node.type_ = OpFuncType::kQueueAsync;
-        } else if (platform::is_cpu_place(kernel_type.place_)) {
-          op_func_node.type_ = OpFuncType::kQueueSync;
-        } else {
-          PADDLE_THROW(platform::errors::Fatal("Unsupported current place %s",
-                                               kernel_type.place_));
-        }
+        op_func_node.type_ =
+            AnalyseOpFuncType(op_func_node, kernel_type.place_);
+
         VLOG(3) << op_with_kernel->Type()
                 << " : finally selected kernel_key: " << kernel_type;
 
 
@@ -420,7 +420,7 @@ void InterpreterCore::BuildInplace() {
   std::set<std::string> skip_inplace_outvars;
   for (Instruction& instr : vec_instruction_) {
     OperatorBase* op = instr.OpBase();
-    if (op->Type() == "coalesce_tensor") {
+    if (op->Type() == kCoalesceTensor) {
       const std::vector<std::string>& outputs =
           op->OutputVars(/*has_intermediate=*/false);
       skip_inplace_outvars.insert(outputs.begin(), outputs.end());
@@ -897,8 +897,9 @@ void InterpreterCore::RunNextInstructions(
     int64_t first_op = -1;
     for (auto next_id : direct_run_ops) {
       if (IsReady(next_id)) {
-        // only keep one op running in current thread
-        if (first_op == -1) {
+        // only keep one sync op running in current thread
+        if (first_op == -1 &&
+            vec_instruction_[next_id].KernelType() == OpFuncType::kQueueSync) {
           first_op = next_id;
           continue;
         }
@@ -935,11 +936,11 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
     try {
       interpreter::WaitEvent(instr_node, place_);
 
-      RunInstruction(instr_node);
-
-      CheckGC(instr_node);
-
-      interpreter::LogDeviceMemoryStats(place_);
+      if (!instr_node.IsArtificial()) {
+        RunInstruction(instr_node);
+        CheckGC(instr_node);
+        interpreter::LogDeviceMemoryStats(place_);
+      }
 
       interpreter::RecordEvent(instr_node, place_);
     } catch (platform::EnforceNotMet& ex) {
 
@@ -675,7 +675,8 @@ Instruction::Instruction(size_t id,
                          OpFuncNode&& op_func_node,
                          const platform::DeviceContext& dev_ctx,
                          const Priority priority)
-    : id_(id),
+    : is_artificial_(op_func_node.operator_base_->Type() == "depend"),
+      id_(id),
       op_func_node_(op_func_node),
       dev_ctx_(dev_ctx),
       priority_(priority) {
 
@@ -32,14 +32,16 @@ namespace framework {
 
 using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
 
-constexpr int kEmptyVarIndex = 0;
+constexpr const char* kCoalesceTensor = "coalesce_tensor";
 
 // stream types
 constexpr const char* kCustomStream = "CustromStream";
 constexpr const char* kDefaultStream = "DefaultStream";
 constexpr const char* kD2HStream = "D2HStream";
 constexpr const char* kH2DStream = "H2DStream";
 
+constexpr int kEmptyVarIndex = 0;
+
 enum class Priority { kLowest, kNormal };
 
 class InterpretercoreInferShapeContext : public InferShapeContext {
@@ -305,6 +307,8 @@ class Instruction {
               const platform::DeviceContext& dev_ctx,
               const Priority priority);
 
+  bool IsArtificial() const { return is_artificial_; }
+
   size_t Id() const;
 
   const std::map<std::string, std::vector<int>>& Inputs() const;
@@ -368,6 +372,9 @@ class Instruction {
   Priority GetPriority() const { return priority_; }
 
  private:
+  bool is_artificial_;  // Instruction is artificial means that it is only used
+                        // to assist scheduling and no need to be executed.
+
   size_t id_;
   OpFuncNode op_func_node_;
   const platform::DeviceContext& dev_ctx_;  // not owned
 
@@ -239,11 +239,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
  */
 bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
                                  const Instruction& next_instr) {
-  if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true;
+  if (cur_instr.KernelType() == next_instr.KernelType() &&
+      (&cur_instr.DeviceContext() == &next_instr.DeviceContext())) {
+    return true;
+  }
 
   // xpu&ipu memcpy kerenl is synchronous.
-  if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_))
+  if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) {
     return true;
+  }
 
   // npu d2h kernel is asynchronous.
   if (platform::is_npu_place(place_) || platform::is_custom_place(place_)) {
 
@@ -1414,16 +1414,10 @@ bool OperatorWithKernel::SupportsKernelType(
 
 bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
                                          proto::VarType::Type data_type) const {
-  // NOTE(jiahongyu): Only mkldnn kernels need to check "use_mkldnn" attribute,
-  // hence we first call function SupportsMKLDNN. If we check "use_mkldnn"
-  // attribute first, it will cause error because some codes add "use_mkldnn"
-  // attribute to non-mkldnn ops.
-  if (!this->SupportsMKLDNN(data_type)) {
-    return false;
-  }
   const std::string use_mkldnn_attr = "use_mkldnn";
   return ctx.HasAttr(use_mkldnn_attr) && ctx.Attr<bool>(use_mkldnn_attr) &&
-         platform::is_cpu_place(ctx.GetPlace());
+         platform::is_cpu_place(ctx.GetPlace()) &&
+         this->SupportsMKLDNN(data_type);
 }
 
 void OperatorWithKernel::InferShape(InferShapeContext* ctx) const {
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,6 @@ bool DependencyBuilder::OpHappensBefore(int prior_op_idx,`
`84`	`84`	`}`
`85`	`85`
`86`	`86`	`void DependencyBuilder::AddDependencyForCoalesceTensorOp() {`
`87`		`- const std::string kCoalesceTensor = "coalesce_tensor";`
`88`	`87`	`for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) {`
`89`	`88`	`if (instructions_->at(op_idx).OpBase()->Type() == kCoalesceTensor) {`
`90`	`89`	`VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx;`