PaddlePaddle
diff --git a/‎cmake/external/xbyak.cmake
+1-1 b/‎cmake/external/xbyak.cmake
+1-1
diff --git a/‎paddle/fluid/framework/distributed_strategy.proto
+2 b/‎paddle/fluid/framework/distributed_strategy.proto
+2
diff --git a/‎paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+5-1 b/‎paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+5-1
diff --git a/‎paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+1-1 b/‎paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+1-1
diff --git a/‎paddle/fluid/framework/new_executor/interpretercore.cc
+24-2 b/‎paddle/fluid/framework/new_executor/interpretercore.cc
+24-2
diff --git a/‎paddle/fluid/framework/new_executor/interpretercore.h
+2 b/‎paddle/fluid/framework/new_executor/interpretercore.h
+2
diff --git a/‎paddle/fluid/framework/new_executor/interpretercore_util.cc
+3-3 b/‎paddle/fluid/framework/new_executor/interpretercore_util.cc
+3-3
@@ -44,7 +44,7 @@ ExternalProject_Add(
     DEPENDS             ""
     PREFIX              ${XBYAK_PREFIX_DIR}
     SOURCE_DIR          ${XBYAK_SOURCE_DIR}
-    # UPDATE_COMMAND      ""
+    UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
                         -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
 
@@ -43,6 +43,8 @@ message ShardingConfig {
   optional bool pp_allreduce_in_optimize = 10 [ default = false ];
   optional int32 pp_degree = 11 [ default = 1 ];
   optional bool optimize_cast = 12 [ default = false ];
+  // Optimizer sharding. Temporary plans and may be deprecated
+  optional bool _dp_as_optimizer_sharding = 13 [ default = false ];
 }
 
 message HybridConfig {
 
@@ -235,13 +235,15 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
 
   timeline.Start();
   std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
-  uint16_t pass_id = 0;
 
   bool record_status = false;
+#ifdef PADDLE_WITH_PSLIB
+  uint16_t pass_id = 0;
   if (multi_node_) {
     record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record(
         table_id_, pass_id, pass_values);
   }
+#endif
   auto build_func = [device_num, record_status, &pass_values, &local_keys,
                      &local_ptr, &device_keys, &device_vals,
                      &device_mutex](int i) {
@@ -260,6 +262,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
       task_keys[shard].push_back(local_keys[i][j]);
       task_ptrs[shard].push_back(local_ptr[i][j]);
     }
+#ifdef PADDLE_WITH_PSLIB
     if (record_status) {
       size_t local_keys_size = local_keys.size();
       size_t pass_values_size = pass_values.size();
@@ -275,6 +278,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
         }
       }
     }
+#endif
     for (int dev = 0; dev < device_num; dev++) {
       device_mutex[dev]->lock();
 
 
@@ -157,7 +157,7 @@ bool SimplifyWithBasicOpsPass::SimplifyDropout(
     float scale =
         1.0f - BOOST_GET_CONST(float, dropout_op_desc->GetAttr("dropout_prob"));
 
-    framework::OpDesc new_op_desc;
+    framework::OpDesc new_op_desc(dropout_op_desc->Block());
     new_op_desc.SetType("scale");
     new_op_desc.SetInput("X", {dropout_x->Name()});
     new_op_desc.SetOutput("Out", {dropout_out->Name()});
 
@@ -188,6 +188,8 @@ void InterpreterCore::Convert() {
     BuildAndCacheInstructionCtx(&vec_instruction_[i], *global_scope_, place_);
   }
 
+  BuildSkipShareLoDInfo();
+
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     gc_event_.emplace_back(vec_instruction_[i].execution_ctx_.get()->GetPlace(),
                            platform::GenerateDeviceEventFlag());
@@ -225,8 +227,8 @@ void InterpreterCore::BuildAndCacheInstructionCtx(
   instr_node->runtime_ctx_->inputs.swap(ins_map);
   instr_node->runtime_ctx_->outputs.swap(outs_map);
 
-  instr_node->infershape_ctx_.reset(
-      new RuntimeInferShapeContext(*op_base, *instr_node->runtime_ctx_.get()));
+  instr_node->infershape_ctx_.reset(new InterpretercoreInferShapeContext(
+      *op_base, *instr_node->runtime_ctx_.get()));
 
   auto* dev_ctx = instr_node->dev_ctx_;
   Scope scope;
@@ -235,6 +237,26 @@ void InterpreterCore::BuildAndCacheInstructionCtx(
       *op_base, scope, *dev_ctx, *instr_node->runtime_ctx_.get()));
 }
 
+void InterpreterCore::BuildSkipShareLoDInfo() {
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    bool can_skip_lod = true;
+    for (auto& input : vec_instruction_[i].runtime_ctx_.get()->inputs) {
+      for (auto& var : input.second) {
+        if (var->IsType<LoDTensor>()) {
+          if (var->Get<LoDTensor>().lod().size() != 0) {
+            can_skip_lod = false;
+            break;
+          }
+        } else {
+          can_skip_lod = false;
+          break;
+        }
+      }
+    }
+    vec_instruction_[i].infershape_ctx_.get()->SetSkipLoD(can_skip_lod);
+  }
+}
+
 void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   VLOG(3) << "RunInstruction:  "
           << instr_node.kernel_func_.operator_base_->Type();
 
@@ -68,6 +68,8 @@ class InterpreterCore {
 
   void AddFetch(const std::vector<std::string>& fetch_names);
 
+  void BuildSkipShareLoDInfo();
+
   bool is_build_;
 
   const platform::Place& place_;
 
@@ -206,7 +206,7 @@ void build_op_func_list(const platform::Place& place,
     RuntimeContext runtime_context({}, {});
     runtime_context.inputs.swap(ins_map);
     runtime_context.outputs.swap(outs_map);
-    RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context);
+    InterpretercoreInferShapeContext infer_shape_ctx(*op_base, runtime_context);
     static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
         &infer_shape_ctx);
     auto kernels_iter = all_op_kernels.find(op->Type());
@@ -320,8 +320,8 @@ void build_op_func_list(const platform::Place& place,
           RuntimeContext copy_runtime_context({}, {});
           copy_runtime_context.inputs.swap(copy_ins_value_map);
           copy_runtime_context.outputs.swap(copy_outs_value_map);
-          RuntimeInferShapeContext copy_infer_shape_ctx(*copy_op,
-                                                        copy_runtime_context);
+          InterpretercoreInferShapeContext copy_infer_shape_ctx(
+              *copy_op, copy_runtime_context);
           static_cast<const framework::OperatorWithKernel*>(copy_op)
               ->InferShape(&copy_infer_shape_ctx);
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,8 @@ message ShardingConfig {`
`43`	`43`	`optional bool pp_allreduce_in_optimize = 10 [ default = false ];`
`44`	`44`	`optional int32 pp_degree = 11 [ default = 1 ];`
`45`	`45`	`optional bool optimize_cast = 12 [ default = false ];`
	`46`	`+ // Optimizer sharding. Temporary plans and may be deprecated`
	`47`	`+ optional bool _dp_as_optimizer_sharding = 13 [ default = false ];`
`46`	`48`	`}`
`47`	`49`
`48`	`50`	`message HybridConfig {`
Original file line number	Diff line number	Diff line change
`@@ -235,13 +235,15 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {`
`235`	`235`
`236`	`236`	`timeline.Start();`
`237`	`237`	`std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;`
`238`		`- uint16_t pass_id = 0;`
`239`	`238`
`240`	`239`	`bool record_status = false;`
	`240`	`+#ifdef PADDLE_WITH_PSLIB`
	`241`	`+ uint16_t pass_id = 0;`
`241`	`242`	`if (multi_node_) {`
`242`	`243`	`record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record(`
`243`	`244`	`table_id_, pass_id, pass_values);`
`244`	`245`	`}`
	`246`	`+#endif`
`245`	`247`	`auto build_func = [device_num, record_status, &pass_values, &local_keys,`
`246`	`248`	`&local_ptr, &device_keys, &device_vals,`
`247`	`249`	`&device_mutex](int i) {`
`@@ -260,6 +262,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {`
`260`	`262`	`task_keys[shard].push_back(local_keys[i][j]);`
`261`	`263`	`task_ptrs[shard].push_back(local_ptr[i][j]);`
`262`	`264`	`}`
	`265`	`+#ifdef PADDLE_WITH_PSLIB`
`263`	`266`	`if (record_status) {`
`264`	`267`	`size_t local_keys_size = local_keys.size();`
`265`	`268`	`size_t pass_values_size = pass_values.size();`
`@@ -275,6 +278,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {`
`275`	`278`	`}`
`276`	`279`	`}`
`277`	`280`	`}`
	`281`	`+#endif`
`278`	`282`	`for (int dev = 0; dev < device_num; dev++) {`
`279`	`283`	`device_mutex[dev]->lock();`
`280`	`284`