PaddlePaddle
diff --git a/‎cmake/external/protobuf.cmake
Lines changed: 3 additions & 0 deletions b/‎cmake/external/protobuf.cmake
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmake/external/xpu.cmake
Lines changed: 2 additions & 2 deletions b/‎cmake/external/xpu.cmake
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/flags.cmake
Lines changed: 1 addition & 0 deletions b/‎cmake/flags.cmake
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/inference_lib.cmake
Lines changed: 2 additions & 2 deletions b/‎cmake/inference_lib.cmake
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/distributed/CMakeLists.txt
Lines changed: 2 additions & 5 deletions b/‎paddle/fluid/distributed/CMakeLists.txt
Lines changed: 2 additions & 5 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/carrier.cc
Lines changed: 52 additions & 3 deletions b/‎paddle/fluid/distributed/fleet_executor/carrier.cc
Lines changed: 52 additions & 3 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/carrier.h
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/distributed/fleet_executor/carrier.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
Lines changed: 14 additions & 7 deletions b/‎paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
Lines changed: 14 additions & 7 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/interceptor.h
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/distributed/fleet_executor/interceptor.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet_executor/task_node.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/distributed/fleet_executor/task_node.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/eager/amp_utils.h
Lines changed: 46 additions & 9 deletions b/‎paddle/fluid/eager/amp_utils.h
Lines changed: 46 additions & 9 deletions
diff --git a/‎paddle/fluid/eager/eager_amp_auto_cast.h
Lines changed: 6 additions & 1 deletion b/‎paddle/fluid/eager/eager_amp_auto_cast.h
Lines changed: 6 additions & 1 deletion
diff --git a/‎paddle/fluid/eager/eager_tensor.h
Lines changed: 0 additions & 1 deletion b/‎paddle/fluid/eager/eager_tensor.h
Lines changed: 0 additions & 1 deletion
@@ -326,6 +326,9 @@ elseif(WITH_IPU)
   set(PROTOBUF_VERSION 21.12)
 elseif(WITH_ARM_BRPC)
   set(PROTOBUF_VERSION 21.12-baidu-ee-common)
+elseif(WIN32)
+  #Lower version prootbuf is used for widows
+  set(PROTOBUF_VERSION 3.2)
 else()
   set(PROTOBUF_VERSION 21.12)
   if(WITH_GPU)
 
@@ -7,8 +7,8 @@ set(XPU_PROJECT "extern_xpu")
 set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 
-set(XPU_BASE_DATE "20230119")
-set(XPU_XCCL_BASE_VERSION "1.0.7")
+set(XPU_BASE_DATE "20230215")
+set(XPU_XCCL_BASE_VERSION "1.0.8")
 
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
 
@@ -149,6 +149,7 @@ if(NOT WIN32)
       -Wno-unused-parameter
       -Wno-unused-function
       -Wno-error=literal-suffix
+      -Wno-error=array-bounds #Warning in Eigen, gcc 12.2
       -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
       -Wno-error=terminate # Warning in PADDLE_ENFORCE
       -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
 
@@ -439,7 +439,7 @@ copy(
   DSTS ${dst_dir}/${module}/allocation)
 
 set(module "platform")
-set(platform_lib_deps profiler_proto errors)
+set(platform_lib_deps phi_profiler_proto errors)
 if(WITH_GPU)
   set(platform_lib_deps ${platform_lib_deps} external_error_proto)
 endif()
@@ -449,7 +449,7 @@ copy(
   fluid_lib_dist
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h
        ${src_dir}/${module}/details/*.h
-       ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
+       ${PADDLE_BINARY_DIR}/paddle/phi/api/profiler/*.pb.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload
        ${dst_dir}/${module}/details ${dst_dir}/${module})
 
 
@@ -4,11 +4,8 @@ add_subdirectory(fleet_executor)
 if(WITH_PYTHON)
   py_proto_compile(pslib_py_proto SRCS ps.proto)
   py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
-  add_custom_target(
-    ps_py_proto_init ALL
-    COMMAND ${CMAKE_COMMAND} -E make_directory
-            ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
-  add_dependencies(ps_py_proto ps_py_proto_init)
+  file(MAKE_DIRECTORY
+       ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
   set(PSLIB_PROTO_DSTPATH
       "${PADDLE_SOURCE_DIR}/python/paddle/fluid/incubate/fleet/parameter_server/pslib/"
   )
 
@@ -44,6 +44,7 @@ cc_library(
        message_bus.cc
        dist_model_tensor_wrapper.cc
   DEPS proto_desc
+       standalone_executor
        fleet_executor_desc_proto
        interceptor_message_proto
        task_loop_thread_pool
 
@@ -28,6 +28,13 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
 
+PADDLE_DEFINE_EXPORTED_bool(
+    fleet_executor_with_standalone,
+    false,
+    "Use standalone executor to run ops. Temporary FLAGS, will be removed "
+    "after all fleet executor cases are modified to run ops with standalone "
+    "executor.");
+
 namespace paddle {
 namespace distributed {
 
@@ -95,7 +102,7 @@ void Carrier::Init(
   thread_pool_.SetThreadNum(thread_num_);
   thread_pool_.Start();
 
-  CreateInterceptors();
+  CreateInterceptors(inference_root_scope_vars);
   is_init_ = true;
 }
 
@@ -279,7 +286,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
   return gc;
 }
 
-void Carrier::CreateInterceptors() {
+void Carrier::CreateInterceptors(
+    const std::vector<std::string>& inference_root_scope_vars) {
   if (interceptor_id_to_node_.empty()) return;
 
   auto gc = GetGC(place_);
@@ -343,7 +351,48 @@ void Carrier::CreateInterceptors() {
     interceptor->SetMiniBatchScope(minibatch_scope_);
     interceptor->SetMicroBatchScope(microbatch_scopes_);
     interceptor->SetRootScope(root_scope_);
-    interceptor->SetGC(gc);
+
+    if (FLAGS_fleet_executor_with_standalone &&
+        (task_node->type() == "Amplifier" || task_node->type() == "Compute")) {
+      std::vector<std::shared_ptr<InterpreterCore>> cores;
+      framework::interpreter::ExecutionConfig execution_config;
+      execution_config.create_local_scope = false;
+      execution_config.force_root_scope_vars = std::set<std::string>(
+          inference_root_scope_vars.begin(), inference_root_scope_vars.end());
+
+      const framework::ProgramDesc* program = task_node->program();
+      PADDLE_ENFORCE_NOT_NULL(
+          program,
+          phi::errors::InvalidArgument("TaskNode %d's program is not set.",
+                                       interceptor_id));
+      std::vector<framework::VarDesc*> all_vars = program->Block(0).AllVars();
+      for (framework::VarDesc* var : all_vars) {
+        execution_config.skip_gc_vars.insert(var->Name());
+      }
+
+      // ONLY unused vars can be GCed.
+      const std::unordered_map<const framework::OperatorBase*,
+                               std::vector<std::string>>& unused_vars =
+          task_node->unused_vars();
+      for (auto& item : unused_vars) {
+        for (const std::string& unused_var : item.second) {
+          execution_config.skip_gc_vars.erase(unused_var);
+        }
+      }
+
+      for (framework::Scope* scope : microbatch_scopes_) {
+        cores.push_back(std::make_shared<InterpreterCore>(
+            place_, task_node->program()->Block(0), scope, execution_config));
+      }
+
+      for (size_t i = 1; i < cores.size(); ++i) {
+        cores[i]->ShareWorkQueueFrom(cores[i - 1]);
+      }
+
+      interceptor->SetInterpreterCore(cores);
+    } else {
+      interceptor->SetGC(gc);
+    }
 
     SetInterceptor(interceptor_id, std::move(interceptor));
     VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
 
@@ -94,7 +94,8 @@ class Carrier final {
   Carrier() = delete;
 
   // create each Interceptor
-  void CreateInterceptors();
+  void CreateInterceptors(
+      const std::vector<std::string>& inference_root_scope_vars = {});
 
   int64_t GetRank(int64_t interceptor_id) const;
 
 
@@ -187,20 +187,27 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::RunOps() {
-  for (auto op : node_->ops()) {
+  if (!cores_.empty() || !node_->ops().empty()) {
     PADDLE_ENFORCE_LT(cur_scope_id_,
                       microbatch_scopes_.size(),
                       platform::errors::InvalidArgument(
                           "Step out of range. There are %ld "
                           "microbatch_scopes, but recevice scope index %ld",
                           microbatch_scopes_.size(),
                           cur_scope_id_));
-    op->Run(*microbatch_scopes_[cur_scope_id_], place_);
-    if (gc_) {
-      framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
-                                     op,
-                                     node_->unused_vars(),
-                                     gc_.get());
+  }
+
+  if (!cores_.empty()) {
+    cores_[cur_scope_id_]->Run(/*feed_names=*/{}, /*need_fetch=*/false);
+  } else {
+    for (auto op : node_->ops()) {
+      op->Run(*microbatch_scopes_[cur_scope_id_], place_);
+      if (gc_) {
+        framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
+                                       op,
+                                       node_->unused_vars(),
+                                       gc_.get());
+      }
     }
   }
 }
 
@@ -24,6 +24,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
@@ -40,6 +41,8 @@ class TaskNode;
 class Carrier;
 class TaskLoop;
 
+using InterpreterCore = framework::InterpreterCore;
+
 constexpr int64_t SOURCE_ID = -1;
 constexpr int64_t SINK_ID = -2;
 
@@ -75,6 +78,10 @@ class Interceptor {
   void SetMicroBatchScope(const std::vector<framework::Scope*>& scopes) {
     microbatch_scopes_ = scopes;
   }
+  void SetInterpreterCore(
+      const std::vector<std::shared_ptr<InterpreterCore>> cores) {
+    cores_ = cores;
+  }
   void SetGC(const std::shared_ptr<framework::GarbageCollector>& gc) {
     gc_ = gc;
   }
@@ -100,6 +107,7 @@ class Interceptor {
   framework::Scope* root_scope_{nullptr};
   framework::Scope* minibatch_scope_{nullptr};
   std::vector<framework::Scope*> microbatch_scopes_{};
+  std::vector<std::shared_ptr<InterpreterCore>> cores_{};
   std::shared_ptr<framework::GarbageCollector> gc_{nullptr};
 
   Carrier* carrier_;
 
@@ -128,7 +128,7 @@ class TaskNode final {
   // task_id-->type
   std::unordered_map<int64_t, DependType> id_to_dep_type_;
 
-  framework::ProgramDesc* program_;
+  framework::ProgramDesc* program_{nullptr};
   std::string cond_var_;
   std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
   std::unordered_map<const OperatorBase*, std::vector<std::string>>
 
@@ -85,6 +85,39 @@ static inline paddle::experimental::DataType GetPromoteType(
   return dst_type;
 }
 
+inline paddle::experimental::DataType GetDtypeWithPlace(
+    const std::string& op_name,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& amp_tensors_vector,
+    const paddle::experimental::DataType amp_dtype) {
+  if (amp_dtype == paddle::experimental::DataType::FLOAT32) {
+    return amp_dtype;
+  }
+  bool is_right_place = false;
+  for (const auto& tensors : amp_tensors_vector) {
+    for (const auto& tensor : tensors) {
+      auto place = tensor.place();
+      is_right_place = (paddle::platform::is_gpu_place(place) ||
+                        paddle::platform::is_cuda_pinned_place(place) ||
+                        paddle::platform::is_xpu_place(place) ||
+                        paddle::platform::is_mlu_place(place) ||
+                        paddle::platform::is_npu_place(place) ||
+                        paddle::platform::is_npu_pinned_place(place) ||
+                        paddle::platform::is_custom_place(place));
+      if (is_right_place) {
+        break;
+      }
+    }
+  }
+
+  if (!is_right_place) {
+    VLOG(6) << "Change " << op_name << "'s AMP type from " << amp_dtype
+            << " to FP32";
+    return paddle::experimental::DataType::FLOAT32;
+  }
+  return amp_dtype;
+}
+
 inline paddle::experimental::DataType GetAmpDestDtype(
     const std::string& op_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
@@ -95,19 +128,21 @@ inline paddle::experimental::DataType GetAmpDestDtype(
   VLOG(6) << "AMP GetAmpDestDtype:"
           << " op(" << op_name << ") amp_dtype(" << amp_dtype << ") amp_level("
           << static_cast<int>(amp_level) << ").";
+  auto return_amp_type = paddle::experimental::DataType::FLOAT16;
+
   if (amp_dtype == "float16") {
     if (amp_level == paddle::imperative::AmpLevel::O1) {
       if (paddle::imperative::AmpOperators::Instance()
               .GetMutableAllowOps()
               ->count(op_name)) {
-        return paddle::experimental::DataType::FLOAT16;
+        return_amp_type = paddle::experimental::DataType::FLOAT16;
       } else if (paddle::imperative::AmpOperators::Instance()
                      .GetMutableBlockOps()
                      ->count(op_name) ||
                  paddle::imperative::AmpOperators::Instance()
                      .GetMutableUnsupportedFp16Ops()
                      ->count(op_name)) {
-        return paddle::experimental::DataType::FLOAT32;
+        return_amp_type = paddle::experimental::DataType::FLOAT32;
       } else {
         auto dst_type = GetPromoteType(op_name,
                                        amp_tensors_vector,
@@ -118,7 +153,7 @@ inline paddle::experimental::DataType GetAmpDestDtype(
                 ->count(op_name)) {
           dst_type = paddle::experimental::DataType::FLOAT32;
         }
-        return dst_type;
+        return_amp_type = dst_type;
       }
     } else if (amp_level == paddle::imperative::AmpLevel::O2) {
       auto dst_type = paddle::experimental::DataType::FLOAT16;
@@ -130,18 +165,18 @@ inline paddle::experimental::DataType GetAmpDestDtype(
               ->count(op_name)) {
         dst_type = paddle::experimental::DataType::FLOAT32;
       }
-      return dst_type;
+      return_amp_type = dst_type;
     }
   } else if (amp_dtype == "bfloat16") {
     if (amp_level == paddle::imperative::AmpLevel::O1) {
       if (paddle::imperative::AmpOperators::Instance()
               .GetMutableAllowOps()
               ->count(op_name)) {
-        return paddle::experimental::DataType::BFLOAT16;
+        return_amp_type = paddle::experimental::DataType::BFLOAT16;
       } else if (paddle::imperative::AmpOperators::Instance()
                      .GetMutableBlockOps()
                      ->count(op_name)) {
-        return paddle::experimental::DataType::FLOAT32;
+        return_amp_type = paddle::experimental::DataType::FLOAT32;
       } else {
         auto dst_type =
             GetPromoteType(op_name,
@@ -153,7 +188,7 @@ inline paddle::experimental::DataType GetAmpDestDtype(
                 ->count(op_name)) {
           dst_type = paddle::experimental::DataType::FLOAT32;
         }
-        return dst_type;
+        return_amp_type = dst_type;
       }
     } else if (amp_level == paddle::imperative::AmpLevel::O2) {
       auto dst_type = paddle::experimental::DataType::BFLOAT16;
@@ -165,10 +200,12 @@ inline paddle::experimental::DataType GetAmpDestDtype(
               ->count(op_name)) {
         dst_type = paddle::experimental::DataType::FLOAT32;
       }
-      return dst_type;
+      return_amp_type = dst_type;
     }
+  } else {
+    return_amp_type = paddle::experimental::DataType::FLOAT32;
   }
-  return paddle::experimental::DataType::FLOAT32;
+  return GetDtypeWithPlace(op_name, amp_tensors_vector, return_amp_type);
 }
 
 }  // namespace egr
@@ -22,14 +22,19 @@ static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
                             const paddle::experimental::DataType& dst_dtype) {
   auto place = tensor.place();
   auto data_type = tensor.dtype();
+  // Except CPU judgment, other conditions should be consistent with
+  // amp_utils.h's judgment
   if (paddle::platform::is_gpu_place(place) ||
       paddle::platform::is_cuda_pinned_place(place) ||
       paddle::platform::is_xpu_place(place) ||
       paddle::platform::is_mlu_place(place) ||
       paddle::platform::is_npu_place(place) ||
       paddle::platform::is_npu_pinned_place(place) ||
-      paddle::platform::is_custom_place(place)) {
+      paddle::platform::is_custom_place(place) ||
+      paddle::platform::is_cpu_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
+    // Cpu place is for differnt place tensor, when input1 is cpu and input2 is
+    // gpu
     if ((data_type == paddle::experimental::DataType::FLOAT32 ||
          data_type == paddle::experimental::DataType::FLOAT16 ||
          data_type == paddle::experimental::DataType::BFLOAT16) &&
 
@@ -19,7 +19,6 @@
 #include "paddle/fluid/framework/variable.h"
 // Phi deps
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
 namespace egr {