PaddlePaddle · chengduoZH · May 30, 2019 · May 27, 2019 · May 27, 2019 · May 28, 2019
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -50,6 +51,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
 FeedFetchList FastThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   VLOG(3) << "enter FastThreadedSSAGraphExecutor Run";
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare"));
   std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
       op_deps = atomic_op_deps_.get();
   PrepareAtomicOpDeps();
@@ -64,7 +67,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
 
   InsertFetchOps(fetch_tensors, &fetches, &fetched_vars, op_deps.get(),
                  &fetch_ops, &ready_fetch_ops);
-
+  event.reset(nullptr);
   if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
     // If the num_threads is 1, we can record the order of operator's
     // execution in the first iteration, and in subsequent iterations,

diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -31,31 +31,18 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       underlying_executor_(std::move(underlying_executor)),
       local_scopes_(std::move(local_scopes)),
       var_infos_(std::move(var_infos)),
-      places_(std::move(places)) {}
+      places_(std::move(places)),
+      threads_pool_(1) {
+  PrepareLocalExeScopes();
+}
 
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   if (drop_scope_counter_ == 0) {
-    // Create local scopes.
-    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
-      auto &scope = *it;
-      Scope &local_scope = scope->NewScope();
-      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
-          &local_scope;
-
-      for (auto &info : var_infos_) {
-        if (scope->FindVar(info.name_) != nullptr) {
-          continue;
-        }
-
-        if (info.persistable_) {  // Persistable
-          InitializeVariable(scope->Var(info.name_), info.type_);
-        } else {
-          InitializeVariable(local_scope.Var(info.name_), info.type_);
-        }
-      }
-    }
+    platform::RecordEvent e("InitLocalExeScopes");
+    run_op_future_.get();
   }
+
   std::vector<framework::LoDTensor> fetch_data;
   std::exception_ptr eptr = nullptr;
   try {
@@ -64,9 +51,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
-  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
   ++drop_scope_counter_;
-
   if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     DropLocalExeScopes();
   }
@@ -78,17 +63,45 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
 }
 
 void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
+  std::unique_ptr<platform::RecordEvent> drop_scope_event(
+      new platform::RecordEvent("DropLocalExeScopes"));
   drop_scope_counter_ = 0;
   for (auto p : places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
-
   for (auto &scope : local_scopes_) {
     auto &local_scope =
         *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
     scope->DeleteScope(local_scope);
     VLOG(3) << "Drop local execution scope: " << local_scope;
   }
+  drop_scope_event.reset(nullptr);
+
+  platform::RecordEvent e("InitLocalExeScopes");
+  PrepareLocalExeScopes();
+}
+
+void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() {
+  run_op_future_ = threads_pool_.enqueue([&] {
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(kLocalExecScopeName)->GetMutable<Scope *>() = &local_scope;
+
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+    return 1;
+  });
 }
 
 bool ScopeBufferedSSAGraphExecutor::NeedCreateLocalExeScope() {

diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-
+#include <ThreadPool.h>
+#include <list>
 #include <memory>
 #include <string>
 #include <vector>
@@ -51,13 +52,17 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   bool NeedCreateLocalExeScope();
 
+  void PrepareLocalExeScopes();
+
  private:
   size_t drop_scope_counter_{0};
   ExecutionStrategy strategy_;
   std::unique_ptr<SSAGraphExecutor> underlying_executor_;
   std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
+  std::future<int> run_op_future_;
+  ::ThreadPool threads_pool_;
 };
 }  // namespace details
 }  // namespace framework

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -580,6 +580,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
 
   platform::RecordBlock b(0);
   if (member_->HasGarbageCollectors()) {
+    platform::RecordEvent event("PrepareGarbageCollectors");
     member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name);
   }